You've already forked favicon-api-v3
25.08.31
This commit is contained in:
@@ -16,13 +16,11 @@ from urllib3.exceptions import MaxRetryError, ReadTimeoutError, ConnectTimeoutEr
|
||||
from favicon_app.utils import header
|
||||
from favicon_app.utils.filetype import helpers, filetype
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings()
|
||||
logging.captureWarnings(True)
|
||||
# 配置日志
|
||||
logger = logging.getLogger()
|
||||
|
||||
# 创建requests会话池
|
||||
requests_session = requests.Session()
|
||||
@@ -76,8 +74,8 @@ class Favicon:
|
||||
elif not (url.startswith('https://') or url.startswith('http://')):
|
||||
self._parse('http://' + url)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error('初始化错误: %s', url)
|
||||
logger.exception('初始化异常:')
|
||||
|
||||
def _parse(self, url: str):
|
||||
"""解析URL,提取协议、域名、路径和端口
|
||||
@@ -96,7 +94,7 @@ class Favicon:
|
||||
if self.scheme not in ['https', 'http']:
|
||||
if self.scheme:
|
||||
logger.warning('不支持的协议类型: %s', self.scheme)
|
||||
self.scheme = 'http' # 默认使用HTTP协议
|
||||
self.scheme = 'http'
|
||||
|
||||
# 检查域名合法性
|
||||
if self.domain and not self._check_url(self.domain):
|
||||
@@ -108,8 +106,8 @@ class Favicon:
|
||||
except Exception as e:
|
||||
self.scheme = None
|
||||
self.domain = None
|
||||
logger.error(e)
|
||||
logger.error('URL解析错误: %s', url)
|
||||
logger.exception('解析异常:')
|
||||
|
||||
def _get_icon_url(self, icon_path: str):
|
||||
"""根据图标路径生成完整的图标URL
|
||||
@@ -128,13 +126,12 @@ class Favicon:
|
||||
elif icon_path.startswith('/'):
|
||||
self.icon_url = f"{self.scheme}://{self.domain}{icon_path}"
|
||||
elif icon_path.startswith('..'):
|
||||
# 处理相对路径
|
||||
clean_path = icon_path.replace('../', '')
|
||||
self.icon_url = f"{self.scheme}://{self.domain}/{clean_path}"
|
||||
elif icon_path.startswith('./'):
|
||||
self.icon_url = f"{self.scheme}://{self.domain}{icon_path[1:]}"
|
||||
elif icon_path.startswith('data:image'):
|
||||
self.icon_url = icon_path # 处理内联base64图片
|
||||
self.icon_url = icon_path
|
||||
else:
|
||||
self.icon_url = f"{self.scheme}://{self.domain}/{icon_path}"
|
||||
|
||||
@@ -186,23 +183,37 @@ class Favicon:
|
||||
_content = base64.b64decode(data_uri[-1])
|
||||
_ct = data_uri[0].split(';')[0].split(':')[-1]
|
||||
else:
|
||||
# 使用请求会话池获取图标
|
||||
_content, _ct = self._req_get(self.icon_url)
|
||||
|
||||
# 验证是否为图片
|
||||
# image/* application/x-ico
|
||||
# if _ct and ('image' in _ct or 'ico' in _ct or helpers.is_image(_content)):
|
||||
if _ct and _content and helpers.is_image(_content):
|
||||
# 检查文件大小,过大的图片会被警告
|
||||
if len(_content) > 5 * 1024 * 1024: # 5MB
|
||||
# 检查文件大小
|
||||
if len(_content) > 5 * 1024 * 1024:
|
||||
logger.warning('图片过大: %d bytes, 域名: %s', len(_content), self.domain)
|
||||
# 确定内容类型
|
||||
content_type = filetype.guess_mime(_content) or _ct
|
||||
return _content, content_type
|
||||
return _content, filetype.guess_mime(_content) or _ct
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error('获取图标文件失败: %s', self.icon_url)
|
||||
logger.exception('获取图标异常:')
|
||||
|
||||
return None, None
|
||||
|
||||
def get_base_url(self) -> Optional[str]:
|
||||
"""获取网站基础URL
|
||||
|
||||
Returns:
|
||||
网站基础URL
|
||||
"""
|
||||
if not self.domain or '.' not in self.domain:
|
||||
return None
|
||||
|
||||
_url = f"{self.scheme}://{self.domain}"
|
||||
if self.port and self.port not in [80, 443]:
|
||||
_url += f":{self.port}"
|
||||
|
||||
return _url
|
||||
|
||||
def req_get(self) -> Optional[bytes]:
|
||||
"""获取网站首页内容
|
||||
|
||||
@@ -212,42 +223,24 @@ class Favicon:
|
||||
if not self.domain or '.' not in self.domain:
|
||||
return None
|
||||
|
||||
# 构建完整URL
|
||||
_url = f"{self.scheme}://{self.domain}"
|
||||
if self.port and self.port not in [80, 443]:
|
||||
_url += f":{self.port}"
|
||||
|
||||
# 获取页面内容
|
||||
_url = self.get_base_url()
|
||||
_content, _ct = self._req_get(_url)
|
||||
|
||||
# 验证内容类型并检查大小
|
||||
# 验证类型并检查大小
|
||||
if _ct and ('text' in _ct or 'html' in _ct or 'xml' in _ct):
|
||||
if _content and len(_content) > 30 * 1024 * 1024: # 30MB
|
||||
if _content and len(_content) > 30 * 1024 * 1024:
|
||||
logger.error('页面内容过大: %d bytes, URL: %s', len(_content), _url)
|
||||
return None
|
||||
return _content
|
||||
|
||||
return None
|
||||
|
||||
def get_base_url(self) -> Optional[str]:
|
||||
"""获取网站基础URL
|
||||
|
||||
Returns:
|
||||
网站基础URL
|
||||
"""
|
||||
if not self.domain or '.' not in self.domain:
|
||||
return None
|
||||
|
||||
_url = f"{self.scheme}://{self.domain}"
|
||||
# 只有非标准端口才需要添加
|
||||
if self.port and self.port not in [80, 443]:
|
||||
_url += f":{self.port}"
|
||||
|
||||
return _url
|
||||
|
||||
@staticmethod
|
||||
def _req_get(url: str, retries: int = DEFAULT_RETRIES, timeout: int = DEFAULT_TIMEOUT) -> Tuple[
|
||||
Optional[bytes], Optional[str]]:
|
||||
def _req_get(
|
||||
url: str,
|
||||
retries: int = DEFAULT_RETRIES,
|
||||
timeout: int = DEFAULT_TIMEOUT
|
||||
) -> Tuple[Optional[bytes], Optional[str]]:
|
||||
"""发送HTTP GET请求获取内容
|
||||
|
||||
Args:
|
||||
@@ -268,7 +261,8 @@ class Favicon:
|
||||
url,
|
||||
headers=header.get_header(),
|
||||
timeout=timeout,
|
||||
allow_redirects=True
|
||||
allow_redirects=True,
|
||||
verify=False
|
||||
)
|
||||
|
||||
if req.ok:
|
||||
@@ -284,21 +278,20 @@ class Favicon:
|
||||
ct_type = _cts[0].strip()
|
||||
|
||||
# 检查响应大小
|
||||
if ct_length and int(ct_length) > 10 * 1024 * 1024: # 10MB
|
||||
if ct_length and int(ct_length) > 10 * 1024 * 1024:
|
||||
logger.warning('响应过大: %d bytes, URL: %s', int(ct_length), url)
|
||||
|
||||
return req.content, ct_type
|
||||
else:
|
||||
logger.error('请求失败: %d, URL: %s', req.status_code, url)
|
||||
break # 状态码错误不重试
|
||||
break
|
||||
except (ConnectTimeoutError, ReadTimeoutError) as e:
|
||||
retry_count += 1
|
||||
if retry_count > retries:
|
||||
logger.error('请求超时: %s, URL: %s', str(e), url)
|
||||
else:
|
||||
logger.warning('请求超时,正在重试(%d/%d): %s',
|
||||
retry_count, retries, url)
|
||||
continue # 超时错误重试
|
||||
logger.warning('请求超时,正在重试(%d/%d): %s', retry_count, retries, url)
|
||||
continue
|
||||
except MaxRetryError as e:
|
||||
logger.error('重定向次数过多: %s, URL: %s', str(e), url)
|
||||
break
|
||||
@@ -318,7 +311,7 @@ class Favicon:
|
||||
Returns:
|
||||
域名是否合法且非内网地址
|
||||
"""
|
||||
return Favicon._check_internal(domain) and Favicon._pattern_domain.match(domain)
|
||||
return _check_internal(domain) and _pattern_domain.match(domain)
|
||||
|
||||
@staticmethod
|
||||
def _check_internal(domain: str) -> bool:
|
||||
@@ -340,10 +333,8 @@ class Favicon:
|
||||
for ip_info in ips:
|
||||
ip = ip_info[4][0]
|
||||
if '.' in ip:
|
||||
# 只要有一个IP不是内网地址,就认为是非内网
|
||||
if not ipaddress.ip_address(ip).is_private:
|
||||
return True
|
||||
# 所有IP都是内网地址或解析失败
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error('解析域名出错: %s, 错误: %s', domain, str(e))
|
||||
@@ -351,11 +342,6 @@ class Favicon:
|
||||
|
||||
|
||||
# 域名验证正则表达式
|
||||
Favicon._pattern_domain = re.compile(
|
||||
r'[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62}(\.[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62})+\.?',
|
||||
re.I
|
||||
)
|
||||
|
||||
_pattern_domain = re.compile(
|
||||
r'[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62}(\.[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62})+\.?',
|
||||
re.I)
|
||||
|
||||
Reference in New Issue
Block a user