You've already forked favicon-api-sync
update
This commit is contained in:
@@ -101,14 +101,14 @@ class Favicon:
|
||||
self.scheme = 'http'
|
||||
|
||||
# 检查域名合法性
|
||||
if self.domain and not self._check_url(self.domain):
|
||||
if self.domain and not _check_url(self.domain):
|
||||
self.domain = None
|
||||
|
||||
# 生成域名MD5哈希值
|
||||
if self.domain:
|
||||
self.domain_md5 = hashlib.md5(self.domain.encode("utf-8")).hexdigest()
|
||||
except Exception as e:
|
||||
failed_url_cache(self.domain, setting.time_of_1_days)
|
||||
failed_urls[self.domain] = setting.time_of_1_days + int(time.time())
|
||||
self.scheme = None
|
||||
self.domain = None
|
||||
logger.error('URL解析错误: %s, URL: %s', str(e), url)
|
||||
@@ -163,6 +163,21 @@ class Favicon:
|
||||
self._get_icon_url(icon_path)
|
||||
return self.icon_url
|
||||
|
||||
def get_base_url(self) -> Optional[str]:
|
||||
"""获取网站基础URL
|
||||
|
||||
Returns:
|
||||
网站基础URL
|
||||
"""
|
||||
if not self.domain or '.' not in self.domain:
|
||||
return None
|
||||
|
||||
_url = f"{self.scheme}://{self.domain}"
|
||||
if self.port and self.port not in [80, 443]:
|
||||
_url += f":{self.port}"
|
||||
|
||||
return _url
|
||||
|
||||
def get_icon_file(self, icon_path: str, default: bool = False) -> Tuple[Optional[bytes], Optional[str]]:
|
||||
"""获取图标文件内容和类型
|
||||
|
||||
@@ -187,7 +202,7 @@ class Favicon:
|
||||
_content = base64.b64decode(data_uri[-1])
|
||||
_ct = data_uri[0].split(';')[0].split(':')[-1]
|
||||
else:
|
||||
_content, _ct = self._req_get(self.icon_url, domain=self.domain)
|
||||
_content, _ct = _req_get(self.icon_url, domain=self.domain)
|
||||
|
||||
# 验证是否为图片
|
||||
# image/* application/x-ico
|
||||
@@ -202,21 +217,6 @@ class Favicon:
|
||||
|
||||
return None, None
|
||||
|
||||
def get_base_url(self) -> Optional[str]:
|
||||
"""获取网站基础URL
|
||||
|
||||
Returns:
|
||||
网站基础URL
|
||||
"""
|
||||
if not self.domain or '.' not in self.domain:
|
||||
return None
|
||||
|
||||
_url = f"{self.scheme}://{self.domain}"
|
||||
if self.port and self.port not in [80, 443]:
|
||||
_url += f":{self.port}"
|
||||
|
||||
return _url
|
||||
|
||||
def req_get(self) -> Optional[bytes]:
|
||||
"""获取网站首页内容
|
||||
|
||||
@@ -227,7 +227,7 @@ class Favicon:
|
||||
return None
|
||||
|
||||
_url = self.get_base_url()
|
||||
_content, _ct = self._req_get(_url, domain=self.domain)
|
||||
_content, _ct = _req_get(_url, domain=self.domain)
|
||||
|
||||
# 验证类型并检查大小
|
||||
if _ct and ('text' in _ct or 'html' in _ct or 'xml' in _ct):
|
||||
@@ -238,124 +238,117 @@ class Favicon:
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _req_get(
|
||||
url: str,
|
||||
domain: str,
|
||||
retries: int = DEFAULT_RETRIES,
|
||||
timeout: int = DEFAULT_TIMEOUT
|
||||
) -> Tuple[Optional[bytes], Optional[str]]:
|
||||
"""发送HTTP GET请求获取内容
|
||||
|
||||
Args:
|
||||
url: 请求URL
|
||||
retries: 重试次数
|
||||
timeout: 超时时间(秒)
|
||||
|
||||
Returns:
|
||||
元组(内容, 内容类型)
|
||||
"""
|
||||
logger.debug('发送请求: %s', url)
|
||||
|
||||
retry_count = 0
|
||||
while retry_count <= retries:
|
||||
try:
|
||||
# 使用全局会话池
|
||||
req = requests_session.get(
|
||||
url,
|
||||
headers=header.get_header(),
|
||||
timeout=timeout,
|
||||
allow_redirects=True,
|
||||
verify=False
|
||||
)
|
||||
def _check_internal(domain: str) -> bool:
|
||||
"""检查网址是否非内网地址
|
||||
|
||||
if req.ok:
|
||||
ct_type = req.headers.get('Content-Type')
|
||||
ct_length = req.headers.get('Content-Length')
|
||||
Args:
|
||||
domain: 域名
|
||||
|
||||
# 处理Content-Type
|
||||
if ct_type and ';' in ct_type:
|
||||
_cts = ct_type.split(';')
|
||||
if 'charset' in _cts[0]:
|
||||
ct_type = _cts[-1].strip()
|
||||
else:
|
||||
ct_type = _cts[0].strip()
|
||||
|
||||
# 检查响应大小
|
||||
if ct_length and int(ct_length) > 10 * 1024 * 1024:
|
||||
logger.warning('响应过大: %d bytes, URL: %s', int(ct_length), url)
|
||||
|
||||
return req.content, ct_type
|
||||
else:
|
||||
failed_url_cache(domain, setting.time_of_7_days)
|
||||
logger.error('请求失败: %d, URL: %s', req.status_code, url)
|
||||
break
|
||||
except (ConnectTimeoutError, ReadTimeoutError) as e:
|
||||
retry_count += 1
|
||||
if retry_count > retries:
|
||||
logger.error('请求超时: %s, URL: %s', str(e), url)
|
||||
else:
|
||||
logger.warning('请求超时,正在重试(%d/%d): %s', retry_count, retries, url)
|
||||
continue
|
||||
except MaxRetryError as e:
|
||||
logger.error('重定向次数过多: %s, URL: %s', str(e), url)
|
||||
break
|
||||
except Exception as e:
|
||||
failed_url_cache(domain, setting.time_of_7_days)
|
||||
logger.error('请求异常: %s, URL: %s', str(e), url)
|
||||
break
|
||||
|
||||
return None, None
|
||||
|
||||
@staticmethod
|
||||
def _check_url(domain: str) -> bool:
|
||||
"""检查域名是否合法且非内网地址
|
||||
|
||||
Args:
|
||||
domain: 域名
|
||||
|
||||
Returns:
|
||||
域名是否合法且非内网地址
|
||||
"""
|
||||
return Favicon.check_internal(domain) and _pattern_domain.match(domain)
|
||||
|
||||
@staticmethod
|
||||
def check_internal(domain: str) -> bool:
|
||||
"""检查网址是否非内网地址
|
||||
|
||||
Args:
|
||||
domain: 域名
|
||||
|
||||
Returns:
|
||||
True: 非内网;False: 是内网/无法解析
|
||||
"""
|
||||
try:
|
||||
# 检查是否为IP地址
|
||||
if domain.replace('.', '').isdigit():
|
||||
return not ipaddress.ip_address(domain).is_private
|
||||
else:
|
||||
# 解析域名获取IP地址
|
||||
ips = socket.getaddrinfo(domain, None)
|
||||
for ip_info in ips:
|
||||
ip = ip_info[4][0]
|
||||
if '.' in ip:
|
||||
if not ipaddress.ip_address(ip).is_private:
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
failed_url_cache(domain, setting.time_of_7_days)
|
||||
logger.error('解析域名出错: %s, 错误: %s', domain, str(e))
|
||||
Returns:
|
||||
True: 非内网;False: 是内网/无法解析
|
||||
"""
|
||||
try:
|
||||
# 检查是否为IP地址
|
||||
if domain.replace('.', '').isdigit():
|
||||
return not ipaddress.ip_address(domain).is_private
|
||||
else:
|
||||
# 解析域名获取IP地址
|
||||
ips = socket.getaddrinfo(domain, None)
|
||||
for ip_info in ips:
|
||||
ip = ip_info[4][0]
|
||||
if '.' in ip:
|
||||
if not ipaddress.ip_address(ip).is_private:
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
failed_urls[domain] = setting.time_of_1_days + int(time.time())
|
||||
logger.error('解析域名出错: %s, 错误: %s', domain, str(e))
|
||||
return False
|
||||
|
||||
|
||||
def _check_url(domain: str) -> bool:
|
||||
"""检查域名是否合法且非内网地址
|
||||
|
||||
Args:
|
||||
domain: 域名
|
||||
|
||||
Returns:
|
||||
域名是否合法且非内网地址
|
||||
"""
|
||||
return _pattern_domain.match(domain) and _check_internal(domain)
|
||||
|
||||
|
||||
def _req_get(url: str,
|
||||
domain: str,
|
||||
retries: int = DEFAULT_RETRIES,
|
||||
timeout: int = DEFAULT_TIMEOUT) -> Tuple[Optional[bytes], Optional[str]]:
|
||||
"""发送HTTP GET请求获取内容
|
||||
|
||||
Args:
|
||||
url: 请求URL
|
||||
retries: 重试次数
|
||||
timeout: 超时时间(秒)
|
||||
|
||||
Returns:
|
||||
元组(内容, 内容类型)
|
||||
"""
|
||||
logger.debug('发送请求: %s', url)
|
||||
|
||||
retry_count = 0
|
||||
while retry_count <= retries:
|
||||
try:
|
||||
# 使用全局会话池
|
||||
req = requests_session.get(
|
||||
url,
|
||||
headers=header.get_header(),
|
||||
timeout=timeout,
|
||||
allow_redirects=True,
|
||||
verify=False
|
||||
)
|
||||
|
||||
if req.ok:
|
||||
ct_type = req.headers.get('Content-Type')
|
||||
ct_length = req.headers.get('Content-Length')
|
||||
|
||||
# 处理Content-Type
|
||||
if ct_type and ';' in ct_type:
|
||||
_cts = ct_type.split(';')
|
||||
if 'charset' in _cts[0]:
|
||||
ct_type = _cts[-1].strip()
|
||||
else:
|
||||
ct_type = _cts[0].strip()
|
||||
|
||||
# 检查响应大小
|
||||
if ct_length and int(ct_length) > 10 * 1024 * 1024:
|
||||
logger.warning('响应过大: %d bytes, URL: %s', int(ct_length), url)
|
||||
|
||||
return req.content, ct_type
|
||||
else:
|
||||
failed_urls[domain] = setting.time_of_1_hours + int(time.time())
|
||||
logger.error('请求失败: %d, URL: %s', req.status_code, url)
|
||||
break
|
||||
except (ConnectTimeoutError, ReadTimeoutError) as e:
|
||||
retry_count += 1
|
||||
if retry_count > retries:
|
||||
failed_urls[domain] = setting.time_of_5_minus + int(time.time())
|
||||
logger.error('请求超时: %s, URL: %s', str(e), url)
|
||||
else:
|
||||
logger.warning('请求超时,正在重试(%d/%d): %s', retry_count, retries, url)
|
||||
continue
|
||||
except MaxRetryError as e:
|
||||
failed_urls[domain] = setting.time_of_1_hours + int(time.time())
|
||||
logger.error('重定向次数过多: %s, URL: %s', str(e), url)
|
||||
break
|
||||
except Exception as e:
|
||||
failed_urls[domain] = setting.time_of_1_hours + int(time.time())
|
||||
logger.error('请求异常: %s, URL: %s', str(e), url)
|
||||
break
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
# 域名验证正则表达式
|
||||
_pattern_domain = re.compile(
|
||||
r'[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62}(\.[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62})+\.?',
|
||||
re.I)
|
||||
|
||||
|
||||
def failed_url_cache(_domain: str, _time: int):
|
||||
if _domain:
|
||||
_current_time = int(time.time())
|
||||
if (not failed_urls.get(_domain)) or (_current_time <= failed_urls.get(_domain)):
|
||||
failed_urls[_domain] = _current_time + _time
|
||||
|
||||
Reference in New Issue
Block a user