master
jinql 2025-09-10 14:49:23 +08:00
parent f0b8929035
commit cb9b9c2d35
5 changed files with 487 additions and 572 deletions

View File

@ -22,6 +22,7 @@ urllib3.disable_warnings()
logging.captureWarnings(True) logging.captureWarnings(True)
# 配置日志 # 配置日志
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# warnings.filterwarnings("ignore", category=RuntimeWarning)
# 创建requests会话池 # 创建requests会话池
requests_session = requests.Session() requests_session = requests.Session()
@ -103,7 +104,7 @@ class Favicon:
self.scheme = 'http' self.scheme = 'http'
# 检查域名合法性 # 检查域名合法性
if self.domain and not self._check_url(self.domain): if self.domain and not _check_url(self.domain):
self.domain = None self.domain = None
# 生成域名MD5哈希值 # 生成域名MD5哈希值
@ -165,6 +166,21 @@ class Favicon:
self._get_icon_url(icon_path) self._get_icon_url(icon_path)
return self.icon_url return self.icon_url
def get_base_url(self) -> Optional[str]:
"""获取网站基础URL
Returns:
网站基础URL
"""
if not self.domain or '.' not in self.domain:
return None
_url = f"{self.scheme}://{self.domain}"
if self.port and self.port not in [80, 443]:
_url += f":{self.port}"
return _url
async def get_icon_file(self, icon_path: str, default: bool = False) -> Tuple[Optional[bytes], Optional[str]]: async def get_icon_file(self, icon_path: str, default: bool = False) -> Tuple[Optional[bytes], Optional[str]]:
"""获取图标文件内容和类型 """获取图标文件内容和类型
@ -189,7 +205,7 @@ class Favicon:
_content = base64.b64decode(data_uri[-1]) _content = base64.b64decode(data_uri[-1])
_ct = data_uri[0].split(';')[0].split(':')[-1] _ct = data_uri[0].split(';')[0].split(':')[-1]
else: else:
_content, _ct = await self._req_get(self.icon_url, domain=self.domain) _content, _ct = await _req_get(self.icon_url, domain=self.domain)
# 验证是否为图片 # 验证是否为图片
# image/* application/x-ico # image/* application/x-ico
@ -204,21 +220,6 @@ class Favicon:
return None, None return None, None
def get_base_url(self) -> Optional[str]:
"""获取网站基础URL
Returns:
网站基础URL
"""
if not self.domain or '.' not in self.domain:
return None
_url = f"{self.scheme}://{self.domain}"
if self.port and self.port not in [80, 443]:
_url += f":{self.port}"
return _url
async def req_get(self) -> Optional[bytes]: async def req_get(self) -> Optional[bytes]:
"""获取网站首页内容 """获取网站首页内容
@ -229,7 +230,7 @@ class Favicon:
return None return None
_url = self.get_base_url() _url = self.get_base_url()
_content, _ct = await self._req_get(_url, domain=self.domain) _content, _ct = await _req_get(_url, domain=self.domain)
# 验证类型并检查大小 # 验证类型并检查大小
if _ct and ('text' in _ct or 'html' in _ct or 'xml' in _ct): if _ct and ('text' in _ct or 'html' in _ct or 'xml' in _ct):
@ -240,118 +241,117 @@ class Favicon:
return None return None
@staticmethod
async def _req_get(
url: str,
domain: str,
retries: int = DEFAULT_RETRIES,
timeout: int = DEFAULT_TIMEOUT
) -> Tuple[Optional[bytes], Optional[str]]:
"""异步发送HTTP GET请求获取内容
Args: def _check_internal(domain: str) -> bool:
url: 请求URL """检查网址是否非内网地址
retries: 重试次数
timeout: 超时时间()
Returns: Args:
元组(内容, 内容类型) domain: 域名
"""
global _aiohttp_client
logger.debug('发送异步请求: %s', url)
# 初始化aiohttp客户端会话 Returns:
if _aiohttp_client is None: True: 非内网False: 是内网/无法解析
_aiohttp_client = aiohttp.ClientSession( """
connector=aiohttp.TCPConnector(verify_ssl=False, limit=1000), try:
timeout=aiohttp.ClientTimeout(total=timeout), # 检查是否为IP地址
raise_for_status=False if domain.replace('.', '').isdigit():
) return not ipaddress.ip_address(domain).is_private
else:
retry_count = 0 # 解析域名获取IP地址
while retry_count <= retries: ips = socket.getaddrinfo(domain, None)
try: for ip_info in ips:
async with _aiohttp_client.get( ip = ip_info[4][0]
url, if '.' in ip:
headers=header.get_header(), if not ipaddress.ip_address(ip).is_private:
allow_redirects=True, return True
timeout=timeout,
) as resp:
if resp.ok:
ct_type = resp.headers.get('Content-Type')
ct_length = resp.headers.get('Content-Length')
# 处理Content-Type
if ct_type and ';' in ct_type:
_cts = ct_type.split(';')
if 'charset' in _cts[0]:
ct_type = _cts[-1].strip()
else:
ct_type = _cts[0].strip()
# 检查响应大小
if ct_length and int(ct_length) > 10 * 1024 * 1024:
logger.warning('响应过大: %d bytes, URL: %s', int(ct_length), url)
content = await resp.read()
return content, ct_type
else:
await redis_pool.set_failed_domain(domain, setting.time_of_7_days)
logger.error('异步请求失败: %d, URL: %s', resp.status, url)
break
except (aiohttp.ClientConnectorError, aiohttp.ServerTimeoutError) as e:
retry_count += 1
if retry_count > retries:
logger.error('异步请求超时: %s, URL: %s', str(e), url)
else:
logger.warning('异步请求超时,正在重试(%d/%d): %s', retry_count, retries, url)
continue
except Exception as e:
await redis_pool.set_failed_domain(domain, setting.time_of_7_days)
logger.error('异步请求异常: %s, URL: %s', str(e), url)
break
return None, None
@staticmethod
def _check_url(domain: str) -> bool:
"""检查域名是否合法且非内网地址
Args:
domain: 域名
Returns:
域名是否合法且非内网地址
"""
return Favicon.check_internal(domain) and _pattern_domain.match(domain)
@staticmethod
def check_internal(domain: str) -> bool:
"""检查网址是否非内网地址
Args:
domain: 域名
Returns:
True: 非内网False: 是内网/无法解析
"""
try:
# 检查是否为IP地址
if domain.replace('.', '').isdigit():
return not ipaddress.ip_address(domain).is_private
else:
# 解析域名获取IP地址
ips = socket.getaddrinfo(domain, None)
for ip_info in ips:
ip = ip_info[4][0]
if '.' in ip:
if not ipaddress.ip_address(ip).is_private:
return True
return False
except Exception as e:
redis_pool.set_failed_domain(domain, setting.time_of_7_days)
logger.error('解析域名出错: %s, 错误: %s', domain, str(e))
return False return False
except Exception as e:
redis_pool.set_failed_domain(domain, setting.time_of_1_days)
logger.error('解析域名出错: %s, 错误: %s', domain, str(e))
return False
def _check_url(domain: str) -> bool:
"""检查域名是否合法且非内网地址
Args:
domain: 域名
Returns:
域名是否合法且非内网地址
"""
return _pattern_domain.match(domain) and _check_internal(domain)
async def _req_get(url: str,
domain: str,
retries: int = DEFAULT_RETRIES,
timeout: int = DEFAULT_TIMEOUT) -> Tuple[Optional[bytes], Optional[str]]:
"""异步发送HTTP GET请求获取内容
Args:
url: 请求URL
retries: 重试次数
timeout: 超时时间()
Returns:
元组(内容, 内容类型)
"""
global _aiohttp_client
logger.debug('发送异步请求: %s', url)
# 初始化aiohttp客户端会话
if _aiohttp_client is None:
_aiohttp_client = aiohttp.ClientSession(
connector=aiohttp.TCPConnector(verify_ssl=False, limit=1000),
timeout=aiohttp.ClientTimeout(total=timeout),
raise_for_status=False
)
retry_count = 0
while retry_count <= retries:
try:
async with _aiohttp_client.get(
url,
headers=header.get_header(),
allow_redirects=True,
timeout=timeout,
) as resp:
if resp.ok:
ct_type = resp.headers.get('Content-Type')
ct_length = resp.headers.get('Content-Length')
# 处理Content-Type
if ct_type and ';' in ct_type:
_cts = ct_type.split(';')
if 'charset' in _cts[0]:
ct_type = _cts[-1].strip()
else:
ct_type = _cts[0].strip()
# 检查响应大小
if ct_length and int(ct_length) > 10 * 1024 * 1024:
logger.warning('响应过大: %d bytes, URL: %s', int(ct_length), url)
content = await resp.read()
return content, ct_type
else:
await redis_pool.set_failed_domain(domain, setting.time_of_5_minus)
logger.error('异步请求失败: %d, URL: %s', resp.status, url)
break
except (aiohttp.ClientConnectorError, aiohttp.ServerTimeoutError) as e:
retry_count += 1
if retry_count > retries:
await redis_pool.set_failed_domain(domain, setting.time_of_5_minus)
logger.error('异步请求超时: %s, URL: %s', str(e), url)
else:
logger.warning('异步请求超时,正在重试(%d/%d): %s', retry_count, retries, url)
continue
except Exception as e:
await redis_pool.set_failed_domain(domain, setting.time_of_5_minus)
logger.error('异步请求异常: %s, URL: %s', str(e), url)
break
return None, None
# 域名验证正则表达式 # 域名验证正则表达式

View File

@ -19,9 +19,6 @@ logger = logging.getLogger(__name__)
_icon_root_path = setting.icon_root_path _icon_root_path = setting.icon_root_path
_default_icon_path = setting.default_icon_path _default_icon_path = setting.default_icon_path
# 创建全局服务实例
_service = favicon_service.FaviconService()
# 创建FastAPI路由器 # 创建FastAPI路由器
favicon_router = APIRouter(prefix="", tags=["favicon"]) favicon_router = APIRouter(prefix="", tags=["favicon"])
@ -35,13 +32,13 @@ async def get_favicon(
refresh: Optional[str] = Query(None, include_in_schema=False), refresh: Optional[str] = Query(None, include_in_schema=False),
): ):
"""获取网站图标""" """获取网站图标"""
return await _service.get_favicon_handler(request, bg_tasks, url, refresh) return await favicon_service.get_favicon_handler(request, bg_tasks, url, refresh)
@favicon_router.get('/icon/default') @favicon_router.get('/icon/default')
async def get_default_icon(): async def get_default_icon():
"""获取默认图标""" """获取默认图标"""
return _service.get_default() return favicon_service.get_default()
@favicon_router.get('/icon/referer', include_in_schema=False) @favicon_router.get('/icon/referer', include_in_schema=False)

View File

@ -7,7 +7,7 @@ import random
import re import re
import time import time
import warnings import warnings
from typing import Tuple, List, Optional from typing import Tuple, Optional
import bs4 import bs4
import urllib3 import urllib3
@ -30,429 +30,356 @@ warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
_current_dir = os.path.dirname(os.path.abspath(__file__)) _current_dir = os.path.dirname(os.path.abspath(__file__))
class FaviconService: async def get_favicon_handler(request: Request,
"""图标服务类,封装所有与图标获取、缓存和处理相关的功能""" bg_tasks: BackgroundTasks,
url: Optional[str] = None,
refresh: Optional[str] = None) -> dict[str, str] | Response:
"""异步处理获取图标的请求"""
def __init__(self): logger.info(
# 全局计数器 f"队列大小(异步) queue/failed"
self.url_count = 0 f"{await redis_pool.get_cache_size(prefix=redis_pool.ICON_QUEUE_PREFIX)} "
self.request_icon_count = 0 f"| {await redis_pool.get_cache_size(prefix=redis_pool.FAILED_DOMAINS_PREFIX)}")
self.request_cache_count = 0
# 预编译正则表达式,提高性能 # 验证URL参数
self.pattern_icon = re.compile(r'(icon|shortcut icon|alternate icon|apple-touch-icon)+', re.I) if not url:
self.pattern_link = re.compile(r'(<link[^>]+rel=.(icon|shortcut icon|alternate icon|apple-touch-icon)[^>]+>)', return {"message": "请提供url参数"}
re.I)
# 计算默认图标的MD5值 try:
self.default_icon_md5 = self._initialize_default_icon_md5() entity = Favicon(url)
def _initialize_default_icon_md5(self) -> List[str]: # 验证域名
"""初始化默认图标MD5值列表""" if not entity.domain:
md5_list = [self._get_file_md5(setting.default_icon_path), logger.warning(f"无效的URL: {url}")
'05231fb6b69aff47c3f35efe09c11ba0', return get_default(setting.time_of_7_days)
'3ca64f83fdcf25135d87e08af65e68c9',
'db470fd0b65c8c121477343c37f74f02',
'52419f3f4f7d11945d272facc76c9e6a',
'b8a0bf372c762e966cc99ede8682bc71',
'71e9c45f29eadfa2ec5495302c22bcf6',
'ababc687adac587b8a06e580ee79aaa1',
'43802bddf65eeaab643adb8265bfbada']
# 过滤掉None值
return [md5 for md5 in md5_list if md5]
@staticmethod # 检查缓存中的失败URL
def _get_file_md5(file_path: str) -> Optional[str]: if await redis_pool.is_domain_failed(entity.domain):
"""计算文件的MD5值""" return get_default(setting.time_of_1_days)
try:
md5 = hashlib.md5()
with open(file_path, 'rb') as f:
while True:
buffer = f.read(1024 * 8)
if not buffer:
break
md5.update(buffer)
return md5.hexdigest().lower()
except Exception as e:
logger.error(f"计算文件MD5失败 {file_path}: {e}")
return None
def _is_default_icon_md5(self, icon_md5: str) -> bool: # 检查缓存
"""检查图标MD5是否为默认图标""" _cached, cached_icon = _get_cache_icon(entity.domain_md5, refresh=refresh in ['true', '1'])
return icon_md5 in self.default_icon_md5
def _is_default_icon_file(self, file_path: str) -> bool: if _cached or cached_icon:
"""检查文件是否为默认图标""" # 使用缓存图标
if os.path.exists(file_path) and os.path.isfile(file_path): icon_content = cached_icon if cached_icon else _cached
md5 = self._get_file_md5(file_path)
return md5 in self.default_icon_md5 if md5 else False
return False
def _is_default_icon_byte(self, file_content: bytes) -> bool: # 确定内容类型和缓存时间
"""检查字节内容是否为默认图标""" content_type = filetype.guess_mime(icon_content) if icon_content else ""
try: cache_time = setting.time_of_12_hours \
md5 = hashlib.md5(file_content).hexdigest().lower() if _is_default_icon_byte(icon_content) else setting.time_of_7_days
return md5 in self.default_icon_md5
except Exception as e:
logger.error(f"计算字节内容MD5失败: {e}")
return False
def _get_cache_file(self, domain: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]: # 乐观缓存机制:检查缓存是否已过期但仍有缓存内容
"""从缓存中获取图标文件""" # _cached 存在但 cached_icon 为 None 表示缓存已过期
cache_path = os.path.join(setting.icon_root_path, 'data', 'icon', domain + '.png') if _cached and not cached_icon:
if os.path.exists(cache_path) and os.path.isfile(cache_path) and os.path.getsize(cache_path) > 0: # 缓存已过期,后台刷新缓存
try: logger.info(f"缓存已过期,加入后台队列刷新(异步): {entity.domain}")
cached_icon = FileUtil.read_file(cache_path, mode='rb') await redis_pool.set_cache(
file_time = int(os.path.getmtime(cache_path)) f"{entity.domain}",
entity.domain,
setting.time_of_2_hours,
prefix=redis_pool.ICON_QUEUE_PREFIX
)
bg_tasks.add_task(get_icon_async, entity, _cached)
# 验证是否为有效的图片文件 return Response(content=icon_content,
if not helpers.is_image(cached_icon): media_type=content_type if content_type else "image/x-icon",
logger.warning(f"缓存的图标不是有效图片: {cache_path}") headers=_get_header(content_type, cache_time))
return None, None else:
# 开始图标处理,加入队列
await redis_pool.set_cache(
f"{entity.domain}",
entity.domain,
setting.time_of_2_hours,
prefix=redis_pool.ICON_QUEUE_PREFIX
)
# 处理刷新请求或缓存过期情况 # 没有缓存,实时处理,检查队列大小
if refresh: _queue_size = await redis_pool.get_cache_size(prefix=redis_pool.ICON_QUEUE_PREFIX)
if int(time.time()) - file_time <= setting.time_of_12_hours: if _queue_size >= setting.MAX_QUEUE_SIZE:
logger.info(f"缓存文件修改时间在有效期内,不执行刷新: {cache_path}") # 加入后台队列并返回默认图片
return cached_icon, cached_icon logger.info(
return cached_icon, None f"队列大小({_queue_size})>={setting.MAX_QUEUE_SIZE},返回默认图片并加入后台队列(异步): {entity.domain}")
bg_tasks.add_task(get_icon_async, entity, _cached)
# 检查缓存是否过期最大30天 return get_default(0)
if int(time.time()) - file_time > setting.time_of_30_days:
logger.info(f"图标缓存过期(>30天): {cache_path}")
return cached_icon, None
# 默认图标,使用随机的缓存时间
if (int(time.time()) - file_time > setting.time_of_1_days * random.randint(1, 7)
and self._is_default_icon_file(cache_path)):
logger.info(f"默认图标缓存过期: {cache_path}")
return cached_icon, None
return cached_icon, cached_icon
except Exception as e:
logger.error(f"读取缓存文件失败 {cache_path}: {e}")
return None, None
return None, None
def _get_cache_icon(self, domain_md5: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
"""获取缓存的图标"""
_cached, cached_icon = self._get_cache_file(domain_md5, refresh)
# 替换默认图标
if _cached and self._is_default_icon_byte(_cached):
_cached = setting.default_icon_file
if cached_icon and self._is_default_icon_byte(cached_icon):
cached_icon = setting.default_icon_file
return _cached, cached_icon
def _get_header(self, content_type: str, cache_time: int = None) -> dict:
"""生成响应头"""
if cache_time is None:
cache_time = setting.time_of_7_days
_ct = 'image/x-icon'
if content_type and content_type in header.image_type:
_ct = content_type
cache_control = 'no-store, no-cache, must-revalidate, max-age=0' if cache_time == 0 else f'public, max-age={cache_time}'
return {
'Content-Type': _ct,
'Cache-Control': cache_control,
'X-Robots-Tag': 'noindex, nofollow'
}
def _parse_html(self, content: Optional[bytes], entity: Favicon) -> Optional[str]:
"""从HTML内容中解析图标URL"""
if not content:
return None
try:
# 尝试将bytes转换为字符串
# str(content).encode('utf-8', 'replace').decode('utf-8', 'replace')
content_str = content.decode('utf-8', 'replace')
# 使用更高效的解析器
bs = bs4.BeautifulSoup(content_str, features='lxml', parse_only=SoupStrainer("link"))
if len(bs) == 0:
bs = bs4.BeautifulSoup(content_str, features='html.parser', parse_only=SoupStrainer("link"))
html_links = bs.find_all("link", rel=self.pattern_icon)
# 如果没有找到,尝试使用正则表达式直接匹配
if not html_links or len(html_links) == 0:
content_links = self.pattern_link.findall(content_str)
c_link = ''.join([_links[0] for _links in content_links])
bs = bs4.BeautifulSoup(c_link, features='lxml')
html_links = bs.find_all("link", rel=self.pattern_icon)
if html_links and len(html_links) > 0:
# 优先查找指定rel类型的图标
icon_url = (self._get_link_rel(html_links, entity, 'shortcut icon') or
self._get_link_rel(html_links, entity, 'icon') or
self._get_link_rel(html_links, entity, 'alternate icon') or
self._get_link_rel(html_links, entity, ''))
if icon_url:
logger.info(f"-> 从HTML获取图标URL: {icon_url}")
return icon_url
except Exception as e:
logger.error(f"解析HTML失败: {e}")
return None
@staticmethod
def _get_link_rel(links, entity: Favicon, _rel: str) -> Optional[str]:
"""从链接列表中查找指定rel类型的图标URL"""
if not links:
return None
for link in links:
r = link.get('rel')
_r = ' '.join(r) if isinstance(r, list) else r
_href = link.get('href')
if _rel:
if _r.lower() == _rel:
return entity.get_icon_url(str(_href))
else: else:
return entity.get_icon_url(str(_href)) # 队列<MAX_QUEUE_SIZE实时处理
logger.info(f"队列大小({_queue_size})<{setting.MAX_QUEUE_SIZE},实时处理(异步): {entity.domain}")
return None # 始终使用异步方法获取图标
icon_content = await get_icon_async(entity, _cached)
def get_default(self, cache_time: int = None) -> Response: if not icon_content:
if cache_time is None: # 获取失败,返回默认图标
cache_time = setting.time_of_1_days return get_default()
return Response(content=setting.default_icon_file,
media_type="image/png",
headers=self._get_header("image/png", cache_time))
def get_icon_sync(self, entity: Favicon, _cached: bytes = None) -> Optional[bytes]:
"""同步获取图标"""
icon_content = None
try:
# 尝试从网站获取HTML内容
html_content = entity.req_get()
if html_content:
icon_url = self._parse_html(html_content, entity)
else:
icon_url = None
# 尝试不同的图标获取策略
strategies = [
# 1. 从原始网页标签链接中获取
lambda: (icon_url, "原始网页标签") if icon_url else (None, None),
# 2. 从 gstatic.cn 接口获取
lambda: (
f'https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={entity.get_base_url()}',
"gstatic接口"),
# 3. 从网站默认位置获取
lambda: ('', "网站默认位置/favicon.ico"),
# 4. 从其他api接口获取
lambda: (f'https://ico.kucat.cn/get.php?url={entity.get_base_url()}', "第三方API"),
# 99. 最后的尝试cloudflare workers
# lambda: (f'https://favicon.cary.cc/?url={entity.get_base_url()}', "cloudflare"),
]
for strategy in strategies:
if icon_content:
break
strategy_url, strategy_name = strategy()
if strategy_url is not None:
logger.debug(f"-> 尝试从 {strategy_name} 获取图标")
icon_content, icon_type = entity.get_icon_file(strategy_url, strategy_url == '')
# 图标获取失败,或图标不是支持的图片格式,写入默认图标
if (not icon_content) or (not helpers.is_image(icon_content) or self._is_default_icon_byte(icon_content)):
logger.warning(f"-> 获取图标失败,使用默认图标: {entity.domain}")
icon_content = _cached if _cached else setting.default_icon_file
if icon_content:
cache_path = os.path.join(setting.icon_root_path, 'data', 'icon', entity.domain_md5 + '.png')
md5_path = os.path.join(setting.icon_root_path, 'data', 'text', entity.domain_md5 + '.txt')
try:
# 确保目录存在
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
os.makedirs(os.path.dirname(md5_path), exist_ok=True)
# 写入缓存文件
FileUtil.write_file(cache_path, icon_content, mode='wb')
FileUtil.write_file(md5_path, entity.domain, mode='w')
except Exception as e:
logger.error(f"写入缓存文件失败: {e}")
self.request_icon_count += 1
return icon_content
except Exception as e:
logger.error(f"获取图标时发生错误 {entity.domain}: {e}")
return _cached or setting.default_icon_file
finally:
redis_pool.remove_cache(f"{redis_pool.ICON_QUEUE_PREFIX}{entity.domain}")
async def get_icon_async(self, entity: Favicon, _cached: bytes = None) -> Optional[bytes]:
"""异步获取图标"""
icon_content = None
try:
# 尝试从网站异步获取HTML内容
html_content = await entity.req_get()
if html_content:
icon_url = self._parse_html(html_content, entity)
else:
icon_url = None
# 尝试不同的图标获取策略
strategies = [
# 1. 从原始网页标签链接中获取
lambda: (icon_url, "原始网页标签") if icon_url else (None, None),
# 2. 从 gstatic.cn 接口获取
lambda: (
f'https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={entity.get_base_url()}',
"gstatic接口"),
# 3. 从网站默认位置获取
lambda: ('', "网站默认位置/favicon.ico"),
# 4. 从其他api接口获取
lambda: (f'https://ico.kucat.cn/get.php?url={entity.get_base_url()}', "第三方API"),
]
for strategy in strategies:
if icon_content:
break
strategy_url, strategy_name = strategy()
if strategy_url is not None:
logger.debug(f"-> 异步尝试从 {strategy_name} 获取图标")
icon_content, icon_type = await entity.get_icon_file(strategy_url, strategy_url == '')
# 图标获取失败,或图标不是支持的图片格式,写入默认图标
if (not icon_content) or (not helpers.is_image(icon_content) or self._is_default_icon_byte(icon_content)):
logger.warning(f"-> 异步获取图标失败,使用默认图标: {entity.domain}")
icon_content = _cached if _cached else setting.default_icon_file
if icon_content:
cache_path = os.path.join(setting.icon_root_path, 'data', 'icon', entity.domain_md5 + '.png')
md5_path = os.path.join(setting.icon_root_path, 'data', 'text', entity.domain_md5 + '.txt')
try:
# 确保目录存在
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
os.makedirs(os.path.dirname(md5_path), exist_ok=True)
# 写入缓存文件注意文件IO操作仍然是同步的
FileUtil.write_file(cache_path, icon_content, mode='wb')
FileUtil.write_file(md5_path, entity.domain, mode='w')
except Exception as e:
logger.error(f"异步写入缓存文件失败: {e}")
self.request_icon_count += 1
return icon_content
except Exception as e:
logger.error(f"异步获取图标时发生错误 {entity.domain}: {e}")
return _cached or setting.default_icon_file
finally:
await redis_pool.remove_cache(f"{redis_pool.ICON_QUEUE_PREFIX}{entity.domain}")
async def get_favicon_handler(
self,
request: Request,
bg_tasks: BackgroundTasks,
url: Optional[str] = None,
refresh: Optional[str] = None,
) -> dict[str, str] | Response:
"""异步处理获取图标的请求"""
logger.info(
f"队列大小(异步) queue/failed{await redis_pool.get_cache_size(f"{redis_pool.ICON_QUEUE_PREFIX}")} | {await redis_pool.get_cache_size(f"{redis_pool.FAILED_DOMAINS_PREFIX}")}")
self.url_count += 1
# 验证URL参数
if not url:
return {"message": "请提供url参数"}
try:
entity = Favicon(url)
# 验证域名
if not entity.domain:
logger.warning(f"无效的URL: {url}")
return self.get_default(setting.time_of_7_days)
# 检查缓存中的失败URL
if await redis_pool.is_domain_failed(entity.domain):
return self.get_default(setting.time_of_7_days)
# 检查缓存
_cached, cached_icon = self._get_cache_icon(entity.domain_md5, refresh=refresh in ['true', '1'])
if _cached or cached_icon:
# 使用缓存图标
icon_content = cached_icon if cached_icon else _cached
self.request_cache_count += 1
# 确定内容类型和缓存时间 # 确定内容类型和缓存时间
content_type = filetype.guess_mime(icon_content) if icon_content else "" content_type = filetype.guess_mime(icon_content) if icon_content else ""
cache_time = setting.time_of_12_hours \ cache_time = setting.time_of_12_hours \
if self._is_default_icon_byte(icon_content) else setting.time_of_7_days if _is_default_icon_byte(icon_content) else setting.time_of_7_days
# 乐观缓存机制:检查缓存是否已过期但仍有缓存内容
# _cached 存在但 cached_icon 为 None 表示缓存已过期
if _cached and not cached_icon:
# 缓存已过期,后台刷新缓存
logger.info(f"缓存已过期,加入后台队列刷新(异步): {entity.domain}")
await redis_pool.set_cache(
f"{redis_pool.ICON_QUEUE_PREFIX}{entity.domain}",
entity.domain,
setting.time_of_2_hours
)
bg_tasks.add_task(self.get_icon_sync, entity, _cached)
return Response(content=icon_content, return Response(content=icon_content,
media_type=content_type if content_type else "image/x-icon", media_type=content_type if content_type else "image/x-icon",
headers=self._get_header(content_type, cache_time)) headers=_get_header(content_type, cache_time))
else: except Exception as e:
# 开始图标处理,加入队列 logger.error(f"处理图标请求时发生错误 {url}: {e}")
await redis_pool.set_cache( # 返回默认图标
f"{redis_pool.ICON_QUEUE_PREFIX}{entity.domain}", return get_default()
entity.domain,
setting.time_of_2_hours
)
# 没有缓存,实时处理,检查队列大小
_queue_size = await redis_pool.get_cache_size(f"{redis_pool.ICON_QUEUE_PREFIX}")
if _queue_size >= setting.MAX_QUEUE_SIZE:
# 加入后台队列并返回默认图片
logger.info(
f"队列大小({_queue_size})>={setting.MAX_QUEUE_SIZE},返回默认图片并加入后台队列(异步): {entity.domain}")
bg_tasks.add_task(self.get_icon_sync, entity, _cached)
return self.get_default(0)
else:
# 队列<MAX_QUEUE_SIZE实时处理
logger.info(f"队列大小({_queue_size})<{setting.MAX_QUEUE_SIZE},实时处理(异步): {entity.domain}")
# 始终使用异步方法获取图标 async def get_icon_async(entity: Favicon, _cached: bytes = None) -> Optional[bytes]:
icon_content = await self.get_icon_async(entity, _cached) """异步获取图标"""
icon_content = None
if not icon_content: try:
# 获取失败,返回默认图标 # 尝试从网站异步获取HTML内容
return self.get_default() html_content = await entity.req_get()
if html_content:
icon_url = _parse_html(html_content, entity)
else:
icon_url = None
# 确定内容类型和缓存时间 # 尝试不同的图标获取策略
content_type = filetype.guess_mime(icon_content) if icon_content else "" strategies = [
cache_time = setting.time_of_12_hours \ # 1. 从原始网页标签链接中获取
if self._is_default_icon_byte(icon_content) else setting.time_of_7_days lambda: (icon_url, "原始网页标签") if icon_url else (None, None),
# 2. 从 gstatic.cn 接口获取
lambda: (
f'https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={entity.get_base_url()}',
"gstatic接口"),
# 3. 从网站默认位置获取
lambda: ('', "网站默认位置/favicon.ico"),
# 4. 从其他api接口获取
lambda: (f'https://ico.kucat.cn/get.php?url={entity.get_base_url()}', "第三方API"),
]
return Response(content=icon_content, for strategy in strategies:
media_type=content_type if content_type else "image/x-icon", if icon_content:
headers=self._get_header(content_type, cache_time)) break
strategy_url, strategy_name = strategy()
if strategy_url is not None:
logger.debug(f"-> 异步尝试从 {strategy_name} 获取图标")
icon_content, icon_type = await entity.get_icon_file(strategy_url, strategy_url == '')
# 图标获取失败,或图标不是支持的图片格式,写入默认图标
if (not icon_content) or (not helpers.is_image(icon_content) or _is_default_icon_byte(icon_content)):
logger.warning(f"-> 异步获取图标失败,使用默认图标: {entity.domain}")
icon_content = _cached if _cached else setting.default_icon_file
if icon_content:
cache_path = os.path.join(setting.icon_root_path, 'data', 'icon', entity.domain_md5 + '.png')
md5_path = os.path.join(setting.icon_root_path, 'data', 'text', entity.domain_md5 + '.txt')
try:
# 确保目录存在
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
os.makedirs(os.path.dirname(md5_path), exist_ok=True)
# 写入缓存文件注意文件IO操作仍然是同步的
FileUtil.write_file(cache_path, icon_content, mode='wb')
FileUtil.write_file(md5_path, entity.domain, mode='w')
except Exception as e:
logger.error(f"异步写入缓存文件失败: {e}")
return icon_content
except Exception as e:
logger.error(f"异步获取图标时发生错误 {entity.domain}: {e}")
return _cached or setting.default_icon_file
finally:
await redis_pool.remove_cache(f"{entity.domain}", prefix=redis_pool.ICON_QUEUE_PREFIX)
# 预编译正则表达式,提高性能
pattern_icon = re.compile(r'(icon|shortcut icon|alternate icon|apple-touch-icon)+', re.I)
pattern_link = re.compile(r'(<link[^>]+rel=.(icon|shortcut icon|alternate icon|apple-touch-icon)[^>]+>)', re.I)
def _get_link_rel(links, entity: Favicon, _rel: str) -> Optional[str]:
"""从链接列表中查找指定rel类型的图标URL"""
if not links:
return None
for link in links:
r = link.get('rel')
_r = ' '.join(r) if isinstance(r, list) else r
_href = link.get('href')
if _rel:
if _r.lower() == _rel:
return entity.get_icon_url(str(_href))
else:
return entity.get_icon_url(str(_href))
return None
def _parse_html(content: Optional[bytes], entity: Favicon) -> Optional[str]:
"""从HTML内容中解析图标URL"""
if not content:
return None
try:
# 尝试将bytes转换为字符串
# str(content).encode('utf-8', 'replace').decode('utf-8', 'replace')
# content_str = content.decode('utf-8', 'replace')
content_str = str(content).encode('utf-8', 'replace').decode('utf-8', 'replace')
# 使用更高效的解析器
bs = bs4.BeautifulSoup(content_str, features='lxml', parse_only=SoupStrainer("link"))
if len(bs) == 0:
bs = bs4.BeautifulSoup(content_str, features='html.parser', parse_only=SoupStrainer("link"))
html_links = bs.find_all("link", rel=pattern_icon)
# 如果没有找到,尝试使用正则表达式直接匹配
if not html_links or len(html_links) == 0:
content_links = pattern_link.findall(content_str)
c_link = ''.join([_links[0] for _links in content_links])
bs = bs4.BeautifulSoup(c_link, features='lxml')
html_links = bs.find_all("link", rel=pattern_icon)
if html_links and len(html_links) > 0:
# 优先查找指定rel类型的图标
icon_url = (_get_link_rel(html_links, entity, 'shortcut icon') or
_get_link_rel(html_links, entity, 'icon') or
_get_link_rel(html_links, entity, 'alternate icon') or
_get_link_rel(html_links, entity, ''))
if icon_url:
logger.debug(f"-> 从HTML获取图标URL: {icon_url}")
return icon_url
except Exception as e:
logger.error(f"解析HTML失败: {e}")
return None
def _get_file_md5(file_path: str) -> Optional[str]:
"""计算文件的MD5值"""
try:
md5 = hashlib.md5()
with open(file_path, 'rb') as f:
while True:
buffer = f.read(1024 * 8)
if not buffer:
break
md5.update(buffer)
return md5.hexdigest().lower()
except Exception as e:
logger.error(f"计算文件MD5失败 {file_path}: {e}")
return None
default_icon_md5 = [
_get_file_md5(setting.default_icon_path),
'05231fb6b69aff47c3f35efe09c11ba0',
'3ca64f83fdcf25135d87e08af65e68c9',
'db470fd0b65c8c121477343c37f74f02',
'52419f3f4f7d11945d272facc76c9e6a',
'b8a0bf372c762e966cc99ede8682bc71',
'71e9c45f29eadfa2ec5495302c22bcf6',
'ababc687adac587b8a06e580ee79aaa1',
'43802bddf65eeaab643adb8265bfbada',
]
def _get_header(content_type: str, cache_time: int = None) -> dict:
"""生成响应头"""
if cache_time is None:
cache_time = setting.time_of_7_days
_ct = 'image/x-icon'
if content_type and content_type in header.image_type:
_ct = content_type
cache_control = 'no-store, no-cache, must-revalidate, max-age=0' if cache_time == 0 else f'public, max-age={cache_time}'
return {
'Content-Type': _ct,
'Cache-Control': cache_control,
'X-Robots-Tag': 'noindex, nofollow'
}
def get_default(cache_time: int = None) -> Response:
if cache_time is None:
cache_time = setting.time_of_1_days
return Response(content=setting.default_icon_file,
media_type="image/png",
headers=_get_header("image/png", cache_time))
def _is_default_icon_md5(icon_md5: str) -> bool:
"""检查图标MD5是否为默认图标"""
return icon_md5 in default_icon_md5
def _is_default_icon_file(file_path: str) -> bool:
"""检查文件是否为默认图标"""
if os.path.exists(file_path) and os.path.isfile(file_path):
md5 = _get_file_md5(file_path)
return md5 in default_icon_md5 if md5 else False
return False
def _is_default_icon_byte(file_content: bytes) -> bool:
"""检查字节内容是否为默认图标"""
try:
md5 = hashlib.md5(file_content).hexdigest().lower()
return md5 in default_icon_md5
except Exception as e:
logger.error(f"计算字节内容MD5失败: {e}")
return False
def _get_cache_file(domain: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
"""从缓存中获取图标文件"""
cache_path = os.path.join(setting.icon_root_path, 'data', 'icon', domain + '.png')
if os.path.exists(cache_path) and os.path.isfile(cache_path) and os.path.getsize(cache_path) > 0:
try:
cached_icon = FileUtil.read_file(cache_path, mode='rb')
file_time = int(os.path.getmtime(cache_path))
# 验证是否为有效的图片文件
if not helpers.is_image(cached_icon):
logger.warning(f"缓存的图标不是有效图片: {cache_path}")
return None, None
# 处理刷新请求或缓存过期情况
if refresh:
if int(time.time()) - file_time <= setting.time_of_12_hours:
logger.info(f"缓存文件修改时间在有效期内,不执行刷新: {cache_path}")
return cached_icon, cached_icon
return cached_icon, None
# 检查缓存是否过期最大30天
if int(time.time()) - file_time > setting.time_of_30_days:
logger.info(f"图标缓存过期(>30天): {cache_path}")
return cached_icon, None
# 默认图标,使用随机的缓存时间
if (int(time.time()) - file_time > setting.time_of_1_days * random.randint(1, 7)
and _is_default_icon_file(cache_path)):
logger.info(f"默认图标缓存过期: {cache_path}")
return cached_icon, None
return cached_icon, cached_icon
except Exception as e: except Exception as e:
logger.error(f"处理图标请求时发生错误 {url}: {e}") logger.error(f"读取缓存文件失败 {cache_path}: {e}")
# 返回默认图标 return None, None
return self.get_default() return None, None
def _get_cache_icon(domain_md5: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
"""获取缓存的图标"""
_cached, cached_icon = _get_cache_file(domain_md5, refresh)
# 替换默认图标
if _cached and _is_default_icon_byte(_cached):
_cached = setting.default_icon_file
if cached_icon and _is_default_icon_byte(cached_icon):
cached_icon = setting.default_icon_file
return _cached, cached_icon

View File

@ -29,34 +29,43 @@ async def get_redis() -> AsyncGenerator[Redis, None]:
yield conn yield conn
async def set_cache(key: str, value: [str | int], ttl: int = None) -> None: async def set_cache(key: str, value: [str | int], ttl: int = None, prefix: str = None) -> None:
if not key: if not key:
return return
try: try:
async for redis in get_redis(): async for redis in get_redis():
await redis.set(key, value, ex=ttl) _key = key
if prefix:
_key = f"{prefix}{key}"
await redis.sadd(prefix, key)
await redis.expire(prefix, ttl)
await redis.set(_key, value, ex=ttl)
except Exception as e: except Exception as e:
logger.error(f"存入redis时出错{e}") logger.error(f"存入redis时出错{e}")
async def get_cache(key: str) -> Optional[str | int]: async def get_cache(key: str, prefix: str = None) -> Optional[str | int]:
if not key: if not key:
return None return None
try: try:
async for redis in get_redis(): async for redis in get_redis():
if prefix:
key = f"{prefix}{key}"
return await redis.get(key) return await redis.get(key)
except Exception as e: except Exception as e:
logger.error(f"读取redis时出错{e}") logger.error(f"读取redis时出错{e}")
async def exist_cache(key: str) -> bool: async def exist_cache(key: str, prefix: str = None) -> bool:
if not key: if not key:
return False return False
try: try:
async for redis in get_redis(): async for redis in get_redis():
if prefix:
key = f"{prefix}{key}"
result = await redis.exists(key) result = await redis.exists(key)
return result > 0 return result > 0
except Exception as e: except Exception as e:
@ -64,80 +73,62 @@ async def exist_cache(key: str) -> bool:
return False return False
async def remove_cache(key: str) -> None: async def remove_cache(key: str, prefix: str = None) -> None:
if not key: if not key:
return return
try: try:
async for redis in get_redis(): async for redis in get_redis():
await redis.delete(key) _key = key
if prefix:
_key = f"{prefix}{key}"
await redis.srem(prefix, key)
await redis.delete(_key)
except Exception as e: except Exception as e:
logger.error(f"删除redis时出错{e}") logger.error(f"删除redis时出错{e}")
async def get_cache_size(cache_name: str = "default") -> int: async def get_cache_size(prefix: str = None) -> int:
"""根据前缀统计数量用于统计Set集合
"""
try: try:
async for redis in get_redis(): async for redis in get_redis():
return await redis.llen(cache_name) return await redis.scard(prefix)
except Exception as e: except Exception as e:
logger.error(f"获取队列大小时出错:{e}") logger.error(f"获取队列大小时出错:{e}")
return 0 return 0
async def set_failed_domain(domain: str, expire_seconds: int = setting.time_of_7_days) -> None: async def set_failed_domain(domain: str, expire_seconds: int = None) -> None:
"""将失败的域名存入Redis并设置过期时间
Args:
domain: 失败的域名
expire_seconds: 过期时间默认为7天
"""
if not domain: if not domain:
return return
try: try:
async for redis in get_redis(): await set_cache(f"{domain}", domain, ttl=expire_seconds, prefix=FAILED_DOMAINS_PREFIX)
redis_key = f"{FAILED_DOMAINS_PREFIX}{domain}"
await redis.set(redis_key, domain, ex=expire_seconds) logger.debug(f"已将失败域名 {domain} 存入Redis过期时间{expire_seconds}")
logger.debug(f"已将失败域名 {domain} 存入Redis过期时间{expire_seconds}")
except Exception as e: except Exception as e:
logger.error(f"将失败域名存入Redis时出错{e}") logger.error(f"将失败域名存入Redis时出错{e}")
async def is_domain_failed(domain: str) -> bool: async def is_domain_failed(domain: str) -> bool:
"""检查域名是否在Redis的失败列表中
Args:
domain: 要检查的域名
Returns:
True: 域名在失败列表中False: 不在或Redis查询失败
"""
if not domain: if not domain:
return False return False
try: try:
async for redis in get_redis(): return await exist_cache(domain, prefix=FAILED_DOMAINS_PREFIX)
redis_key = f"{FAILED_DOMAINS_PREFIX}{domain}"
result = await redis.exists(redis_key)
return result > 0
except Exception as e: except Exception as e:
logger.error(f"检查域名是否失败时出错:{e}") logger.error(f"检查域名是否失败时出错:{e}")
return False return False
async def delete_failed_domain(domain: str) -> None: async def delete_failed_domain(domain: str) -> None:
"""从Redis中删除失败域名记录
Args:
domain: 要删除的域名
"""
if not domain: if not domain:
return return
try: try:
async for redis in get_redis(): await remove_cache(domain, prefix=FAILED_DOMAINS_PREFIX)
redis_key = f"{FAILED_DOMAINS_PREFIX}{domain}"
await redis.delete(redis_key) logger.debug(f"已从Redis删除失败域名 {domain}")
logger.debug(f"已从Redis删除失败域名 {domain}")
except Exception as e: except Exception as e:
logger.error(f"从Redis删除失败域名时出错{e}") logger.error(f"从Redis删除失败域名时出错{e}")

View File

@ -18,7 +18,7 @@ default_icon_file = FileUtil.read_file(default_icon_path, mode='rb')
referer_log_file = os.path.join(icon_root_path, 'data', 'referer.txt') referer_log_file = os.path.join(icon_root_path, 'data', 'referer.txt')
# 队列阈值常量配置 # 队列阈值常量配置
MAX_QUEUE_SIZE = 3 MAX_QUEUE_SIZE = 10
# 时间常量 # 时间常量
time_of_1_minus = 1 * 60 time_of_1_minus = 1 * 60