master
jinql 2025-09-10 14:49:23 +08:00
parent f0b8929035
commit cb9b9c2d35
5 changed files with 487 additions and 572 deletions

View File

@ -22,6 +22,7 @@ urllib3.disable_warnings()
logging.captureWarnings(True) logging.captureWarnings(True)
# 配置日志 # 配置日志
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# warnings.filterwarnings("ignore", category=RuntimeWarning)
# 创建requests会话池 # 创建requests会话池
requests_session = requests.Session() requests_session = requests.Session()
@ -103,7 +104,7 @@ class Favicon:
self.scheme = 'http' self.scheme = 'http'
# 检查域名合法性 # 检查域名合法性
if self.domain and not self._check_url(self.domain): if self.domain and not _check_url(self.domain):
self.domain = None self.domain = None
# 生成域名MD5哈希值 # 生成域名MD5哈希值
@ -165,6 +166,21 @@ class Favicon:
self._get_icon_url(icon_path) self._get_icon_url(icon_path)
return self.icon_url return self.icon_url
def get_base_url(self) -> Optional[str]:
"""获取网站基础URL
Returns:
网站基础URL
"""
if not self.domain or '.' not in self.domain:
return None
_url = f"{self.scheme}://{self.domain}"
if self.port and self.port not in [80, 443]:
_url += f":{self.port}"
return _url
async def get_icon_file(self, icon_path: str, default: bool = False) -> Tuple[Optional[bytes], Optional[str]]: async def get_icon_file(self, icon_path: str, default: bool = False) -> Tuple[Optional[bytes], Optional[str]]:
"""获取图标文件内容和类型 """获取图标文件内容和类型
@ -189,7 +205,7 @@ class Favicon:
_content = base64.b64decode(data_uri[-1]) _content = base64.b64decode(data_uri[-1])
_ct = data_uri[0].split(';')[0].split(':')[-1] _ct = data_uri[0].split(';')[0].split(':')[-1]
else: else:
_content, _ct = await self._req_get(self.icon_url, domain=self.domain) _content, _ct = await _req_get(self.icon_url, domain=self.domain)
# 验证是否为图片 # 验证是否为图片
# image/* application/x-ico # image/* application/x-ico
@ -204,21 +220,6 @@ class Favicon:
return None, None return None, None
def get_base_url(self) -> Optional[str]:
"""获取网站基础URL
Returns:
网站基础URL
"""
if not self.domain or '.' not in self.domain:
return None
_url = f"{self.scheme}://{self.domain}"
if self.port and self.port not in [80, 443]:
_url += f":{self.port}"
return _url
async def req_get(self) -> Optional[bytes]: async def req_get(self) -> Optional[bytes]:
"""获取网站首页内容 """获取网站首页内容
@ -229,7 +230,7 @@ class Favicon:
return None return None
_url = self.get_base_url() _url = self.get_base_url()
_content, _ct = await self._req_get(_url, domain=self.domain) _content, _ct = await _req_get(_url, domain=self.domain)
# 验证类型并检查大小 # 验证类型并检查大小
if _ct and ('text' in _ct or 'html' in _ct or 'xml' in _ct): if _ct and ('text' in _ct or 'html' in _ct or 'xml' in _ct):
@ -240,13 +241,51 @@ class Favicon:
return None return None
@staticmethod
async def _req_get( def _check_internal(domain: str) -> bool:
url: str, """检查网址是否非内网地址
Args:
domain: 域名
Returns:
True: 非内网False: 是内网/无法解析
"""
try:
# 检查是否为IP地址
if domain.replace('.', '').isdigit():
return not ipaddress.ip_address(domain).is_private
else:
# 解析域名获取IP地址
ips = socket.getaddrinfo(domain, None)
for ip_info in ips:
ip = ip_info[4][0]
if '.' in ip:
if not ipaddress.ip_address(ip).is_private:
return True
return False
except Exception as e:
redis_pool.set_failed_domain(domain, setting.time_of_1_days)
logger.error('解析域名出错: %s, 错误: %s', domain, str(e))
return False
def _check_url(domain: str) -> bool:
"""检查域名是否合法且非内网地址
Args:
domain: 域名
Returns:
域名是否合法且非内网地址
"""
return _pattern_domain.match(domain) and _check_internal(domain)
async def _req_get(url: str,
domain: str, domain: str,
retries: int = DEFAULT_RETRIES, retries: int = DEFAULT_RETRIES,
timeout: int = DEFAULT_TIMEOUT timeout: int = DEFAULT_TIMEOUT) -> Tuple[Optional[bytes], Optional[str]]:
) -> Tuple[Optional[bytes], Optional[str]]:
"""异步发送HTTP GET请求获取内容 """异步发送HTTP GET请求获取内容
Args: Args:
@ -296,63 +335,24 @@ class Favicon:
content = await resp.read() content = await resp.read()
return content, ct_type return content, ct_type
else: else:
await redis_pool.set_failed_domain(domain, setting.time_of_7_days) await redis_pool.set_failed_domain(domain, setting.time_of_5_minus)
logger.error('异步请求失败: %d, URL: %s', resp.status, url) logger.error('异步请求失败: %d, URL: %s', resp.status, url)
break break
except (aiohttp.ClientConnectorError, aiohttp.ServerTimeoutError) as e: except (aiohttp.ClientConnectorError, aiohttp.ServerTimeoutError) as e:
retry_count += 1 retry_count += 1
if retry_count > retries: if retry_count > retries:
await redis_pool.set_failed_domain(domain, setting.time_of_5_minus)
logger.error('异步请求超时: %s, URL: %s', str(e), url) logger.error('异步请求超时: %s, URL: %s', str(e), url)
else: else:
logger.warning('异步请求超时,正在重试(%d/%d): %s', retry_count, retries, url) logger.warning('异步请求超时,正在重试(%d/%d): %s', retry_count, retries, url)
continue continue
except Exception as e: except Exception as e:
await redis_pool.set_failed_domain(domain, setting.time_of_7_days) await redis_pool.set_failed_domain(domain, setting.time_of_5_minus)
logger.error('异步请求异常: %s, URL: %s', str(e), url) logger.error('异步请求异常: %s, URL: %s', str(e), url)
break break
return None, None return None, None
@staticmethod
def _check_url(domain: str) -> bool:
"""检查域名是否合法且非内网地址
Args:
domain: 域名
Returns:
域名是否合法且非内网地址
"""
return Favicon.check_internal(domain) and _pattern_domain.match(domain)
@staticmethod
def check_internal(domain: str) -> bool:
"""检查网址是否非内网地址
Args:
domain: 域名
Returns:
True: 非内网False: 是内网/无法解析
"""
try:
# 检查是否为IP地址
if domain.replace('.', '').isdigit():
return not ipaddress.ip_address(domain).is_private
else:
# 解析域名获取IP地址
ips = socket.getaddrinfo(domain, None)
for ip_info in ips:
ip = ip_info[4][0]
if '.' in ip:
if not ipaddress.ip_address(ip).is_private:
return True
return False
except Exception as e:
redis_pool.set_failed_domain(domain, setting.time_of_7_days)
logger.error('解析域名出错: %s, 错误: %s', domain, str(e))
return False
# 域名验证正则表达式 # 域名验证正则表达式
_pattern_domain = re.compile( _pattern_domain = re.compile(

View File

@ -19,9 +19,6 @@ logger = logging.getLogger(__name__)
_icon_root_path = setting.icon_root_path _icon_root_path = setting.icon_root_path
_default_icon_path = setting.default_icon_path _default_icon_path = setting.default_icon_path
# 创建全局服务实例
_service = favicon_service.FaviconService()
# 创建FastAPI路由器 # 创建FastAPI路由器
favicon_router = APIRouter(prefix="", tags=["favicon"]) favicon_router = APIRouter(prefix="", tags=["favicon"])
@ -35,13 +32,13 @@ async def get_favicon(
refresh: Optional[str] = Query(None, include_in_schema=False), refresh: Optional[str] = Query(None, include_in_schema=False),
): ):
"""获取网站图标""" """获取网站图标"""
return await _service.get_favicon_handler(request, bg_tasks, url, refresh) return await favicon_service.get_favicon_handler(request, bg_tasks, url, refresh)
@favicon_router.get('/icon/default') @favicon_router.get('/icon/default')
async def get_default_icon(): async def get_default_icon():
"""获取默认图标""" """获取默认图标"""
return _service.get_default() return favicon_service.get_default()
@favicon_router.get('/icon/referer', include_in_schema=False) @favicon_router.get('/icon/referer', include_in_schema=False)

View File

@ -7,7 +7,7 @@ import random
import re import re
import time import time
import warnings import warnings
from typing import Tuple, List, Optional from typing import Tuple, Optional
import bs4 import bs4
import urllib3 import urllib3
@ -30,272 +30,104 @@ warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
_current_dir = os.path.dirname(os.path.abspath(__file__)) _current_dir = os.path.dirname(os.path.abspath(__file__))
class FaviconService: async def get_favicon_handler(request: Request,
"""图标服务类,封装所有与图标获取、缓存和处理相关的功能""" bg_tasks: BackgroundTasks,
url: Optional[str] = None,
refresh: Optional[str] = None) -> dict[str, str] | Response:
"""异步处理获取图标的请求"""
def __init__(self): logger.info(
# 全局计数器 f"队列大小(异步) queue/failed"
self.url_count = 0 f"{await redis_pool.get_cache_size(prefix=redis_pool.ICON_QUEUE_PREFIX)} "
self.request_icon_count = 0 f"| {await redis_pool.get_cache_size(prefix=redis_pool.FAILED_DOMAINS_PREFIX)}")
self.request_cache_count = 0
# 预编译正则表达式,提高性能 # 验证URL参数
self.pattern_icon = re.compile(r'(icon|shortcut icon|alternate icon|apple-touch-icon)+', re.I) if not url:
self.pattern_link = re.compile(r'(<link[^>]+rel=.(icon|shortcut icon|alternate icon|apple-touch-icon)[^>]+>)', return {"message": "请提供url参数"}
re.I)
# 计算默认图标的MD5值
self.default_icon_md5 = self._initialize_default_icon_md5()
def _initialize_default_icon_md5(self) -> List[str]:
"""初始化默认图标MD5值列表"""
md5_list = [self._get_file_md5(setting.default_icon_path),
'05231fb6b69aff47c3f35efe09c11ba0',
'3ca64f83fdcf25135d87e08af65e68c9',
'db470fd0b65c8c121477343c37f74f02',
'52419f3f4f7d11945d272facc76c9e6a',
'b8a0bf372c762e966cc99ede8682bc71',
'71e9c45f29eadfa2ec5495302c22bcf6',
'ababc687adac587b8a06e580ee79aaa1',
'43802bddf65eeaab643adb8265bfbada']
# 过滤掉None值
return [md5 for md5 in md5_list if md5]
@staticmethod
def _get_file_md5(file_path: str) -> Optional[str]:
"""计算文件的MD5值"""
try:
md5 = hashlib.md5()
with open(file_path, 'rb') as f:
while True:
buffer = f.read(1024 * 8)
if not buffer:
break
md5.update(buffer)
return md5.hexdigest().lower()
except Exception as e:
logger.error(f"计算文件MD5失败 {file_path}: {e}")
return None
def _is_default_icon_md5(self, icon_md5: str) -> bool:
"""检查图标MD5是否为默认图标"""
return icon_md5 in self.default_icon_md5
def _is_default_icon_file(self, file_path: str) -> bool:
"""检查文件是否为默认图标"""
if os.path.exists(file_path) and os.path.isfile(file_path):
md5 = self._get_file_md5(file_path)
return md5 in self.default_icon_md5 if md5 else False
return False
def _is_default_icon_byte(self, file_content: bytes) -> bool:
"""检查字节内容是否为默认图标"""
try:
md5 = hashlib.md5(file_content).hexdigest().lower()
return md5 in self.default_icon_md5
except Exception as e:
logger.error(f"计算字节内容MD5失败: {e}")
return False
def _get_cache_file(self, domain: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
"""从缓存中获取图标文件"""
cache_path = os.path.join(setting.icon_root_path, 'data', 'icon', domain + '.png')
if os.path.exists(cache_path) and os.path.isfile(cache_path) and os.path.getsize(cache_path) > 0:
try:
cached_icon = FileUtil.read_file(cache_path, mode='rb')
file_time = int(os.path.getmtime(cache_path))
# 验证是否为有效的图片文件
if not helpers.is_image(cached_icon):
logger.warning(f"缓存的图标不是有效图片: {cache_path}")
return None, None
# 处理刷新请求或缓存过期情况
if refresh:
if int(time.time()) - file_time <= setting.time_of_12_hours:
logger.info(f"缓存文件修改时间在有效期内,不执行刷新: {cache_path}")
return cached_icon, cached_icon
return cached_icon, None
# 检查缓存是否过期最大30天
if int(time.time()) - file_time > setting.time_of_30_days:
logger.info(f"图标缓存过期(>30天): {cache_path}")
return cached_icon, None
# 默认图标,使用随机的缓存时间
if (int(time.time()) - file_time > setting.time_of_1_days * random.randint(1, 7)
and self._is_default_icon_file(cache_path)):
logger.info(f"默认图标缓存过期: {cache_path}")
return cached_icon, None
return cached_icon, cached_icon
except Exception as e:
logger.error(f"读取缓存文件失败 {cache_path}: {e}")
return None, None
return None, None
def _get_cache_icon(self, domain_md5: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
"""获取缓存的图标"""
_cached, cached_icon = self._get_cache_file(domain_md5, refresh)
# 替换默认图标
if _cached and self._is_default_icon_byte(_cached):
_cached = setting.default_icon_file
if cached_icon and self._is_default_icon_byte(cached_icon):
cached_icon = setting.default_icon_file
return _cached, cached_icon
def _get_header(self, content_type: str, cache_time: int = None) -> dict:
"""生成响应头"""
if cache_time is None:
cache_time = setting.time_of_7_days
_ct = 'image/x-icon'
if content_type and content_type in header.image_type:
_ct = content_type
cache_control = 'no-store, no-cache, must-revalidate, max-age=0' if cache_time == 0 else f'public, max-age={cache_time}'
return {
'Content-Type': _ct,
'Cache-Control': cache_control,
'X-Robots-Tag': 'noindex, nofollow'
}
def _parse_html(self, content: Optional[bytes], entity: Favicon) -> Optional[str]:
"""从HTML内容中解析图标URL"""
if not content:
return None
try: try:
# 尝试将bytes转换为字符串 entity = Favicon(url)
# str(content).encode('utf-8', 'replace').decode('utf-8', 'replace')
content_str = content.decode('utf-8', 'replace')
# 使用更高效的解析器 # 验证域名
bs = bs4.BeautifulSoup(content_str, features='lxml', parse_only=SoupStrainer("link")) if not entity.domain:
if len(bs) == 0: logger.warning(f"无效的URL: {url}")
bs = bs4.BeautifulSoup(content_str, features='html.parser', parse_only=SoupStrainer("link")) return get_default(setting.time_of_7_days)
html_links = bs.find_all("link", rel=self.pattern_icon) # 检查缓存中的失败URL
if await redis_pool.is_domain_failed(entity.domain):
return get_default(setting.time_of_1_days)
# 如果没有找到,尝试使用正则表达式直接匹配 # 检查缓存
if not html_links or len(html_links) == 0: _cached, cached_icon = _get_cache_icon(entity.domain_md5, refresh=refresh in ['true', '1'])
content_links = self.pattern_link.findall(content_str)
c_link = ''.join([_links[0] for _links in content_links])
bs = bs4.BeautifulSoup(c_link, features='lxml')
html_links = bs.find_all("link", rel=self.pattern_icon)
if html_links and len(html_links) > 0: if _cached or cached_icon:
# 优先查找指定rel类型的图标 # 使用缓存图标
icon_url = (self._get_link_rel(html_links, entity, 'shortcut icon') or icon_content = cached_icon if cached_icon else _cached
self._get_link_rel(html_links, entity, 'icon') or
self._get_link_rel(html_links, entity, 'alternate icon') or
self._get_link_rel(html_links, entity, ''))
if icon_url: # 确定内容类型和缓存时间
logger.info(f"-> 从HTML获取图标URL: {icon_url}") content_type = filetype.guess_mime(icon_content) if icon_content else ""
cache_time = setting.time_of_12_hours \
if _is_default_icon_byte(icon_content) else setting.time_of_7_days
return icon_url # 乐观缓存机制:检查缓存是否已过期但仍有缓存内容
except Exception as e: # _cached 存在但 cached_icon 为 None 表示缓存已过期
logger.error(f"解析HTML失败: {e}") if _cached and not cached_icon:
# 缓存已过期,后台刷新缓存
logger.info(f"缓存已过期,加入后台队列刷新(异步): {entity.domain}")
await redis_pool.set_cache(
f"{entity.domain}",
entity.domain,
setting.time_of_2_hours,
prefix=redis_pool.ICON_QUEUE_PREFIX
)
bg_tasks.add_task(get_icon_async, entity, _cached)
return None return Response(content=icon_content,
media_type=content_type if content_type else "image/x-icon",
@staticmethod headers=_get_header(content_type, cache_time))
def _get_link_rel(links, entity: Favicon, _rel: str) -> Optional[str]:
"""从链接列表中查找指定rel类型的图标URL"""
if not links:
return None
for link in links:
r = link.get('rel')
_r = ' '.join(r) if isinstance(r, list) else r
_href = link.get('href')
if _rel:
if _r.lower() == _rel:
return entity.get_icon_url(str(_href))
else: else:
return entity.get_icon_url(str(_href)) # 开始图标处理,加入队列
await redis_pool.set_cache(
f"{entity.domain}",
entity.domain,
setting.time_of_2_hours,
prefix=redis_pool.ICON_QUEUE_PREFIX
)
return None # 没有缓存,实时处理,检查队列大小
_queue_size = await redis_pool.get_cache_size(prefix=redis_pool.ICON_QUEUE_PREFIX)
def get_default(self, cache_time: int = None) -> Response: if _queue_size >= setting.MAX_QUEUE_SIZE:
if cache_time is None: # 加入后台队列并返回默认图片
cache_time = setting.time_of_1_days logger.info(
return Response(content=setting.default_icon_file, f"队列大小({_queue_size})>={setting.MAX_QUEUE_SIZE},返回默认图片并加入后台队列(异步): {entity.domain}")
media_type="image/png", bg_tasks.add_task(get_icon_async, entity, _cached)
headers=self._get_header("image/png", cache_time)) return get_default(0)
def get_icon_sync(self, entity: Favicon, _cached: bytes = None) -> Optional[bytes]:
"""同步获取图标"""
icon_content = None
try:
# 尝试从网站获取HTML内容
html_content = entity.req_get()
if html_content:
icon_url = self._parse_html(html_content, entity)
else: else:
icon_url = None # 队列<MAX_QUEUE_SIZE实时处理
logger.info(f"队列大小({_queue_size})<{setting.MAX_QUEUE_SIZE},实时处理(异步): {entity.domain}")
# 尝试不同的图标获取策略 # 始终使用异步方法获取图标
strategies = [ icon_content = await get_icon_async(entity, _cached)
# 1. 从原始网页标签链接中获取
lambda: (icon_url, "原始网页标签") if icon_url else (None, None),
# 2. 从 gstatic.cn 接口获取
lambda: (
f'https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={entity.get_base_url()}',
"gstatic接口"),
# 3. 从网站默认位置获取
lambda: ('', "网站默认位置/favicon.ico"),
# 4. 从其他api接口获取
lambda: (f'https://ico.kucat.cn/get.php?url={entity.get_base_url()}', "第三方API"),
# 99. 最后的尝试cloudflare workers
# lambda: (f'https://favicon.cary.cc/?url={entity.get_base_url()}', "cloudflare"),
]
for strategy in strategies: if not icon_content:
if icon_content: # 获取失败,返回默认图标
break return get_default()
strategy_url, strategy_name = strategy() # 确定内容类型和缓存时间
if strategy_url is not None: content_type = filetype.guess_mime(icon_content) if icon_content else ""
logger.debug(f"-> 尝试从 {strategy_name} 获取图标") cache_time = setting.time_of_12_hours \
icon_content, icon_type = entity.get_icon_file(strategy_url, strategy_url == '') if _is_default_icon_byte(icon_content) else setting.time_of_7_days
# 图标获取失败,或图标不是支持的图片格式,写入默认图标 return Response(content=icon_content,
if (not icon_content) or (not helpers.is_image(icon_content) or self._is_default_icon_byte(icon_content)): media_type=content_type if content_type else "image/x-icon",
logger.warning(f"-> 获取图标失败,使用默认图标: {entity.domain}") headers=_get_header(content_type, cache_time))
icon_content = _cached if _cached else setting.default_icon_file
if icon_content:
cache_path = os.path.join(setting.icon_root_path, 'data', 'icon', entity.domain_md5 + '.png')
md5_path = os.path.join(setting.icon_root_path, 'data', 'text', entity.domain_md5 + '.txt')
try:
# 确保目录存在
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
os.makedirs(os.path.dirname(md5_path), exist_ok=True)
# 写入缓存文件
FileUtil.write_file(cache_path, icon_content, mode='wb')
FileUtil.write_file(md5_path, entity.domain, mode='w')
except Exception as e: except Exception as e:
logger.error(f"写入缓存文件失败: {e}") logger.error(f"处理图标请求时发生错误 {url}: {e}")
# 返回默认图标
return get_default()
self.request_icon_count += 1
return icon_content async def get_icon_async(entity: Favicon, _cached: bytes = None) -> Optional[bytes]:
except Exception as e:
logger.error(f"获取图标时发生错误 {entity.domain}: {e}")
return _cached or setting.default_icon_file
finally:
redis_pool.remove_cache(f"{redis_pool.ICON_QUEUE_PREFIX}{entity.domain}")
async def get_icon_async(self, entity: Favicon, _cached: bytes = None) -> Optional[bytes]:
"""异步获取图标""" """异步获取图标"""
icon_content = None icon_content = None
@ -303,7 +135,7 @@ class FaviconService:
# 尝试从网站异步获取HTML内容 # 尝试从网站异步获取HTML内容
html_content = await entity.req_get() html_content = await entity.req_get()
if html_content: if html_content:
icon_url = self._parse_html(html_content, entity) icon_url = _parse_html(html_content, entity)
else: else:
icon_url = None icon_url = None
@ -331,7 +163,7 @@ class FaviconService:
icon_content, icon_type = await entity.get_icon_file(strategy_url, strategy_url == '') icon_content, icon_type = await entity.get_icon_file(strategy_url, strategy_url == '')
# 图标获取失败,或图标不是支持的图片格式,写入默认图标 # 图标获取失败,或图标不是支持的图片格式,写入默认图标
if (not icon_content) or (not helpers.is_image(icon_content) or self._is_default_icon_byte(icon_content)): if (not icon_content) or (not helpers.is_image(icon_content) or _is_default_icon_byte(icon_content)):
logger.warning(f"-> 异步获取图标失败,使用默认图标: {entity.domain}") logger.warning(f"-> 异步获取图标失败,使用默认图标: {entity.domain}")
icon_content = _cached if _cached else setting.default_icon_file icon_content = _cached if _cached else setting.default_icon_file
@ -350,109 +182,204 @@ class FaviconService:
except Exception as e: except Exception as e:
logger.error(f"异步写入缓存文件失败: {e}") logger.error(f"异步写入缓存文件失败: {e}")
self.request_icon_count += 1
return icon_content return icon_content
except Exception as e: except Exception as e:
logger.error(f"异步获取图标时发生错误 {entity.domain}: {e}") logger.error(f"异步获取图标时发生错误 {entity.domain}: {e}")
return _cached or setting.default_icon_file return _cached or setting.default_icon_file
finally: finally:
await redis_pool.remove_cache(f"{redis_pool.ICON_QUEUE_PREFIX}{entity.domain}") await redis_pool.remove_cache(f"{entity.domain}", prefix=redis_pool.ICON_QUEUE_PREFIX)
async def get_favicon_handler(
self,
request: Request,
bg_tasks: BackgroundTasks,
url: Optional[str] = None,
refresh: Optional[str] = None,
) -> dict[str, str] | Response:
"""异步处理获取图标的请求"""
logger.info( # 预编译正则表达式,提高性能
f"队列大小(异步) queue/failed{await redis_pool.get_cache_size(f"{redis_pool.ICON_QUEUE_PREFIX}")} | {await redis_pool.get_cache_size(f"{redis_pool.FAILED_DOMAINS_PREFIX}")}") pattern_icon = re.compile(r'(icon|shortcut icon|alternate icon|apple-touch-icon)+', re.I)
pattern_link = re.compile(r'(<link[^>]+rel=.(icon|shortcut icon|alternate icon|apple-touch-icon)[^>]+>)', re.I)
self.url_count += 1
# 验证URL参数 def _get_link_rel(links, entity: Favicon, _rel: str) -> Optional[str]:
if not url: """从链接列表中查找指定rel类型的图标URL"""
return {"message": "请提供url参数"} if not links:
return None
for link in links:
r = link.get('rel')
_r = ' '.join(r) if isinstance(r, list) else r
_href = link.get('href')
if _rel:
if _r.lower() == _rel:
return entity.get_icon_url(str(_href))
else:
return entity.get_icon_url(str(_href))
return None
def _parse_html(content: Optional[bytes], entity: Favicon) -> Optional[str]:
"""从HTML内容中解析图标URL"""
if not content:
return None
try: try:
entity = Favicon(url) # 尝试将bytes转换为字符串
# str(content).encode('utf-8', 'replace').decode('utf-8', 'replace')
# content_str = content.decode('utf-8', 'replace')
content_str = str(content).encode('utf-8', 'replace').decode('utf-8', 'replace')
# 验证域名 # 使用更高效的解析器
if not entity.domain: bs = bs4.BeautifulSoup(content_str, features='lxml', parse_only=SoupStrainer("link"))
logger.warning(f"无效的URL: {url}") if len(bs) == 0:
return self.get_default(setting.time_of_7_days) bs = bs4.BeautifulSoup(content_str, features='html.parser', parse_only=SoupStrainer("link"))
# 检查缓存中的失败URL html_links = bs.find_all("link", rel=pattern_icon)
if await redis_pool.is_domain_failed(entity.domain):
return self.get_default(setting.time_of_7_days)
# 检查缓存 # 如果没有找到,尝试使用正则表达式直接匹配
_cached, cached_icon = self._get_cache_icon(entity.domain_md5, refresh=refresh in ['true', '1']) if not html_links or len(html_links) == 0:
content_links = pattern_link.findall(content_str)
c_link = ''.join([_links[0] for _links in content_links])
bs = bs4.BeautifulSoup(c_link, features='lxml')
html_links = bs.find_all("link", rel=pattern_icon)
if _cached or cached_icon: if html_links and len(html_links) > 0:
# 使用缓存图标 # 优先查找指定rel类型的图标
icon_content = cached_icon if cached_icon else _cached icon_url = (_get_link_rel(html_links, entity, 'shortcut icon') or
self.request_cache_count += 1 _get_link_rel(html_links, entity, 'icon') or
_get_link_rel(html_links, entity, 'alternate icon') or
_get_link_rel(html_links, entity, ''))
# 确定内容类型和缓存时间 if icon_url:
content_type = filetype.guess_mime(icon_content) if icon_content else "" logger.debug(f"-> 从HTML获取图标URL: {icon_url}")
cache_time = setting.time_of_12_hours \
if self._is_default_icon_byte(icon_content) else setting.time_of_7_days
# 乐观缓存机制:检查缓存是否已过期但仍有缓存内容 return icon_url
# _cached 存在但 cached_icon 为 None 表示缓存已过期
if _cached and not cached_icon:
# 缓存已过期,后台刷新缓存
logger.info(f"缓存已过期,加入后台队列刷新(异步): {entity.domain}")
await redis_pool.set_cache(
f"{redis_pool.ICON_QUEUE_PREFIX}{entity.domain}",
entity.domain,
setting.time_of_2_hours
)
bg_tasks.add_task(self.get_icon_sync, entity, _cached)
return Response(content=icon_content,
media_type=content_type if content_type else "image/x-icon",
headers=self._get_header(content_type, cache_time))
else:
# 开始图标处理,加入队列
await redis_pool.set_cache(
f"{redis_pool.ICON_QUEUE_PREFIX}{entity.domain}",
entity.domain,
setting.time_of_2_hours
)
# 没有缓存,实时处理,检查队列大小
_queue_size = await redis_pool.get_cache_size(f"{redis_pool.ICON_QUEUE_PREFIX}")
if _queue_size >= setting.MAX_QUEUE_SIZE:
# 加入后台队列并返回默认图片
logger.info(
f"队列大小({_queue_size})>={setting.MAX_QUEUE_SIZE},返回默认图片并加入后台队列(异步): {entity.domain}")
bg_tasks.add_task(self.get_icon_sync, entity, _cached)
return self.get_default(0)
else:
# 队列<MAX_QUEUE_SIZE实时处理
logger.info(f"队列大小({_queue_size})<{setting.MAX_QUEUE_SIZE},实时处理(异步): {entity.domain}")
# 始终使用异步方法获取图标
icon_content = await self.get_icon_async(entity, _cached)
if not icon_content:
# 获取失败,返回默认图标
return self.get_default()
# 确定内容类型和缓存时间
content_type = filetype.guess_mime(icon_content) if icon_content else ""
cache_time = setting.time_of_12_hours \
if self._is_default_icon_byte(icon_content) else setting.time_of_7_days
return Response(content=icon_content,
media_type=content_type if content_type else "image/x-icon",
headers=self._get_header(content_type, cache_time))
except Exception as e: except Exception as e:
logger.error(f"处理图标请求时发生错误 {url}: {e}") logger.error(f"解析HTML失败: {e}")
# 返回默认图标
return self.get_default() return None
def _get_file_md5(file_path: str) -> Optional[str]:
"""计算文件的MD5值"""
try:
md5 = hashlib.md5()
with open(file_path, 'rb') as f:
while True:
buffer = f.read(1024 * 8)
if not buffer:
break
md5.update(buffer)
return md5.hexdigest().lower()
except Exception as e:
logger.error(f"计算文件MD5失败 {file_path}: {e}")
return None
default_icon_md5 = [
_get_file_md5(setting.default_icon_path),
'05231fb6b69aff47c3f35efe09c11ba0',
'3ca64f83fdcf25135d87e08af65e68c9',
'db470fd0b65c8c121477343c37f74f02',
'52419f3f4f7d11945d272facc76c9e6a',
'b8a0bf372c762e966cc99ede8682bc71',
'71e9c45f29eadfa2ec5495302c22bcf6',
'ababc687adac587b8a06e580ee79aaa1',
'43802bddf65eeaab643adb8265bfbada',
]
def _get_header(content_type: str, cache_time: int = None) -> dict:
"""生成响应头"""
if cache_time is None:
cache_time = setting.time_of_7_days
_ct = 'image/x-icon'
if content_type and content_type in header.image_type:
_ct = content_type
cache_control = 'no-store, no-cache, must-revalidate, max-age=0' if cache_time == 0 else f'public, max-age={cache_time}'
return {
'Content-Type': _ct,
'Cache-Control': cache_control,
'X-Robots-Tag': 'noindex, nofollow'
}
def get_default(cache_time: int = None) -> Response:
if cache_time is None:
cache_time = setting.time_of_1_days
return Response(content=setting.default_icon_file,
media_type="image/png",
headers=_get_header("image/png", cache_time))
def _is_default_icon_md5(icon_md5: str) -> bool:
"""检查图标MD5是否为默认图标"""
return icon_md5 in default_icon_md5
def _is_default_icon_file(file_path: str) -> bool:
"""检查文件是否为默认图标"""
if os.path.exists(file_path) and os.path.isfile(file_path):
md5 = _get_file_md5(file_path)
return md5 in default_icon_md5 if md5 else False
return False
def _is_default_icon_byte(file_content: bytes) -> bool:
"""检查字节内容是否为默认图标"""
try:
md5 = hashlib.md5(file_content).hexdigest().lower()
return md5 in default_icon_md5
except Exception as e:
logger.error(f"计算字节内容MD5失败: {e}")
return False
def _get_cache_file(domain: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
"""从缓存中获取图标文件"""
cache_path = os.path.join(setting.icon_root_path, 'data', 'icon', domain + '.png')
if os.path.exists(cache_path) and os.path.isfile(cache_path) and os.path.getsize(cache_path) > 0:
try:
cached_icon = FileUtil.read_file(cache_path, mode='rb')
file_time = int(os.path.getmtime(cache_path))
# 验证是否为有效的图片文件
if not helpers.is_image(cached_icon):
logger.warning(f"缓存的图标不是有效图片: {cache_path}")
return None, None
# 处理刷新请求或缓存过期情况
if refresh:
if int(time.time()) - file_time <= setting.time_of_12_hours:
logger.info(f"缓存文件修改时间在有效期内,不执行刷新: {cache_path}")
return cached_icon, cached_icon
return cached_icon, None
# 检查缓存是否过期最大30天
if int(time.time()) - file_time > setting.time_of_30_days:
logger.info(f"图标缓存过期(>30天): {cache_path}")
return cached_icon, None
# 默认图标,使用随机的缓存时间
if (int(time.time()) - file_time > setting.time_of_1_days * random.randint(1, 7)
and _is_default_icon_file(cache_path)):
logger.info(f"默认图标缓存过期: {cache_path}")
return cached_icon, None
return cached_icon, cached_icon
except Exception as e:
logger.error(f"读取缓存文件失败 {cache_path}: {e}")
return None, None
return None, None
def _get_cache_icon(domain_md5: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
"""获取缓存的图标"""
_cached, cached_icon = _get_cache_file(domain_md5, refresh)
# 替换默认图标
if _cached and _is_default_icon_byte(_cached):
_cached = setting.default_icon_file
if cached_icon and _is_default_icon_byte(cached_icon):
cached_icon = setting.default_icon_file
return _cached, cached_icon

View File

@ -29,34 +29,43 @@ async def get_redis() -> AsyncGenerator[Redis, None]:
yield conn yield conn
async def set_cache(key: str, value: [str | int], ttl: int = None) -> None: async def set_cache(key: str, value: [str | int], ttl: int = None, prefix: str = None) -> None:
if not key: if not key:
return return
try: try:
async for redis in get_redis(): async for redis in get_redis():
await redis.set(key, value, ex=ttl) _key = key
if prefix:
_key = f"{prefix}{key}"
await redis.sadd(prefix, key)
await redis.expire(prefix, ttl)
await redis.set(_key, value, ex=ttl)
except Exception as e: except Exception as e:
logger.error(f"存入redis时出错{e}") logger.error(f"存入redis时出错{e}")
async def get_cache(key: str) -> Optional[str | int]: async def get_cache(key: str, prefix: str = None) -> Optional[str | int]:
if not key: if not key:
return None return None
try: try:
async for redis in get_redis(): async for redis in get_redis():
if prefix:
key = f"{prefix}{key}"
return await redis.get(key) return await redis.get(key)
except Exception as e: except Exception as e:
logger.error(f"读取redis时出错{e}") logger.error(f"读取redis时出错{e}")
async def exist_cache(key: str) -> bool: async def exist_cache(key: str, prefix: str = None) -> bool:
if not key: if not key:
return False return False
try: try:
async for redis in get_redis(): async for redis in get_redis():
if prefix:
key = f"{prefix}{key}"
result = await redis.exists(key) result = await redis.exists(key)
return result > 0 return result > 0
except Exception as e: except Exception as e:
@ -64,80 +73,62 @@ async def exist_cache(key: str) -> bool:
return False return False
async def remove_cache(key: str) -> None: async def remove_cache(key: str, prefix: str = None) -> None:
if not key: if not key:
return return
try: try:
async for redis in get_redis(): async for redis in get_redis():
await redis.delete(key) _key = key
if prefix:
_key = f"{prefix}{key}"
await redis.srem(prefix, key)
await redis.delete(_key)
except Exception as e: except Exception as e:
logger.error(f"删除redis时出错{e}") logger.error(f"删除redis时出错{e}")
async def get_cache_size(cache_name: str = "default") -> int: async def get_cache_size(prefix: str = None) -> int:
"""根据前缀统计数量用于统计Set集合
"""
try: try:
async for redis in get_redis(): async for redis in get_redis():
return await redis.llen(cache_name) return await redis.scard(prefix)
except Exception as e: except Exception as e:
logger.error(f"获取队列大小时出错:{e}") logger.error(f"获取队列大小时出错:{e}")
return 0 return 0
async def set_failed_domain(domain: str, expire_seconds: int = setting.time_of_7_days) -> None: async def set_failed_domain(domain: str, expire_seconds: int = None) -> None:
"""将失败的域名存入Redis并设置过期时间
Args:
domain: 失败的域名
expire_seconds: 过期时间默认为7天
"""
if not domain: if not domain:
return return
try: try:
async for redis in get_redis(): await set_cache(f"{domain}", domain, ttl=expire_seconds, prefix=FAILED_DOMAINS_PREFIX)
redis_key = f"{FAILED_DOMAINS_PREFIX}{domain}"
await redis.set(redis_key, domain, ex=expire_seconds)
logger.debug(f"已将失败域名 {domain} 存入Redis过期时间{expire_seconds}") logger.debug(f"已将失败域名 {domain} 存入Redis过期时间{expire_seconds}")
except Exception as e: except Exception as e:
logger.error(f"将失败域名存入Redis时出错{e}") logger.error(f"将失败域名存入Redis时出错{e}")
async def is_domain_failed(domain: str) -> bool: async def is_domain_failed(domain: str) -> bool:
"""检查域名是否在Redis的失败列表中
Args:
domain: 要检查的域名
Returns:
True: 域名在失败列表中False: 不在或Redis查询失败
"""
if not domain: if not domain:
return False return False
try: try:
async for redis in get_redis(): return await exist_cache(domain, prefix=FAILED_DOMAINS_PREFIX)
redis_key = f"{FAILED_DOMAINS_PREFIX}{domain}"
result = await redis.exists(redis_key)
return result > 0
except Exception as e: except Exception as e:
logger.error(f"检查域名是否失败时出错:{e}") logger.error(f"检查域名是否失败时出错:{e}")
return False return False
async def delete_failed_domain(domain: str) -> None: async def delete_failed_domain(domain: str) -> None:
"""从Redis中删除失败域名记录
Args:
domain: 要删除的域名
"""
if not domain: if not domain:
return return
try: try:
async for redis in get_redis(): await remove_cache(domain, prefix=FAILED_DOMAINS_PREFIX)
redis_key = f"{FAILED_DOMAINS_PREFIX}{domain}"
await redis.delete(redis_key)
logger.debug(f"已从Redis删除失败域名 {domain}") logger.debug(f"已从Redis删除失败域名 {domain}")
except Exception as e: except Exception as e:
logger.error(f"从Redis删除失败域名时出错{e}") logger.error(f"从Redis删除失败域名时出错{e}")

View File

@ -18,7 +18,7 @@ default_icon_file = FileUtil.read_file(default_icon_path, mode='rb')
referer_log_file = os.path.join(icon_root_path, 'data', 'referer.txt') referer_log_file = os.path.join(icon_root_path, 'data', 'referer.txt')
# 队列阈值常量配置 # 队列阈值常量配置
MAX_QUEUE_SIZE = 3 MAX_QUEUE_SIZE = 10
# 时间常量 # 时间常量
time_of_1_minus = 1 * 60 time_of_1_minus = 1 * 60