# -*- coding: utf-8 -*- import hashlib import logging import os import random import re import time from concurrent.futures import ThreadPoolExecutor from queue import Queue from threading import Lock from typing import Optional, Tuple, Dict, Set, List import bs4 import urllib3 from bs4 import SoupStrainer from fastapi import APIRouter, Request, Query from fastapi.responses import Response from favicon_app.models import Favicon from favicon_app.utils import header, file_util from favicon_app.utils.filetype import helpers, filetype urllib3.disable_warnings() logging.captureWarnings(True) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # 创建FastAPI路由器 favicon_router = APIRouter(prefix="", tags=["favicon"]) # 获取当前模块所在目录的绝对路径 current_dir = os.path.dirname(os.path.abspath(__file__)) # icon 存储的绝对路径,上两级目录(applications/application) icon_root_path = os.path.abspath(os.path.join(current_dir, '..', '..')) # default_icon_path = '/'.join([icon_root_path, 'favicon.png']) default_icon_path = os.path.join(icon_root_path, 'favicon.png') try: default_icon_content = file_util.read_file(default_icon_path, mode='rb') except Exception as e: # 如果默认图标文件不存在,使用一个基本的PNG图标作为默认值 logger.warning(f"无法读取默认图标文件,使用内置图标: {e}") default_icon_content = b'iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAAsTAAALEwEAmpwYAAAKT2lDQ1BQaG90b3Nob3AgSUNDIHByb2ZpbGUAAHjanVNnVFPpFj333vRCS4iAlEtvUhUIIFJCi4AUkSYqIQkQSoghodkVUcERRUUEG8igiAOOjoCMFVEsDIoK2AfkIaKOg6OIisr74Xuja9a89+bN/rXXPues852zzwfACAyWSDNRNYAMqUIeEeCDx8TG4eQuQIEKJHAAEAizZCFz/SMBAPh+PDwrIsAHvgABeNMLCADATZvAMByH/w/qQplcAYCEAcB0kThLCIAUAEB6jkKmAEBGAYCdmCZTAKAEAGDLY2LjAFAtAGAnf+bTAICd+Jl7AQBblCEVAaCRACATZYhEAGg7AKzPVopFAFgwABRmS8Q5ANgtADBJV2ZIALC3AMDOEAuyAAgMADBRiIUpAAR7AGDIIyN4AISZABRG8lc88SuuEOcqAAB4mbI8uSQ5RYFbCC1xB1dXLh4ozkkXKxQ2YQJhmkAuwnmZGTKBNA/g88wAAKCRFRHgg/P9eM4Ors7ONo62Dl8t6r8G/yJiYuP+5c+rcEAAAOF0ftH+LC+zGoA7BoBt/qIl7gRoXgugdfeLZrIPQLUAoOnaV/Nw+H48PEWhkLnZ2eXk5NhKxEJbYcpXff5nwl/AV/1s+X48/Pf14L7iJIEyXYFHBPjgwsz0TKUcz5IJhGLc5o9H/LcL//wd0yLESWK5WCoU41EScY5EmozzMqUiiUKSKcUl0v9k4t8s+wM+3zUAsGo+AXuRLahdYwP2SycQWHTA4vcAAPK7b8HUKAgDgGiD4c93/+8//UegJQCAZkmScQAAXkQkLlTKsz/HCAAARKCBKrBBG/TBGCzABhzBBdzBC/xgNoRCJMTCQhBCCmSAHHJgKayCQiiGzbAdKmAv1EAdNMBRaIaTcA4uwlW4Dj1wD/phCJ7BKLyBCQRByAgTYSHaiAFiilgjjggXmYX4IcFIBBKLJCDJiBRRIkuRNUgxUopUIFVIHfI9cgI5h1xGupE7yAAygvyGvEcxlIGyUT3UDLVDuag3GoRGogvQZHQxmo8WoJvQcrQaPYw2oefQq2gP2o8+Q8cwwOgYBzPEbDAuxsNCsTgsCZNjy7EirAyrxhqwVqwDu4n1Y8+xdwQSgUXACTYEd0IgYR5BSFhMWE7YSKggHCQ0EdoJNwkDhFHCJyKTqEu0JroR+cQYYjIxh1hILCPWEo8TLxB7iEPENyQSiUMyJ7mQAkmxpFTSEtJG0m5SI+ksqZs0SBojk8naZGuyBzmULCAryIXkneTD5DPkG+Qh8lsKnWJAcaT4U+IoUspqShnlEOU05QZlmDJBVaOaUt2ooVQRNY9aQq2htlKvUYeoEzR1mjnNgxZJS6WtopXTGmgXaPdpr+h0uhHdlR5Ol9BX0svpR+iX6AP0dwwNhhWDx4hnKBmbGAcYZxl3GK+YTKYZ04sZx1QwNzHrmOeZD5lvVVgqtip8FZHKCpVKlSaVGyovVKmqpqreqgtV81XLVI+pXlN9rkZVM1PjqQnUlqtVqp1Q61MbU2epO6iHqmeob1Q/pH5Z/YkGWcNMw09DpFGgsV/jvMYgC2MZs3gsIWsNq4Z1gTXEJrHN2Xx2KruY/R27iz2qqaE5QzNKM1ezUvOUZj8H45hx+Jx0TgnnKKeX836K3hTvKeIpG6Y0TLkxZVxrqpaXllirSKtRq0frvTau7aedpr1Fu1n7gQ5Bx0onXCdHZ4/OBZ3nU9lT3acKpxZNPTr1ri6qa6UbobtEd79up+6Ynr5egJ5Mb6feeb3n+hx9L/1U/W36p/VHDFgGswwkBtsMzhg8xTVxbzwdL8fb8VFDXcNAQ6VhlWGX4YSRudE8o9VGjUYPjGnGXOMk423GbcajJgYmISZLTepN7ppSTbmmKaY7TDtMx83MzaLN1pk1mz0x1zLnm+eb15vft2BaeFostqi2uGVJsuRaplnutrxuhVo5WaVYVVpds0atna0l1rutu6cRp7lOk06rntZnw7Dxtsm2qbcZsOXYBtuutm22fWFnYhdnt8Wuw+6TvZN9un2N/T0HDYfZDqsdWh1+c7RyFDpWOt6azpzuP33F9JbpL2dYzxDP2DPjthPLKcRpnVOb00dnF2e5c4PziIuJS4LLLpc+Lpsbxt3IveRKdPVxXeF60vWdm7Obwu2o26/uNu5p7ofcn8w0nymeWTNz0MPIQ+BR5dE/C5+VMGvfrH5PQ0+BZ7XnIy9jL5FXrdewt6V3qvdh7xc+9j5yn+M+4zw33jLeWV/MN8C3yLfLT8Nvnl+F30N/I/9k/3r/0QCngCUBZwOJgUGBWwL7+Hp8Ib+OPzrbZfay2e1BjKC5QRVBj4KtguXBrSFoyOyQrSH355jOkc5pDoVQfujW0Jnjr0YfN1DO8PauXp5epj7PPL5Iq4R8uHBchF2e3kZSOzTrMbMZaROWJKTdMLj2Vx9BjFhVypQa5SaTb5Mw9jdvRcPEfOU4oJxYhKkv5HrvXiw6jeP3FXB9f0iOv5zQxN0c8qSHo4a3N3uB9Y+7wV/WT//6qy8JxjZsmxxy5+4w9CDNJY09T072iKG0EnOS0arEYgXqYnXcYHwjTtUNAcMelOd4xpkoqiTYICWFq0JSiPfPDQdnt+4/wuqcXY47QILbgAAAABJRU5ErkJggg==' class FaviconService: """图标服务类,封装所有与图标获取、缓存和处理相关的功能""" def __init__(self): # 使用锁保证线程安全 self._lock = Lock() # 全局计数器和集合 self.url_count = 0 self.request_icon_count = 0 self.request_cache_count = 0 self.href_referrer: Set[str] = set() self.domain_list: List[str] = list() # 初始化队列 self.icon_queue = Queue() self.total_queue = Queue() # 初始化线程池(FastAPI默认已使用异步,但保留线程池用于CPU密集型任务) self.executor = ThreadPoolExecutor(15) # 时间常量 self.time_of_1_minus = 1 * 60 self.time_of_5_minus = 5 * self.time_of_1_minus self.time_of_10_minus = 10 * self.time_of_1_minus self.time_of_30_minus = 30 * self.time_of_1_minus self.time_of_1_hours = 1 * 60 * 60 self.time_of_2_hours = 2 * self.time_of_1_hours self.time_of_3_hours = 3 * self.time_of_1_hours self.time_of_6_hours = 6 * self.time_of_1_hours self.time_of_12_hours = 12 * self.time_of_1_hours self.time_of_1_days = 1 * 24 * 60 * 60 self.time_of_7_days = 7 * self.time_of_1_days self.time_of_15_days = 15 * self.time_of_1_days self.time_of_30_days = 30 * self.time_of_1_days # 预编译正则表达式,提高性能 self.pattern_icon = re.compile(r'(icon|shortcut icon|alternate icon|apple-touch-icon)+', re.I) self.pattern_link = re.compile(r'(]+rel=.(icon|shortcut icon|alternate icon|apple-touch-icon)[^>]+>)', re.I) # 计算默认图标的MD5值 self.default_icon_md5 = self._initialize_default_icon_md5() def _initialize_default_icon_md5(self) -> List[str]: """初始化默认图标MD5值列表""" try: md5_list = [self._get_file_md5(default_icon_path), '05231fb6b69aff47c3f35efe09c11ba0', '3ca64f83fdcf25135d87e08af65e68c9', 'db470fd0b65c8c121477343c37f74f02', '52419f3f4f7d11945d272facc76c9e6a', 'b8a0bf372c762e966cc99ede8682bc71', '71e9c45f29eadfa2ec5495302c22bcf6', 'ababc687adac587b8a06e580ee79aaa1', '43802b9f029eadfa2ec5495302c22bcf6'] # 过滤掉None值 return [md5 for md5 in md5_list if md5] except Exception as e: logger.error(f"初始化默认图标MD5列表失败: {e}") return ['05231fb6b69aff47c3f35efe09c11ba0', '3ca64f83fdcf25135d87e08af65e68c9', 'db470fd0b65c8c121477343c37f74f02', '52419f3f4f7d11945d272facc76c9e6a', 'b8a0bf372c762e966cc99ede8682bc71', '71e9c45f29eadfa2ec5495302c22bcf6', 'ababc687adac587b8a06e580ee79aaa1', '43802b9f029eadfa2ec5495302c22bcf6'] def _get_file_md5(self, file_path: str) -> Optional[str]: """计算文件的MD5值""" try: md5 = hashlib.md5() with open(file_path, 'rb') as f: while True: buffer = f.read(1024 * 8) if not buffer: break md5.update(buffer) return md5.hexdigest().lower() except Exception as e: logger.error(f"计算文件MD5失败 {file_path}: {e}") return None def _is_default_icon_md5(self, icon_md5: str) -> bool: """检查图标MD5是否为默认图标""" return icon_md5 in self.default_icon_md5 def _is_default_icon_file(self, file_path: str) -> bool: """检查文件是否为默认图标""" if os.path.exists(file_path) and os.path.isfile(file_path): md5 = self._get_file_md5(file_path) return md5 in self.default_icon_md5 if md5 else False return False def _is_default_icon_byte(self, file_content: bytes) -> bool: """检查字节内容是否为默认图标""" try: md5 = hashlib.md5(file_content).hexdigest().lower() return md5 in self.default_icon_md5 except Exception as e: logger.error(f"计算字节内容MD5失败: {e}") return False def _get_cache_file(self, domain: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]: """从缓存中获取图标文件""" # Windows路径格式 cache_path = os.path.join(icon_root_path, 'icon', domain + '.png') if os.path.exists(cache_path) and os.path.isfile(cache_path) and os.path.getsize(cache_path) > 0: try: cached_icon = file_util.read_file(cache_path, mode='rb') file_time = int(os.path.getmtime(cache_path)) # 验证是否为有效的图片文件 if not helpers.is_image(cached_icon): logger.warning(f"缓存的图标不是有效图片: {cache_path}") return None, None # 处理刷新请求或缓存过期情况 if refresh: return cached_icon, None current_time = int(time.time()) # 检查缓存是否过期(30天) if current_time - file_time > self.time_of_30_days: logger.info(f"图标缓存过期(>30天): {cache_path}") return cached_icon, None # 对于默认图标,使用较短的缓存时间 if current_time - file_time > self.time_of_1_days * random.randint(1, 7) and self._is_default_icon_file( cache_path): logger.info(f"默认图标缓存过期: {cache_path}") return cached_icon, None return cached_icon, cached_icon except Exception as e: logger.error(f"读取缓存文件失败 {cache_path}: {e}") return None, None return None, None def _get_cache_icon(self, domain_md5: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]: """获取缓存的图标""" _cached, cached_icon = self._get_cache_file(domain_md5, refresh) # 替换默认图标 if _cached and self._is_default_icon_byte(_cached): _cached = default_icon_content if cached_icon and self._is_default_icon_byte(cached_icon): cached_icon = default_icon_content return _cached, cached_icon def _get_header(self, content_type: str, cache_time: int = None) -> dict: """生成响应头""" if cache_time is None: cache_time = self.time_of_7_days _ct = 'image/x-icon' if content_type and content_type in header.image_type: _ct = content_type cache_control = 'no-store, no-cache, must-revalidate, max-age=0' if cache_time == 0 else f'public, max-age={cache_time}' return { 'Content-Type': _ct, 'Cache-Control': cache_control, 'X-Robots-Tag': 'noindex, nofollow' } def _queue_pull(self, is_pull: bool = True, _queue: Queue = None) -> None: """从队列中取出元素""" if _queue is None: _queue = self.icon_queue if is_pull and not _queue.empty(): try: _queue.get_nowait() _queue.task_done() except Exception as e: logger.error(f"从队列中取出元素失败: {e}") def _parse_html(self, content: bytes, entity: Favicon) -> Optional[str]: """从HTML内容中解析图标URL""" if not content: return None try: # 尝试将bytes转换为字符串 content_str = content.decode('utf-8', 'replace') # 使用更高效的解析器 bs = bs4.BeautifulSoup(content_str, features='lxml', parse_only=SoupStrainer("link")) if len(bs) == 0: bs = bs4.BeautifulSoup(content_str, features='html.parser', parse_only=SoupStrainer("link")) html_links = bs.find_all("link", rel=self.pattern_icon) # 如果没有找到,尝试使用正则表达式直接匹配 if not html_links or len(html_links) == 0: content_links = self.pattern_link.findall(content_str) c_link = ''.join([_links[0] for _links in content_links]) bs = bs4.BeautifulSoup(c_link, features='lxml') html_links = bs.find_all("link", rel=self.pattern_icon) if html_links and len(html_links) > 0: # 优先查找指定rel类型的图标 icon_url = (self._get_link_rel(html_links, entity, 'shortcut icon') or self._get_link_rel(html_links, entity, 'icon') or self._get_link_rel(html_links, entity, 'alternate icon') or self._get_link_rel(html_links, entity, '')) if icon_url: logger.info(f"-> 从HTML获取图标URL: {icon_url}") return icon_url except Exception as e: logger.error(f"解析HTML失败: {e}") return None def _get_link_rel(self, links, entity: Favicon, _rel: str) -> Optional[str]: """从链接列表中查找指定rel类型的图标URL""" if not links: return None for link in links: r = link.get('rel') _r = ' '.join(r) if isinstance(r, list) else r _href = link.get('href') if _rel: if _r.lower() == _rel: return entity.get_icon_url(str(_href)) else: return entity.get_icon_url(str(_href)) return None async def _referer(self, req: Request) -> None: """记录请求来源""" _referrer = req.headers.get('referrer') or req.headers.get('referer') if _referrer: logger.debug(f"-> Referrer: {_referrer}") # Windows路径格式 _path = os.path.join(icon_root_path, 'referrer.txt') with self._lock: # 首次加载现有referrer数据 if len(self.href_referrer) == 0 and os.path.exists(_path): try: with open(_path, 'r', encoding='utf-8') as ff: self.href_referrer = {line.strip() for line in ff.readlines()} except Exception as e: logger.error(f"读取referrer文件失败: {e}") # 添加新的referrer if _referrer not in self.href_referrer: self.href_referrer.add(_referrer) try: file_util.write_file(_path, f'{_referrer}\n', mode='a') except Exception as e: logger.error(f"写入referrer文件失败: {e}") def get_icon_sync(self, entity: Favicon, _cached: bytes = None) -> Optional[bytes]: """同步获取图标""" with self._lock: if entity.domain in self.domain_list: self._queue_pull(True, self.total_queue) return None else: self.domain_list.append(entity.domain) try: icon_url, icon_content = None, None # 尝试从网站获取HTML内容 html_content = entity.req_get() if html_content: icon_url = self._parse_html(html_content, entity) # 尝试不同的图标获取策略 strategies = [ # 1. 从原始网页标签链接中获取 lambda: (icon_url, "原始网页标签") if icon_url else (None, None), # 2. 从 gstatic.cn 接口获取 lambda: ( f'https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={entity.get_base_url()}', "gstatic接口"), # 3. 从网站默认位置获取 lambda: ('', "网站默认位置/favicon.ico"), # 4. 从其他api接口获取 lambda: (f'https://ico.kucat.cn/get.php?url={entity.get_base_url()}', "第三方API") ] for strategy in strategies: if icon_content: break strategy_url, strategy_name = strategy() if strategy_url is not None: logger.info(f"-> 尝试从 {strategy_name} 获取图标") icon_content, icon_type = entity.get_icon_file(strategy_url, strategy_url == '') # 图标获取失败,或图标不是支持的图片格式,写入默认图标 if (not icon_content) or (not helpers.is_image(icon_content) or self._is_default_icon_byte(icon_content)): logger.warning(f"-> 获取图标失败,使用默认图标: {entity.domain}") icon_content = _cached if _cached else default_icon_content if icon_content: # Windows路径格式 cache_path = os.path.join(icon_root_path, 'icon', entity.domain_md5 + '.png') md5_path = os.path.join(icon_root_path, 'md5', entity.domain_md5 + '.txt') try: # 确保目录存在 os.makedirs(os.path.dirname(cache_path), exist_ok=True) os.makedirs(os.path.dirname(md5_path), exist_ok=True) # 写入缓存文件 file_util.write_file(cache_path, icon_content, mode='wb') file_util.write_file(md5_path, entity.domain, mode='w') except Exception as e: logger.error(f"写入缓存文件失败: {e}") with self._lock: self.request_icon_count += 1 return icon_content except Exception as e: logger.error(f"获取图标时发生错误 {entity.domain}: {e}") return None finally: with self._lock: if entity.domain in self.domain_list: self.domain_list.remove(entity.domain) self._queue_pull(True, self.total_queue) def get_icon_background(self, entity: Favicon, _cached: bytes = None) -> None: """在后台线程中获取图标""" # 使用线程池执行同步函数 self.executor.submit(self.get_icon_sync, entity, _cached) def get_count(self) -> Dict[str, int]: """获取统计数据""" with self._lock: return { 'url_count': self.url_count, 'request_icon_count': self.request_icon_count, 'request_cache_count': self.request_cache_count, 'queue_size': self.icon_queue.qsize(), 'total_queue_size': self.total_queue.qsize(), 'href_referrer': len(self.href_referrer), } async def get_favicon_handler(self, request: Request, url: Optional[str] = None, refresh: Optional[str] = None) -> Response: """处理获取图标的请求""" with self._lock: self.url_count += 1 # 验证URL参数 if not url: # 如果没有提供URL参数,返回默认图标或提示页面 return {"message": "请提供url参数"} try: # 创建Favicon实例 entity = Favicon(url) # 验证域名 if not entity.domain: logger.warning(f"无效的URL: {url}") return Response(content=default_icon_content, media_type="image/x-icon", headers=self._get_header("", self.time_of_7_days)) # 检测并记录referer await self._referer(request) # 检查队列大小 if self.icon_queue.qsize() > 100: logger.warning(f'-> 警告: 队列大小已达到 => {self.icon_queue.qsize()}') # 检查缓存 _cached, cached_icon = self._get_cache_icon(entity.domain_md5, refresh=refresh in ['true', '1']) if cached_icon: # 使用缓存图标 icon_content = cached_icon with self._lock: self.request_cache_count += 1 else: # 将域名加入队列 self.icon_queue.put(entity.domain) self.total_queue.put(entity.domain) if self.icon_queue.qsize() > 10: # 如果队列较大,使用后台任务处理 # 在FastAPI中,我们使用BackgroundTasks而不是直接提交到线程池 # 这里保持原有行为,但在实际使用中应考虑使用FastAPI的BackgroundTasks self.get_icon_background(entity, _cached) self._queue_pull(True) # 返回默认图标,但不缓存 return Response(content=default_icon_content, media_type="image/x-icon", headers=self._get_header("", 0)) else: # 直接处理请求 icon_content = self.get_icon_sync(entity, _cached) self._queue_pull(True) if not icon_content: # 获取失败,返回默认图标,但不缓存 return Response(content=default_icon_content, media_type="image/x-icon", headers=self._get_header("", 0)) # 确定内容类型和缓存时间 content_type = filetype.guess_mime(icon_content) if icon_content else "" cache_time = self.time_of_1_hours * random.randint(1, 6) if self._is_default_icon_byte( icon_content) else self.time_of_7_days return Response(content=icon_content, media_type=content_type if content_type else "image/x-icon", headers=self._get_header(content_type, cache_time)) except Exception as e: logger.error(f"处理图标请求时发生错误 {url}: {e}") # 发生异常时返回默认图标 return Response(content=default_icon_content, media_type="image/x-icon", headers=self._get_header("", 0)) # 创建全局服务实例 favicon_service = FaviconService() # 定义路由函数,保持向后兼容性 @favicon_router.get('/icon/') @favicon_router.get('/') async def get_favicon( request: Request, url: Optional[str] = Query(None, description="要获取图标的网址"), refresh: Optional[str] = Query(None, description="是否刷新缓存,'true'或'1'表示刷新") ): """获取网站图标""" return await favicon_service.get_favicon_handler(request, url, refresh) @favicon_router.get('/icon/count') async def get_count(): """获取统计数据""" return favicon_service.get_count() @favicon_router.get('/icon/default') async def get_default_icon(cache_time: int = Query(favicon_service.time_of_1_days, description="缓存时间")): """获取默认图标""" icon_content = default_icon_content return Response(content=icon_content, media_type="image/x-icon", headers=favicon_service._get_header("", cache_time)) @favicon_router.get('/icon/referrer') async def get_referrer(): """获取请求来源信息""" content = 'None' # Windows路径格式 path = os.path.join(icon_root_path, 'referrer.txt') if os.path.exists(path): try: content = file_util.read_file(path, mode='r') or 'None' except Exception as e: logger.error(f"读取referrer文件失败: {e}") return Response(content=content, media_type="text/plain") # 队列消费 def _queue_pull(is_pull=True, _queue=favicon_service.icon_queue): if is_pull and _queue.qsize() != 0: _queue.get()