favicon-api-v3/favicon_app/routes/favicon_service.py

460 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
import hashlib
import logging
import os
import random
import re
import time
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
from threading import Lock
from typing import Optional, Tuple, Dict, Set, List
import bs4
import urllib3
from bs4 import SoupStrainer
from fastapi import Request
from fastapi.responses import Response
from favicon_app.models import Favicon
from favicon_app.utils import header, file_util
from favicon_app.utils.filetype import helpers, filetype
urllib3.disable_warnings()
logging.captureWarnings(True)
logger = logging.getLogger()
# 获取当前所在目录的绝对路径
current_dir = os.path.dirname(os.path.abspath(__file__))
# icon 存储的绝对路径,上两级目录
icon_root_path = os.path.abspath(os.path.join(current_dir, '..', '..'))
default_icon_path = os.path.join(icon_root_path, 'favicon.png')
default_icon_content = file_util.read_file(default_icon_path, mode='rb')
class FaviconService:
"""图标服务类,封装所有与图标获取、缓存和处理相关的功能"""
def __init__(self):
# 使用锁保证线程安全
self._lock = Lock()
# 全局计数器和集合
self.url_count = 0
self.request_icon_count = 0
self.request_cache_count = 0
self.href_referrer: Set[str] = set()
self.domain_list: List[str] = list()
# 初始化队列
self.icon_queue = Queue()
self.total_queue = Queue()
# 初始化线程池FastAPI默认已使用异步但保留线程池用于CPU密集型任务
self.executor = ThreadPoolExecutor(15)
# 时间常量
self.time_of_1_minus = 1 * 60
self.time_of_5_minus = 5 * self.time_of_1_minus
self.time_of_10_minus = 10 * self.time_of_1_minus
self.time_of_30_minus = 30 * self.time_of_1_minus
self.time_of_1_hours = 1 * 60 * 60
self.time_of_2_hours = 2 * self.time_of_1_hours
self.time_of_3_hours = 3 * self.time_of_1_hours
self.time_of_6_hours = 6 * self.time_of_1_hours
self.time_of_12_hours = 12 * self.time_of_1_hours
self.time_of_1_days = 1 * 24 * 60 * 60
self.time_of_7_days = 7 * self.time_of_1_days
self.time_of_15_days = 15 * self.time_of_1_days
self.time_of_30_days = 30 * self.time_of_1_days
# 预编译正则表达式,提高性能
self.pattern_icon = re.compile(r'(icon|shortcut icon|alternate icon|apple-touch-icon)+', re.I)
self.pattern_link = re.compile(r'(<link[^>]+rel=.(icon|shortcut icon|alternate icon|apple-touch-icon)[^>]+>)',
re.I)
# 计算默认图标的MD5值
self.default_icon_md5 = self._initialize_default_icon_md5()
def _initialize_default_icon_md5(self) -> List[str]:
"""初始化默认图标MD5值列表"""
md5_list = [self._get_file_md5(default_icon_path),
'05231fb6b69aff47c3f35efe09c11ba0',
'3ca64f83fdcf25135d87e08af65e68c9',
'db470fd0b65c8c121477343c37f74f02',
'52419f3f4f7d11945d272facc76c9e6a',
'b8a0bf372c762e966cc99ede8682bc71',
'71e9c45f29eadfa2ec5495302c22bcf6',
'ababc687adac587b8a06e580ee79aaa1',
'43802bddf65eeaab643adb8265bfbada']
# 过滤掉None值
return [md5 for md5 in md5_list if md5]
@staticmethod
def _get_file_md5(file_path: str) -> Optional[str]:
"""计算文件的MD5值"""
try:
md5 = hashlib.md5()
with open(file_path, 'rb') as f:
while True:
buffer = f.read(1024 * 8)
if not buffer:
break
md5.update(buffer)
return md5.hexdigest().lower()
except Exception as e:
logger.error(f"计算文件MD5失败 {file_path}: {e}")
return None
def _is_default_icon_md5(self, icon_md5: str) -> bool:
"""检查图标MD5是否为默认图标"""
return icon_md5 in self.default_icon_md5
def _is_default_icon_file(self, file_path: str) -> bool:
"""检查文件是否为默认图标"""
if os.path.exists(file_path) and os.path.isfile(file_path):
md5 = self._get_file_md5(file_path)
return md5 in self.default_icon_md5 if md5 else False
return False
def _is_default_icon_byte(self, file_content: bytes) -> bool:
"""检查字节内容是否为默认图标"""
try:
md5 = hashlib.md5(file_content).hexdigest().lower()
return md5 in self.default_icon_md5
except Exception as e:
logger.error(f"计算字节内容MD5失败: {e}")
return False
def _get_cache_file(self, domain: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
"""从缓存中获取图标文件"""
cache_path = os.path.join(icon_root_path, 'icon', domain + '.png')
if os.path.exists(cache_path) and os.path.isfile(cache_path) and os.path.getsize(cache_path) > 0:
try:
cached_icon = file_util.read_file(cache_path, mode='rb')
file_time = int(os.path.getmtime(cache_path))
# 验证是否为有效的图片文件
if not helpers.is_image(cached_icon):
logger.warning(f"缓存的图标不是有效图片: {cache_path}")
return None, None
# 处理刷新请求或缓存过期情况
if refresh:
return cached_icon, None
# 检查缓存是否过期最大30天
if int(time.time()) - file_time > self.time_of_30_days:
logger.info(f"图标缓存过期(>30天): {cache_path}")
return cached_icon, None
# 对于默认图标,使用随机的缓存时间
if int(time.time()) - file_time > self.time_of_1_days * random.randint(1, 7) and self._is_default_icon_file(cache_path):
logger.info(f"默认图标缓存过期: {cache_path}")
return cached_icon, None
return cached_icon, cached_icon
except Exception as e:
logger.error(f"读取缓存文件失败 {cache_path}: {e}")
return None, None
return None, None
def _get_cache_icon(self, domain_md5: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
"""获取缓存的图标"""
_cached, cached_icon = self._get_cache_file(domain_md5, refresh)
# 替换默认图标
if _cached and self._is_default_icon_byte(_cached):
_cached = default_icon_content
if cached_icon and self._is_default_icon_byte(cached_icon):
cached_icon = default_icon_content
return _cached, cached_icon
def get_header(self, content_type: str, cache_time: int = None) -> dict:
return self._get_header(content_type, cache_time)
def _get_header(self, content_type: str, cache_time: int = None) -> dict:
"""生成响应头"""
if cache_time is None:
cache_time = self.time_of_7_days
_ct = 'image/x-icon'
if content_type and content_type in header.image_type:
_ct = content_type
cache_control = 'no-store, no-cache, must-revalidate, max-age=0' if cache_time == 0 else f'public, max-age={cache_time}'
return {
'Content-Type': _ct,
'Cache-Control': cache_control,
'X-Robots-Tag': 'noindex, nofollow'
}
def _queue_pull(self, is_pull: bool = True, _queue: Queue = None) -> None:
"""从队列中取出元素"""
if _queue is None:
_queue = self.icon_queue
if is_pull and not _queue.empty():
# _queue.get()
try:
_queue.get_nowait()
_queue.task_done()
except Exception as e:
logger.error(f"从队列中取出元素失败: {e}")
def _parse_html(self, content: bytes, entity: Favicon) -> Optional[str]:
"""从HTML内容中解析图标URL"""
if not content:
return None
try:
# 尝试将bytes转换为字符串
# str(content).encode('utf-8', 'replace').decode('utf-8', 'replace')
content_str = content.decode('utf-8', 'replace')
# 使用更高效的解析器
bs = bs4.BeautifulSoup(content_str, features='lxml', parse_only=SoupStrainer("link"))
if len(bs) == 0:
bs = bs4.BeautifulSoup(content_str, features='html.parser', parse_only=SoupStrainer("link"))
html_links = bs.find_all("link", rel=self.pattern_icon)
# 如果没有找到,尝试使用正则表达式直接匹配
if not html_links or len(html_links) == 0:
content_links = self.pattern_link.findall(content_str)
c_link = ''.join([_links[0] for _links in content_links])
bs = bs4.BeautifulSoup(c_link, features='lxml')
html_links = bs.find_all("link", rel=self.pattern_icon)
if html_links and len(html_links) > 0:
# 优先查找指定rel类型的图标
icon_url = (self._get_link_rel(html_links, entity, 'shortcut icon') or
self._get_link_rel(html_links, entity, 'icon') or
self._get_link_rel(html_links, entity, 'alternate icon') or
self._get_link_rel(html_links, entity, ''))
if icon_url:
logger.info(f"-> 从HTML获取图标URL: {icon_url}")
return icon_url
except Exception as e:
logger.error(f"解析HTML失败: {e}")
return None
@staticmethod
def _get_link_rel(links, entity: Favicon, _rel: str) -> Optional[str]:
"""从链接列表中查找指定rel类型的图标URL"""
if not links:
return None
for link in links:
r = link.get('rel')
_r = ' '.join(r) if isinstance(r, list) else r
_href = link.get('href')
if _rel:
if _r.lower() == _rel:
return entity.get_icon_url(str(_href))
else:
return entity.get_icon_url(str(_href))
return None
async def _referer(self, req: Request) -> None:
"""记录请求来源"""
_referrer = req.headers.get('referrer') or req.headers.get('referer')
if _referrer:
logger.debug(f"-> Referrer: {_referrer}")
_path = os.path.join(icon_root_path, 'referrer.txt')
with self._lock:
# 首次加载现有referrer数据
if len(self.href_referrer) == 0 and os.path.exists(_path):
try:
with open(_path, 'r', encoding='utf-8') as ff:
self.href_referrer = {line.strip() for line in ff.readlines()}
except Exception as e:
logger.error(f"读取referrer文件失败: {e}")
# 添加新的referrer
if _referrer not in self.href_referrer:
self.href_referrer.add(_referrer)
try:
file_util.write_file(_path, f'{_referrer}\n', mode='a')
except Exception as e:
logger.error(f"写入referrer文件失败: {e}")
def get_icon_sync(self, entity: Favicon, _cached: bytes = None) -> Optional[bytes]:
"""同步获取图标"""
with self._lock:
if entity.domain in self.domain_list:
self._queue_pull(True, self.total_queue)
return None
else:
self.domain_list.append(entity.domain)
try:
icon_url, icon_content = None, None
# 尝试从网站获取HTML内容
html_content = entity.req_get()
if html_content:
icon_url = self._parse_html(html_content, entity)
# 尝试不同的图标获取策略
strategies = [
# 1. 从原始网页标签链接中获取
lambda: (icon_url, "原始网页标签") if icon_url else (None, None),
# 2. 从 gstatic.cn 接口获取
lambda: (
f'https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={entity.get_base_url()}',
"gstatic接口"),
# 3. 从网站默认位置获取
lambda: ('', "网站默认位置/favicon.ico"),
# 4. 从其他api接口获取
lambda: (f'https://ico.kucat.cn/get.php?url={entity.get_base_url()}', "第三方API")
]
for strategy in strategies:
if icon_content:
break
strategy_url, strategy_name = strategy()
if strategy_url is not None:
logger.info(f"-> 尝试从 {strategy_name} 获取图标")
icon_content, icon_type = entity.get_icon_file(strategy_url, strategy_url == '')
# 图标获取失败,或图标不是支持的图片格式,写入默认图标
if (not icon_content) or (not helpers.is_image(icon_content) or self._is_default_icon_byte(icon_content)):
logger.warning(f"-> 获取图标失败,使用默认图标: {entity.domain}")
icon_content = _cached if _cached else default_icon_content
if icon_content:
# Windows路径格式
cache_path = os.path.join(icon_root_path, 'icon', entity.domain_md5 + '.png')
md5_path = os.path.join(icon_root_path, 'md5', entity.domain_md5 + '.txt')
try:
# 确保目录存在
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
os.makedirs(os.path.dirname(md5_path), exist_ok=True)
# 写入缓存文件
file_util.write_file(cache_path, icon_content, mode='wb')
file_util.write_file(md5_path, entity.domain, mode='w')
except Exception as e:
logger.error(f"写入缓存文件失败: {e}")
with self._lock:
self.request_icon_count += 1
return icon_content
except Exception as e:
logger.error(f"获取图标时发生错误 {entity.domain}: {e}")
return None
finally:
with self._lock:
if entity.domain in self.domain_list:
self.domain_list.remove(entity.domain)
self._queue_pull(True, self.total_queue)
def get_icon_background(self, entity: Favicon, _cached: bytes = None) -> None:
"""在后台线程中获取图标"""
# 使用线程池执行同步函数
self.executor.submit(self.get_icon_sync, entity, _cached)
def get_count(self) -> Dict[str, int]:
"""获取统计数据"""
with self._lock:
return {
'url_count': self.url_count,
'request_icon_count': self.request_icon_count,
'request_cache_count': self.request_cache_count,
'queue_size': self.icon_queue.qsize(),
'total_queue_size': self.total_queue.qsize(),
'href_referrer': len(self.href_referrer),
}
async def get_favicon_handler(self, request: Request, url: Optional[str] = None,
refresh: Optional[str] = None) -> Response:
"""处理获取图标的请求"""
with self._lock:
self.url_count += 1
# 验证URL参数
if not url:
# 如果没有提供URL参数返回默认图标或提示页面
return {"message": "请提供url参数"}
try:
# 创建Favicon实例
entity = Favicon(url)
# 验证域名
if not entity.domain:
logger.warning(f"无效的URL: {url}")
return Response(content=default_icon_content, media_type="image/x-icon",
headers=self._get_header("", self.time_of_7_days))
# 检测并记录referer
await self._referer(request)
# 检查队列大小
if self.icon_queue.qsize() > 100:
logger.warning(f'-> 警告: 队列大小已达到 => {self.icon_queue.qsize()}')
# 检查缓存
_cached, cached_icon = self._get_cache_icon(entity.domain_md5, refresh=refresh in ['true', '1'])
if cached_icon:
# 使用缓存图标
icon_content = cached_icon
with self._lock:
self.request_cache_count += 1
else:
# 将域名加入队列
self.icon_queue.put(entity.domain)
self.total_queue.put(entity.domain)
if self.icon_queue.qsize() > 10:
# 如果队列较大,使用后台任务处理
# 在FastAPI中我们使用BackgroundTasks而不是直接提交到线程池
# 这里保持原有行为但在实际使用中应考虑使用FastAPI的BackgroundTasks
self.get_icon_background(entity, _cached)
self._queue_pull(True)
# 返回默认图标,但不缓存
return Response(content=default_icon_content, media_type="image/x-icon",
headers=self._get_header("", 0))
else:
# 直接处理请求
icon_content = self.get_icon_sync(entity, _cached)
self._queue_pull(True)
if not icon_content:
# 获取失败,返回默认图标,但不缓存
return Response(content=default_icon_content, media_type="image/x-icon",
headers=self._get_header("", 0))
# 确定内容类型和缓存时间
content_type = filetype.guess_mime(icon_content) if icon_content else ""
cache_time = self.time_of_1_hours * random.randint(1, 6) if self._is_default_icon_byte(
icon_content) else self.time_of_7_days
return Response(content=icon_content, media_type=content_type if content_type else "image/x-icon",
headers=self._get_header(content_type, cache_time))
except Exception as e:
logger.error(f"处理图标请求时发生错误 {url}: {e}")
# 发生异常时返回默认图标
return Response(content=default_icon_content, media_type="image/x-icon", headers=self._get_header("", 0))