476 lines
20 KiB
Python
476 lines
20 KiB
Python
# -*- coding: utf-8 -*-
|
||
|
||
import hashlib
|
||
import logging
|
||
import os
|
||
import random
|
||
import re
|
||
import time
|
||
from queue import Queue
|
||
from threading import Lock
|
||
from typing import Optional, Tuple, Dict, Set, List
|
||
|
||
import bs4
|
||
import urllib3
|
||
from bs4 import SoupStrainer
|
||
from fastapi import Request, BackgroundTasks
|
||
from fastapi.responses import Response
|
||
|
||
from favicon_app.models import Favicon
|
||
from favicon_app.utils import header
|
||
from favicon_app.utils.file_util import FileUtil
|
||
from favicon_app.utils.filetype import helpers, filetype
|
||
|
||
urllib3.disable_warnings()
|
||
logging.captureWarnings(True)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# 获取当前所在目录的绝对路径
|
||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||
# icon 存储的绝对路径,上两级目录
|
||
icon_root_path = os.path.abspath(os.path.join(current_dir, '..', '..'))
|
||
default_icon_path = os.path.join(icon_root_path, 'favicon.png')
|
||
default_icon_content = FileUtil.read_file(default_icon_path, mode='rb')
|
||
|
||
|
||
class FaviconService:
|
||
"""图标服务类,封装所有与图标获取、缓存和处理相关的功能"""
|
||
|
||
def __init__(self):
|
||
# 使用锁保证线程安全
|
||
self._lock = Lock()
|
||
# 全局计数器和集合
|
||
self.url_count = 0
|
||
self.request_icon_count = 0
|
||
self.request_cache_count = 0
|
||
self.href_referrer: Set[str] = set()
|
||
self.domain_list: List[str] = list()
|
||
|
||
# 初始化队列
|
||
self.icon_queue = Queue()
|
||
self.total_queue = Queue()
|
||
|
||
# 时间常量
|
||
self.time_of_1_minus = 1 * 60
|
||
self.time_of_5_minus = 5 * self.time_of_1_minus
|
||
self.time_of_10_minus = 10 * self.time_of_1_minus
|
||
self.time_of_30_minus = 30 * self.time_of_1_minus
|
||
|
||
self.time_of_1_hours = 1 * 60 * 60
|
||
self.time_of_2_hours = 2 * self.time_of_1_hours
|
||
self.time_of_3_hours = 3 * self.time_of_1_hours
|
||
self.time_of_6_hours = 6 * self.time_of_1_hours
|
||
self.time_of_12_hours = 12 * self.time_of_1_hours
|
||
|
||
self.time_of_1_days = 1 * 24 * 60 * 60
|
||
self.time_of_7_days = 7 * self.time_of_1_days
|
||
self.time_of_15_days = 15 * self.time_of_1_days
|
||
self.time_of_30_days = 30 * self.time_of_1_days
|
||
|
||
# 预编译正则表达式,提高性能
|
||
self.pattern_icon = re.compile(r'(icon|shortcut icon|alternate icon|apple-touch-icon)+', re.I)
|
||
self.pattern_link = re.compile(r'(<link[^>]+rel=.(icon|shortcut icon|alternate icon|apple-touch-icon)[^>]+>)',
|
||
re.I)
|
||
|
||
# 计算默认图标的MD5值
|
||
self.default_icon_md5 = self._initialize_default_icon_md5()
|
||
|
||
def _initialize_default_icon_md5(self) -> List[str]:
|
||
"""初始化默认图标MD5值列表"""
|
||
md5_list = [self._get_file_md5(default_icon_path),
|
||
'05231fb6b69aff47c3f35efe09c11ba0',
|
||
'3ca64f83fdcf25135d87e08af65e68c9',
|
||
'db470fd0b65c8c121477343c37f74f02',
|
||
'52419f3f4f7d11945d272facc76c9e6a',
|
||
'b8a0bf372c762e966cc99ede8682bc71',
|
||
'71e9c45f29eadfa2ec5495302c22bcf6',
|
||
'ababc687adac587b8a06e580ee79aaa1',
|
||
'43802bddf65eeaab643adb8265bfbada']
|
||
# 过滤掉None值
|
||
return [md5 for md5 in md5_list if md5]
|
||
|
||
@staticmethod
|
||
def _get_file_md5(file_path: str) -> Optional[str]:
|
||
"""计算文件的MD5值"""
|
||
try:
|
||
md5 = hashlib.md5()
|
||
with open(file_path, 'rb') as f:
|
||
while True:
|
||
buffer = f.read(1024 * 8)
|
||
if not buffer:
|
||
break
|
||
md5.update(buffer)
|
||
return md5.hexdigest().lower()
|
||
except Exception as e:
|
||
logger.error(f"计算文件MD5失败 {file_path}: {e}")
|
||
return None
|
||
|
||
def _is_default_icon_md5(self, icon_md5: str) -> bool:
|
||
"""检查图标MD5是否为默认图标"""
|
||
return icon_md5 in self.default_icon_md5
|
||
|
||
def _is_default_icon_file(self, file_path: str) -> bool:
|
||
"""检查文件是否为默认图标"""
|
||
if os.path.exists(file_path) and os.path.isfile(file_path):
|
||
md5 = self._get_file_md5(file_path)
|
||
return md5 in self.default_icon_md5 if md5 else False
|
||
return False
|
||
|
||
def _is_default_icon_byte(self, file_content: bytes) -> bool:
|
||
"""检查字节内容是否为默认图标"""
|
||
try:
|
||
md5 = hashlib.md5(file_content).hexdigest().lower()
|
||
return md5 in self.default_icon_md5
|
||
except Exception as e:
|
||
logger.error(f"计算字节内容MD5失败: {e}")
|
||
return False
|
||
|
||
def _get_cache_file(self, domain: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
|
||
"""从缓存中获取图标文件"""
|
||
cache_path = os.path.join(icon_root_path, 'data/icon', domain + '.png')
|
||
if os.path.exists(cache_path) and os.path.isfile(cache_path) and os.path.getsize(cache_path) > 0:
|
||
try:
|
||
cached_icon = FileUtil.read_file(cache_path, mode='rb')
|
||
file_time = int(os.path.getmtime(cache_path))
|
||
|
||
# 验证是否为有效的图片文件
|
||
if not helpers.is_image(cached_icon):
|
||
logger.warning(f"缓存的图标不是有效图片: {cache_path}")
|
||
return None, None
|
||
|
||
# 处理刷新请求或缓存过期情况
|
||
if refresh:
|
||
if int(time.time()) - file_time <= self.time_of_12_hours:
|
||
logger.info(f"缓存文件修改时间在有效期内,不执行刷新: {cache_path}")
|
||
return cached_icon, cached_icon
|
||
return cached_icon, None
|
||
|
||
# 检查缓存是否过期(最大30天)
|
||
if int(time.time()) - file_time > self.time_of_30_days:
|
||
logger.info(f"图标缓存过期(>30天): {cache_path}")
|
||
return cached_icon, None
|
||
|
||
# 默认图标,使用随机的缓存时间
|
||
if int(time.time()) - file_time > self.time_of_1_days * random.randint(1, 7) and self._is_default_icon_file(cache_path):
|
||
logger.info(f"默认图标缓存过期: {cache_path}")
|
||
return cached_icon, None
|
||
|
||
return cached_icon, cached_icon
|
||
except Exception as e:
|
||
logger.error(f"读取缓存文件失败 {cache_path}: {e}")
|
||
return None, None
|
||
return None, None
|
||
|
||
def _get_cache_icon(self, domain_md5: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
|
||
"""获取缓存的图标"""
|
||
_cached, cached_icon = self._get_cache_file(domain_md5, refresh)
|
||
|
||
# 替换默认图标
|
||
if _cached and self._is_default_icon_byte(_cached):
|
||
_cached = default_icon_content
|
||
if cached_icon and self._is_default_icon_byte(cached_icon):
|
||
cached_icon = default_icon_content
|
||
|
||
return _cached, cached_icon
|
||
|
||
def _get_header(self, content_type: str, cache_time: int = None) -> dict:
|
||
"""生成响应头"""
|
||
if cache_time is None:
|
||
cache_time = self.time_of_7_days
|
||
|
||
_ct = 'image/x-icon'
|
||
if content_type and content_type in header.image_type:
|
||
_ct = content_type
|
||
|
||
cache_control = 'no-store, no-cache, must-revalidate, max-age=0' if cache_time == 0 else f'public, max-age={cache_time}'
|
||
|
||
return {
|
||
'Content-Type': _ct,
|
||
'Cache-Control': cache_control,
|
||
'X-Robots-Tag': 'noindex, nofollow'
|
||
}
|
||
|
||
def _queue_pull(self, is_pull: bool = True, _queue: Queue = None) -> None:
|
||
"""从队列中取出元素"""
|
||
if _queue is None:
|
||
_queue = self.icon_queue
|
||
|
||
if is_pull and not _queue.empty():
|
||
# _queue.get()
|
||
try:
|
||
_queue.get_nowait()
|
||
_queue.task_done()
|
||
except Exception as e:
|
||
logger.error(f"从队列中取出元素失败: {e}")
|
||
|
||
def _parse_html(self, content: bytes, entity: Favicon) -> Optional[str]:
|
||
"""从HTML内容中解析图标URL"""
|
||
if not content:
|
||
return None
|
||
|
||
try:
|
||
# 尝试将bytes转换为字符串
|
||
# str(content).encode('utf-8', 'replace').decode('utf-8', 'replace')
|
||
content_str = content.decode('utf-8', 'replace')
|
||
|
||
# 使用更高效的解析器
|
||
bs = bs4.BeautifulSoup(content_str, features='lxml', parse_only=SoupStrainer("link"))
|
||
if len(bs) == 0:
|
||
bs = bs4.BeautifulSoup(content_str, features='html.parser', parse_only=SoupStrainer("link"))
|
||
|
||
html_links = bs.find_all("link", rel=self.pattern_icon)
|
||
|
||
# 如果没有找到,尝试使用正则表达式直接匹配
|
||
if not html_links or len(html_links) == 0:
|
||
content_links = self.pattern_link.findall(content_str)
|
||
c_link = ''.join([_links[0] for _links in content_links])
|
||
bs = bs4.BeautifulSoup(c_link, features='lxml')
|
||
html_links = bs.find_all("link", rel=self.pattern_icon)
|
||
|
||
if html_links and len(html_links) > 0:
|
||
# 优先查找指定rel类型的图标
|
||
icon_url = (self._get_link_rel(html_links, entity, 'shortcut icon') or
|
||
self._get_link_rel(html_links, entity, 'icon') or
|
||
self._get_link_rel(html_links, entity, 'alternate icon') or
|
||
self._get_link_rel(html_links, entity, ''))
|
||
|
||
if icon_url:
|
||
logger.info(f"-> 从HTML获取图标URL: {icon_url}")
|
||
|
||
return icon_url
|
||
except Exception as e:
|
||
logger.error(f"解析HTML失败: {e}")
|
||
|
||
return None
|
||
|
||
@staticmethod
|
||
def _get_link_rel(links, entity: Favicon, _rel: str) -> Optional[str]:
|
||
"""从链接列表中查找指定rel类型的图标URL"""
|
||
if not links:
|
||
return None
|
||
|
||
for link in links:
|
||
r = link.get('rel')
|
||
_r = ' '.join(r) if isinstance(r, list) else r
|
||
_href = link.get('href')
|
||
|
||
if _rel:
|
||
if _r.lower() == _rel:
|
||
return entity.get_icon_url(str(_href))
|
||
else:
|
||
return entity.get_icon_url(str(_href))
|
||
|
||
return None
|
||
|
||
async def _referer(self, req: Request) -> None:
|
||
"""记录请求来源"""
|
||
_referrer = req.headers.get('referrer') or req.headers.get('referer')
|
||
|
||
if _referrer:
|
||
logger.debug(f"-> Referrer: {_referrer}")
|
||
|
||
_path = os.path.join(icon_root_path, 'conf', 'referrer.txt')
|
||
|
||
with self._lock:
|
||
# 首次加载现有referrer数据
|
||
if len(self.href_referrer) == 0 and os.path.exists(_path):
|
||
try:
|
||
with open(_path, 'r', encoding='utf-8') as ff:
|
||
self.href_referrer = {line.strip() for line in ff.readlines()}
|
||
except Exception as e:
|
||
logger.error(f"读取referrer文件失败: {e}")
|
||
|
||
# 添加新的referrer
|
||
if _referrer not in self.href_referrer:
|
||
self.href_referrer.add(_referrer)
|
||
try:
|
||
FileUtil.write_file(_path, f'{_referrer}\n', mode='a')
|
||
except Exception as e:
|
||
logger.error(f"写入referrer文件失败: {e}")
|
||
|
||
def get_icon_sync(self, entity: Favicon, _cached: bytes = None) -> Optional[bytes]:
|
||
"""同步获取图标"""
|
||
with self._lock:
|
||
if entity.domain in self.domain_list:
|
||
self._queue_pull(True, self.total_queue)
|
||
return None
|
||
else:
|
||
self.domain_list.append(entity.domain)
|
||
|
||
try:
|
||
icon_url, icon_content = None, None
|
||
|
||
# 尝试从网站获取HTML内容
|
||
html_content = entity.req_get()
|
||
if html_content:
|
||
icon_url = self._parse_html(html_content, entity)
|
||
|
||
# 尝试不同的图标获取策略
|
||
strategies = [
|
||
# 1. 从原始网页标签链接中获取
|
||
lambda: (icon_url, "原始网页标签") if icon_url else (None, None),
|
||
# 2. 从 gstatic.cn 接口获取
|
||
lambda: (
|
||
f'https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={entity.get_base_url()}',
|
||
"gstatic接口"),
|
||
# 3. 从网站默认位置获取
|
||
lambda: ('', "网站默认位置/favicon.ico"),
|
||
# 4. 从其他api接口获取
|
||
lambda: (f'https://ico.kucat.cn/get.php?url={entity.get_base_url()}', "第三方API"),
|
||
# 99. 最后的尝试,cloudflare workers
|
||
# lambda: (f'https://favicon.cary.cc/?url={entity.get_base_url()}', "cloudflare"),
|
||
]
|
||
|
||
for strategy in strategies:
|
||
if icon_content:
|
||
break
|
||
|
||
strategy_url, strategy_name = strategy()
|
||
if strategy_url is not None:
|
||
logger.info(f"-> 尝试从 {strategy_name} 获取图标")
|
||
icon_content, icon_type = entity.get_icon_file(strategy_url, strategy_url == '')
|
||
|
||
# 图标获取失败,或图标不是支持的图片格式,写入默认图标
|
||
if (not icon_content) or (not helpers.is_image(icon_content) or self._is_default_icon_byte(icon_content)):
|
||
logger.warning(f"-> 获取图标失败,使用默认图标: {entity.domain}")
|
||
icon_content = _cached if _cached else default_icon_content
|
||
|
||
if icon_content:
|
||
cache_path = os.path.join(icon_root_path, 'data/icon', entity.domain_md5 + '.png')
|
||
md5_path = os.path.join(icon_root_path, 'data/text', entity.domain_md5 + '.txt')
|
||
|
||
try:
|
||
# 确保目录存在
|
||
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
|
||
os.makedirs(os.path.dirname(md5_path), exist_ok=True)
|
||
|
||
# 写入缓存文件
|
||
FileUtil.write_file(cache_path, icon_content, mode='wb')
|
||
FileUtil.write_file(md5_path, entity.domain, mode='w')
|
||
except Exception as e:
|
||
logger.error(f"写入缓存文件失败: {e}")
|
||
|
||
with self._lock:
|
||
self.request_icon_count += 1
|
||
|
||
return icon_content
|
||
except Exception as e:
|
||
logger.error(f"获取图标时发生错误 {entity.domain}: {e}")
|
||
return None
|
||
finally:
|
||
with self._lock:
|
||
if entity.domain in self.domain_list:
|
||
self.domain_list.remove(entity.domain)
|
||
self._queue_pull(True, self.total_queue)
|
||
|
||
def get_count(self) -> Dict[str, int]:
|
||
"""获取统计数据"""
|
||
with self._lock:
|
||
return {
|
||
'url_count': self.url_count,
|
||
'request_icon_count': self.request_icon_count,
|
||
'request_cache_count': self.request_cache_count,
|
||
'queue_size': self.icon_queue.qsize(),
|
||
'total_queue_size': self.total_queue.qsize(),
|
||
'href_referrer': len(self.href_referrer),
|
||
}
|
||
|
||
async def get_favicon_handler(
|
||
self,
|
||
request: Request,
|
||
bg_tasks: BackgroundTasks,
|
||
url: Optional[str] = None,
|
||
refresh: Optional[str] = None,
|
||
sync: Optional[str] = None
|
||
) -> dict[str, str] | Response:
|
||
"""处理获取图标的请求"""
|
||
with self._lock:
|
||
self.url_count += 1
|
||
|
||
# 验证URL参数
|
||
if not url:
|
||
return {"message": "请提供url参数"}
|
||
|
||
logger.info('##########################################################')
|
||
try:
|
||
entity = Favicon(url)
|
||
|
||
# 验证域名
|
||
if not entity.domain:
|
||
logger.warning(f"无效的URL: {url}")
|
||
return self.get_default(self.time_of_7_days)
|
||
|
||
# 检测并记录referer
|
||
await self._referer(request)
|
||
|
||
# 检查缓存
|
||
_cached, cached_icon = self._get_cache_icon(entity.domain_md5, refresh=refresh in ['true', '1', 'True'])
|
||
|
||
if cached_icon:
|
||
# 使用缓存图标
|
||
icon_content = cached_icon
|
||
with self._lock:
|
||
self.request_cache_count += 1
|
||
|
||
# 确定内容类型和缓存时间
|
||
content_type = filetype.guess_mime(icon_content) if icon_content else ""
|
||
cache_time = self.time_of_1_hours * random.randint(1, 6) if self._is_default_icon_byte(icon_content) else self.time_of_7_days
|
||
|
||
# 乐观缓存机制:检查缓存是否已过期但仍有缓存内容
|
||
# _cached 存在但 cached_icon 为 None 表示缓存已过期
|
||
if _cached and not cached_icon:
|
||
# 缓存已过期,后台刷新缓存
|
||
logger.info(f"缓存已过期,加入后台队列刷新: {entity.domain}")
|
||
bg_tasks.add_task(self.get_icon_sync, entity, _cached)
|
||
|
||
return Response(content=icon_content,
|
||
media_type=content_type if content_type else "image/x-icon",
|
||
headers=self._get_header(content_type, cache_time))
|
||
else:
|
||
# 检查sync参数
|
||
is_sync = sync in ['true', '1', 'True']
|
||
|
||
if not is_sync:
|
||
# 返回默认图片并加入后台队列
|
||
logger.info(f"返回默认图片并加入后台队列: {entity.domain}")
|
||
bg_tasks.add_task(self.get_icon_sync, entity, _cached)
|
||
return self.get_default(0)
|
||
else:
|
||
# 没有缓存,实时处理,检查队列大小
|
||
queue_size = self.icon_queue.qsize()
|
||
if queue_size >= 16:
|
||
# 加入后台队列并返回默认图片
|
||
logger.info(f"队列大小({queue_size})>=16,返回默认图片并加入后台队列: {entity.domain}")
|
||
bg_tasks.add_task(self.get_icon_sync, entity, _cached)
|
||
return self.get_default(0)
|
||
else:
|
||
# 队列<16,实时处理
|
||
logger.info(f"队列大小({queue_size})<16,实时处理: {entity.domain}")
|
||
icon_content = self.get_icon_sync(entity, _cached)
|
||
|
||
if not icon_content:
|
||
# 获取失败,返回默认图标,不缓存
|
||
return self.get_default(0)
|
||
|
||
# 确定内容类型和缓存时间
|
||
content_type = filetype.guess_mime(icon_content) if icon_content else ""
|
||
cache_time = self.time_of_1_hours * random.randint(1, 6) if self._is_default_icon_byte(icon_content) else self.time_of_7_days
|
||
|
||
return Response(content=icon_content,
|
||
media_type=content_type if content_type else "image/x-icon",
|
||
headers=self._get_header(content_type, cache_time))
|
||
except Exception as e:
|
||
logger.error(f"处理图标请求时发生错误 {url}: {e}")
|
||
# 返回默认图标
|
||
return self.get_default(0)
|
||
|
||
def get_header(self, content_type: str, cache_time: int = None) -> dict:
|
||
return self._get_header(content_type, cache_time)
|
||
|
||
def get_default(self, cache_time: int = None) -> Response:
|
||
if cache_time is None:
|
||
cache_time = self.time_of_1_days
|
||
return Response(content=default_icon_content,
|
||
media_type="image/png",
|
||
headers=self._get_header("image/png", cache_time))
|