This commit is contained in:
jinql
2025-09-03 21:57:10 +08:00
parent 452e110ee5
commit e54e69b940
6 changed files with 296 additions and 37 deletions

View File

@@ -34,7 +34,7 @@ async def get_favicon(
bg_tasks: BackgroundTasks,
url: Optional[str] = Query(None, description="网址eg. https://www.baidu.com"),
refresh: Optional[str] = Query(None, include_in_schema=False),
sync: Optional[str] = Query('false', description="是否使用同步方式获取")
sync: Optional[str] = Query('true', description="是否使用同步方式获取")
):
"""获取网站图标"""
return await _service.get_favicon_handler(request, bg_tasks, url, refresh, sync)

View File

@@ -3,6 +3,7 @@
import hashlib
import logging
import os
import platform
import random
import re
import time
@@ -21,6 +22,15 @@ from favicon_app.utils import header
from favicon_app.utils.file_util import FileUtil
from favicon_app.utils.filetype import helpers, filetype
if platform.system() == 'Windows':
import msvcrt
else:
import fcntl
# 多进程加锁
LOCKS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', 'data', 'locks')
os.makedirs(LOCKS_DIR, exist_ok=True)
urllib3.disable_warnings()
logging.captureWarnings(True)
logger = logging.getLogger(__name__)
@@ -50,6 +60,9 @@ class FaviconService:
self.icon_queue = Queue()
self.total_queue = Queue()
# 队列阈值常量配置
self.MAX_QUEUE_SIZE = 3
# 时间常量
self.time_of_1_minus = 1 * 60
self.time_of_5_minus = 5 * self.time_of_1_minus
@@ -127,7 +140,7 @@ class FaviconService:
def _get_cache_file(self, domain: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
"""从缓存中获取图标文件"""
cache_path = os.path.join(icon_root_path, 'data/icon', domain + '.png')
cache_path = os.path.join(icon_root_path, 'data', 'icon', domain + '.png')
if os.path.exists(cache_path) and os.path.isfile(cache_path) and os.path.getsize(cache_path) > 0:
try:
cached_icon = FileUtil.read_file(cache_path, mode='rb')
@@ -262,6 +275,70 @@ class FaviconService:
return None
@staticmethod
def _lock_file(file_handle, lock_type='exclusive'):
"""跨平台文件锁"""
if platform.system() == 'Windows':
try:
msvcrt.locking(file_handle.fileno(), msvcrt.LK_LOCK, 1)
return True
except Exception:
time.sleep(0.01)
try:
msvcrt.locking(file_handle.fileno(), msvcrt.LK_NBLCK, 1)
return True
except:
return False
else:
if lock_type == 'exclusive':
fcntl.flock(file_handle, fcntl.LOCK_EX)
else:
fcntl.flock(file_handle, fcntl.LOCK_SH)
return True
@staticmethod
def _unlock_file(file_handle):
"""释放文件锁"""
if platform.system() == 'Windows':
try:
msvcrt.locking(file_handle.fileno(), msvcrt.LK_UNLCK, 1)
except Exception as e:
logger.error(f"释放Windows文件锁失败: {e}")
else:
try:
fcntl.flock(file_handle, fcntl.LOCK_UN)
except Exception as e:
logger.error(f"释放Unix文件锁失败: {e}")
def _get_domain_lock_path(self, domain: str) -> str:
"""获取域名对应的锁文件路径"""
domain_hash = hashlib.md5(domain.encode('utf-8')).hexdigest()
return os.path.join(LOCKS_DIR, f"{domain_hash}.lock")
def _acquire_domain_lock(self, domain: str, timeout: float = 5.0) -> Optional[str]:
"""获取域名锁防止多进程同时获取同一个域名的favicon"""
lock_path = self._get_domain_lock_path(domain)
start_time = time.time()
while time.time() - start_time < timeout:
try:
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
os.close(fd)
return lock_path
except FileExistsError:
time.sleep(0.1)
logger.warning(f"获取域名锁超时: {domain}")
return None
def _release_domain_lock(self, lock_path: str) -> None:
"""释放域名锁"""
try:
if os.path.exists(lock_path):
os.remove(lock_path)
except Exception as e:
logger.error(f"释放锁文件失败 {lock_path}: {e}")
async def _referer(self, req: Request) -> None:
"""记录请求来源"""
_referrer = req.headers.get('referrer') or req.headers.get('referer')
@@ -270,40 +347,79 @@ class FaviconService:
logger.debug(f"-> Referrer: {_referrer}")
_path = os.path.join(icon_root_path, 'conf', 'referrer.txt')
os.makedirs(os.path.dirname(_path), exist_ok=True)
with self._lock:
# 首次加载现有referrer数据
if len(self.href_referrer) == 0 and os.path.exists(_path):
try:
with open(_path, 'r', encoding='utf-8') as ff:
self.href_referrer = {line.strip() for line in ff.readlines()}
except Exception as e:
logger.error(f"读取referrer文件失败: {e}")
try:
if _referrer in self.href_referrer:
return
# 添加新的referrer
if _referrer not in self.href_referrer:
self.href_referrer.add(_referrer)
with open(_path, 'a+', encoding='utf-8') as f:
try:
FileUtil.write_file(_path, f'{_referrer}\n', mode='a')
except Exception as e:
logger.error(f"写入referrer文件失败: {e}")
locked = self._lock_file(f, 'exclusive')
if not locked:
logger.warning(f"无法获取文件锁,跳过referrer记录: {_referrer}")
return
f.seek(0)
existing_referrers = {line.strip() for line in f.readlines()}
if _referrer not in existing_referrers:
f.seek(0, os.SEEK_END)
f.write(f'{_referrer}\n')
f.flush()
if platform.system() != 'Windows':
os.fsync(f.fileno())
logger.debug(f"成功添加新referrer: {_referrer}")
self.href_referrer.add(_referrer)
else:
if _referrer not in self.href_referrer:
self.href_referrer.add(_referrer)
finally:
self._unlock_file(f)
except Exception as e:
logger.error(f"处理referrer文件失败: {e}")
if len(self.href_referrer) > 1000 or random.random() < 0.01:
await self._refresh_referrer_cache(_path)
async def _refresh_referrer_cache(self, file_path: str) -> None:
"""刷新内存中的referrer缓存"""
try:
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
try:
locked = self._lock_file(f, 'shared')
if locked:
self.href_referrer = {line.strip() for line in f.readlines() if line.strip()}
finally:
self._unlock_file(f)
except Exception as e:
logger.error(f"刷新referrer缓存失败: {e}")
def get_icon_sync(self, entity: Favicon, _cached: bytes = None) -> Optional[bytes]:
"""同步获取图标"""
with self._lock:
if entity.domain in self.domain_list:
self._queue_pull(True, self.total_queue)
return None
else:
self.domain_list.append(entity.domain)
domain_lock = None
icon_content = None
try:
icon_url, icon_content = None, None
domain_lock = self._acquire_domain_lock(entity.domain)
if not domain_lock:
logger.warning(f"无法获取域名锁,跳过获取图标: {entity.domain}")
return _cached or default_icon_content
with self._lock:
if entity.domain in self.domain_list:
self._queue_pull(True, self.total_queue)
return _cached or default_icon_content
else:
self.domain_list.append(entity.domain)
# 尝试从网站获取HTML内容
html_content = entity.req_get()
if html_content:
icon_url = self._parse_html(html_content, entity)
else:
icon_url = None
# 尝试不同的图标获取策略
strategies = [
@@ -336,8 +452,8 @@ class FaviconService:
icon_content = _cached if _cached else default_icon_content
if icon_content:
cache_path = os.path.join(icon_root_path, 'data/icon', entity.domain_md5 + '.png')
md5_path = os.path.join(icon_root_path, 'data/text', entity.domain_md5 + '.txt')
cache_path = os.path.join(icon_root_path, 'data', 'icon', entity.domain_md5 + '.png')
md5_path = os.path.join(icon_root_path, 'data', 'text', entity.domain_md5 + '.txt')
try:
# 确保目录存在
@@ -356,8 +472,11 @@ class FaviconService:
return icon_content
except Exception as e:
logger.error(f"获取图标时发生错误 {entity.domain}: {e}")
return None
return _cached or default_icon_content
finally:
if domain_lock:
self._release_domain_lock(domain_lock)
with self._lock:
if entity.domain in self.domain_list:
self.domain_list.remove(entity.domain)
@@ -411,7 +530,7 @@ class FaviconService:
icon_content = cached_icon
with self._lock:
self.request_cache_count += 1
# 确定内容类型和缓存时间
content_type = filetype.guess_mime(icon_content) if icon_content else ""
cache_time = self.time_of_1_hours * random.randint(1, 6) if self._is_default_icon_byte(icon_content) else self.time_of_7_days
@@ -429,7 +548,7 @@ class FaviconService:
else:
# 检查sync参数
is_sync = sync in ['true', '1', 'True']
if not is_sync:
# 返回默认图片并加入后台队列
logger.info(f"返回默认图片并加入后台队列: {entity.domain}")
@@ -438,20 +557,20 @@ class FaviconService:
else:
# 没有缓存,实时处理,检查队列大小
queue_size = self.icon_queue.qsize()
if queue_size >= 16:
if queue_size >= self.MAX_QUEUE_SIZE:
# 加入后台队列并返回默认图片
logger.info(f"队列大小({queue_size})>=16,返回默认图片并加入后台队列: {entity.domain}")
logger.info(f"队列大小({queue_size})>={self.MAX_QUEUE_SIZE},返回默认图片并加入后台队列: {entity.domain}")
bg_tasks.add_task(self.get_icon_sync, entity, _cached)
return self.get_default(0)
else:
# 队列<16,实时处理
logger.info(f"队列大小({queue_size})<16,实时处理: {entity.domain}")
# 队列<MAX_QUEUE_SIZE,实时处理
logger.info(f"队列大小({queue_size})<{self.MAX_QUEUE_SIZE},实时处理: {entity.domain}")
icon_content = self.get_icon_sync(entity, _cached)
if not icon_content:
# 获取失败,返回默认图标,不缓存
return self.get_default(0)
# 确定内容类型和缓存时间
content_type = filetype.guess_mime(icon_content) if icon_content else ""
cache_time = self.time_of_1_hours * random.randint(1, 6) if self._is_default_icon_byte(icon_content) else self.time_of_7_days