diff --git a/Dockerfile b/Dockerfile index 1bacfd1..e75a530 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,4 +34,4 @@ VOLUME ["/app/data", "/app/conf", "/app/logs"] ENTRYPOINT ["entrypoint.sh"] # CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] -CMD ["gunicorn", "-c", "conf/gunicorn.conf.pyc", "main:app"] +CMD ["gunicorn", "-c", "conf/gunicorn.conf.py", "main:app"] diff --git a/conf.default/gunicorn.conf.py b/conf.default/gunicorn.conf.py new file mode 100644 index 0000000..389fafd --- /dev/null +++ b/conf.default/gunicorn.conf.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +from pathlib import Path + +import yaml + +# 绑定地址和端口 +bind = "0.0.0.0:8000" + +# Worker 进程数(推荐 CPU 核心数 * 2 + 1) +workers = 4 + +# 工作模式(sync、gevent、uvicorn.workers.UvicornWorker) +worker_class = "uvicorn.workers.UvicornWorker" + +# 日志目录 +log_dir = Path("logs") +log_dir.mkdir(exist_ok=True) + +# 日志配置 +with open(Path(__file__).with_name("logging.yaml"), "r", encoding="utf-8") as f: + logconfig_dict = yaml.safe_load(f) + +# 日志级别(debug、info、warning、error、critical);以 YAML 配置优先 +loglevel = "info" + +# 访问日志文件("-" 表示输出到 stdout);以 YAML 配置优先 +accesslog = "logs/access.log" + +# 错误日志文件;以 YAML 配置优先 +errorlog = "-" + +# access_log_format 仅在 同步 worker 下有效,UvicornWorker下不可用;以 YAML 配置优先 +# access_log_format = '%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s" %(D)s' +raw_env = [ + "UVICORN_ACCESS_LOGFORMAT=%(h)s %(l)s %(u)s %(t)s \"%(r)s\" %(s)s %(b)s \"%(f)s\" \"%(a)s\" %(D)s" +] + +# 可选:超时时间(秒) +timeout = 120 + +# Keep - Alive超时 +keepalive = 5 + +# 进程名(ps aux 中显示) +# proc_name = "gunicorn" + +# 守护进程运行(后台运行,默认 False) +# daemon = True diff --git a/conf.default/logging.yaml b/conf.default/logging.yaml new file mode 100644 index 0000000..28dc01f --- /dev/null +++ b/conf.default/logging.yaml @@ -0,0 +1,60 @@ +version: 1 +disable_existing_loggers: false +formatters: + default: + format: "[%(levelname)-7s] %(asctime)s [%(process)d] -[%(name)s:%(lineno)d] %(message)s" + datefmt: "%Y-%m-%d %H:%M:%S" + +handlers: + console: + class: logging.StreamHandler + level: INFO + formatter: default + stream: ext://sys.stdout + file_info: + class: logging.handlers.TimedRotatingFileHandler + level: INFO + formatter: default + filename: logs/info.log + when: midnight + interval: 1 + backupCount: 7 + encoding: utf8 + delay: true + file_error: + class: logging.handlers.TimedRotatingFileHandler + level: ERROR + formatter: default + filename: logs/error.log + when: midnight + interval: 1 + backupCount: 7 + encoding: utf8 + delay: true + +loggers: + uvicorn: + level: INFO + handlers: + - console + - file_info + propagate: false + uvicorn.error: + level: INFO + handlers: + - console + - file_error + propagate: false + uvicorn.access: + level: INFO + handlers: + - console + - file_info + propagate: false + +root: + level: INFO + handlers: + - console + - file_info + - file_error diff --git a/entrypoint.sh b/entrypoint.sh index 70224e6..f0069c2 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -2,7 +2,38 @@ set -e -# 首次启动时把镜像里的默认配置拷到挂载点 -[ -z "$(ls -A /app/conf)" ] && cp -r /app/conf.default/* /app/conf/ +mkdir -p /app/conf + +default_conf_dir="/app/conf.default/" +gunicorn_conf="/app/conf/gunicorn.conf.py" +logging_conf="/app/conf/logging.yaml" + +if [ ! -f "$gunicorn_conf" ]; then + echo "复制默认的gunicorn.conf.py配置文件..." + if [ -f "$default_conf_dir/gunicorn.conf.py" ]; then + cp "$default_conf_dir/gunicorn.conf.py" "$gunicorn_conf" + chmod 644 "$gunicorn_conf" + echo "已成功复制gunicorn.conf.py" + else + echo "警告:默认配置文件 $default_conf_dir/gunicorn.conf.py 不存在,创建空文件" + touch "$gunicorn_conf" + chmod 644 "$gunicorn_conf" + fi +fi + +if [ ! -f "$logging_conf" ]; then + echo "复制默认的logging.yaml配置文件..." + if [ -f "$default_conf_dir/logging.yaml" ]; then + cp "$default_conf_dir/logging.yaml" "$logging_conf" + chmod 644 "$logging_conf" + echo "已成功复制logging.yaml" + else + echo "警告:默认配置文件 $default_conf_dir/logging.yaml 不存在,创建空文件" + touch "$logging_conf" + chmod 644 "$logging_conf" + fi +fi + +mkdir -p /app/logs /app/data/icon /app/data/text exec "$@" diff --git a/favicon_app/routes/favicon_routes.py b/favicon_app/routes/favicon_routes.py index da750d5..11cd0c8 100644 --- a/favicon_app/routes/favicon_routes.py +++ b/favicon_app/routes/favicon_routes.py @@ -34,7 +34,7 @@ async def get_favicon( bg_tasks: BackgroundTasks, url: Optional[str] = Query(None, description="网址:eg. https://www.baidu.com"), refresh: Optional[str] = Query(None, include_in_schema=False), - sync: Optional[str] = Query('false', description="是否使用同步方式获取") + sync: Optional[str] = Query('true', description="是否使用同步方式获取") ): """获取网站图标""" return await _service.get_favicon_handler(request, bg_tasks, url, refresh, sync) diff --git a/favicon_app/routes/favicon_service.py b/favicon_app/routes/favicon_service.py index 2ba5dab..c68ea52 100644 --- a/favicon_app/routes/favicon_service.py +++ b/favicon_app/routes/favicon_service.py @@ -3,6 +3,7 @@ import hashlib import logging import os +import platform import random import re import time @@ -21,6 +22,15 @@ from favicon_app.utils import header from favicon_app.utils.file_util import FileUtil from favicon_app.utils.filetype import helpers, filetype +if platform.system() == 'Windows': + import msvcrt +else: + import fcntl + +# 多进程加锁 +LOCKS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', 'data', 'locks') +os.makedirs(LOCKS_DIR, exist_ok=True) + urllib3.disable_warnings() logging.captureWarnings(True) logger = logging.getLogger(__name__) @@ -50,6 +60,9 @@ class FaviconService: self.icon_queue = Queue() self.total_queue = Queue() + # 队列阈值常量配置 + self.MAX_QUEUE_SIZE = 3 + # 时间常量 self.time_of_1_minus = 1 * 60 self.time_of_5_minus = 5 * self.time_of_1_minus @@ -127,7 +140,7 @@ class FaviconService: def _get_cache_file(self, domain: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]: """从缓存中获取图标文件""" - cache_path = os.path.join(icon_root_path, 'data/icon', domain + '.png') + cache_path = os.path.join(icon_root_path, 'data', 'icon', domain + '.png') if os.path.exists(cache_path) and os.path.isfile(cache_path) and os.path.getsize(cache_path) > 0: try: cached_icon = FileUtil.read_file(cache_path, mode='rb') @@ -262,6 +275,70 @@ class FaviconService: return None + @staticmethod + def _lock_file(file_handle, lock_type='exclusive'): + """跨平台文件锁""" + if platform.system() == 'Windows': + try: + msvcrt.locking(file_handle.fileno(), msvcrt.LK_LOCK, 1) + return True + except Exception: + time.sleep(0.01) + try: + msvcrt.locking(file_handle.fileno(), msvcrt.LK_NBLCK, 1) + return True + except: + return False + else: + if lock_type == 'exclusive': + fcntl.flock(file_handle, fcntl.LOCK_EX) + else: + fcntl.flock(file_handle, fcntl.LOCK_SH) + return True + + @staticmethod + def _unlock_file(file_handle): + """释放文件锁""" + if platform.system() == 'Windows': + try: + msvcrt.locking(file_handle.fileno(), msvcrt.LK_UNLCK, 1) + except Exception as e: + logger.error(f"释放Windows文件锁失败: {e}") + else: + try: + fcntl.flock(file_handle, fcntl.LOCK_UN) + except Exception as e: + logger.error(f"释放Unix文件锁失败: {e}") + + def _get_domain_lock_path(self, domain: str) -> str: + """获取域名对应的锁文件路径""" + domain_hash = hashlib.md5(domain.encode('utf-8')).hexdigest() + return os.path.join(LOCKS_DIR, f"{domain_hash}.lock") + + def _acquire_domain_lock(self, domain: str, timeout: float = 5.0) -> Optional[str]: + """获取域名锁,防止多进程同时获取同一个域名的favicon""" + lock_path = self._get_domain_lock_path(domain) + start_time = time.time() + + while time.time() - start_time < timeout: + try: + fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY) + os.close(fd) + return lock_path + except FileExistsError: + time.sleep(0.1) + + logger.warning(f"获取域名锁超时: {domain}") + return None + + def _release_domain_lock(self, lock_path: str) -> None: + """释放域名锁""" + try: + if os.path.exists(lock_path): + os.remove(lock_path) + except Exception as e: + logger.error(f"释放锁文件失败 {lock_path}: {e}") + async def _referer(self, req: Request) -> None: """记录请求来源""" _referrer = req.headers.get('referrer') or req.headers.get('referer') @@ -270,40 +347,79 @@ class FaviconService: logger.debug(f"-> Referrer: {_referrer}") _path = os.path.join(icon_root_path, 'conf', 'referrer.txt') + os.makedirs(os.path.dirname(_path), exist_ok=True) - with self._lock: - # 首次加载现有referrer数据 - if len(self.href_referrer) == 0 and os.path.exists(_path): - try: - with open(_path, 'r', encoding='utf-8') as ff: - self.href_referrer = {line.strip() for line in ff.readlines()} - except Exception as e: - logger.error(f"读取referrer文件失败: {e}") + try: + if _referrer in self.href_referrer: + return - # 添加新的referrer - if _referrer not in self.href_referrer: - self.href_referrer.add(_referrer) + with open(_path, 'a+', encoding='utf-8') as f: try: - FileUtil.write_file(_path, f'{_referrer}\n', mode='a') - except Exception as e: - logger.error(f"写入referrer文件失败: {e}") + locked = self._lock_file(f, 'exclusive') + if not locked: + logger.warning(f"无法获取文件锁,跳过referrer记录: {_referrer}") + return + + f.seek(0) + existing_referrers = {line.strip() for line in f.readlines()} + + if _referrer not in existing_referrers: + f.seek(0, os.SEEK_END) + f.write(f'{_referrer}\n') + f.flush() + if platform.system() != 'Windows': + os.fsync(f.fileno()) + logger.debug(f"成功添加新referrer: {_referrer}") + self.href_referrer.add(_referrer) + else: + if _referrer not in self.href_referrer: + self.href_referrer.add(_referrer) + finally: + self._unlock_file(f) + except Exception as e: + logger.error(f"处理referrer文件失败: {e}") + + if len(self.href_referrer) > 1000 or random.random() < 0.01: + await self._refresh_referrer_cache(_path) + + async def _refresh_referrer_cache(self, file_path: str) -> None: + """刷新内存中的referrer缓存""" + try: + if os.path.exists(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + try: + locked = self._lock_file(f, 'shared') + if locked: + self.href_referrer = {line.strip() for line in f.readlines() if line.strip()} + finally: + self._unlock_file(f) + except Exception as e: + logger.error(f"刷新referrer缓存失败: {e}") def get_icon_sync(self, entity: Favicon, _cached: bytes = None) -> Optional[bytes]: """同步获取图标""" - with self._lock: - if entity.domain in self.domain_list: - self._queue_pull(True, self.total_queue) - return None - else: - self.domain_list.append(entity.domain) + domain_lock = None + icon_content = None try: - icon_url, icon_content = None, None + domain_lock = self._acquire_domain_lock(entity.domain) + if not domain_lock: + logger.warning(f"无法获取域名锁,跳过获取图标: {entity.domain}") + return _cached or default_icon_content + + with self._lock: + if entity.domain in self.domain_list: + self._queue_pull(True, self.total_queue) + return _cached or default_icon_content + else: + self.domain_list.append(entity.domain) # 尝试从网站获取HTML内容 html_content = entity.req_get() if html_content: icon_url = self._parse_html(html_content, entity) + else: + icon_url = None # 尝试不同的图标获取策略 strategies = [ @@ -336,8 +452,8 @@ class FaviconService: icon_content = _cached if _cached else default_icon_content if icon_content: - cache_path = os.path.join(icon_root_path, 'data/icon', entity.domain_md5 + '.png') - md5_path = os.path.join(icon_root_path, 'data/text', entity.domain_md5 + '.txt') + cache_path = os.path.join(icon_root_path, 'data', 'icon', entity.domain_md5 + '.png') + md5_path = os.path.join(icon_root_path, 'data', 'text', entity.domain_md5 + '.txt') try: # 确保目录存在 @@ -356,8 +472,11 @@ class FaviconService: return icon_content except Exception as e: logger.error(f"获取图标时发生错误 {entity.domain}: {e}") - return None + return _cached or default_icon_content finally: + if domain_lock: + self._release_domain_lock(domain_lock) + with self._lock: if entity.domain in self.domain_list: self.domain_list.remove(entity.domain) @@ -411,7 +530,7 @@ class FaviconService: icon_content = cached_icon with self._lock: self.request_cache_count += 1 - + # 确定内容类型和缓存时间 content_type = filetype.guess_mime(icon_content) if icon_content else "" cache_time = self.time_of_1_hours * random.randint(1, 6) if self._is_default_icon_byte(icon_content) else self.time_of_7_days @@ -429,7 +548,7 @@ class FaviconService: else: # 检查sync参数 is_sync = sync in ['true', '1', 'True'] - + if not is_sync: # 返回默认图片并加入后台队列 logger.info(f"返回默认图片并加入后台队列: {entity.domain}") @@ -438,20 +557,20 @@ class FaviconService: else: # 没有缓存,实时处理,检查队列大小 queue_size = self.icon_queue.qsize() - if queue_size >= 16: + if queue_size >= self.MAX_QUEUE_SIZE: # 加入后台队列并返回默认图片 - logger.info(f"队列大小({queue_size})>=16,返回默认图片并加入后台队列: {entity.domain}") + logger.info(f"队列大小({queue_size})>={self.MAX_QUEUE_SIZE},返回默认图片并加入后台队列: {entity.domain}") bg_tasks.add_task(self.get_icon_sync, entity, _cached) return self.get_default(0) else: - # 队列<16,实时处理 - logger.info(f"队列大小({queue_size})<16,实时处理: {entity.domain}") + # 队列