favicon-api-v3/favicon_app/routes/favicon_service.py

# -*- coding: utf-8 -*-

import hashlib
import logging
import os
import random
import re
import time
from queue import Queue
from threading import Lock
from typing import Optional, Tuple, Dict, Set, List

import bs4
import urllib3
from bs4 import SoupStrainer
from fastapi import Request, BackgroundTasks
from fastapi.responses import Response

from favicon_app.models import Favicon
from favicon_app.utils import header
from favicon_app.utils.file_util import FileUtil
from favicon_app.utils.filetype import helpers, filetype

urllib3.disable_warnings()
logging.captureWarnings(True)
logger = logging.getLogger(__name__)

# 获取当前所在目录的绝对路径
current_dir = os.path.dirname(os.path.abspath(__file__))
# icon 存储的绝对路径，上两级目录
icon_root_path = os.path.abspath(os.path.join(current_dir, '..', '..'))
default_icon_path = os.path.join(icon_root_path, 'favicon.png')
default_icon_content = FileUtil.read_file(default_icon_path, mode='rb')


class FaviconService:
    """图标服务类，封装所有与图标获取、缓存和处理相关的功能"""

    def __init__(self):
        # 使用锁保证线程安全
        self._lock = Lock()
        # 全局计数器和集合
        self.url_count = 0
        self.request_icon_count = 0
        self.request_cache_count = 0
        self.href_referrer: Set[str] = set()
        self.domain_list: List[str] = list()

        # 初始化队列
        self.icon_queue = Queue()
        self.total_queue = Queue()

        # 时间常量
        self.time_of_1_minus = 1 * 60
        self.time_of_5_minus = 5 * self.time_of_1_minus
        self.time_of_10_minus = 10 * self.time_of_1_minus
        self.time_of_30_minus = 30 * self.time_of_1_minus

        self.time_of_1_hours = 1 * 60 * 60
        self.time_of_2_hours = 2 * self.time_of_1_hours
        self.time_of_3_hours = 3 * self.time_of_1_hours
        self.time_of_6_hours = 6 * self.time_of_1_hours
        self.time_of_12_hours = 12 * self.time_of_1_hours

        self.time_of_1_days = 1 * 24 * 60 * 60
        self.time_of_7_days = 7 * self.time_of_1_days
        self.time_of_15_days = 15 * self.time_of_1_days
        self.time_of_30_days = 30 * self.time_of_1_days

        # 预编译正则表达式，提高性能
        self.pattern_icon = re.compile(r'(icon|shortcut icon|alternate icon|apple-touch-icon)+', re.I)
        self.pattern_link = re.compile(r'(<link[^>]+rel=.(icon|shortcut icon|alternate icon|apple-touch-icon)[^>]+>)',
                                       re.I)

        # 计算默认图标的MD5值
        self.default_icon_md5 = self._initialize_default_icon_md5()

    def _initialize_default_icon_md5(self) -> List[str]:
        """初始化默认图标MD5值列表"""
        md5_list = [self._get_file_md5(default_icon_path),
                    '05231fb6b69aff47c3f35efe09c11ba0',
                    '3ca64f83fdcf25135d87e08af65e68c9',
                    'db470fd0b65c8c121477343c37f74f02',
                    '52419f3f4f7d11945d272facc76c9e6a',
                    'b8a0bf372c762e966cc99ede8682bc71',
                    '71e9c45f29eadfa2ec5495302c22bcf6',
                    'ababc687adac587b8a06e580ee79aaa1',
                    '43802bddf65eeaab643adb8265bfbada']
        # 过滤掉None值
        return [md5 for md5 in md5_list if md5]

    @staticmethod
    def _get_file_md5(file_path: str) -> Optional[str]:
        """计算文件的MD5值"""
        try:
            md5 = hashlib.md5()
            with open(file_path, 'rb') as f:
                while True:
                    buffer = f.read(1024 * 8)
                    if not buffer:
                        break
                    md5.update(buffer)
            return md5.hexdigest().lower()
        except Exception as e:
            logger.error(f"计算文件MD5失败 {file_path}: {e}")
            return None

    def _is_default_icon_md5(self, icon_md5: str) -> bool:
        """检查图标MD5是否为默认图标"""
        return icon_md5 in self.default_icon_md5

    def _is_default_icon_file(self, file_path: str) -> bool:
        """检查文件是否为默认图标"""
        if os.path.exists(file_path) and os.path.isfile(file_path):
            md5 = self._get_file_md5(file_path)
            return md5 in self.default_icon_md5 if md5 else False
        return False

    def _is_default_icon_byte(self, file_content: bytes) -> bool:
        """检查字节内容是否为默认图标"""
        try:
            md5 = hashlib.md5(file_content).hexdigest().lower()
            return md5 in self.default_icon_md5
        except Exception as e:
            logger.error(f"计算字节内容MD5失败: {e}")
            return False

    def _get_cache_file(self, domain: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
        """从缓存中获取图标文件"""
        cache_path = os.path.join(icon_root_path, 'data/icon', domain + '.png')
        if os.path.exists(cache_path) and os.path.isfile(cache_path) and os.path.getsize(cache_path) > 0:
            try:
                cached_icon = FileUtil.read_file(cache_path, mode='rb')
                file_time = int(os.path.getmtime(cache_path))

                # 验证是否为有效的图片文件
                if not helpers.is_image(cached_icon):
                    logger.warning(f"缓存的图标不是有效图片: {cache_path}")
                    return None, None

                # 处理刷新请求或缓存过期情况
                if refresh:
                    if int(time.time()) - file_time <= self.time_of_12_hours:
                        logger.info(f"缓存文件修改时间在有效期内，不执行刷新: {cache_path}")
                        return cached_icon, cached_icon
                    return cached_icon, None

                # 检查缓存是否过期（最大30天）
                if int(time.time()) - file_time > self.time_of_30_days:
                    logger.info(f"图标缓存过期(>30天): {cache_path}")
                    return cached_icon, None

                # 默认图标，使用随机的缓存时间
                if int(time.time()) - file_time > self.time_of_1_days * random.randint(1, 7) and self._is_default_icon_file(cache_path):
                    logger.info(f"默认图标缓存过期: {cache_path}")
                    return cached_icon, None

                return cached_icon, cached_icon
            except Exception as e:
                logger.error(f"读取缓存文件失败 {cache_path}: {e}")
                return None, None
        return None, None

    def _get_cache_icon(self, domain_md5: str, refresh: bool = False) -> Tuple[Optional[bytes], Optional[bytes]]:
        """获取缓存的图标"""
        _cached, cached_icon = self._get_cache_file(domain_md5, refresh)

        # 替换默认图标
        if _cached and self._is_default_icon_byte(_cached):
            _cached = default_icon_content
        if cached_icon and self._is_default_icon_byte(cached_icon):
            cached_icon = default_icon_content

        return _cached, cached_icon

    def _get_header(self, content_type: str, cache_time: int = None) -> dict:
        """生成响应头"""
        if cache_time is None:
            cache_time = self.time_of_7_days

        _ct = 'image/x-icon'
        if content_type and content_type in header.image_type:
            _ct = content_type

        cache_control = 'no-store, no-cache, must-revalidate, max-age=0' if cache_time == 0 else f'public, max-age={cache_time}'

        return {
            'Content-Type': _ct,
            'Cache-Control': cache_control,
            'X-Robots-Tag': 'noindex, nofollow'
        }

    def _queue_pull(self, is_pull: bool = True, _queue: Queue = None) -> None:
        """从队列中取出元素"""
        if _queue is None:
            _queue = self.icon_queue

        if is_pull and not _queue.empty():
            # _queue.get()
            try:
                _queue.get_nowait()
                _queue.task_done()
            except Exception as e:
                logger.error(f"从队列中取出元素失败: {e}")

    def _parse_html(self, content: bytes, entity: Favicon) -> Optional[str]:
        """从HTML内容中解析图标URL"""
        if not content:
            return None

        try:
            # 尝试将bytes转换为字符串
            # str(content).encode('utf-8', 'replace').decode('utf-8', 'replace')
            content_str = content.decode('utf-8', 'replace')

            # 使用更高效的解析器
            bs = bs4.BeautifulSoup(content_str, features='lxml', parse_only=SoupStrainer("link"))
            if len(bs) == 0:
                bs = bs4.BeautifulSoup(content_str, features='html.parser', parse_only=SoupStrainer("link"))

            html_links = bs.find_all("link", rel=self.pattern_icon)

            # 如果没有找到，尝试使用正则表达式直接匹配
            if not html_links or len(html_links) == 0:
                content_links = self.pattern_link.findall(content_str)
                c_link = ''.join([_links[0] for _links in content_links])
                bs = bs4.BeautifulSoup(c_link, features='lxml')
                html_links = bs.find_all("link", rel=self.pattern_icon)

            if html_links and len(html_links) > 0:
                # 优先查找指定rel类型的图标
                icon_url = (self._get_link_rel(html_links, entity, 'shortcut icon') or
                            self._get_link_rel(html_links, entity, 'icon') or
                            self._get_link_rel(html_links, entity, 'alternate icon') or
                            self._get_link_rel(html_links, entity, ''))

                if icon_url:
                    logger.info(f"-> 从HTML获取图标URL: {icon_url}")

                return icon_url
        except Exception as e:
            logger.error(f"解析HTML失败: {e}")

        return None

    @staticmethod
    def _get_link_rel(links, entity: Favicon, _rel: str) -> Optional[str]:
        """从链接列表中查找指定rel类型的图标URL"""
        if not links:
            return None

        for link in links:
            r = link.get('rel')
            _r = ' '.join(r) if isinstance(r, list) else r
            _href = link.get('href')

            if _rel:
                if _r.lower() == _rel:
                    return entity.get_icon_url(str(_href))
            else:
                return entity.get_icon_url(str(_href))

        return None

    async def _referer(self, req: Request) -> None:
        """记录请求来源"""
        _referrer = req.headers.get('referrer') or req.headers.get('referer')

        if _referrer:
            logger.debug(f"-> Referrer: {_referrer}")

            _path = os.path.join(icon_root_path, 'conf', 'referrer.txt')

            with self._lock:
                # 首次加载现有referrer数据
                if len(self.href_referrer) == 0 and os.path.exists(_path):
                    try:
                        with open(_path, 'r', encoding='utf-8') as ff:
                            self.href_referrer = {line.strip() for line in ff.readlines()}
                    except Exception as e:
                        logger.error(f"读取referrer文件失败: {e}")

                # 添加新的referrer
                if _referrer not in self.href_referrer:
                    self.href_referrer.add(_referrer)
                    try:
                        FileUtil.write_file(_path, f'{_referrer}\n', mode='a')
                    except Exception as e:
                        logger.error(f"写入referrer文件失败: {e}")

    def get_icon_sync(self, entity: Favicon, _cached: bytes = None) -> Optional[bytes]:
        """同步获取图标"""
        with self._lock:
            if entity.domain in self.domain_list:
                self._queue_pull(True, self.total_queue)
                return None
            else:
                self.domain_list.append(entity.domain)

        try:
            icon_url, icon_content = None, None

            # 尝试从网站获取HTML内容
            html_content = entity.req_get()
            if html_content:
                icon_url = self._parse_html(html_content, entity)

            # 尝试不同的图标获取策略
            strategies = [
                # 1. 从原始网页标签链接中获取
                lambda: (icon_url, "原始网页标签") if icon_url else (None, None),
                # 2. 从 gstatic.cn 接口获取
                lambda: (
                    f'https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={entity.get_base_url()}',
                    "gstatic接口"),
                # 3. 从网站默认位置获取
                lambda: ('', "网站默认位置/favicon.ico"),
                # 4. 从其他api接口获取
                lambda: (f'https://ico.kucat.cn/get.php?url={entity.get_base_url()}', "第三方API"),
                # 99. 最后的尝试，cloudflare workers
                # lambda: (f'https://favicon.cary.cc/?url={entity.get_base_url()}', "cloudflare"),
            ]

            for strategy in strategies:
                if icon_content:
                    break

                strategy_url, strategy_name = strategy()
                if strategy_url is not None:
                    logger.info(f"-> 尝试从 {strategy_name} 获取图标")
                    icon_content, icon_type = entity.get_icon_file(strategy_url, strategy_url == '')

            # 图标获取失败，或图标不是支持的图片格式，写入默认图标
            if (not icon_content) or (not helpers.is_image(icon_content) or self._is_default_icon_byte(icon_content)):
                logger.warning(f"-> 获取图标失败，使用默认图标: {entity.domain}")
                icon_content = _cached if _cached else default_icon_content

            if icon_content:
                cache_path = os.path.join(icon_root_path, 'data/icon', entity.domain_md5 + '.png')
                md5_path = os.path.join(icon_root_path, 'data/text', entity.domain_md5 + '.txt')

                try:
                    # 确保目录存在
                    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
                    os.makedirs(os.path.dirname(md5_path), exist_ok=True)

                    # 写入缓存文件
                    FileUtil.write_file(cache_path, icon_content, mode='wb')
                    FileUtil.write_file(md5_path, entity.domain, mode='w')
                except Exception as e:
                    logger.error(f"写入缓存文件失败: {e}")

            with self._lock:
                self.request_icon_count += 1

            return icon_content
        except Exception as e:
            logger.error(f"获取图标时发生错误 {entity.domain}: {e}")
            return None
        finally:
            with self._lock:
                if entity.domain in self.domain_list:
                    self.domain_list.remove(entity.domain)
                self._queue_pull(True, self.total_queue)

    def get_count(self) -> Dict[str, int]:
        """获取统计数据"""
        with self._lock:
            return {
                'url_count': self.url_count,
                'request_icon_count': self.request_icon_count,
                'request_cache_count': self.request_cache_count,
                'queue_size': self.icon_queue.qsize(),
                'total_queue_size': self.total_queue.qsize(),
                'href_referrer': len(self.href_referrer),
            }

    async def get_favicon_handler(
            self,
            request: Request,
            bg_tasks: BackgroundTasks,
            url: Optional[str] = None,
            refresh: Optional[str] = None,
            sync: Optional[str] = None
    ) -> dict[str, str] | Response:
        """处理获取图标的请求"""
        with self._lock:
            self.url_count += 1

        # 验证URL参数
        if not url:
            return {"message": "请提供url参数"}

        logger.info('##########################################################')
        try:
            entity = Favicon(url)

            # 验证域名
            if not entity.domain:
                logger.warning(f"无效的URL: {url}")
                return self.get_default(self.time_of_7_days)

            # 检测并记录referer
            await self._referer(request)

            # 检查缓存
            _cached, cached_icon = self._get_cache_icon(entity.domain_md5, refresh=refresh in ['true', '1', 'True'])

            if cached_icon:
                # 使用缓存图标
                icon_content = cached_icon
                with self._lock:
                    self.request_cache_count += 1

                # 确定内容类型和缓存时间
                content_type = filetype.guess_mime(icon_content) if icon_content else ""
                cache_time = self.time_of_1_hours * random.randint(1, 6) if self._is_default_icon_byte(icon_content) else self.time_of_7_days

                # 乐观缓存机制：检查缓存是否已过期但仍有缓存内容
                # _cached 存在但 cached_icon 为 None 表示缓存已过期
                if _cached and not cached_icon:
                    # 缓存已过期，后台刷新缓存
                    logger.info(f"缓存已过期，加入后台队列刷新: {entity.domain}")
                    bg_tasks.add_task(self.get_icon_sync, entity, _cached)

                return Response(content=icon_content,
                                media_type=content_type if content_type else "image/x-icon",
                                headers=self._get_header(content_type, cache_time))
            else:
                # 检查sync参数
                is_sync = sync in ['true', '1', 'True']

                if not is_sync:
                    # 返回默认图片并加入后台队列
                    logger.info(f"返回默认图片并加入后台队列: {entity.domain}")
                    bg_tasks.add_task(self.get_icon_sync, entity, _cached)
                    return self.get_default(0)
                else:
                    # 没有缓存，实时处理，检查队列大小
                    queue_size = self.icon_queue.qsize()
                    if queue_size >= 16:
                        # 加入后台队列并返回默认图片
                        logger.info(f"队列大小({queue_size})>=16，返回默认图片并加入后台队列: {entity.domain}")
                        bg_tasks.add_task(self.get_icon_sync, entity, _cached)
                        return self.get_default(0)
                    else:
                        # 队列<16，实时处理
                        logger.info(f"队列大小({queue_size})<16，实时处理: {entity.domain}")
                        icon_content = self.get_icon_sync(entity, _cached)

                        if not icon_content:
                            # 获取失败，返回默认图标，不缓存
                            return self.get_default(0)

                        # 确定内容类型和缓存时间
                        content_type = filetype.guess_mime(icon_content) if icon_content else ""
                        cache_time = self.time_of_1_hours * random.randint(1, 6) if self._is_default_icon_byte(icon_content) else self.time_of_7_days

                        return Response(content=icon_content,
                                        media_type=content_type if content_type else "image/x-icon",
                                        headers=self._get_header(content_type, cache_time))
        except Exception as e:
            logger.error(f"处理图标请求时发生错误 {url}: {e}")
            # 返回默认图标
            return self.get_default(0)

    def get_header(self, content_type: str, cache_time: int = None) -> dict:
        return self._get_header(content_type, cache_time)

    def get_default(self, cache_time: int = None) -> Response:
        if cache_time is None:
            cache_time = self.time_of_1_days
        return Response(content=default_icon_content,
                        media_type="image/png",
                        headers=self._get_header("image/png", cache_time))