From b8b10048c30188ee6e0938cdeb8f248e412fa88d Mon Sep 17 00:00:00 2001 From: jinql Date: Thu, 18 Sep 2025 23:54:13 +0800 Subject: [PATCH] =?UTF-8?q?1.=20API=E5=A4=96=E9=83=A8=E6=8E=A5=E5=8F=A3?= =?UTF-8?q?=E5=8F=AF=E9=85=8D=E7=BD=AE=202.=20=E5=A4=B1=E8=B4=A5=E7=9A=84u?= =?UTF-8?q?rl=E4=BF=9D=E5=AD=98=E5=88=B0=E6=96=87=E4=BB=B6=E4=B8=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- favicon_app/models/favicon.py | 143 +++++++++++++++++++++++++- favicon_app/routes/favicon_service.py | 18 ++-- setting.py | 20 ++++ 3 files changed, 165 insertions(+), 16 deletions(-) diff --git a/favicon_app/models/favicon.py b/favicon_app/models/favicon.py index 5034ec4..fabb404 100644 --- a/favicon_app/models/favicon.py +++ b/favicon_app/models/favicon.py @@ -4,6 +4,7 @@ import base64 import hashlib import ipaddress import logging +import os import re import socket import time @@ -35,6 +36,8 @@ DEFAULT_RETRIES = 2 # 存储失败的URL,值为缓存过期时间戳 failed_urls: Dict[str, int] = dict() +# 记录上次保存失败URL的时间 +_last_saved_failed_urls = time.time() # 创建aiohttp客户端会话池 _aiohttp_client = None @@ -111,7 +114,8 @@ class Favicon: if self.domain: self.domain_md5 = hashlib.md5(self.domain.encode("utf-8")).hexdigest() except Exception as e: - failed_urls[self.domain] = setting.time_of_1_days + int(time.time()) + # failed_urls[self.domain] = setting.time_of_1_days + int(time.time()) + add_failed_url(self.domain, setting.time_of_1_days + int(time.time())) self.scheme = None self.domain = None logger.error('URL解析错误: %s, URL: %s', str(e), url) @@ -271,7 +275,8 @@ def _check_internal(domain: str) -> bool: return True return False except Exception as e: - failed_urls[domain] = setting.time_of_1_days + int(time.time()) + # failed_urls[domain] = setting.time_of_1_days + int(time.time()) + add_failed_url(domain, setting.time_of_1_days + int(time.time())) logger.error('解析域名出错: %s, 错误: %s', domain, str(e)) return False @@ -341,25 +346,153 @@ async def _req_get(url: str, content = await resp.read() return content, ct_type else: - failed_urls[domain] = setting.time_of_1_hours + int(time.time()) + # failed_urls[domain] = setting.time_of_1_hours + int(time.time()) + add_failed_url(domain, setting.time_of_1_hours + int(time.time())) logger.error('异步请求失败: %d, URL: %s', resp.status, url) break except (aiohttp.ClientConnectorError, aiohttp.ServerTimeoutError) as e: retry_count += 1 if retry_count > retries: - failed_urls[domain] = setting.time_of_5_minus + int(time.time()) + # failed_urls[domain] = setting.time_of_5_minus + int(time.time()) + add_failed_url(domain, setting.time_of_5_minus + int(time.time())) logger.error('异步请求超时: %s, URL: %s', str(e), url) else: logger.warning('异步请求超时,正在重试(%d/%d): %s', retry_count, retries, url) continue except Exception as e: - failed_urls[domain] = setting.time_of_1_hours + int(time.time()) + # failed_urls[domain] = setting.time_of_1_hours + int(time.time()) + add_failed_url(domain, setting.time_of_1_hours + int(time.time())) logger.error('异步请求异常: %s, URL: %s', str(e), url) break return None, None +def add_failed_url(domain: str, expire_time: int): + """添加失败的URL,并在数量达到10的倍数时保存到文件 + + Args: + domain: 域名 + expire_time: 过期时间戳 + """ + global failed_urls + + # 添加或更新失败URL + if not domain: # 确保域名不为空 + return + + old_count = len(failed_urls) + failed_urls[domain] = expire_time + new_count = len(failed_urls) + + # 检查是否需要保存到文件(当新增了指定数量的URL或数量是指定阈值的倍数) + if new_count % setting.FAILED_URLS_SAVE_THRESHOLD == 0 or ( + new_count - old_count) >= setting.FAILED_URLS_SAVE_THRESHOLD: + save_failed_urls() + + +def save_failed_urls(): + """保存失败的URL到文件,每增加10个URL触发一次""" + global failed_urls, _last_saved_failed_urls + + try: + # 读取现有文件内容 + existing_urls = {} + if os.path.exists(setting.failed_urls_file): + try: + # 确保目录存在 + os.makedirs(os.path.dirname(setting.failed_urls_file), exist_ok=True) + + # 读取文件内容 + with open(setting.failed_urls_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + + # 解析文件内容 + for line in lines: + line = line.strip() + if line and '\t' in line: + try: + domain, timestamp_str = line.split('\t', 1) + timestamp = int(timestamp_str) + existing_urls[domain] = timestamp + except: + continue + except Exception as e: + logger.error('读取失败URL文件出错: %s', str(e)) + + # 合并当前失败URL和文件中的URL,保留最新的过期时间 + merged_urls = {**existing_urls} + for domain, timestamp in failed_urls.items(): + # 只保留过期时间更晚的条目 + if domain not in merged_urls or timestamp > merged_urls[domain]: + merged_urls[domain] = timestamp + + # 保存合并后的结果 + os.makedirs(os.path.dirname(setting.failed_urls_file), exist_ok=True) + with open(setting.failed_urls_file, 'w', encoding='utf-8') as f: + for domain, timestamp in merged_urls.items(): + # 只保留未过期的URL(时间戳大于当前时间) + if timestamp > time.time(): + f.write(f"{domain}\t{timestamp}\n") + + # 更新内存中的failed_urls为合并和去重后的结果 + failed_urls = merged_urls + _last_saved_failed_urls = time.time() + + logger.info(f'成功保存{len(merged_urls)}个失败URL到文件') + except Exception as e: + logger.error('保存失败URL到文件出错: %s', str(e)) + + +def load_failed_urls(): + """从文件加载失败的URL到内存中 + + 当failed_urls为空时调用,从failed_urls_file读取数据并加载到failed_urls字典中 + 只加载未过期的URL + """ + global failed_urls + + try: + if not os.path.exists(setting.failed_urls_file): + logger.info('失败URL文件不存在,无需加载') + return + + # 确保目录存在 + os.makedirs(os.path.dirname(setting.failed_urls_file), exist_ok=True) + + # 读取文件内容 + with open(setting.failed_urls_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + + # 解析文件内容,只加载未过期的URL + loaded_urls = {} + current_time = time.time() + for line in lines: + line = line.strip() + if line and '\t' in line: + try: + domain, timestamp_str = line.split('\t', 1) + timestamp = int(timestamp_str) + # 只加载未过期的URL + if timestamp > current_time: + loaded_urls[domain] = timestamp + except: + continue + + # 更新内存中的failed_urls + if loaded_urls: + failed_urls.update(loaded_urls) + logger.info(f'成功从文件加载{len(loaded_urls)}个未过期的失败URL') + else: + logger.info('文件中没有未过期的失败URL需要加载') + except Exception as e: + logger.error('从文件加载失败URL出错: %s', str(e)) + + +# 初始化时,如果failed_urls为空,则从文件加载 +if not failed_urls: + load_failed_urls() + # 域名验证正则表达式 _pattern_domain = re.compile( r'[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62}(\.[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62})+\.?', diff --git a/favicon_app/routes/favicon_service.py b/favicon_app/routes/favicon_service.py index d6c5015..60aeaa6 100644 --- a/favicon_app/routes/favicon_service.py +++ b/favicon_app/routes/favicon_service.py @@ -332,18 +332,14 @@ async def get_icon_async(entity: Favicon, _cached: bytes = None) -> Optional[byt strategies = [ # 0. 从原始网页标签链接中获取 lambda: (icon_url, "原始网页标签") if icon_url else (None, None), - # 从 gstatic.cn 接口获取 - lambda: ( - f'https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={entity.get_base_url()}', - "gstatic接口"), - # 从网站默认位置获取 - lambda: ('', "网站默认位置/favicon.ico"), - # 从其他api接口获取 - lambda: (f'https://favicon.is/{entity.domain}', "第三方API"), - lambda: (f'https://ico.kucat.cn/get.php?url={entity.get_base_url()}', "第三方API"), - # 99. 最后的尝试,cloudflare workers - # lambda: (f'https://favicon.cary.cc/?url={entity.get_base_url()}', "cloudflare"), ] + + # 2. 从配置文件加载其他图标获取接口 + for _template, _name in setting.FAVICON_APIS: + strategies.append( + lambda template=_template, name=_name: + (template.format(domain=entity.domain, base_url=entity.get_base_url()), name) + ) for strategy in strategies: if icon_content: diff --git a/setting.py b/setting.py index ac32ecf..18bf947 100644 --- a/setting.py +++ b/setting.py @@ -16,6 +16,10 @@ default_icon_path = os.path.join(icon_root_path, 'favicon.png') default_icon_file = FileUtil.read_file(default_icon_path, mode='rb') # 定义referer日志文件路径 referer_log_file = os.path.join(icon_root_path, 'data', 'referer.txt') +# 定义失败URL日志文件路径 +failed_urls_file = os.path.join(icon_root_path, 'data', 'failedurls.txt') +# 失败URL保存阈值,当失败URL数量达到此值的倍数时保存到文件 +FAILED_URLS_SAVE_THRESHOLD = 10 # 时间常量 time_of_1_minus = 1 * 60 @@ -33,3 +37,19 @@ time_of_1_days = 1 * 24 * 60 * 60 time_of_7_days = 7 * time_of_1_days time_of_15_days = 15 * time_of_1_days time_of_30_days = 30 * time_of_1_days + +# 图标获取接口配置 +# 格式: (模板URL, 名称) +# 支持的变量: {domain} - 域名, {base_url} - 基础URL +FAVICON_APIS = [ + # gstatic.cn 接口 + ('https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={base_url}', + 'gstatic接口'), + + # 第三方API + ('https://favicon.is/{domain}', '第三方API'), + ('https://ico.kucat.cn/get.php?url={base_url}', '第三方API'), + + # 网站默认位置,放在最后 + ('', '网站默认位置/favicon.ico'), +]