1. API外部接口可配置

2. 失败的url保存到文件中
This commit is contained in:
jinql
2025-09-18 23:54:13 +08:00
parent 3455cb8b50
commit b8b10048c3
3 changed files with 165 additions and 16 deletions

View File

@@ -4,6 +4,7 @@ import base64
import hashlib
import ipaddress
import logging
import os
import re
import socket
import time
@@ -35,6 +36,8 @@ DEFAULT_RETRIES = 2
# 存储失败的URL值为缓存过期时间戳
failed_urls: Dict[str, int] = dict()
# 记录上次保存失败URL的时间
_last_saved_failed_urls = time.time()
# 创建aiohttp客户端会话池
_aiohttp_client = None
@@ -111,7 +114,8 @@ class Favicon:
if self.domain:
self.domain_md5 = hashlib.md5(self.domain.encode("utf-8")).hexdigest()
except Exception as e:
failed_urls[self.domain] = setting.time_of_1_days + int(time.time())
# failed_urls[self.domain] = setting.time_of_1_days + int(time.time())
add_failed_url(self.domain, setting.time_of_1_days + int(time.time()))
self.scheme = None
self.domain = None
logger.error('URL解析错误: %s, URL: %s', str(e), url)
@@ -271,7 +275,8 @@ def _check_internal(domain: str) -> bool:
return True
return False
except Exception as e:
failed_urls[domain] = setting.time_of_1_days + int(time.time())
# failed_urls[domain] = setting.time_of_1_days + int(time.time())
add_failed_url(domain, setting.time_of_1_days + int(time.time()))
logger.error('解析域名出错: %s, 错误: %s', domain, str(e))
return False
@@ -341,25 +346,153 @@ async def _req_get(url: str,
content = await resp.read()
return content, ct_type
else:
failed_urls[domain] = setting.time_of_1_hours + int(time.time())
# failed_urls[domain] = setting.time_of_1_hours + int(time.time())
add_failed_url(domain, setting.time_of_1_hours + int(time.time()))
logger.error('异步请求失败: %d, URL: %s', resp.status, url)
break
except (aiohttp.ClientConnectorError, aiohttp.ServerTimeoutError) as e:
retry_count += 1
if retry_count > retries:
failed_urls[domain] = setting.time_of_5_minus + int(time.time())
# failed_urls[domain] = setting.time_of_5_minus + int(time.time())
add_failed_url(domain, setting.time_of_5_minus + int(time.time()))
logger.error('异步请求超时: %s, URL: %s', str(e), url)
else:
logger.warning('异步请求超时,正在重试(%d/%d): %s', retry_count, retries, url)
continue
except Exception as e:
failed_urls[domain] = setting.time_of_1_hours + int(time.time())
# failed_urls[domain] = setting.time_of_1_hours + int(time.time())
add_failed_url(domain, setting.time_of_1_hours + int(time.time()))
logger.error('异步请求异常: %s, URL: %s', str(e), url)
break
return None, None
def add_failed_url(domain: str, expire_time: int):
"""添加失败的URL并在数量达到10的倍数时保存到文件
Args:
domain: 域名
expire_time: 过期时间戳
"""
global failed_urls
# 添加或更新失败URL
if not domain: # 确保域名不为空
return
old_count = len(failed_urls)
failed_urls[domain] = expire_time
new_count = len(failed_urls)
# 检查是否需要保存到文件当新增了指定数量的URL或数量是指定阈值的倍数
if new_count % setting.FAILED_URLS_SAVE_THRESHOLD == 0 or (
new_count - old_count) >= setting.FAILED_URLS_SAVE_THRESHOLD:
save_failed_urls()
def save_failed_urls():
"""保存失败的URL到文件每增加10个URL触发一次"""
global failed_urls, _last_saved_failed_urls
try:
# 读取现有文件内容
existing_urls = {}
if os.path.exists(setting.failed_urls_file):
try:
# 确保目录存在
os.makedirs(os.path.dirname(setting.failed_urls_file), exist_ok=True)
# 读取文件内容
with open(setting.failed_urls_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 解析文件内容
for line in lines:
line = line.strip()
if line and '\t' in line:
try:
domain, timestamp_str = line.split('\t', 1)
timestamp = int(timestamp_str)
existing_urls[domain] = timestamp
except:
continue
except Exception as e:
logger.error('读取失败URL文件出错: %s', str(e))
# 合并当前失败URL和文件中的URL保留最新的过期时间
merged_urls = {**existing_urls}
for domain, timestamp in failed_urls.items():
# 只保留过期时间更晚的条目
if domain not in merged_urls or timestamp > merged_urls[domain]:
merged_urls[domain] = timestamp
# 保存合并后的结果
os.makedirs(os.path.dirname(setting.failed_urls_file), exist_ok=True)
with open(setting.failed_urls_file, 'w', encoding='utf-8') as f:
for domain, timestamp in merged_urls.items():
# 只保留未过期的URL时间戳大于当前时间
if timestamp > time.time():
f.write(f"{domain}\t{timestamp}\n")
# 更新内存中的failed_urls为合并和去重后的结果
failed_urls = merged_urls
_last_saved_failed_urls = time.time()
logger.info(f'成功保存{len(merged_urls)}个失败URL到文件')
except Exception as e:
logger.error('保存失败URL到文件出错: %s', str(e))
def load_failed_urls():
"""从文件加载失败的URL到内存中
当failed_urls为空时调用从failed_urls_file读取数据并加载到failed_urls字典中
只加载未过期的URL
"""
global failed_urls
try:
if not os.path.exists(setting.failed_urls_file):
logger.info('失败URL文件不存在无需加载')
return
# 确保目录存在
os.makedirs(os.path.dirname(setting.failed_urls_file), exist_ok=True)
# 读取文件内容
with open(setting.failed_urls_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 解析文件内容只加载未过期的URL
loaded_urls = {}
current_time = time.time()
for line in lines:
line = line.strip()
if line and '\t' in line:
try:
domain, timestamp_str = line.split('\t', 1)
timestamp = int(timestamp_str)
# 只加载未过期的URL
if timestamp > current_time:
loaded_urls[domain] = timestamp
except:
continue
# 更新内存中的failed_urls
if loaded_urls:
failed_urls.update(loaded_urls)
logger.info(f'成功从文件加载{len(loaded_urls)}个未过期的失败URL')
else:
logger.info('文件中没有未过期的失败URL需要加载')
except Exception as e:
logger.error('从文件加载失败URL出错: %s', str(e))
# 初始化时如果failed_urls为空则从文件加载
if not failed_urls:
load_failed_urls()
# 域名验证正则表达式
_pattern_domain = re.compile(
r'[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62}(\.[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62})+\.?',

View File

@@ -332,18 +332,14 @@ async def get_icon_async(entity: Favicon, _cached: bytes = None) -> Optional[byt
strategies = [
# 0. 从原始网页标签链接中获取
lambda: (icon_url, "原始网页标签") if icon_url else (None, None),
# 从 gstatic.cn 接口获取
lambda: (
f'https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={entity.get_base_url()}',
"gstatic接口"),
# 从网站默认位置获取
lambda: ('', "网站默认位置/favicon.ico"),
# 从其他api接口获取
lambda: (f'https://favicon.is/{entity.domain}', "第三方API"),
lambda: (f'https://ico.kucat.cn/get.php?url={entity.get_base_url()}', "第三方API"),
# 99. 最后的尝试cloudflare workers
# lambda: (f'https://favicon.cary.cc/?url={entity.get_base_url()}', "cloudflare"),
]
# 2. 从配置文件加载其他图标获取接口
for _template, _name in setting.FAVICON_APIS:
strategies.append(
lambda template=_template, name=_name:
(template.format(domain=entity.domain, base_url=entity.get_base_url()), name)
)
for strategy in strategies:
if icon_content:

View File

@@ -16,6 +16,10 @@ default_icon_path = os.path.join(icon_root_path, 'favicon.png')
default_icon_file = FileUtil.read_file(default_icon_path, mode='rb')
# 定义referer日志文件路径
referer_log_file = os.path.join(icon_root_path, 'data', 'referer.txt')
# 定义失败URL日志文件路径
failed_urls_file = os.path.join(icon_root_path, 'data', 'failedurls.txt')
# 失败URL保存阈值当失败URL数量达到此值的倍数时保存到文件
FAILED_URLS_SAVE_THRESHOLD = 10
# 时间常量
time_of_1_minus = 1 * 60
@@ -33,3 +37,19 @@ time_of_1_days = 1 * 24 * 60 * 60
time_of_7_days = 7 * time_of_1_days
time_of_15_days = 15 * time_of_1_days
time_of_30_days = 30 * time_of_1_days
# 图标获取接口配置
# 格式: (模板URL, 名称)
# 支持的变量: {domain} - 域名, {base_url} - 基础URL
FAVICON_APIS = [
# gstatic.cn 接口
('https://t3.gstatic.cn/faviconV2?client=SOCIAL&fallback_opts=TYPE,SIZE,URL&type=FAVICON&size=128&url={base_url}',
'gstatic接口'),
# 第三方API
('https://favicon.is/{domain}', '第三方API'),
('https://ico.kucat.cn/get.php?url={base_url}', '第三方API'),
# 网站默认位置,放在最后
('', '网站默认位置/favicon.ico'),
]