修改失败URL处理逻辑
parent
1b9a50ec71
commit
634cb504da
|
@ -8,7 +8,7 @@ import os
|
|||
import re
|
||||
import socket
|
||||
import time
|
||||
from typing import Tuple, Optional, Dict
|
||||
from typing import Tuple, Optional
|
||||
from urllib.parse import urlparse, unquote
|
||||
|
||||
import aiohttp
|
||||
|
@ -17,12 +17,11 @@ import urllib3
|
|||
|
||||
import setting
|
||||
from favicon_app.utils import header
|
||||
from favicon_app.utils.file_util import FileUtil
|
||||
from favicon_app.utils.filetype import helpers, filetype
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings()
|
||||
logging.captureWarnings(True)
|
||||
# 配置日志
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 创建requests会话池
|
||||
|
@ -34,10 +33,8 @@ requests_session.verify = False
|
|||
DEFAULT_TIMEOUT = 10
|
||||
DEFAULT_RETRIES = 2
|
||||
|
||||
# 存储失败的URL,值为缓存过期时间戳
|
||||
failed_urls: Dict[str, int] = dict()
|
||||
# 记录上次保存失败URL的时间
|
||||
_last_saved_failed_urls = time.time()
|
||||
# 临时存储域名和对应的MD5值
|
||||
domain_md5_mapping = dict()
|
||||
|
||||
# 创建aiohttp客户端会话池
|
||||
_aiohttp_client = None
|
||||
|
@ -114,8 +111,7 @@ class Favicon:
|
|||
if self.domain:
|
||||
self.domain_md5 = hashlib.md5(self.domain.encode("utf-8")).hexdigest()
|
||||
except Exception as e:
|
||||
# failed_urls[self.domain] = setting.time_of_1_days + int(time.time())
|
||||
add_failed_url(self.domain, setting.time_of_1_days + int(time.time()))
|
||||
add_failed_url(self.domain)
|
||||
self.scheme = None
|
||||
self.domain = None
|
||||
logger.error('URL解析错误: %s, URL: %s', str(e), url)
|
||||
|
@ -275,8 +271,7 @@ def _check_internal(domain: str) -> bool:
|
|||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
# failed_urls[domain] = setting.time_of_1_days + int(time.time())
|
||||
add_failed_url(domain, setting.time_of_1_days + int(time.time()))
|
||||
add_failed_url(domain)
|
||||
logger.error('解析域名出错: %s, 错误: %s', domain, str(e))
|
||||
return False
|
||||
|
||||
|
@ -346,153 +341,100 @@ async def _req_get(url: str,
|
|||
content = await resp.read()
|
||||
return content, ct_type
|
||||
else:
|
||||
# failed_urls[domain] = setting.time_of_1_hours + int(time.time())
|
||||
add_failed_url(domain, setting.time_of_1_hours + int(time.time()))
|
||||
add_failed_url(domain)
|
||||
logger.error('异步请求失败: %d, URL: %s', resp.status, url)
|
||||
break
|
||||
except (aiohttp.ClientConnectorError, aiohttp.ServerTimeoutError) as e:
|
||||
retry_count += 1
|
||||
if retry_count > retries:
|
||||
# failed_urls[domain] = setting.time_of_5_minus + int(time.time())
|
||||
add_failed_url(domain, setting.time_of_5_minus + int(time.time()))
|
||||
add_failed_url(domain)
|
||||
logger.error('异步请求超时: %s, URL: %s', str(e), url)
|
||||
else:
|
||||
logger.warning('异步请求超时,正在重试(%d/%d): %s', retry_count, retries, url)
|
||||
continue
|
||||
except Exception as e:
|
||||
# failed_urls[domain] = setting.time_of_1_hours + int(time.time())
|
||||
add_failed_url(domain, setting.time_of_1_hours + int(time.time()))
|
||||
add_failed_url(domain)
|
||||
logger.error('异步请求异常: %s, URL: %s', str(e), url)
|
||||
break
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def add_failed_url(domain: str, expire_time: int):
|
||||
"""添加失败的URL,并在数量达到10的倍数时保存到文件
|
||||
def add_failed_url(domain: str):
|
||||
"""添加失败的URL,将其保存为单独的文件
|
||||
|
||||
Args:
|
||||
domain: 域名
|
||||
expire_time: 过期时间戳
|
||||
"""
|
||||
global failed_urls
|
||||
|
||||
# 添加或更新失败URL
|
||||
if not domain: # 确保域名不为空
|
||||
# 确保域名不为空
|
||||
if not domain:
|
||||
return
|
||||
|
||||
old_count = len(failed_urls)
|
||||
failed_urls[domain] = expire_time
|
||||
new_count = len(failed_urls)
|
||||
|
||||
# 检查是否需要保存到文件(当新增了指定数量的URL或数量是指定阈值的倍数)
|
||||
if (new_count % setting.FAILED_URLS_SAVE_THRESHOLD == 0
|
||||
or (new_count - old_count) >= setting.FAILED_URLS_SAVE_THRESHOLD):
|
||||
save_failed_urls()
|
||||
|
||||
|
||||
def save_failed_urls():
|
||||
"""保存失败的URL到文件,每增加10个URL触发一次"""
|
||||
global failed_urls, _last_saved_failed_urls
|
||||
|
||||
try:
|
||||
# 读取现有文件内容
|
||||
existing_urls = {}
|
||||
if os.path.exists(setting.failed_urls_file):
|
||||
try:
|
||||
# 确保目录存在
|
||||
os.makedirs(os.path.dirname(setting.failed_urls_file), exist_ok=True)
|
||||
# 确保失败URL目录存在
|
||||
os.makedirs(setting.failed_urls_dir, exist_ok=True)
|
||||
|
||||
# 读取文件内容
|
||||
with open(setting.failed_urls_file, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
# 将域名的MD5值作为文件名
|
||||
domain_md5 = hashlib.md5(domain.encode("utf-8")).hexdigest()
|
||||
file_path = os.path.join(setting.failed_urls_dir, f"{domain_md5}.txt")
|
||||
|
||||
# 解析文件内容
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line and '\t' in line:
|
||||
try:
|
||||
domain, timestamp_str = line.split('\t', 1)
|
||||
timestamp = int(timestamp_str)
|
||||
existing_urls[domain] = timestamp
|
||||
except:
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error('读取失败URL文件出错: %s', str(e))
|
||||
# 格式化当前时间
|
||||
formatted_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
|
||||
|
||||
# 合并当前失败URL和文件中的URL,保留最新的过期时间
|
||||
merged_urls = {**existing_urls}
|
||||
for domain, timestamp in failed_urls.items():
|
||||
# 只保留过期时间更晚的条目
|
||||
if domain not in merged_urls or timestamp > merged_urls[domain]:
|
||||
merged_urls[domain] = timestamp
|
||||
# 写入域名和时间到文件
|
||||
FileUtil.write_file(file_path, f"{domain}--{formatted_time}")
|
||||
|
||||
# 保存合并后的结果
|
||||
os.makedirs(os.path.dirname(setting.failed_urls_file), exist_ok=True)
|
||||
with open(setting.failed_urls_file, 'w', encoding='utf-8') as f:
|
||||
for domain, timestamp in merged_urls.items():
|
||||
# 只保留未过期的URL(时间戳大于当前时间)
|
||||
if timestamp > time.time():
|
||||
f.write(f"{domain}\t{timestamp}\n")
|
||||
# 缓存域名和MD5的映射关系
|
||||
domain_md5_mapping[domain] = domain_md5
|
||||
|
||||
# 更新内存中的failed_urls为合并和去重后的结果
|
||||
failed_urls = merged_urls
|
||||
_last_saved_failed_urls = time.time()
|
||||
|
||||
logger.info(f'成功保存{len(merged_urls)}个失败URL到文件')
|
||||
logger.debug('成功添加失败URL到文件: %s', domain)
|
||||
except Exception as e:
|
||||
logger.error('保存失败URL到文件出错: %s', str(e))
|
||||
logger.error('添加失败URL到文件出错: %s, 域名: %s', str(e), domain)
|
||||
|
||||
|
||||
def load_failed_urls():
|
||||
"""从文件加载失败的URL到内存中
|
||||
def is_failed_url(domain: str) -> bool:
|
||||
"""检查域名是否是失败URL(未过期)
|
||||
|
||||
当failed_urls为空时调用,从failed_urls_file读取数据并加载到failed_urls字典中
|
||||
只加载未过期的URL
|
||||
Args:
|
||||
domain: 域名
|
||||
|
||||
Returns:
|
||||
True: 是失败URL(未过期);False: 不是失败URL或已过期
|
||||
"""
|
||||
global failed_urls
|
||||
|
||||
try:
|
||||
if not os.path.exists(setting.failed_urls_file):
|
||||
logger.info('失败URL文件不存在,无需加载')
|
||||
return
|
||||
|
||||
# 确保目录存在
|
||||
os.makedirs(os.path.dirname(setting.failed_urls_file), exist_ok=True)
|
||||
|
||||
# 读取文件内容
|
||||
with open(setting.failed_urls_file, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
# 解析文件内容,只加载未过期的URL
|
||||
loaded_urls = {}
|
||||
current_time = time.time()
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line and '\t' in line:
|
||||
try:
|
||||
domain, timestamp_str = line.split('\t', 1)
|
||||
timestamp = int(timestamp_str)
|
||||
# 只加载未过期的URL
|
||||
if timestamp > current_time:
|
||||
loaded_urls[domain] = timestamp
|
||||
except:
|
||||
continue
|
||||
|
||||
# 更新内存中的failed_urls
|
||||
if loaded_urls:
|
||||
failed_urls.update(loaded_urls)
|
||||
logger.info(f'成功从文件加载{len(loaded_urls)}个未过期的失败URL')
|
||||
# 从缓存中获取域名的MD5值,如果没有则计算
|
||||
if domain in domain_md5_mapping:
|
||||
domain_md5 = domain_md5_mapping[domain]
|
||||
else:
|
||||
logger.info('文件中没有未过期的失败URL需要加载')
|
||||
domain_md5 = hashlib.md5(domain.encode("utf-8")).hexdigest()
|
||||
domain_md5_mapping[domain] = domain_md5
|
||||
|
||||
file_path = os.path.join(setting.failed_urls_dir, f"{domain_md5}.txt")
|
||||
|
||||
# 检查文件是否存在
|
||||
if not os.path.exists(file_path):
|
||||
return False
|
||||
|
||||
# 获取文件的修改时间
|
||||
file_mtime = os.path.getmtime(file_path)
|
||||
current_time = time.time()
|
||||
|
||||
# 检查文件是否未过期
|
||||
if current_time - file_mtime <= setting.FAILED_URL_EXPIRE_TIME:
|
||||
return True
|
||||
else:
|
||||
try:
|
||||
os.remove(file_path)
|
||||
if domain in domain_md5_mapping:
|
||||
del domain_md5_mapping[domain]
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error('从文件加载失败URL出错: %s', str(e))
|
||||
logger.error('检查失败URL出错: %s, 域名: %s', str(e), domain)
|
||||
return False
|
||||
|
||||
|
||||
# 初始化时,如果failed_urls为空,则从文件加载
|
||||
if not failed_urls:
|
||||
load_failed_urls()
|
||||
|
||||
# 域名验证正则表达式
|
||||
_pattern_domain = re.compile(
|
||||
r'[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62}(\.[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62})+\.?',
|
||||
|
|
|
@ -20,11 +20,10 @@ _icon_root_path = setting.icon_root_path
|
|||
_default_icon_path = setting.default_icon_path
|
||||
|
||||
# 创建FastAPI路由器
|
||||
favicon_router = APIRouter(prefix="", tags=["favicon"])
|
||||
favicon_router = APIRouter(prefix="/icon", tags=["favicon"])
|
||||
|
||||
|
||||
@favicon_router.get('/icon/')
|
||||
@favicon_router.get('/icon')
|
||||
@favicon_router.get('/')
|
||||
async def get_favicon(
|
||||
request: Request,
|
||||
bg_tasks: BackgroundTasks,
|
||||
|
@ -37,13 +36,13 @@ async def get_favicon(
|
|||
return await favicon_service.get_favicon_handler(request, bg_tasks, url, refresh)
|
||||
|
||||
|
||||
@favicon_router.get('/icon/default')
|
||||
@favicon_router.get('/default')
|
||||
async def get_default_icon():
|
||||
"""获取默认图标"""
|
||||
return favicon_service.get_default()
|
||||
|
||||
|
||||
@favicon_router.get('/icon/referer', include_in_schema=False)
|
||||
@favicon_router.get('/referer', include_in_schema=False)
|
||||
async def get_referrer(unique: Optional[str] = Query(None)):
|
||||
"""获取请求来源信息,带unique参数时会进行去重处理"""
|
||||
content = 'None'
|
||||
|
|
|
@ -258,15 +258,11 @@ async def get_favicon_handler(request: Request,
|
|||
return get_default(setting.time_of_1_days)
|
||||
|
||||
# 检查缓存中的失败URL
|
||||
if entity.domain in favicon.failed_urls:
|
||||
if int(time.time()) <= favicon.failed_urls.get(entity.domain):
|
||||
return get_default(setting.time_of_1_days)
|
||||
else:
|
||||
del favicon.failed_urls[entity.domain]
|
||||
if favicon.is_failed_url(entity.domain):
|
||||
return get_default(setting.time_of_1_days)
|
||||
|
||||
logger.info(
|
||||
f"-> count (failed/cached/icon/url): "
|
||||
f"{len(favicon.failed_urls)}/{_cache_count}/{_icon_count}/{_url_count}"
|
||||
logger.debug(
|
||||
f"-> count (cached/icon/url): "f"{_cache_count}/{_icon_count}/{_url_count}"
|
||||
)
|
||||
|
||||
# 检查缓存
|
||||
|
@ -333,7 +329,7 @@ async def get_icon_async(entity: Favicon, _cached: bytes = None) -> Optional[byt
|
|||
# 0. 从原始网页标签链接中获取
|
||||
lambda: (icon_url, "原始网页标签") if icon_url else (None, None),
|
||||
]
|
||||
|
||||
|
||||
# 2. 从配置文件加载其他图标获取接口
|
||||
for _template, _name in setting.FAVICON_APIS:
|
||||
strategies.append(
|
||||
|
|
2
main.py
2
main.py
|
@ -5,7 +5,6 @@ import os
|
|||
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import Response
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
import setting
|
||||
from favicon_app.routes import favicon_router
|
||||
|
@ -24,7 +23,6 @@ referer_log_file = setting.referer_log_file
|
|||
|
||||
# fastapi
|
||||
app = FastAPI(title="Favicon API", description="获取网站favicon图标", version="3.0")
|
||||
app.mount("/static", StaticFiles(directory="static"), name="static")
|
||||
app.include_router(favicon_router)
|
||||
|
||||
|
||||
|
|
|
@ -18,8 +18,8 @@ default_icon_file = FileUtil.read_file(default_icon_path, mode='rb')
|
|||
referer_log_file = os.path.join(icon_root_path, 'data', 'referer.txt')
|
||||
# 定义失败URL日志文件路径
|
||||
failed_urls_file = os.path.join(icon_root_path, 'data', 'failedurls.txt')
|
||||
# 失败URL保存阈值,当失败URL数量达到此值的倍数时保存到文件
|
||||
FAILED_URLS_SAVE_THRESHOLD = 10
|
||||
# 定义失败URL存储目录
|
||||
failed_urls_dir = os.path.join(icon_root_path, 'data', 'failed_urls')
|
||||
|
||||
# 时间常量
|
||||
time_of_1_minus = 1 * 60
|
||||
|
@ -38,6 +38,9 @@ time_of_7_days = 7 * time_of_1_days
|
|||
time_of_15_days = 15 * time_of_1_days
|
||||
time_of_30_days = 30 * time_of_1_days
|
||||
|
||||
# 失败URL默认失效时间
|
||||
FAILED_URL_EXPIRE_TIME = time_of_6_hours
|
||||
|
||||
# 图标获取接口配置
|
||||
# 格式: (模板URL, 名称)
|
||||
# 支持的变量: {domain} - 域名, {base_url} - 基础URL
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
|
@ -1,2 +0,0 @@
|
|||
webpackJsonp([1],{Lptq:function(e,t){},NHnr:function(e,t,n){"use strict";Object.defineProperty(t,"__esModule",{value:!0});var i=n("zxxf"),a=n("yebG"),o=(n("Lptq"),{name:"App",data:function(){return{height:600,marginTop:200,year:(new Date).getFullYear(),iconUrl:"",iconImg:"https://api.xinac.net/icon/default",defaultIcon:"https://api.xinac.net/icon/default",placeholder:"https://www.xinac.net",headerLink:"https://api.xinac.net"}},methods:{handle:function(){console.log("表单提交的数据:",this.iconUrl),this.iconUrl?this.iconImg="https://api.xinac.net/icon/?url="+this.iconUrl:this.iconImg=this.defaultIcon},resize:function(){this.height=document.documentElement.clientHeight,this.marginTop=(this.height-300)/2},linkTo:function(){window.location.href=this.headerLink}},mounted:function(){var e=this;this.resize(),window.onresize=function(){e.resize()}}}),l={render:function(){var e=this,t=e.$createElement,n=e._self._c||t;return n("div",{attrs:{id:"app"}},[n("el-container",{style:{height:e.height+"px"}},[n("el-header",[n("h2",{on:{click:e.linkTo}},[e._v(e._s(e.headerLink))])]),e._v(" "),n("el-main",{style:{marginTop:e.marginTop+"px"}},[n("div",[n("el-input",{attrs:{placeholder:e.placeholder,clearable:"",autofocus:""},on:{change:e.handle},model:{value:e.iconUrl,callback:function(t){e.iconUrl=t},expression:"iconUrl"}},[n("template",{attrs:{slot:"prepend"},slot:"prepend"},[e._v("https://api.xinac.net/icon/?url=")]),e._v(" "),n("template",{attrs:{slot:"append"},slot:"append"},[n("el-image",{staticStyle:{width:"32px",height:"32px"},attrs:{src:e.iconImg}})],1)],2)],1)]),e._v(" "),n("el-footer",[e._v("© "+e._s(e.year)+" "),n("a",{attrs:{href:e.placeholder}},[e._v("xinac.net")])])],1)],1)},staticRenderFns:[]};var u=n("lp4z")(o,l,!1,function(e){n("jQJW")},null,null).exports;i.default.config.productionTip=!1,i.default.use(a.Autocomplete),i.default.use(a.Input),i.default.use(a.Select),i.default.use(a.Button),i.default.use(a.ButtonGroup),i.default.use(a.Tooltip),i.default.use(a.Form),i.default.use(a.FormItem),i.default.use(a.Icon),i.default.use(a.Container),i.default.use(a.Header),i.default.use(a.Aside),i.default.use(a.Main),i.default.use(a.Footer),i.default.use(a.Image),new i.default({el:"#app",render:function(e){return e(u)}})},jQJW:function(e,t){}},["NHnr"]);
|
||||
//# sourceMappingURL=app.c97f95b50095b442df6d.js.map
|
File diff suppressed because one or more lines are too long
|
@ -1,2 +0,0 @@
|
|||
!function(e){var n=window.webpackJsonp;window.webpackJsonp=function(r,c,a){for(var i,u,f,s=0,l=[];s<r.length;s++)u=r[s],t[u]&&l.push(t[u][0]),t[u]=0;for(i in c)Object.prototype.hasOwnProperty.call(c,i)&&(e[i]=c[i]);for(n&&n(r,c,a);l.length;)l.shift()();if(a)for(s=0;s<a.length;s++)f=o(o.s=a[s]);return f};var r={},t={2:0};function o(n){if(r[n])return r[n].exports;var t=r[n]={i:n,l:!1,exports:{}};return e[n].call(t.exports,t,t.exports,o),t.l=!0,t.exports}o.e=function(e){var n=t[e];if(0===n)return new Promise(function(e){e()});if(n)return n[2];var r=new Promise(function(r,o){n=t[e]=[r,o]});n[2]=r;var c=document.getElementsByTagName("head")[0],a=document.createElement("script");a.type="text/javascript",a.charset="utf-8",a.async=!0,a.timeout=12e4,o.nc&&a.setAttribute("nonce",o.nc),a.src=o.p+"static/js/"+e+"."+{0:"c377031f0b4534916d3a",1:"c97f95b50095b442df6d"}[e]+".js";var i=setTimeout(u,12e4);function u(){a.onerror=a.onload=null,clearTimeout(i);var n=t[e];0!==n&&(n&&n[1](new Error("Loading chunk "+e+" failed.")),t[e]=void 0)}return a.onerror=a.onload=u,c.appendChild(a),r},o.m=e,o.c=r,o.d=function(e,n,r){o.o(e,n)||Object.defineProperty(e,n,{configurable:!1,enumerable:!0,get:r})},o.n=function(e){var n=e&&e.__esModule?function(){return e.default}:function(){return e};return o.d(n,"a",n),n},o.o=function(e,n){return Object.prototype.hasOwnProperty.call(e,n)},o.p="/",o.oe=function(e){throw console.error(e),e}}([]);
|
||||
//# sourceMappingURL=manifest.3c886754347e2a8c0b4d.js.map
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1 +1,17 @@
|
|||
<!DOCTYPE html><html><head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>favicon图标获取API</title><link href=/static/css/app.6e1a9be5bf9dacce170f04018b053469.css rel=stylesheet></head><body><div id=app></div><script type=text/javascript src=/static/js/manifest.3c886754347e2a8c0b4d.js></script><script type=text/javascript src=/static/js/vendor.c377031f0b4534916d3a.js></script><script type=text/javascript src=/static/js/app.c97f95b50095b442df6d.js></script></body></html>
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh" translate="no">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
<title>favicon图标获取API</title>
|
||||
<link href="https://api.xinac.net/static/css/app.6e1a9be5bf9dacce170f04018b053469.css" rel="stylesheet">
|
||||
</head>
|
||||
<body>
|
||||
<div id="app"></div>
|
||||
<script src="https://api.xinac.net/static/js/manifest.3c886754347e2a8c0b4d.js"></script>
|
||||
<script src="https://api.xinac.net/static/js/vendor.c377031f0b4534916d3a.js"></script>
|
||||
<script src="https://api.xinac.net/static/js/app.c97f95b50095b442df6d.js"></script>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue