update

2025-09-10 22:13:22 +08:00
parent 4c8f92f49b
commit d3b21d6a11
6 changed files with 436 additions and 448 deletions
--- a/favicon_app/models/favicon.py
+++ b/favicon_app/models/favicon.py
@@ -101,14 +101,14 @@ class Favicon:
                self.scheme = 'http'

            # 检查域名合法性
-            if self.domain and not self._check_url(self.domain):
+            if self.domain and not _check_url(self.domain):
                self.domain = None

            # 生成域名MD5哈希值
            if self.domain:
                self.domain_md5 = hashlib.md5(self.domain.encode("utf-8")).hexdigest()
        except Exception as e:
-            failed_url_cache(self.domain, setting.time_of_1_days)
+            failed_urls[self.domain] = setting.time_of_1_days + int(time.time())
            self.scheme = None
            self.domain = None
            logger.error('URL解析错误: %s, URL: %s', str(e), url)
@@ -163,6 +163,21 @@ class Favicon:
            self._get_icon_url(icon_path)
        return self.icon_url

+    def get_base_url(self) -> Optional[str]:
+        """获取网站基础URL
+
+        Returns:
+            网站基础URL
+        """
+        if not self.domain or '.' not in self.domain:
+            return None
+
+        _url = f"{self.scheme}://{self.domain}"
+        if self.port and self.port not in [80, 443]:
+            _url += f":{self.port}"
+
+        return _url
+
    def get_icon_file(self, icon_path: str, default: bool = False) -> Tuple[Optional[bytes], Optional[str]]:
        """获取图标文件内容和类型
        
@@ -187,7 +202,7 @@ class Favicon:
                    _content = base64.b64decode(data_uri[-1])
                    _ct = data_uri[0].split(';')[0].split(':')[-1]
            else:
-                _content, _ct = self._req_get(self.icon_url, domain=self.domain)
+                _content, _ct = _req_get(self.icon_url, domain=self.domain)

            # 验证是否为图片
            # image/* application/x-ico
@@ -202,21 +217,6 @@ class Favicon:

        return None, None

-    def get_base_url(self) -> Optional[str]:
-        """获取网站基础URL
-
-        Returns:
-            网站基础URL
-        """
-        if not self.domain or '.' not in self.domain:
-            return None
-
-        _url = f"{self.scheme}://{self.domain}"
-        if self.port and self.port not in [80, 443]:
-            _url += f":{self.port}"
-
-        return _url
-
    def req_get(self) -> Optional[bytes]:
        """获取网站首页内容
        
@@ -227,7 +227,7 @@ class Favicon:
            return None

        _url = self.get_base_url()
-        _content, _ct = self._req_get(_url, domain=self.domain)
+        _content, _ct = _req_get(_url, domain=self.domain)

        # 验证类型并检查大小
        if _ct and ('text' in _ct or 'html' in _ct or 'xml' in _ct):
@@ -238,124 +238,117 @@ class Favicon:

        return None

-    @staticmethod
-    def _req_get(
-            url: str,
-            domain: str,
-            retries: int = DEFAULT_RETRIES,
-            timeout: int = DEFAULT_TIMEOUT
-    ) -> Tuple[Optional[bytes], Optional[str]]:
-        """发送HTTP GET请求获取内容
-        
-        Args:
-            url: 请求URL
-            retries: 重试次数
-            timeout: 超时时间(秒)
-            
-        Returns:
-            元组(内容, 内容类型)
-        """
-        logger.debug('发送请求: %s', url)

-        retry_count = 0
-        while retry_count <= retries:
-            try:
-                # 使用全局会话池
-                req = requests_session.get(
-                    url,
-                    headers=header.get_header(),
-                    timeout=timeout,
-                    allow_redirects=True,
-                    verify=False
-                )
+def _check_internal(domain: str) -> bool:
+    """检查网址是否非内网地址

-                if req.ok:
-                    ct_type = req.headers.get('Content-Type')
-                    ct_length = req.headers.get('Content-Length')
+    Args:
+        domain: 域名

-                    # 处理Content-Type
-                    if ct_type and ';' in ct_type:
-                        _cts = ct_type.split(';')
-                        if 'charset' in _cts[0]:
-                            ct_type = _cts[-1].strip()
-                        else:
-                            ct_type = _cts[0].strip()
-
-                    # 检查响应大小
-                    if ct_length and int(ct_length) > 10 * 1024 * 1024:
-                        logger.warning('响应过大: %d bytes, URL: %s', int(ct_length), url)
-
-                    return req.content, ct_type
-                else:
-                    failed_url_cache(domain, setting.time_of_7_days)
-                    logger.error('请求失败: %d, URL: %s', req.status_code, url)
-                    break
-            except (ConnectTimeoutError, ReadTimeoutError) as e:
-                retry_count += 1
-                if retry_count > retries:
-                    logger.error('请求超时: %s, URL: %s', str(e), url)
-                else:
-                    logger.warning('请求超时，正在重试(%d/%d): %s', retry_count, retries, url)
-                    continue
-            except MaxRetryError as e:
-                logger.error('重定向次数过多: %s, URL: %s', str(e), url)
-                break
-            except Exception as e:
-                failed_url_cache(domain, setting.time_of_7_days)
-                logger.error('请求异常: %s, URL: %s', str(e), url)
-                break
-
-        return None, None
-
-    @staticmethod
-    def _check_url(domain: str) -> bool:
-        """检查域名是否合法且非内网地址
-        
-        Args:
-            domain: 域名
-            
-        Returns:
-            域名是否合法且非内网地址
-        """
-        return Favicon.check_internal(domain) and _pattern_domain.match(domain)
-
-    @staticmethod
-    def check_internal(domain: str) -> bool:
-        """检查网址是否非内网地址
-        
-        Args:
-            domain: 域名
-            
-        Returns:
-            True: 非内网；False: 是内网/无法解析
-        """
-        try:
-            # 检查是否为IP地址
-            if domain.replace('.', '').isdigit():
-                return not ipaddress.ip_address(domain).is_private
-            else:
-                # 解析域名获取IP地址
-                ips = socket.getaddrinfo(domain, None)
-                for ip_info in ips:
-                    ip = ip_info[4][0]
-                    if '.' in ip:
-                        if not ipaddress.ip_address(ip).is_private:
-                            return True
-                return False
-        except Exception as e:
-            failed_url_cache(domain, setting.time_of_7_days)
-            logger.error('解析域名出错: %s, 错误: %s', domain, str(e))
+    Returns:
+        True: 非内网；False: 是内网/无法解析
+    """
+    try:
+        # 检查是否为IP地址
+        if domain.replace('.', '').isdigit():
+            return not ipaddress.ip_address(domain).is_private
+        else:
+            # 解析域名获取IP地址
+            ips = socket.getaddrinfo(domain, None)
+            for ip_info in ips:
+                ip = ip_info[4][0]
+                if '.' in ip:
+                    if not ipaddress.ip_address(ip).is_private:
+                        return True
            return False
+    except Exception as e:
+        failed_urls[domain] = setting.time_of_1_days + int(time.time())
+        logger.error('解析域名出错: %s, 错误: %s', domain, str(e))
+        return False
+
+
+def _check_url(domain: str) -> bool:
+    """检查域名是否合法且非内网地址
+
+    Args:
+        domain: 域名
+
+    Returns:
+        域名是否合法且非内网地址
+    """
+    return _pattern_domain.match(domain) and _check_internal(domain)
+
+
+def _req_get(url: str,
+             domain: str,
+             retries: int = DEFAULT_RETRIES,
+             timeout: int = DEFAULT_TIMEOUT) -> Tuple[Optional[bytes], Optional[str]]:
+    """发送HTTP GET请求获取内容
+
+    Args:
+        url: 请求URL
+        retries: 重试次数
+        timeout: 超时时间(秒)
+
+    Returns:
+        元组(内容, 内容类型)
+    """
+    logger.debug('发送请求: %s', url)
+
+    retry_count = 0
+    while retry_count <= retries:
+        try:
+            # 使用全局会话池
+            req = requests_session.get(
+                url,
+                headers=header.get_header(),
+                timeout=timeout,
+                allow_redirects=True,
+                verify=False
+            )
+
+            if req.ok:
+                ct_type = req.headers.get('Content-Type')
+                ct_length = req.headers.get('Content-Length')
+
+                # 处理Content-Type
+                if ct_type and ';' in ct_type:
+                    _cts = ct_type.split(';')
+                    if 'charset' in _cts[0]:
+                        ct_type = _cts[-1].strip()
+                    else:
+                        ct_type = _cts[0].strip()
+
+                # 检查响应大小
+                if ct_length and int(ct_length) > 10 * 1024 * 1024:
+                    logger.warning('响应过大: %d bytes, URL: %s', int(ct_length), url)
+
+                return req.content, ct_type
+            else:
+                failed_urls[domain] = setting.time_of_1_hours + int(time.time())
+                logger.error('请求失败: %d, URL: %s', req.status_code, url)
+                break
+        except (ConnectTimeoutError, ReadTimeoutError) as e:
+            retry_count += 1
+            if retry_count > retries:
+                failed_urls[domain] = setting.time_of_5_minus + int(time.time())
+                logger.error('请求超时: %s, URL: %s', str(e), url)
+            else:
+                logger.warning('请求超时，正在重试(%d/%d): %s', retry_count, retries, url)
+                continue
+        except MaxRetryError as e:
+            failed_urls[domain] = setting.time_of_1_hours + int(time.time())
+            logger.error('重定向次数过多: %s, URL: %s', str(e), url)
+            break
+        except Exception as e:
+            failed_urls[domain] = setting.time_of_1_hours + int(time.time())
+            logger.error('请求异常: %s, URL: %s', str(e), url)
+            break
+
+    return None, None


 # 域名验证正则表达式
 _pattern_domain = re.compile(
    r'[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62}(\.[a-zA-Z0-9\u4E00-\u9FA5][-a-zA-Z0-9\u4E00-\u9FA5]{0,62})+\.?',
    re.I)
-
-
-def failed_url_cache(_domain: str, _time: int):
-    if _domain:
-        _current_time = int(time.time())
-        if (not failed_urls.get(_domain)) or (_current_time <= failed_urls.get(_domain)):
-            failed_urls[_domain] = _current_time + _time