275 lines
11 KiB
Python
275 lines
11 KiB
Python
# -*- coding: utf-8 -*-
|
||
|
||
import logging
|
||
import random
|
||
import threading
|
||
from typing import Dict, Optional
|
||
|
||
# 配置日志
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class HeaderConfig:
|
||
"""HTTP请求头管理类,提供灵活的请求头配置和生成功能"""
|
||
|
||
_USER_AGENTS = [
|
||
# Firefox
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:112.0) Gecko/20100101 Firefox/112.0',
|
||
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:110.0) Gecko/20100101 Firefox/110.0',
|
||
# Chrome
|
||
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||
# Edge
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.70',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36 Edg/103.0.1264.77',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.58',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61',
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0',
|
||
# macOS
|
||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
|
||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36',
|
||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
|
||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15',
|
||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
||
# iOS
|
||
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1',
|
||
'Mozilla/5.0 (iPad; CPU OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1',
|
||
# Android
|
||
'Mozilla/5.0 (Linux; Android 13; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Mobile Safari/537.36',
|
||
'Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36'
|
||
]
|
||
|
||
# 合并两个版本的图片类型,并添加更多常见的图片格式
|
||
IMAGE_TYPES = [
|
||
'image/gif',
|
||
'image/jpeg',
|
||
'image/png',
|
||
'image/svg+xml',
|
||
'image/tiff',
|
||
'image/vnd.wap.wbmp',
|
||
'image/webp',
|
||
'image/x-icon',
|
||
'image/x-jng',
|
||
'image/x-ms-bmp',
|
||
'image/vnd.microsoft.icon',
|
||
'image/vnd.dwg',
|
||
'image/vnd.dxf',
|
||
'image/jpx',
|
||
'image/apng',
|
||
'image/bmp',
|
||
'image/vnd.ms-photo',
|
||
'image/vnd.adobe.photoshop',
|
||
'image/heic',
|
||
'image/avif',
|
||
'image/jfif',
|
||
'image/pjpeg',
|
||
'image/vnd.adobe.illustrator',
|
||
'application/pdf',
|
||
'application/x-pdf'
|
||
]
|
||
|
||
# 默认内容类型
|
||
CONTENT_TYPE = 'application/json; charset=utf-8'
|
||
|
||
# 不同场景的请求头模板
|
||
_HEADER_TEMPLATES = {
|
||
'default': {
|
||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||
'Accept-Encoding': 'gzip, deflate',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
|
||
'Connection': 'keep-alive'
|
||
},
|
||
'image': {
|
||
'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
|
||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||
'Accept-Encoding': 'gzip, deflate',
|
||
'Connection': 'keep-alive'
|
||
},
|
||
'api': {
|
||
'Accept': 'application/json, application/xml',
|
||
'Content-Type': CONTENT_TYPE,
|
||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||
'Accept-Encoding': 'gzip, deflate',
|
||
'Connection': 'keep-alive'
|
||
}
|
||
}
|
||
|
||
def __init__(self):
|
||
# 线程锁,确保线程安全
|
||
self._lock = threading.RLock()
|
||
# 存储自定义请求头
|
||
self._custom_headers = {}
|
||
|
||
def get_random_user_agent(self) -> str:
|
||
"""获取随机的User-Agent字符串"""
|
||
with self._lock:
|
||
return random.choice(self._USER_AGENTS)
|
||
|
||
def get_headers(
|
||
self,
|
||
template: str = 'default',
|
||
include_user_agent: bool = True,
|
||
custom_headers: Optional[Dict[str, str]] = None
|
||
) -> Dict[str, str]:
|
||
"""
|
||
获取配置好的请求头字典
|
||
|
||
Args:
|
||
template: 请求头模板类型,可选值:'default', 'image', 'api'
|
||
include_user_agent: 是否包含随机User-Agent
|
||
custom_headers: 自定义请求头,将覆盖默认值
|
||
|
||
Returns:
|
||
配置好的请求头字典
|
||
"""
|
||
with self._lock:
|
||
# 选择基础模板
|
||
headers = self._HEADER_TEMPLATES.get(template, self._HEADER_TEMPLATES['default']).copy()
|
||
|
||
# 添加随机User-Agent
|
||
if include_user_agent:
|
||
headers['User-Agent'] = self.get_random_user_agent()
|
||
|
||
# 添加自定义请求头
|
||
if self._custom_headers:
|
||
headers.update(self._custom_headers)
|
||
|
||
# 添加方法参数中的自定义请求头
|
||
if custom_headers:
|
||
headers.update(custom_headers)
|
||
|
||
return headers
|
||
|
||
def set_custom_header(self, key: str, value: str) -> None:
|
||
"""设置自定义请求头,将应用于所有后续生成的请求头"""
|
||
if not key or not value:
|
||
logger.warning("尝试设置空的请求头键或值")
|
||
return
|
||
|
||
with self._lock:
|
||
self._custom_headers[key] = value
|
||
logger.debug(f"已设置自定义请求头: {key} = {value}")
|
||
|
||
def remove_custom_header(self, key: str) -> None:
|
||
"""移除自定义请求头"""
|
||
with self._lock:
|
||
if key in self._custom_headers:
|
||
del self._custom_headers[key]
|
||
logger.debug(f"已移除自定义请求头: {key}")
|
||
|
||
def clear_custom_headers(self) -> None:
|
||
"""清除所有自定义请求头"""
|
||
with self._lock:
|
||
self._custom_headers.clear()
|
||
logger.debug("已清除所有自定义请求头")
|
||
|
||
def is_image_content_type(self, content_type: str) -> bool:
|
||
"""检查内容类型是否为图片类型"""
|
||
if not content_type:
|
||
return False
|
||
|
||
# 处理可能包含参数的Content-Type,如 'image/png; charset=utf-8'
|
||
base_type = content_type.split(';')[0].strip().lower()
|
||
return base_type in self.IMAGE_TYPES
|
||
|
||
def add_user_agent(self, user_agent: str) -> None:
|
||
"""添加自定义User-Agent到池"""
|
||
if not user_agent or user_agent in self._USER_AGENTS:
|
||
return
|
||
|
||
with self._lock:
|
||
self._USER_AGENTS.append(user_agent)
|
||
logger.debug(f"已添加自定义User-Agent")
|
||
|
||
def get_specific_headers(
|
||
self,
|
||
url: str = None,
|
||
referer: str = None,
|
||
content_type: str = None
|
||
) -> Dict[str, str]:
|
||
"""
|
||
获取针对特定场景优化的请求头
|
||
|
||
Args:
|
||
url: 目标URL,用于设置Host
|
||
referer: 引用页URL
|
||
content_type: 内容类型
|
||
|
||
Returns:
|
||
优化后的请求头字典
|
||
"""
|
||
headers = self.get_headers()
|
||
|
||
# 设置Host
|
||
if url:
|
||
try:
|
||
from urllib.parse import urlparse
|
||
parsed_url = urlparse(url)
|
||
if parsed_url.netloc:
|
||
headers['Host'] = parsed_url.netloc
|
||
except Exception as e:
|
||
logger.warning(f"解析URL失败: {e}")
|
||
|
||
# 设置Referer
|
||
if referer:
|
||
headers['Referer'] = referer
|
||
|
||
# 设置Content-Type
|
||
if content_type:
|
||
headers['Content-Type'] = content_type
|
||
|
||
return headers
|
||
|
||
|
||
# 创建全局HeaderConfig实例,用于向后兼容
|
||
_header_config = HeaderConfig()
|
||
|
||
# 全局请求头字典,用于向后兼容
|
||
_headers = {'User-Agent': '-'}
|
||
|
||
# 向后兼容的常量和函数
|
||
content_type = HeaderConfig.CONTENT_TYPE
|
||
image_type = HeaderConfig.IMAGE_TYPES
|
||
|
||
|
||
def get_header():
|
||
"""向后兼容的函数:获取请求头"""
|
||
global _headers
|
||
_headers = _header_config.get_headers(template='default')
|
||
return _headers
|
||
|
||
|
||
def set_header(key: str, value: str):
|
||
"""向后兼容的函数:设置请求头"""
|
||
if key and value:
|
||
_header_config.set_custom_header(key, value)
|
||
|
||
|
||
def del_header(key: str):
|
||
"""向后兼容的函数:删除请求头"""
|
||
_header_config.remove_custom_header(key)
|
||
|
||
|
||
def get_user_agent():
|
||
"""向后兼容的函数:获取请求头中的User-Agent"""
|
||
return _headers.get('User-Agent', '')
|
||
|
||
|
||
def set_user_agent(ua: str):
|
||
"""向后兼容的函数:设置请求头中的User-Agent"""
|
||
if ua:
|
||
_header_config.set_custom_header('User-Agent', ua)
|