master
jinql 2025-09-10 22:56:31 +08:00
parent d3b21d6a11
commit 6c564e6e99
1 changed files with 11 additions and 3 deletions

View File

@ -173,6 +173,7 @@ def _get_link_rel(links, entity: Favicon, _rel: str) -> Optional[str]:
if not links:
return None
_result = None
for link in links:
r = link.get('rel')
_r = ' '.join(r) if isinstance(r, list) else r
@ -180,11 +181,11 @@ def _get_link_rel(links, entity: Favicon, _rel: str) -> Optional[str]:
if _rel:
if _r.lower() == _rel:
return entity.get_icon_url(str(_href))
_result = entity.get_icon_url(str(_href))
else:
return entity.get_icon_url(str(_href))
_result = entity.get_icon_url(str(_href))
return None
return _result
def _parse_html(content: bytes, entity: Favicon) -> Optional[str]:
@ -203,6 +204,13 @@ def _parse_html(content: bytes, entity: Favicon) -> Optional[str]:
html_links = bs.find_all("link", rel=pattern_icon)
# 处理<base>问题
base_soup = bs4.BeautifulSoup(content_str, 'lxml', parse_only=SoupStrainer("base"))
if base_soup:
_base = base_soup.select_one('base[href]')
if _base:
logger.warning(f"-> 页面检测到<base>标签:{_base['href']} | {entity.domain} <-")
# 如果没有找到,尝试使用正则表达式直接匹配
if not html_links or len(html_links) == 0:
content_links = pattern_link.findall(content_str)