Spaces:
Paused
Paused
| from __future__ import annotations | |
| from typing import Any | |
| from urllib.parse import urljoin, urlsplit | |
| from bs4 import BeautifulSoup | |
| from .models import FetchResult | |
| def parse_page(item: FetchResult) -> tuple[dict[str, Any] | None, list[str]]: | |
| if not item.html: | |
| return None, [] | |
| soup = BeautifulSoup(item.html, "lxml") | |
| for tag in soup(["script", "style", "noscript", "svg", "iframe", "canvas"]): | |
| tag.decompose() | |
| text = soup.get_text(" ", strip=True) | |
| if not text: | |
| return None, [] | |
| links: list[str] = [] | |
| for anchor in soup.find_all("a", href=True): | |
| href = anchor.get("href", "").strip() | |
| if not href: | |
| continue | |
| links.append(urljoin(item.url, href)) | |
| domain = (urlsplit(item.url).hostname or "").lower().strip(".") | |
| record = { | |
| "text": text, | |
| "url": item.url, | |
| "domain": domain, | |
| "timestamp": item.fetched_at, | |
| } | |
| return record, links | |