Spaces:

DataMuncher-Labs
/

AutoWS

Paused

Upload AutoWS app files without plan/readme

f55f92e verified 16 days ago

963 Bytes

	from __future__ import annotations

	from typing import Any
	from urllib.parse import urljoin, urlsplit

	from bs4 import BeautifulSoup

	from .models import FetchResult


	def parse_page(item: FetchResult) -> tuple[dict[str, Any] \| None, list[str]]:
	if not item.html:
	return None, []

	soup = BeautifulSoup(item.html, "lxml")

	for tag in soup(["script", "style", "noscript", "svg", "iframe", "canvas"]):
	tag.decompose()

	text = soup.get_text(" ", strip=True)
	if not text:
	return None, []

	links: list[str] = []
	for anchor in soup.find_all("a", href=True):
	href = anchor.get("href", "").strip()
	if not href:
	continue
	links.append(urljoin(item.url, href))

	domain = (urlsplit(item.url).hostname or "").lower().strip(".")
	record = {
	"text": text,
	"url": item.url,
	"domain": domain,
	"timestamp": item.fetched_at,
	}
	return record, links