Spaces:
Sleeping
Sleeping
| from typing import Optional | |
| from urllib.parse import urljoin, urlparse | |
| from bs4 import BeautifulSoup | |
| from fastapi import APIRouter | |
| from httpx import AsyncClient, HTTPStatusError, RequestError | |
| from pydantic import BaseModel, Field | |
| from reworkd_platform.web.api.errors import PlatformaticError | |
| router = APIRouter() | |
| class Metadata(BaseModel): | |
| title: Optional[str] = Field(default=None, description="Title of the page") | |
| hostname: Optional[str] = Field(default=None, description="Hostname of the page") | |
| favicon: Optional[str] = Field(default=None, description="Favicon of the page") | |
| async def extract_metadata(url: str) -> Metadata: | |
| try: | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36" | |
| } | |
| async with AsyncClient() as client: | |
| res = await client.get(url, headers=headers) | |
| res.raise_for_status() | |
| soup = BeautifulSoup(res.text, "html.parser") | |
| parsed_url = urlparse(url) | |
| metadata = Metadata( | |
| hostname=parsed_url.hostname, | |
| title=soup.title.string.strip() if soup.title else None, | |
| ) | |
| favicon = None | |
| for link in soup.find_all("link", rel=lambda x: x in ["icon", "shortcut icon"]): | |
| favicon = link.get("href") | |
| if not favicon.startswith("http"): | |
| favicon = urljoin(url, favicon) | |
| break | |
| metadata.favicon = ( | |
| favicon | |
| if favicon | |
| else f"{parsed_url.scheme}://{parsed_url.hostname}/favicon.ico" | |
| ) | |
| return metadata | |
| except (RequestError, HTTPStatusError): | |
| parsed_url = urlparse(url) | |
| return Metadata( | |
| hostname=parsed_url.hostname, | |
| favicon=f"{parsed_url.scheme}://{parsed_url.hostname}/favicon.ico", | |
| ) | |
| except Exception as e: | |
| raise PlatformaticError( | |
| base_exception=e, | |
| detail=f"Could not extract metadata from {url}", | |
| should_log=False, | |
| ) | |