Spaces:
Sleeping
Sleeping
from bs4 import BeautifulSoup, Comment | |
from typing import Dict, List, Optional | |
import re | |
from urllib.parse import urljoin, urlparse | |
from settings import settings | |
class DataExtractor: | |
def __init__(self): | |
self.config = settings.extraction | |
def extract_structured_data(self, html: str, url: str) -> Dict: | |
"""Extract structured data from HTML for LLM consumption""" | |
soup = BeautifulSoup(html, 'lxml') | |
# Remove unwanted elements | |
self._clean_html(soup) | |
return { | |
"content": self._extract_content(soup), | |
"metadata": self._extract_metadata(soup, url), | |
"structure": self._extract_structure(soup), | |
"links": self._extract_links(soup, url), | |
"images": self._extract_images(soup, url), | |
"text_summary": self._extract_text_summary(soup) | |
} | |
def _clean_html(self, soup: BeautifulSoup): | |
"""Remove unwanted elements for cleaner extraction""" | |
for selector in self.config.ignore_selectors: | |
for element in soup.select(selector): | |
element.decompose() | |
# Remove comments and scripts | |
for element in soup(text=lambda text: isinstance(text, Comment)): | |
element.extract() | |
def _extract_content(self, soup: BeautifulSoup) -> List[Dict]: | |
"""Extract main content blocks""" | |
content_blocks = [] | |
for selector in self.config.content_selectors: | |
elements = soup.select(selector) | |
for elem in elements: | |
text = elem.get_text(strip=True) | |
if len(text) >= self.config.min_text_length: | |
content_blocks.append({ | |
"tag": elem.name, | |
"text": text, | |
"html": str(elem), | |
"attributes": dict(elem.attrs) if elem.attrs else {} | |
}) | |
return content_blocks | |
def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict: | |
"""Extract page metadata""" | |
title = soup.find('title') | |
meta_desc = soup.find('meta', attrs={'name': 'description'}) | |
return { | |
"title": title.get_text().strip() if title else "", | |
"description": meta_desc.get('content', '') if meta_desc else "", | |
"url": url, | |
"domain": urlparse(url).netloc, | |
"headings": self._extract_headings(soup) | |
} | |
def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]: | |
"""Extract heading hierarchy for structure""" | |
headings = [] | |
for i in range(1, 7): | |
for heading in soup.find_all(f'h{i}'): | |
headings.append({ | |
"level": i, | |
"text": heading.get_text().strip(), | |
"id": heading.get('id', '') | |
}) | |
return headings | |
def _extract_structure(self, soup: BeautifulSoup) -> Dict: | |
"""Extract DOM structure for relationships""" | |
return { | |
"sections": len(soup.find_all(['section', 'article', 'div'])), | |
"paragraphs": len(soup.find_all('p')), | |
"lists": len(soup.find_all(['ul', 'ol'])), | |
"tables": len(soup.find_all('table')), | |
"forms": len(soup.find_all('form')) | |
} | |
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]: | |
"""Extract all links for relationship mapping""" | |
links = [] | |
for link in soup.find_all('a', href=True): | |
href = urljoin(base_url, link['href']) | |
links.append({ | |
"url": href, | |
"text": link.get_text().strip(), | |
"internal": urlparse(href).netloc == urlparse(base_url).netloc | |
}) | |
return links[:50] # Limit for performance | |
def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]: | |
"""Extract images with context""" | |
images = [] | |
for img in soup.find_all('img', src=True): | |
images.append({ | |
"src": urljoin(base_url, img['src']), | |
"alt": img.get('alt', ''), | |
"caption": img.get('title', '') | |
}) | |
return images[:20] # Limit for performance | |
def _extract_text_summary(self, soup: BeautifulSoup) -> str: | |
"""Extract clean text for LLM processing""" | |
text = soup.get_text() | |
# Clean whitespace and normalize | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text[:5000] # Limit for token efficiency |