from bs4 import BeautifulSoup, Comment from typing import Dict, List, Optional import re from urllib.parse import urljoin, urlparse from settings import settings class DataExtractor: def __init__(self): self.config = settings.extraction def extract_structured_data(self, html: str, url: str) -> Dict: """Extract structured data from HTML for LLM consumption""" soup = BeautifulSoup(html, 'lxml') # Remove unwanted elements self._clean_html(soup) return { "content": self._extract_content(soup), "metadata": self._extract_metadata(soup, url), "structure": self._extract_structure(soup), "links": self._extract_links(soup, url), "images": self._extract_images(soup, url), "text_summary": self._extract_text_summary(soup) } def _clean_html(self, soup: BeautifulSoup): """Remove unwanted elements for cleaner extraction""" for selector in self.config.ignore_selectors: for element in soup.select(selector): element.decompose() # Remove comments and scripts for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() def _extract_content(self, soup: BeautifulSoup) -> List[Dict]: """Extract main content blocks""" content_blocks = [] for selector in self.config.content_selectors: elements = soup.select(selector) for elem in elements: text = elem.get_text(strip=True) if len(text) >= self.config.min_text_length: content_blocks.append({ "tag": elem.name, "text": text, "html": str(elem), "attributes": dict(elem.attrs) if elem.attrs else {} }) return content_blocks def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict: """Extract page metadata""" title = soup.find('title') meta_desc = soup.find('meta', attrs={'name': 'description'}) return { "title": title.get_text().strip() if title else "", "description": meta_desc.get('content', '') if meta_desc else "", "url": url, "domain": urlparse(url).netloc, "headings": self._extract_headings(soup) } def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]: """Extract heading hierarchy for structure""" headings = [] for i in range(1, 7): for heading in soup.find_all(f'h{i}'): headings.append({ "level": i, "text": heading.get_text().strip(), "id": heading.get('id', '') }) return headings def _extract_structure(self, soup: BeautifulSoup) -> Dict: """Extract DOM structure for relationships""" return { "sections": len(soup.find_all(['section', 'article', 'div'])), "paragraphs": len(soup.find_all('p')), "lists": len(soup.find_all(['ul', 'ol'])), "tables": len(soup.find_all('table')), "forms": len(soup.find_all('form')) } def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]: """Extract all links for relationship mapping""" links = [] for link in soup.find_all('a', href=True): href = urljoin(base_url, link['href']) links.append({ "url": href, "text": link.get_text().strip(), "internal": urlparse(href).netloc == urlparse(base_url).netloc }) return links[:50] # Limit for performance def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]: """Extract images with context""" images = [] for img in soup.find_all('img', src=True): images.append({ "src": urljoin(base_url, img['src']), "alt": img.get('alt', ''), "caption": img.get('title', '') }) return images[:20] # Limit for performance def _extract_text_summary(self, soup: BeautifulSoup) -> str: """Extract clean text for LLM processing""" text = soup.get_text() # Clean whitespace and normalize text = re.sub(r'\s+', ' ', text).strip() return text[:5000] # Limit for token efficiency