Spaces:
Sleeping
Sleeping
File size: 4,627 Bytes
69a077e 5c969a3 69a077e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
from bs4 import BeautifulSoup, Comment
from typing import Dict, List, Optional
import re
from urllib.parse import urljoin, urlparse
from settings import settings
class DataExtractor:
def __init__(self):
self.config = settings.extraction
def extract_structured_data(self, html: str, url: str) -> Dict:
"""Extract structured data from HTML for LLM consumption"""
soup = BeautifulSoup(html, 'lxml')
# Remove unwanted elements
self._clean_html(soup)
return {
"content": self._extract_content(soup),
"metadata": self._extract_metadata(soup, url),
"structure": self._extract_structure(soup),
"links": self._extract_links(soup, url),
"images": self._extract_images(soup, url),
"text_summary": self._extract_text_summary(soup)
}
def _clean_html(self, soup: BeautifulSoup):
"""Remove unwanted elements for cleaner extraction"""
for selector in self.config.ignore_selectors:
for element in soup.select(selector):
element.decompose()
# Remove comments and scripts
for element in soup(text=lambda text: isinstance(text, Comment)):
element.extract()
def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
"""Extract main content blocks"""
content_blocks = []
for selector in self.config.content_selectors:
elements = soup.select(selector)
for elem in elements:
text = elem.get_text(strip=True)
if len(text) >= self.config.min_text_length:
content_blocks.append({
"tag": elem.name,
"text": text,
"html": str(elem),
"attributes": dict(elem.attrs) if elem.attrs else {}
})
return content_blocks
def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
"""Extract page metadata"""
title = soup.find('title')
meta_desc = soup.find('meta', attrs={'name': 'description'})
return {
"title": title.get_text().strip() if title else "",
"description": meta_desc.get('content', '') if meta_desc else "",
"url": url,
"domain": urlparse(url).netloc,
"headings": self._extract_headings(soup)
}
def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
"""Extract heading hierarchy for structure"""
headings = []
for i in range(1, 7):
for heading in soup.find_all(f'h{i}'):
headings.append({
"level": i,
"text": heading.get_text().strip(),
"id": heading.get('id', '')
})
return headings
def _extract_structure(self, soup: BeautifulSoup) -> Dict:
"""Extract DOM structure for relationships"""
return {
"sections": len(soup.find_all(['section', 'article', 'div'])),
"paragraphs": len(soup.find_all('p')),
"lists": len(soup.find_all(['ul', 'ol'])),
"tables": len(soup.find_all('table')),
"forms": len(soup.find_all('form'))
}
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
"""Extract all links for relationship mapping"""
links = []
for link in soup.find_all('a', href=True):
href = urljoin(base_url, link['href'])
links.append({
"url": href,
"text": link.get_text().strip(),
"internal": urlparse(href).netloc == urlparse(base_url).netloc
})
return links[:50] # Limit for performance
def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
"""Extract images with context"""
images = []
for img in soup.find_all('img', src=True):
images.append({
"src": urljoin(base_url, img['src']),
"alt": img.get('alt', ''),
"caption": img.get('title', '')
})
return images[:20] # Limit for performance
def _extract_text_summary(self, soup: BeautifulSoup) -> str:
"""Extract clean text for LLM processing"""
text = soup.get_text()
# Clean whitespace and normalize
text = re.sub(r'\s+', ' ', text).strip()
return text[:5000] # Limit for token efficiency |