Spaces:

WD101
/

Gradio_chat

Sleeping

App Files Files Community

Gradio_chat / data_extractor.py

princeta3011

Update data_extractor.py

5c969a3 verified 3 months ago

raw

history blame contribute delete

4.63 kB

	from bs4 import BeautifulSoup, Comment
	from typing import Dict, List, Optional
	import re
	from urllib.parse import urljoin, urlparse
	from settings import settings

	class DataExtractor:
	def __init__(self):
	self.config = settings.extraction

	def extract_structured_data(self, html: str, url: str) -> Dict:
	"""Extract structured data from HTML for LLM consumption"""
	soup = BeautifulSoup(html, 'lxml')

	# Remove unwanted elements
	self._clean_html(soup)

	return {
	"content": self._extract_content(soup),
	"metadata": self._extract_metadata(soup, url),
	"structure": self._extract_structure(soup),
	"links": self._extract_links(soup, url),
	"images": self._extract_images(soup, url),
	"text_summary": self._extract_text_summary(soup)
	}

	def _clean_html(self, soup: BeautifulSoup):
	"""Remove unwanted elements for cleaner extraction"""
	for selector in self.config.ignore_selectors:
	for element in soup.select(selector):
	element.decompose()

	# Remove comments and scripts
	for element in soup(text=lambda text: isinstance(text, Comment)):
	element.extract()

	def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
	"""Extract main content blocks"""
	content_blocks = []

	for selector in self.config.content_selectors:
	elements = soup.select(selector)
	for elem in elements:
	text = elem.get_text(strip=True)
	if len(text) >= self.config.min_text_length:
	content_blocks.append({
	"tag": elem.name,
	"text": text,
	"html": str(elem),
	"attributes": dict(elem.attrs) if elem.attrs else {}
	})

	return content_blocks

	def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
	"""Extract page metadata"""
	title = soup.find('title')
	meta_desc = soup.find('meta', attrs={'name': 'description'})

	return {
	"title": title.get_text().strip() if title else "",
	"description": meta_desc.get('content', '') if meta_desc else "",
	"url": url,
	"domain": urlparse(url).netloc,
	"headings": self._extract_headings(soup)
	}

	def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
	"""Extract heading hierarchy for structure"""
	headings = []
	for i in range(1, 7):
	for heading in soup.find_all(f'h{i}'):
	headings.append({
	"level": i,
	"text": heading.get_text().strip(),
	"id": heading.get('id', '')
	})
	return headings

	def _extract_structure(self, soup: BeautifulSoup) -> Dict:
	"""Extract DOM structure for relationships"""
	return {
	"sections": len(soup.find_all(['section', 'article', 'div'])),
	"paragraphs": len(soup.find_all('p')),
	"lists": len(soup.find_all(['ul', 'ol'])),
	"tables": len(soup.find_all('table')),
	"forms": len(soup.find_all('form'))
	}

	def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
	"""Extract all links for relationship mapping"""
	links = []
	for link in soup.find_all('a', href=True):
	href = urljoin(base_url, link['href'])
	links.append({
	"url": href,
	"text": link.get_text().strip(),
	"internal": urlparse(href).netloc == urlparse(base_url).netloc
	})
	return links[:50] # Limit for performance

	def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
	"""Extract images with context"""
	images = []
	for img in soup.find_all('img', src=True):
	images.append({
	"src": urljoin(base_url, img['src']),
	"alt": img.get('alt', ''),
	"caption": img.get('title', '')
	})
	return images[:20] # Limit for performance

	def _extract_text_summary(self, soup: BeautifulSoup) -> str:
	"""Extract clean text for LLM processing"""
	text = soup.get_text()
	# Clean whitespace and normalize
	text = re.sub(r'\s+', ' ', text).strip()
	return text[:5000] # Limit for token efficiency