Spaces:

WD101
/

Gradio_chat

Sleeping

File size: 4,627 Bytes

69a077e
 
 
 
5c969a3
69a077e

from bs4 import BeautifulSoup, Comment
from typing import Dict, List, Optional
import re
from urllib.parse import urljoin, urlparse
from settings import settings

class DataExtractor:
    def __init__(self):
        self.config = settings.extraction
    
    def extract_structured_data(self, html: str, url: str) -> Dict:
        """Extract structured data from HTML for LLM consumption"""
        soup = BeautifulSoup(html, 'lxml')
        
        # Remove unwanted elements
        self._clean_html(soup)
        
        return {
            "content": self._extract_content(soup),
            "metadata": self._extract_metadata(soup, url),
            "structure": self._extract_structure(soup),
            "links": self._extract_links(soup, url),
            "images": self._extract_images(soup, url),
            "text_summary": self._extract_text_summary(soup)
        }
    
    def _clean_html(self, soup: BeautifulSoup):
        """Remove unwanted elements for cleaner extraction"""
        for selector in self.config.ignore_selectors:
            for element in soup.select(selector):
                element.decompose()
        
        # Remove comments and scripts
        for element in soup(text=lambda text: isinstance(text, Comment)):
            element.extract()
    
    def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
        """Extract main content blocks"""
        content_blocks = []
        
        for selector in self.config.content_selectors:
            elements = soup.select(selector)
            for elem in elements:
                text = elem.get_text(strip=True)
                if len(text) >= self.config.min_text_length:
                    content_blocks.append({
                        "tag": elem.name,
                        "text": text,
                        "html": str(elem),
                        "attributes": dict(elem.attrs) if elem.attrs else {}
                    })
        
        return content_blocks
    
    def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
        """Extract page metadata"""
        title = soup.find('title')
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        
        return {
            "title": title.get_text().strip() if title else "",
            "description": meta_desc.get('content', '') if meta_desc else "",
            "url": url,
            "domain": urlparse(url).netloc,
            "headings": self._extract_headings(soup)
        }
    
    def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
        """Extract heading hierarchy for structure"""
        headings = []
        for i in range(1, 7):
            for heading in soup.find_all(f'h{i}'):
                headings.append({
                    "level": i,
                    "text": heading.get_text().strip(),
                    "id": heading.get('id', '')
                })
        return headings
    
    def _extract_structure(self, soup: BeautifulSoup) -> Dict:
        """Extract DOM structure for relationships"""
        return {
            "sections": len(soup.find_all(['section', 'article', 'div'])),
            "paragraphs": len(soup.find_all('p')),
            "lists": len(soup.find_all(['ul', 'ol'])),
            "tables": len(soup.find_all('table')),
            "forms": len(soup.find_all('form'))
        }
    
    def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
        """Extract all links for relationship mapping"""
        links = []
        for link in soup.find_all('a', href=True):
            href = urljoin(base_url, link['href'])
            links.append({
                "url": href,
                "text": link.get_text().strip(),
                "internal": urlparse(href).netloc == urlparse(base_url).netloc
            })
        return links[:50]  # Limit for performance
    
    def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
        """Extract images with context"""
        images = []
        for img in soup.find_all('img', src=True):
            images.append({
                "src": urljoin(base_url, img['src']),
                "alt": img.get('alt', ''),
                "caption": img.get('title', '')
            })
        return images[:20]  # Limit for performance
    
    def _extract_text_summary(self, soup: BeautifulSoup) -> str:
        """Extract clean text for LLM processing"""
        text = soup.get_text()
        # Clean whitespace and normalize
        text = re.sub(r'\s+', ' ', text).strip()
        return text[:5000]  # Limit for token efficiency