Spaces:

WD101
/

Gradio_chat

Sleeping

File size: 6,339 Bytes

69a077e

from bs4 import BeautifulSoup
from typing import Dict, List
import hashlib

class DOMAnalyzer:
    def __init__(self):
        pass
    
    def analyze_structure(self, html: str) -> Dict:
        """Analyze DOM structure and create tree representation"""
        soup = BeautifulSoup(html, 'lxml')
        
        return {
            "tree": self._build_dom_tree(soup.body if soup.body else soup),
            "statistics": self._get_dom_statistics(soup),
            "semantic_structure": self._analyze_semantic_structure(soup),
            "content_blocks": self._identify_content_blocks(soup)
        }
    
    def _build_dom_tree(self, element, depth=0, max_depth=5) -> Dict:
        """Build hierarchical DOM tree structure"""
        if depth > max_depth or not element or not hasattr(element, 'name'):
            return {}
        
        node = {
            "tag": element.name if element.name else "text",
            "id": element.get('id', ''),
            "classes": element.get('class', []),
            "text_content": element.get_text()[:100] if element.get_text() else "",
            "children": [],
            "attributes": dict(element.attrs) if hasattr(element, 'attrs') else {},
            "depth": depth,
            "node_id": hashlib.md5(str(element)[:500].encode()).hexdigest()[:8]
        }
        
        # Add children (limit to prevent huge trees)
        if hasattr(element, 'children') and depth < max_depth:
            child_count = 0
            for child in element.children:
                if child_count >= 10:  # Limit children per node
                    break
                if hasattr(child, 'name') and child.name:
                    child_node = self._build_dom_tree(child, depth + 1, max_depth)
                    if child_node:
                        node["children"].append(child_node)
                        child_count += 1
        
        return node
    
    def _get_dom_statistics(self, soup: BeautifulSoup) -> Dict:
        """Get DOM statistics for analysis"""
        all_tags = soup.find_all()
        tag_counts = {}
        
        for tag in all_tags:
            tag_name = tag.name
            tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1
        
        return {
            "total_elements": len(all_tags),
            "tag_distribution": tag_counts,
            "max_depth": self._calculate_max_depth(soup),
            "text_content_ratio": self._calculate_text_ratio(soup)
        }
    
    def _analyze_semantic_structure(self, soup: BeautifulSoup) -> Dict:
        """Analyze semantic HTML structure"""
        semantic_tags = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer']
        semantic_elements = {}
        
        for tag in semantic_tags:
            elements = soup.find_all(tag)
            semantic_elements[tag] = len(elements)
        
        return {
            "semantic_elements": semantic_elements,
            "has_semantic_structure": sum(semantic_elements.values()) > 0,
            "content_hierarchy": self._analyze_heading_hierarchy(soup)
        }
    
    def _identify_content_blocks(self, soup: BeautifulSoup) -> List[Dict]:
        """Identify main content blocks for LLM processing"""
        content_blocks = []
        
        # Look for common content containers
        selectors = ['article', 'main', '.content', '#content', '.post', '.entry']
        
        for selector in selectors:
            elements = soup.select(selector)
            for elem in elements:
                if elem.get_text(strip=True):
                    content_blocks.append({
                        "selector": selector,
                        "tag": elem.name,
                        "text_length": len(elem.get_text()),
                        "element_id": elem.get('id', ''),
                        "classes": elem.get('class', []),
                        "priority": self._calculate_content_priority(elem)
                    })
        
        return sorted(content_blocks, key=lambda x: x['priority'], reverse=True)[:5]
    
    def _calculate_max_depth(self, soup: BeautifulSoup) -> int:
        """Calculate maximum DOM depth"""
        def get_depth(element, current_depth=0):
            if not hasattr(element, 'children'):
                return current_depth
            
            max_child_depth = current_depth
            for child in element.children:
                if hasattr(child, 'name') and child.name:
                    depth = get_depth(child, current_depth + 1)
                    max_child_depth = max(max_child_depth, depth)
            
            return max_child_depth
        
        return get_depth(soup)
    
    def _calculate_text_ratio(self, soup: BeautifulSoup) -> float:
        """Calculate ratio of text content to HTML tags"""
        text_length = len(soup.get_text())
        html_length = len(str(soup))
        return text_length / html_length if html_length > 0 else 0
    
    def _analyze_heading_hierarchy(self, soup: BeautifulSoup) -> List[Dict]:
        """Analyze heading structure for content organization"""
        headings = []
        for i in range(1, 7):
            for heading in soup.find_all(f'h{i}'):
                headings.append({
                    "level": i,
                    "text": heading.get_text().strip(),
                    "position": len(headings)
                })
        return headings
    
    def _calculate_content_priority(self, element) -> int:
        """Calculate priority score for content blocks"""
        score = 0
        text_length = len(element.get_text())
        
        # Text length scoring
        score += min(text_length // 100, 10)
        
        # Semantic tag bonus
        if element.name in ['article', 'main']:
            score += 5
        elif element.name in ['section', 'div']:
            score += 2
        
        # Class/ID based scoring
        classes = element.get('class', [])
        element_id = element.get('id', '')
        
        content_indicators = ['content', 'article', 'post', 'main', 'body']
        for indicator in content_indicators:
            if any(indicator in str(c).lower() for c in classes):
                score += 3
            if indicator in element_id.lower():
                score += 3
        
        return score