from bs4 import BeautifulSoup from typing import Dict, List import hashlib class DOMAnalyzer: def __init__(self): pass def analyze_structure(self, html: str) -> Dict: """Analyze DOM structure and create tree representation""" soup = BeautifulSoup(html, 'lxml') return { "tree": self._build_dom_tree(soup.body if soup.body else soup), "statistics": self._get_dom_statistics(soup), "semantic_structure": self._analyze_semantic_structure(soup), "content_blocks": self._identify_content_blocks(soup) } def _build_dom_tree(self, element, depth=0, max_depth=5) -> Dict: """Build hierarchical DOM tree structure""" if depth > max_depth or not element or not hasattr(element, 'name'): return {} node = { "tag": element.name if element.name else "text", "id": element.get('id', ''), "classes": element.get('class', []), "text_content": element.get_text()[:100] if element.get_text() else "", "children": [], "attributes": dict(element.attrs) if hasattr(element, 'attrs') else {}, "depth": depth, "node_id": hashlib.md5(str(element)[:500].encode()).hexdigest()[:8] } # Add children (limit to prevent huge trees) if hasattr(element, 'children') and depth < max_depth: child_count = 0 for child in element.children: if child_count >= 10: # Limit children per node break if hasattr(child, 'name') and child.name: child_node = self._build_dom_tree(child, depth + 1, max_depth) if child_node: node["children"].append(child_node) child_count += 1 return node def _get_dom_statistics(self, soup: BeautifulSoup) -> Dict: """Get DOM statistics for analysis""" all_tags = soup.find_all() tag_counts = {} for tag in all_tags: tag_name = tag.name tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1 return { "total_elements": len(all_tags), "tag_distribution": tag_counts, "max_depth": self._calculate_max_depth(soup), "text_content_ratio": self._calculate_text_ratio(soup) } def _analyze_semantic_structure(self, soup: BeautifulSoup) -> Dict: """Analyze semantic HTML structure""" semantic_tags = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer'] semantic_elements = {} for tag in semantic_tags: elements = soup.find_all(tag) semantic_elements[tag] = len(elements) return { "semantic_elements": semantic_elements, "has_semantic_structure": sum(semantic_elements.values()) > 0, "content_hierarchy": self._analyze_heading_hierarchy(soup) } def _identify_content_blocks(self, soup: BeautifulSoup) -> List[Dict]: """Identify main content blocks for LLM processing""" content_blocks = [] # Look for common content containers selectors = ['article', 'main', '.content', '#content', '.post', '.entry'] for selector in selectors: elements = soup.select(selector) for elem in elements: if elem.get_text(strip=True): content_blocks.append({ "selector": selector, "tag": elem.name, "text_length": len(elem.get_text()), "element_id": elem.get('id', ''), "classes": elem.get('class', []), "priority": self._calculate_content_priority(elem) }) return sorted(content_blocks, key=lambda x: x['priority'], reverse=True)[:5] def _calculate_max_depth(self, soup: BeautifulSoup) -> int: """Calculate maximum DOM depth""" def get_depth(element, current_depth=0): if not hasattr(element, 'children'): return current_depth max_child_depth = current_depth for child in element.children: if hasattr(child, 'name') and child.name: depth = get_depth(child, current_depth + 1) max_child_depth = max(max_child_depth, depth) return max_child_depth return get_depth(soup) def _calculate_text_ratio(self, soup: BeautifulSoup) -> float: """Calculate ratio of text content to HTML tags""" text_length = len(soup.get_text()) html_length = len(str(soup)) return text_length / html_length if html_length > 0 else 0 def _analyze_heading_hierarchy(self, soup: BeautifulSoup) -> List[Dict]: """Analyze heading structure for content organization""" headings = [] for i in range(1, 7): for heading in soup.find_all(f'h{i}'): headings.append({ "level": i, "text": heading.get_text().strip(), "position": len(headings) }) return headings def _calculate_content_priority(self, element) -> int: """Calculate priority score for content blocks""" score = 0 text_length = len(element.get_text()) # Text length scoring score += min(text_length // 100, 10) # Semantic tag bonus if element.name in ['article', 'main']: score += 5 elif element.name in ['section', 'div']: score += 2 # Class/ID based scoring classes = element.get('class', []) element_id = element.get('id', '') content_indicators = ['content', 'article', 'post', 'main', 'body'] for indicator in content_indicators: if any(indicator in str(c).lower() for c in classes): score += 3 if indicator in element_id.lower(): score += 3 return score