Spaces:
Sleeping
Sleeping
from bs4 import BeautifulSoup | |
from typing import Dict, List | |
import hashlib | |
class DOMAnalyzer: | |
def __init__(self): | |
pass | |
def analyze_structure(self, html: str) -> Dict: | |
"""Analyze DOM structure and create tree representation""" | |
soup = BeautifulSoup(html, 'lxml') | |
return { | |
"tree": self._build_dom_tree(soup.body if soup.body else soup), | |
"statistics": self._get_dom_statistics(soup), | |
"semantic_structure": self._analyze_semantic_structure(soup), | |
"content_blocks": self._identify_content_blocks(soup) | |
} | |
def _build_dom_tree(self, element, depth=0, max_depth=5) -> Dict: | |
"""Build hierarchical DOM tree structure""" | |
if depth > max_depth or not element or not hasattr(element, 'name'): | |
return {} | |
node = { | |
"tag": element.name if element.name else "text", | |
"id": element.get('id', ''), | |
"classes": element.get('class', []), | |
"text_content": element.get_text()[:100] if element.get_text() else "", | |
"children": [], | |
"attributes": dict(element.attrs) if hasattr(element, 'attrs') else {}, | |
"depth": depth, | |
"node_id": hashlib.md5(str(element)[:500].encode()).hexdigest()[:8] | |
} | |
# Add children (limit to prevent huge trees) | |
if hasattr(element, 'children') and depth < max_depth: | |
child_count = 0 | |
for child in element.children: | |
if child_count >= 10: # Limit children per node | |
break | |
if hasattr(child, 'name') and child.name: | |
child_node = self._build_dom_tree(child, depth + 1, max_depth) | |
if child_node: | |
node["children"].append(child_node) | |
child_count += 1 | |
return node | |
def _get_dom_statistics(self, soup: BeautifulSoup) -> Dict: | |
"""Get DOM statistics for analysis""" | |
all_tags = soup.find_all() | |
tag_counts = {} | |
for tag in all_tags: | |
tag_name = tag.name | |
tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1 | |
return { | |
"total_elements": len(all_tags), | |
"tag_distribution": tag_counts, | |
"max_depth": self._calculate_max_depth(soup), | |
"text_content_ratio": self._calculate_text_ratio(soup) | |
} | |
def _analyze_semantic_structure(self, soup: BeautifulSoup) -> Dict: | |
"""Analyze semantic HTML structure""" | |
semantic_tags = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer'] | |
semantic_elements = {} | |
for tag in semantic_tags: | |
elements = soup.find_all(tag) | |
semantic_elements[tag] = len(elements) | |
return { | |
"semantic_elements": semantic_elements, | |
"has_semantic_structure": sum(semantic_elements.values()) > 0, | |
"content_hierarchy": self._analyze_heading_hierarchy(soup) | |
} | |
def _identify_content_blocks(self, soup: BeautifulSoup) -> List[Dict]: | |
"""Identify main content blocks for LLM processing""" | |
content_blocks = [] | |
# Look for common content containers | |
selectors = ['article', 'main', '.content', '#content', '.post', '.entry'] | |
for selector in selectors: | |
elements = soup.select(selector) | |
for elem in elements: | |
if elem.get_text(strip=True): | |
content_blocks.append({ | |
"selector": selector, | |
"tag": elem.name, | |
"text_length": len(elem.get_text()), | |
"element_id": elem.get('id', ''), | |
"classes": elem.get('class', []), | |
"priority": self._calculate_content_priority(elem) | |
}) | |
return sorted(content_blocks, key=lambda x: x['priority'], reverse=True)[:5] | |
def _calculate_max_depth(self, soup: BeautifulSoup) -> int: | |
"""Calculate maximum DOM depth""" | |
def get_depth(element, current_depth=0): | |
if not hasattr(element, 'children'): | |
return current_depth | |
max_child_depth = current_depth | |
for child in element.children: | |
if hasattr(child, 'name') and child.name: | |
depth = get_depth(child, current_depth + 1) | |
max_child_depth = max(max_child_depth, depth) | |
return max_child_depth | |
return get_depth(soup) | |
def _calculate_text_ratio(self, soup: BeautifulSoup) -> float: | |
"""Calculate ratio of text content to HTML tags""" | |
text_length = len(soup.get_text()) | |
html_length = len(str(soup)) | |
return text_length / html_length if html_length > 0 else 0 | |
def _analyze_heading_hierarchy(self, soup: BeautifulSoup) -> List[Dict]: | |
"""Analyze heading structure for content organization""" | |
headings = [] | |
for i in range(1, 7): | |
for heading in soup.find_all(f'h{i}'): | |
headings.append({ | |
"level": i, | |
"text": heading.get_text().strip(), | |
"position": len(headings) | |
}) | |
return headings | |
def _calculate_content_priority(self, element) -> int: | |
"""Calculate priority score for content blocks""" | |
score = 0 | |
text_length = len(element.get_text()) | |
# Text length scoring | |
score += min(text_length // 100, 10) | |
# Semantic tag bonus | |
if element.name in ['article', 'main']: | |
score += 5 | |
elif element.name in ['section', 'div']: | |
score += 2 | |
# Class/ID based scoring | |
classes = element.get('class', []) | |
element_id = element.get('id', '') | |
content_indicators = ['content', 'article', 'post', 'main', 'body'] | |
for indicator in content_indicators: | |
if any(indicator in str(c).lower() for c in classes): | |
score += 3 | |
if indicator in element_id.lower(): | |
score += 3 | |
return score |