Spaces:
Sleeping
Sleeping
File size: 6,339 Bytes
69a077e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
from bs4 import BeautifulSoup
from typing import Dict, List
import hashlib
class DOMAnalyzer:
def __init__(self):
pass
def analyze_structure(self, html: str) -> Dict:
"""Analyze DOM structure and create tree representation"""
soup = BeautifulSoup(html, 'lxml')
return {
"tree": self._build_dom_tree(soup.body if soup.body else soup),
"statistics": self._get_dom_statistics(soup),
"semantic_structure": self._analyze_semantic_structure(soup),
"content_blocks": self._identify_content_blocks(soup)
}
def _build_dom_tree(self, element, depth=0, max_depth=5) -> Dict:
"""Build hierarchical DOM tree structure"""
if depth > max_depth or not element or not hasattr(element, 'name'):
return {}
node = {
"tag": element.name if element.name else "text",
"id": element.get('id', ''),
"classes": element.get('class', []),
"text_content": element.get_text()[:100] if element.get_text() else "",
"children": [],
"attributes": dict(element.attrs) if hasattr(element, 'attrs') else {},
"depth": depth,
"node_id": hashlib.md5(str(element)[:500].encode()).hexdigest()[:8]
}
# Add children (limit to prevent huge trees)
if hasattr(element, 'children') and depth < max_depth:
child_count = 0
for child in element.children:
if child_count >= 10: # Limit children per node
break
if hasattr(child, 'name') and child.name:
child_node = self._build_dom_tree(child, depth + 1, max_depth)
if child_node:
node["children"].append(child_node)
child_count += 1
return node
def _get_dom_statistics(self, soup: BeautifulSoup) -> Dict:
"""Get DOM statistics for analysis"""
all_tags = soup.find_all()
tag_counts = {}
for tag in all_tags:
tag_name = tag.name
tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1
return {
"total_elements": len(all_tags),
"tag_distribution": tag_counts,
"max_depth": self._calculate_max_depth(soup),
"text_content_ratio": self._calculate_text_ratio(soup)
}
def _analyze_semantic_structure(self, soup: BeautifulSoup) -> Dict:
"""Analyze semantic HTML structure"""
semantic_tags = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer']
semantic_elements = {}
for tag in semantic_tags:
elements = soup.find_all(tag)
semantic_elements[tag] = len(elements)
return {
"semantic_elements": semantic_elements,
"has_semantic_structure": sum(semantic_elements.values()) > 0,
"content_hierarchy": self._analyze_heading_hierarchy(soup)
}
def _identify_content_blocks(self, soup: BeautifulSoup) -> List[Dict]:
"""Identify main content blocks for LLM processing"""
content_blocks = []
# Look for common content containers
selectors = ['article', 'main', '.content', '#content', '.post', '.entry']
for selector in selectors:
elements = soup.select(selector)
for elem in elements:
if elem.get_text(strip=True):
content_blocks.append({
"selector": selector,
"tag": elem.name,
"text_length": len(elem.get_text()),
"element_id": elem.get('id', ''),
"classes": elem.get('class', []),
"priority": self._calculate_content_priority(elem)
})
return sorted(content_blocks, key=lambda x: x['priority'], reverse=True)[:5]
def _calculate_max_depth(self, soup: BeautifulSoup) -> int:
"""Calculate maximum DOM depth"""
def get_depth(element, current_depth=0):
if not hasattr(element, 'children'):
return current_depth
max_child_depth = current_depth
for child in element.children:
if hasattr(child, 'name') and child.name:
depth = get_depth(child, current_depth + 1)
max_child_depth = max(max_child_depth, depth)
return max_child_depth
return get_depth(soup)
def _calculate_text_ratio(self, soup: BeautifulSoup) -> float:
"""Calculate ratio of text content to HTML tags"""
text_length = len(soup.get_text())
html_length = len(str(soup))
return text_length / html_length if html_length > 0 else 0
def _analyze_heading_hierarchy(self, soup: BeautifulSoup) -> List[Dict]:
"""Analyze heading structure for content organization"""
headings = []
for i in range(1, 7):
for heading in soup.find_all(f'h{i}'):
headings.append({
"level": i,
"text": heading.get_text().strip(),
"position": len(headings)
})
return headings
def _calculate_content_priority(self, element) -> int:
"""Calculate priority score for content blocks"""
score = 0
text_length = len(element.get_text())
# Text length scoring
score += min(text_length // 100, 10)
# Semantic tag bonus
if element.name in ['article', 'main']:
score += 5
elif element.name in ['section', 'div']:
score += 2
# Class/ID based scoring
classes = element.get('class', [])
element_id = element.get('id', '')
content_indicators = ['content', 'article', 'post', 'main', 'body']
for indicator in content_indicators:
if any(indicator in str(c).lower() for c in classes):
score += 3
if indicator in element_id.lower():
score += 3
return score |