Gradio_chat / dom_analyzer.py
princeta3011's picture
Upload 11 files
69a077e verified
from bs4 import BeautifulSoup
from typing import Dict, List
import hashlib
class DOMAnalyzer:
def __init__(self):
pass
def analyze_structure(self, html: str) -> Dict:
"""Analyze DOM structure and create tree representation"""
soup = BeautifulSoup(html, 'lxml')
return {
"tree": self._build_dom_tree(soup.body if soup.body else soup),
"statistics": self._get_dom_statistics(soup),
"semantic_structure": self._analyze_semantic_structure(soup),
"content_blocks": self._identify_content_blocks(soup)
}
def _build_dom_tree(self, element, depth=0, max_depth=5) -> Dict:
"""Build hierarchical DOM tree structure"""
if depth > max_depth or not element or not hasattr(element, 'name'):
return {}
node = {
"tag": element.name if element.name else "text",
"id": element.get('id', ''),
"classes": element.get('class', []),
"text_content": element.get_text()[:100] if element.get_text() else "",
"children": [],
"attributes": dict(element.attrs) if hasattr(element, 'attrs') else {},
"depth": depth,
"node_id": hashlib.md5(str(element)[:500].encode()).hexdigest()[:8]
}
# Add children (limit to prevent huge trees)
if hasattr(element, 'children') and depth < max_depth:
child_count = 0
for child in element.children:
if child_count >= 10: # Limit children per node
break
if hasattr(child, 'name') and child.name:
child_node = self._build_dom_tree(child, depth + 1, max_depth)
if child_node:
node["children"].append(child_node)
child_count += 1
return node
def _get_dom_statistics(self, soup: BeautifulSoup) -> Dict:
"""Get DOM statistics for analysis"""
all_tags = soup.find_all()
tag_counts = {}
for tag in all_tags:
tag_name = tag.name
tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1
return {
"total_elements": len(all_tags),
"tag_distribution": tag_counts,
"max_depth": self._calculate_max_depth(soup),
"text_content_ratio": self._calculate_text_ratio(soup)
}
def _analyze_semantic_structure(self, soup: BeautifulSoup) -> Dict:
"""Analyze semantic HTML structure"""
semantic_tags = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer']
semantic_elements = {}
for tag in semantic_tags:
elements = soup.find_all(tag)
semantic_elements[tag] = len(elements)
return {
"semantic_elements": semantic_elements,
"has_semantic_structure": sum(semantic_elements.values()) > 0,
"content_hierarchy": self._analyze_heading_hierarchy(soup)
}
def _identify_content_blocks(self, soup: BeautifulSoup) -> List[Dict]:
"""Identify main content blocks for LLM processing"""
content_blocks = []
# Look for common content containers
selectors = ['article', 'main', '.content', '#content', '.post', '.entry']
for selector in selectors:
elements = soup.select(selector)
for elem in elements:
if elem.get_text(strip=True):
content_blocks.append({
"selector": selector,
"tag": elem.name,
"text_length": len(elem.get_text()),
"element_id": elem.get('id', ''),
"classes": elem.get('class', []),
"priority": self._calculate_content_priority(elem)
})
return sorted(content_blocks, key=lambda x: x['priority'], reverse=True)[:5]
def _calculate_max_depth(self, soup: BeautifulSoup) -> int:
"""Calculate maximum DOM depth"""
def get_depth(element, current_depth=0):
if not hasattr(element, 'children'):
return current_depth
max_child_depth = current_depth
for child in element.children:
if hasattr(child, 'name') and child.name:
depth = get_depth(child, current_depth + 1)
max_child_depth = max(max_child_depth, depth)
return max_child_depth
return get_depth(soup)
def _calculate_text_ratio(self, soup: BeautifulSoup) -> float:
"""Calculate ratio of text content to HTML tags"""
text_length = len(soup.get_text())
html_length = len(str(soup))
return text_length / html_length if html_length > 0 else 0
def _analyze_heading_hierarchy(self, soup: BeautifulSoup) -> List[Dict]:
"""Analyze heading structure for content organization"""
headings = []
for i in range(1, 7):
for heading in soup.find_all(f'h{i}'):
headings.append({
"level": i,
"text": heading.get_text().strip(),
"position": len(headings)
})
return headings
def _calculate_content_priority(self, element) -> int:
"""Calculate priority score for content blocks"""
score = 0
text_length = len(element.get_text())
# Text length scoring
score += min(text_length // 100, 10)
# Semantic tag bonus
if element.name in ['article', 'main']:
score += 5
elif element.name in ['section', 'div']:
score += 2
# Class/ID based scoring
classes = element.get('class', [])
element_id = element.get('id', '')
content_indicators = ['content', 'article', 'post', 'main', 'body']
for indicator in content_indicators:
if any(indicator in str(c).lower() for c in classes):
score += 3
if indicator in element_id.lower():
score += 3
return score