Alpha108's picture
Update utils/scorer.py
5de8c42 verified
raw
history blame
12.8 kB
"""
GEO Scorer Data Integration Fix
Handles various data formats from web scrapers and ensures compatibility
"""
import logging
from typing import Dict, Any, List, Union, Optional
class GEODataAdapter:
"""Adapter to handle different data formats from web scrapers"""
def __init__(self, logger: Optional[logging.Logger] = None):
self.logger = logger or logging.getLogger(__name__)
def normalize_scraped_data(self, scraped_data: Union[Dict, List]) -> List[Dict[str, Any]]:
"""
Normalize scraped data to the format expected by GEOScorer
Args:
scraped_data: Raw data from web scraper (various formats)
Returns:
List[Dict]: Normalized data ready for GEO analysis
"""
try:
# Handle different input formats
if isinstance(scraped_data, dict):
# Single page data
normalized = [self._normalize_single_page(scraped_data)]
elif isinstance(scraped_data, list):
# Multiple pages
normalized = [self._normalize_single_page(page) for page in scraped_data]
else:
raise ValueError(f"Unsupported data type: {type(scraped_data)}")
# Filter out invalid entries
valid_pages = [page for page in normalized if page.get('content')]
self.logger.info(f"Normalized {len(valid_pages)} valid pages from {len(normalized) if isinstance(normalized, list) else 1} total")
return valid_pages
except Exception as e:
self.logger.error(f"Data normalization failed: {e}")
return []
def _normalize_single_page(self, page_data: Dict[str, Any]) -> Dict[str, Any]:
"""Normalize a single page's data structure"""
# Common field mappings from different scrapers
content_fields = ['content', 'text', 'body', 'html_content', 'page_content', 'main_content']
title_fields = ['title', 'page_title', 'heading', 'h1', 'name']
url_fields = ['url', 'link', 'page_url', 'source_url', 'href']
# Extract content (try multiple possible field names)
content = ""
for field in content_fields:
if field in page_data and page_data[field]:
content = str(page_data[field])
break
# Extract title
title = "Untitled Page"
for field in title_fields:
if field in page_data and page_data[field]:
title = str(page_data[field])
break
# Extract URL
url = ""
for field in url_fields:
if field in page_data and page_data[field]:
url = str(page_data[field])
break
# Create normalized structure
normalized = {
'content': content,
'title': title,
'url': url,
'word_count': len(content.split()) if content else 0,
'original_data': page_data # Keep original for debugging
}
# Add any additional metadata
metadata_fields = ['description', 'keywords', 'author', 'date', 'meta_description']
for field in metadata_fields:
if field in page_data:
normalized[field] = page_data[field]
return normalized
def validate_normalized_data(self, normalized_data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Validate normalized data and provide diagnostics"""
validation_results = {
'total_pages': len(normalized_data),
'valid_pages': 0,
'invalid_pages': 0,
'issues': [],
'summary': {}
}
for i, page in enumerate(normalized_data):
issues = []
# Check required fields
if not page.get('content'):
issues.append(f"Page {i}: Missing or empty content")
elif len(page['content'].strip()) < 50:
issues.append(f"Page {i}: Content too short ({len(page['content'])} chars)")
if not page.get('title'):
issues.append(f"Page {i}: Missing title")
if issues:
validation_results['invalid_pages'] += 1
validation_results['issues'].extend(issues)
else:
validation_results['valid_pages'] += 1
# Generate summary
content_lengths = [len(page.get('content', '')) for page in normalized_data if page.get('content')]
if content_lengths:
validation_results['summary'] = {
'avg_content_length': sum(content_lengths) / len(content_lengths),
'min_content_length': min(content_lengths),
'max_content_length': max(content_lengths),
'pages_with_titles': len([p for p in normalized_data if p.get('title') and p['title'] != 'Untitled Page']),
'pages_with_urls': len([p for p in normalized_data if p.get('url')])
}
return validation_results
class GEOScorerWithAdapter(GEOScorer):
"""Extended GEOScorer with built-in data adaptation"""
def __init__(self, llm, config: Optional[GEOConfig] = None, logger: Optional[logging.Logger] = None):
super().__init__(llm, config, logger)
self.data_adapter = GEODataAdapter(logger)
def analyze_scraped_data(self, scraped_data: Union[Dict, List], detailed: bool = True) -> Dict[str, Any]:
"""
Analyze scraped data with automatic format detection and normalization
Args:
scraped_data: Raw scraped data in any format
detailed: Whether to perform detailed analysis
Returns:
Dict: Complete analysis results with diagnostics
"""
self.logger.info("Starting analysis of scraped data")
try:
# Step 1: Normalize the data
normalized_data = self.data_adapter.normalize_scraped_data(scraped_data)
if not normalized_data:
return {
'error': 'No valid data could be extracted from scraped content',
'error_type': 'data_normalization',
'original_data_type': str(type(scraped_data)),
'original_data_sample': str(scraped_data)[:200] if scraped_data else None
}
# Step 2: Validate normalized data
validation_results = self.data_adapter.validate_normalized_data(normalized_data)
# Step 3: Analyze valid pages
analysis_results = self.analyze_multiple_pages(normalized_data, detailed)
# Step 4: Calculate aggregate scores
aggregate_results = self.calculate_aggregate_scores(analysis_results)
# Step 5: Combine all results
complete_results = {
'data_validation': validation_results,
'individual_analyses': analysis_results,
'aggregate_scores': aggregate_results,
'processing_summary': {
'pages_scraped': validation_results['total_pages'],
'pages_analyzed': len([r for r in analysis_results if not r.get('error')]),
'overall_success_rate': validation_results['valid_pages'] / max(validation_results['total_pages'], 1),
'analysis_type': 'detailed' if detailed else 'quick'
}
}
self.logger.info(f"Analysis completed: {complete_results['processing_summary']}")
return complete_results
except Exception as e:
self.logger.error(f"Scraped data analysis failed: {e}")
return {
'error': f'Analysis failed: {str(e)}',
'error_type': 'system',
'original_data_type': str(type(scraped_data)),
'traceback': str(e)
}
# Debugging utility functions
def debug_scraped_data(scraped_data: Union[Dict, List]) -> Dict[str, Any]:
"""
Debug utility to understand the structure of scraped data
Args:
scraped_data: The raw scraped data causing issues
Returns:
Dict: Detailed breakdown of the data structure
"""
debug_info = {
'data_type': str(type(scraped_data)),
'data_structure': {},
'sample_content': {},
'recommendations': []
}
try:
if isinstance(scraped_data, dict):
debug_info['data_structure'] = {
'is_dict': True,
'keys': list(scraped_data.keys()),
'key_count': len(scraped_data.keys())
}
# Sample first few key-value pairs
for i, (key, value) in enumerate(list(scraped_data.items())[:5]):
debug_info['sample_content'][key] = {
'type': str(type(value)),
'length': len(str(value)) if value else 0,
'sample': str(value)[:100] if value else None
}
# Check for common content fields
content_fields = ['content', 'text', 'body', 'html_content', 'page_content']
found_content_fields = [field for field in content_fields if field in scraped_data]
if found_content_fields:
debug_info['recommendations'].append(f"Found potential content fields: {found_content_fields}")
else:
debug_info['recommendations'].append("No standard content fields found. Check field names.")
elif isinstance(scraped_data, list):
debug_info['data_structure'] = {
'is_list': True,
'length': len(scraped_data),
'first_item_type': str(type(scraped_data[0])) if scraped_data else 'empty'
}
if scraped_data and isinstance(scraped_data[0], dict):
debug_info['sample_content']['first_item_keys'] = list(scraped_data[0].keys())
else:
debug_info['recommendations'].append(f"Unexpected data type: {type(scraped_data)}")
except Exception as e:
debug_info['error'] = f"Debug analysis failed: {str(e)}"
return debug_info
def create_test_scraped_data() -> List[Dict[str, Any]]:
"""Create test data in various formats that scrapers might return"""
# Format 1: Standard format
format1 = {
'content': 'This is the main content of the page about AI optimization.',
'title': 'AI Optimization Guide',
'url': 'https://example.com/ai-guide'
}
# Format 2: Different field names
format2 = {
'text': 'Content about machine learning best practices.',
'page_title': 'ML Best Practices',
'link': 'https://example.com/ml-practices'
}
# Format 3: Nested structure
format3 = {
'page_data': {
'body': 'Deep learning techniques for content optimization.',
'heading': 'Deep Learning Guide'
},
'metadata': {
'source_url': 'https://example.com/deep-learning'
}
}
return [format1, format2, format3]
# Usage example and testing
def test_data_integration():
"""Test the data integration fixes"""
# Test with various data formats
test_data = create_test_scraped_data()
# Debug the data first
for i, data in enumerate(test_data):
print(f"\n--- Debug Info for Test Data {i+1} ---")
debug_info = debug_scraped_data(data)
print(f"Data type: {debug_info['data_type']}")
print(f"Keys: {debug_info['data_structure'].get('keys', 'N/A')}")
print(f"Recommendations: {debug_info['recommendations']}")
# Test normalization
adapter = GEODataAdapter()
normalized = adapter.normalize_scraped_data(test_data)
print(f"\n--- Normalization Results ---")
print(f"Original items: {len(test_data)}")
print(f"Normalized items: {len(normalized)}")
for i, item in enumerate(normalized):
print(f"Item {i+1}: Title='{item['title']}', Content length={len(item['content'])}")
# Test validation
validation = adapter.validate_normalized_data(normalized)
print(f"\n--- Validation Results ---")
print(f"Valid pages: {validation['valid_pages']}/{validation['total_pages']}")
print(f"Issues: {validation['issues']}")
if __name__ == "__main__":
test_data_integration()