geo-platform / server /competitor_analysis.py
3v324v23's picture
initial: geo-platform full stack
5c429d4
"""Competitor Detection and Analysis."""
import re
from urllib.parse import urlparse
from typing import List, Dict, Set
from collections import defaultdict
def extract_domain(url: str) -> str:
"""Extract clean domain from URL."""
try:
parsed = urlparse(url)
domain = parsed.netloc or parsed.path
# Remove www.
domain = re.sub(r'^www\.', '', domain)
return domain.lower()
except:
return ''
def is_valid_competitor(url: str, source_domain: str) -> bool:
"""Check if URL is a valid competitor (not internal, not social, not CDN)."""
domain = extract_domain(url)
if not domain:
return False
# Same domain = not competitor
if domain == source_domain or source_domain in domain:
return False
# Filter out common non-competitor domains
excluded_patterns = [
# Social media
r'facebook\.com', r'twitter\.com', r'instagram\.com', r'linkedin\.com',
r'youtube\.com', r'tiktok\.com', r'pinterest\.com', r'snapchat\.com',
# CDNs and services
r'cloudflare\.com', r'amazonaws\.com', r'googleusercontent\.com',
r'cloudfront\.net', r'akamai\.net', r'fastly\.net',
# Analytics and ads
r'google-analytics\.com', r'googletagmanager\.com', r'doubleclick\.net',
r'facebook\.net', r'googlesyndication\.com', r'googleadservices\.com',
# Payment and services
r'paypal\.com', r'stripe\.com', r'shopify\.com',
# Generic services
r'google\.com', r'bing\.com', r'yahoo\.com', r'wikipedia\.org',
r'w3\.org', r'schema\.org', r'creativecommons\.org',
# Fonts and assets
r'fonts\.googleapis\.com', r'fonts\.gstatic\.com',
# Maps
r'maps\.google\.com', r'openstreetmap\.org'
]
for pattern in excluded_patterns:
if re.search(pattern, domain):
return False
return True
def detect_competitors(pages: List[Dict], source_url: str, min_mentions: int = 1) -> List[Dict]:
"""
Detect competitor domains from crawled pages with contextual snippets.
Args:
pages: List of page objects with 'links' and 'text' fields
source_url: Source domain URL
min_mentions: Minimum number of mentions to be considered competitor
Returns:
List of competitor dicts with domain, count, sample URLs, and context snippets
"""
source_domain = extract_domain(source_url)
# Count competitor mentions
competitor_counts = defaultdict(int)
competitor_urls = defaultdict(set)
competitor_contexts = defaultdict(list)
for page in pages:
links = page.get('links', [])
page_text = page.get('text', '')
for link in links:
if is_valid_competitor(link, source_domain):
domain = extract_domain(link)
competitor_counts[domain] += 1
competitor_urls[domain].add(link)
# Extract a small snippet of context if possible
# In a real scenario, we'd use beautifulsoup to find parent elements
# Here we do a simple text-based heuristic search
if page_text and domain in page_text:
try:
idx = page_text.find(domain)
start = max(0, idx - 100)
end = min(len(page_text), idx + 100)
context = page_text[start:end].strip().replace('\n', ' ')
if context:
competitor_contexts[domain].append(context)
except:
pass
# Filter by minimum mentions
competitors = []
for domain, count in competitor_counts.items():
if count >= min_mentions:
# Deduplicate and limit contexts
unique_contexts = list(set(competitor_contexts[domain]))[:5]
competitors.append({
'domain': domain,
'mentions': count,
'sample_urls': list(competitor_urls[domain])[:3],
'contexts': unique_contexts
})
# Sort by mentions (descending)
competitors.sort(key=lambda x: x['mentions'], reverse=True)
return competitors
def analyze_competitor_keywords(competitor_domain: str, pages: List[Dict]) -> Dict:
"""
Analyze what keywords appear near competitor links.
This helps understand the context in which competitors are mentioned.
"""
# Find pages that mention this competitor
relevant_pages = []
for page in pages:
links = page.get('links', [])
for link in links:
if competitor_domain in link:
relevant_pages.append(page)
break
if not relevant_pages:
return {'keywords': [], 'context': []}
# Extract text around competitor mentions
# This is a simplified version - could be enhanced with NLP
contexts = []
for page in relevant_pages:
title = page.get('title', '')
if title:
contexts.append(title)
return {
'pages_mentioned': len(relevant_pages),
'contexts': contexts[:5]
}
def format_competitor_report(competitors: List[Dict], source_url: str) -> str:
"""Format competitor analysis as readable report."""
lines = []
lines.append("=" * 80)
lines.append("COMPETITOR ANALYSIS REPORT")
lines.append("=" * 80)
source_domain = extract_domain(source_url)
lines.append(f"\n🎯 Source Domain: {source_domain}")
lines.append(f"📊 Competitors Found: {len(competitors)}")
if not competitors:
lines.append("\n❌ No competitors detected")
lines.append("\nPossible reasons:")
lines.append(" • Page has no external links")
lines.append(" • All external links are to social media/CDNs")
lines.append(" • Minimum mention threshold not met")
return "\n".join(lines)
lines.append("\n🏆 TOP COMPETITORS")
lines.append("-" * 80)
lines.append(f"{'Domain':<40} {'Mentions':<10} {'Sample URL'}")
lines.append("-" * 80)
for comp in competitors[:10]:
sample = comp['sample_urls'][0] if comp['sample_urls'] else 'N/A'
if len(sample) > 35:
sample = sample[:32] + '...'
lines.append(f"{comp['domain']:<40} {comp['mentions']:<10} {sample}")
lines.append("\n" + "=" * 80)
return "\n".join(lines)
def get_competitor_summary(competitors: List[Dict]) -> Dict:
"""Get summary statistics for competitors."""
if not competitors:
return {
'total': 0,
'avg_mentions': 0,
'top_competitor': None
}
total = len(competitors)
avg_mentions = sum(c['mentions'] for c in competitors) / total
top_competitor = competitors[0] if competitors else None
return {
'total': total,
'avg_mentions': round(avg_mentions, 1),
'top_competitor': top_competitor['domain'] if top_competitor else None,
'top_mentions': top_competitor['mentions'] if top_competitor else 0
}