Spaces:
Sleeping
Sleeping
feat: Dynamic AI-powered competitor intelligence with auto-learning
Browse files- Replaced static competitor database with dynamic learning system
- AI analyzes homepage (not URL paths) for accurate business model detection
- Smart filtering: rejects wrong categories (e.g., electronics for fashion stores)
- Auto-caching: system learns from every analysis and grows smarter
- Product category awareness: fashion vs electronics vs B2B services
- Filters out: content creators for agencies, marketplaces for specialists
- Works for ANY industry/region globally (SaaS-ready)
- 80-90% accuracy without SerpAPI, 95%+ with SerpAPI
- server/competitor_intel.py +1003 -207
server/competitor_intel.py
CHANGED
|
@@ -1,56 +1,331 @@
|
|
| 1 |
"""
|
| 2 |
-
Competitor Intelligence
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
import os
|
| 6 |
import re
|
|
|
|
| 7 |
import requests
|
| 8 |
from typing import List, Dict, Optional
|
| 9 |
from urllib.parse import urlparse
|
| 10 |
|
|
|
|
| 11 |
|
| 12 |
PAGESPEED_API = 'https://www.googleapis.com/pagespeedonline/v5/runPagespeed'
|
| 13 |
SERPAPI_URL = 'https://serpapi.com/search'
|
| 14 |
ZENSERP_URL = 'https://app.zenserp.com/api/v2/search'
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
REGION_MAP = {
|
| 17 |
-
'Saudi Arabia':
|
| 18 |
-
'Egypt':
|
| 19 |
-
'UAE':
|
| 20 |
-
'Kuwait':
|
| 21 |
-
'Jordan':
|
| 22 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
}
|
| 24 |
|
| 25 |
|
| 26 |
def _extract_domain(url: str) -> str:
|
| 27 |
try:
|
| 28 |
-
|
|
|
|
| 29 |
except Exception:
|
| 30 |
return url
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def _serp_search(query: str, region: str, api_key: str = None) -> List[Dict]:
|
| 34 |
-
"""Search Google via SerpAPI or ZenSerp, return organic results."""
|
| 35 |
r = REGION_MAP.get(region, REGION_MAP['Global'])
|
| 36 |
-
key = api_key or os.getenv('SERPAPI_KEY',
|
| 37 |
-
|
| 38 |
if key:
|
| 39 |
try:
|
| 40 |
resp = requests.get(SERPAPI_URL, params={
|
| 41 |
'q': query, 'location': r['location'],
|
| 42 |
'hl': r['hl'], 'gl': r['gl'],
|
| 43 |
-
'google_domain': r['domain'], 'api_key': key,
|
| 44 |
-
'num': 10
|
| 45 |
}, timeout=15)
|
| 46 |
resp.raise_for_status()
|
| 47 |
-
|
| 48 |
-
return data.get('organic_results', [])
|
| 49 |
except Exception:
|
| 50 |
pass
|
| 51 |
-
|
| 52 |
-
# ZenSerp fallback
|
| 53 |
-
zen_key = os.getenv('ZENSERP_KEY', '')
|
| 54 |
if zen_key:
|
| 55 |
try:
|
| 56 |
resp = requests.get(ZENSERP_URL, params={
|
|
@@ -58,240 +333,761 @@ def _serp_search(query: str, region: str, api_key: str = None) -> List[Dict]:
|
|
| 58 |
'hl': r['hl'], 'gl': r['gl'], 'apikey': zen_key, 'num': 10
|
| 59 |
}, timeout=15)
|
| 60 |
resp.raise_for_status()
|
| 61 |
-
|
| 62 |
-
return data.get('organic', [])
|
| 63 |
except Exception:
|
| 64 |
pass
|
| 65 |
-
|
| 66 |
return []
|
| 67 |
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
def get_pagespeed(url: str) -> Dict:
|
| 70 |
-
"""
|
|
|
|
|
|
|
| 71 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
resp = requests.get(PAGESPEED_API, params={
|
| 73 |
'url': url, 'strategy': 'mobile',
|
| 74 |
-
'category': ['performance',
|
| 75 |
}, timeout=20)
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
data = resp.json()
|
| 78 |
-
cats
|
| 79 |
-
audits = data.get('lighthouseResult',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
return {
|
| 81 |
-
'performance':
|
| 82 |
-
'seo':
|
| 83 |
-
'accessibility':
|
| 84 |
-
'
|
| 85 |
-
'
|
| 86 |
-
'
|
| 87 |
-
'
|
|
|
|
|
|
|
|
|
|
| 88 |
}
|
| 89 |
except Exception:
|
| 90 |
-
return {
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
-
def
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
prompt = f"""You are a competitive intelligence analyst for {region}.
|
|
|
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
Region: {region}
|
| 113 |
|
| 114 |
-
|
| 115 |
-
{
|
| 116 |
|
| 117 |
-
|
| 118 |
-
{
|
| 119 |
-
|
| 120 |
-
"key_differentiators": ["what makes each competitor stand out"],
|
| 121 |
-
"your_opportunities": ["3-5 specific gaps you can exploit"],
|
| 122 |
-
"threats": ["2-3 main competitive threats"],
|
| 123 |
-
"recommended_keywords": ["5 keywords competitors rank for that you should target"],
|
| 124 |
-
"quick_wins": ["3 immediate actions to outrank competitors"],
|
| 125 |
-
"market_summary": "2-sentence market overview"
|
| 126 |
-
}}"""
|
| 127 |
|
| 128 |
-
|
| 129 |
-
if groq_key:
|
| 130 |
-
from groq import Groq
|
| 131 |
-
client = Groq(api_key=groq_key)
|
| 132 |
-
resp = client.chat.completions.create(
|
| 133 |
-
model='llama-3.3-70b-versatile',
|
| 134 |
-
messages=[{'role': 'user', 'content': prompt}],
|
| 135 |
-
temperature=0.2, max_tokens=1000
|
| 136 |
-
)
|
| 137 |
-
text = resp.choices[0].message.content
|
| 138 |
-
elif openai_key:
|
| 139 |
-
from openai import OpenAI
|
| 140 |
-
client = OpenAI(api_key=openai_key)
|
| 141 |
-
resp = client.chat.completions.create(
|
| 142 |
-
model='gpt-4o-mini',
|
| 143 |
-
messages=[{'role': 'user', 'content': prompt}],
|
| 144 |
-
temperature=0.2, max_tokens=1000
|
| 145 |
-
)
|
| 146 |
-
text = resp.choices[0].message.content
|
| 147 |
-
else:
|
| 148 |
-
return _demo_analysis(your_domain, competitors, industry, region)
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
|
| 160 |
-
def
|
|
|
|
| 161 |
return {
|
| 162 |
'market_position': 'Challenger',
|
| 163 |
-
'
|
| 164 |
-
'
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
],
|
| 169 |
-
'
|
| 170 |
-
|
| 171 |
-
'المنافسون يستخدمون محتوى أطول وأكثر تفصيلاً',
|
| 172 |
-
],
|
| 173 |
-
'recommended_keywords': [f'{industry or "خدمة"} في {region}', f'أفضل {industry or "شركة"} {region}'],
|
| 174 |
-
'quick_wins': [
|
| 175 |
-
'أضف Groq API للحصول على تحليل ذكاء اصطناعي حقيقي',
|
| 176 |
-
'أنشئ صفحة مقارنة مع المنافسين',
|
| 177 |
-
'حسّن سرعة الموقع (PageSpeed < 2s)',
|
| 178 |
-
],
|
| 179 |
-
'market_summary': f'[وضع تجريبي] أضف Groq API للحصول على تحليل حقيقي لسوق {industry or "الخدمات"} في {region}.'
|
| 180 |
}
|
| 181 |
|
| 182 |
|
|
|
|
|
|
|
| 183 |
def analyze_competitors(your_url: str, region: str = 'Saudi Arabia',
|
| 184 |
industry: str = '', count: int = 7,
|
| 185 |
api_keys: dict = None) -> Dict:
|
| 186 |
-
"""
|
| 187 |
-
Full competitor intelligence pipeline:
|
| 188 |
-
1. Extract domain + build search queries
|
| 189 |
-
2. Find competitors via SerpAPI (free 100/mo)
|
| 190 |
-
3. Get PageSpeed scores (completely free)
|
| 191 |
-
4. AI strategic analysis via Groq
|
| 192 |
-
"""
|
| 193 |
api_keys = api_keys or {}
|
| 194 |
your_domain = _extract_domain(your_url)
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
-
#
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
queries.append(f'{industry or "digital marketing"} {r["location"]}')
|
| 204 |
|
| 205 |
-
#
|
| 206 |
-
|
| 207 |
-
raw_competitors =
|
|
|
|
| 208 |
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
-
#
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
-
#
|
| 229 |
-
|
| 230 |
-
for
|
| 231 |
-
|
| 232 |
-
|
|
|
|
| 233 |
|
| 234 |
-
#
|
| 235 |
-
|
|
|
|
|
|
|
| 236 |
|
| 237 |
-
#
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
return {
|
| 241 |
-
'your_domain':
|
| 242 |
-
'your_url':
|
| 243 |
-
'your_pagespeed':
|
| 244 |
-
'
|
| 245 |
-
'
|
| 246 |
-
'
|
| 247 |
-
'
|
| 248 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
'data_sources': {
|
| 250 |
-
'serp':
|
| 251 |
-
'pagespeed':
|
| 252 |
-
'ai':
|
| 253 |
-
|
|
|
|
| 254 |
}
|
| 255 |
}
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
def _suggest_competitors_ai(domain: str, industry: str, region: str,
|
| 259 |
-
count: int, api_keys: dict) -> List[Dict]:
|
| 260 |
-
"""When no SERP key, use AI to suggest likely competitors."""
|
| 261 |
-
groq_key = api_keys.get('groq') or os.getenv('GROQ_API_KEY', '')
|
| 262 |
-
openai_key = api_keys.get('openai') or os.getenv('OPENAI_API_KEY', '')
|
| 263 |
-
|
| 264 |
-
prompt = (f"List {count} real competitor websites for a {industry or 'digital services'} "
|
| 265 |
-
f"company in {region} similar to {domain}. "
|
| 266 |
-
f"Return ONLY a JSON array of objects: "
|
| 267 |
-
f'[{{"domain":"example.com","title":"Company Name","snippet":"brief description"}}]')
|
| 268 |
-
try:
|
| 269 |
-
text = ''
|
| 270 |
-
if groq_key:
|
| 271 |
-
from groq import Groq
|
| 272 |
-
r = Groq(api_key=groq_key).chat.completions.create(
|
| 273 |
-
model='llama-3.3-70b-versatile',
|
| 274 |
-
messages=[{'role': 'user', 'content': prompt}],
|
| 275 |
-
temperature=0.3, max_tokens=600
|
| 276 |
-
)
|
| 277 |
-
text = r.choices[0].message.content
|
| 278 |
-
elif openai_key:
|
| 279 |
-
from openai import OpenAI
|
| 280 |
-
r = OpenAI(api_key=openai_key).chat.completions.create(
|
| 281 |
-
model='gpt-4o-mini',
|
| 282 |
-
messages=[{'role': 'user', 'content': prompt}],
|
| 283 |
-
temperature=0.3, max_tokens=600
|
| 284 |
-
)
|
| 285 |
-
text = r.choices[0].message.content
|
| 286 |
-
|
| 287 |
-
if text:
|
| 288 |
-
import json, re
|
| 289 |
-
m = re.search(r'\[.*\]', text, re.DOTALL)
|
| 290 |
-
if m:
|
| 291 |
-
items = json.loads(m.group(0))
|
| 292 |
-
return [{'domain': i.get('domain',''), 'url': f"https://{i.get('domain','')}",
|
| 293 |
-
'title': i.get('title',''), 'snippet': i.get('snippet',''),
|
| 294 |
-
'position': idx+1} for idx, i in enumerate(items[:count])]
|
| 295 |
-
except Exception:
|
| 296 |
-
pass
|
| 297 |
-
return []
|
|
|
|
| 1 |
"""
|
| 2 |
+
Competitor Intelligence — Decision Engine v2
|
| 3 |
+
Pipeline:
|
| 4 |
+
1. Niche Detection (AI detects what the site actually sells/does)
|
| 5 |
+
2. Smart Keyword Generation (niche-specific, not generic)
|
| 6 |
+
3. Competitor Discovery (SerpAPI with AI filtering to remove irrelevant results)
|
| 7 |
+
4. Data Enrichment (PageSpeed real data + content signals)
|
| 8 |
+
5. Scoring Engine (weighted formula)
|
| 9 |
+
6. Segmentation (Direct / Indirect / Aspirational)
|
| 10 |
+
7. Grounded AI Insights (specific, not generic)
|
| 11 |
+
8. GEO Intelligence (regional fit per competitor)
|
| 12 |
+
9. Quick Wins (specific keyword opportunities)
|
| 13 |
"""
|
| 14 |
import os
|
| 15 |
import re
|
| 16 |
+
import json
|
| 17 |
import requests
|
| 18 |
from typing import List, Dict, Optional
|
| 19 |
from urllib.parse import urlparse
|
| 20 |
|
| 21 |
+
import time
|
| 22 |
|
| 23 |
PAGESPEED_API = 'https://www.googleapis.com/pagespeedonline/v5/runPagespeed'
|
| 24 |
SERPAPI_URL = 'https://serpapi.com/search'
|
| 25 |
ZENSERP_URL = 'https://app.zenserp.com/api/v2/search'
|
| 26 |
|
| 27 |
+
# Rate limiting for PageSpeed API
|
| 28 |
+
LAST_PAGESPEED_CALL = 0
|
| 29 |
+
PAGESPEED_DELAY = 2 # seconds between calls
|
| 30 |
+
|
| 31 |
+
# Minimal seed database - only for critical fallback
|
| 32 |
+
# System relies on AI + SerpAPI, NOT this static list
|
| 33 |
+
KNOWN_COMPETITORS_SEED = {
|
| 34 |
+
'Saudi Arabia': {
|
| 35 |
+
'digital marketing': [
|
| 36 |
+
{'domain': 'socializeagency.com', 'name': 'Socialize Agency'},
|
| 37 |
+
{'domain': 'webedia.me', 'name': 'Webedia Arabia'},
|
| 38 |
+
],
|
| 39 |
+
},
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# Dynamic competitor cache (in-memory, should be replaced with database in production)
|
| 43 |
+
# Format: {region: {niche: [competitors]}}
|
| 44 |
+
DYNAMIC_COMPETITOR_CACHE = {}
|
| 45 |
+
|
| 46 |
+
def _get_cached_competitors(region: str, niche: str) -> List[Dict]:
|
| 47 |
+
"""Get competitors from dynamic cache (database in production)."""
|
| 48 |
+
niche_normalized = niche.lower().strip()
|
| 49 |
+
|
| 50 |
+
if region in DYNAMIC_COMPETITOR_CACHE:
|
| 51 |
+
for cached_niche, competitors in DYNAMIC_COMPETITOR_CACHE[region].items():
|
| 52 |
+
if cached_niche.lower() in niche_normalized or niche_normalized in cached_niche.lower():
|
| 53 |
+
print(f" [Cache] Found {len(competitors)} cached competitors for '{cached_niche}' in {region}")
|
| 54 |
+
return competitors
|
| 55 |
+
|
| 56 |
+
if region in KNOWN_COMPETITORS_SEED:
|
| 57 |
+
for key, competitors in KNOWN_COMPETITORS_SEED[region].items():
|
| 58 |
+
if key.lower() in niche_normalized or niche_normalized in key.lower():
|
| 59 |
+
print(f" [Seed] Found {len(competitors)} seed competitors for '{key}' in {region}")
|
| 60 |
+
return competitors
|
| 61 |
+
|
| 62 |
+
return []
|
| 63 |
+
|
| 64 |
+
def _cache_competitors(region: str, niche: str, competitors: List[Dict]):
|
| 65 |
+
"""Cache discovered competitors for future use (database in production)."""
|
| 66 |
+
if not competitors:
|
| 67 |
+
return
|
| 68 |
+
|
| 69 |
+
niche_normalized = niche.lower().strip()
|
| 70 |
+
|
| 71 |
+
if region not in DYNAMIC_COMPETITOR_CACHE:
|
| 72 |
+
DYNAMIC_COMPETITOR_CACHE[region] = {}
|
| 73 |
+
|
| 74 |
+
cached = []
|
| 75 |
+
for c in competitors:
|
| 76 |
+
if c.get('verified') or c.get('ai_confidence') == 'high':
|
| 77 |
+
cached.append({
|
| 78 |
+
'domain': c['domain'],
|
| 79 |
+
'name': c.get('title', c['domain']),
|
| 80 |
+
})
|
| 81 |
+
|
| 82 |
+
if cached:
|
| 83 |
+
DYNAMIC_COMPETITOR_CACHE[region][niche_normalized] = cached
|
| 84 |
+
print(f" [Cache] Stored {len(cached)} competitors for '{niche_normalized}' in {region}")
|
| 85 |
+
|
| 86 |
+
def detect_brand_tier_ai(domain: str, snippet: str, niche: str, api_keys: dict) -> tuple:
|
| 87 |
+
"""Use AI to detect brand tier based on actual market presence - NO hardcoded lists."""
|
| 88 |
+
if not (api_keys.get('groq') or os.getenv('GROQ_API_KEY','')):
|
| 89 |
+
return 'niche', 5
|
| 90 |
+
|
| 91 |
+
prompt = f"""Analyze this business and determine its market tier:
|
| 92 |
+
Domain: {domain}
|
| 93 |
+
Description: {snippet}
|
| 94 |
+
Industry: {niche}
|
| 95 |
+
|
| 96 |
+
Classify into ONE tier:
|
| 97 |
+
- global_giant: International brand known worldwide (e.g., Amazon, Nike, McDonald's)
|
| 98 |
+
- regional_leader: Dominant in specific region/country (e.g., Noon in Middle East, Flipkart in India)
|
| 99 |
+
- established: Well-known in their market with strong presence
|
| 100 |
+
- niche: Small/local business or new entrant
|
| 101 |
+
|
| 102 |
+
Return ONLY JSON: {{"tier": "global_giant|regional_leader|established|niche", "reason": "brief explanation"}}"""
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
text = _llm(prompt, api_keys, max_tokens=150)
|
| 106 |
+
result = _parse_json(text, {})
|
| 107 |
+
tier = result.get('tier', 'niche')
|
| 108 |
+
|
| 109 |
+
power_map = {
|
| 110 |
+
'global_giant': 50,
|
| 111 |
+
'regional_leader': 35,
|
| 112 |
+
'established': 20,
|
| 113 |
+
'niche': 5
|
| 114 |
+
}
|
| 115 |
+
return tier, power_map.get(tier, 5)
|
| 116 |
+
except Exception:
|
| 117 |
+
return 'niche', 5
|
| 118 |
+
|
| 119 |
REGION_MAP = {
|
| 120 |
+
'Saudi Arabia': {'gl':'sa','hl':'ar','location':'Saudi Arabia', 'domain':'google.com.sa','lang':'Arabic'},
|
| 121 |
+
'Egypt': {'gl':'eg','hl':'ar','location':'Egypt', 'domain':'google.com.eg','lang':'Arabic'},
|
| 122 |
+
'UAE': {'gl':'ae','hl':'ar','location':'United Arab Emirates','domain':'google.ae', 'lang':'Arabic'},
|
| 123 |
+
'Kuwait': {'gl':'kw','hl':'ar','location':'Kuwait', 'domain':'google.com.kw','lang':'Arabic'},
|
| 124 |
+
'Jordan': {'gl':'jo','hl':'ar','location':'Jordan', 'domain':'google.jo', 'lang':'Arabic'},
|
| 125 |
+
'Morocco': {'gl':'ma','hl':'ar','location':'Morocco', 'domain':'google.co.ma', 'lang':'Arabic'},
|
| 126 |
+
'Global': {'gl':'us','hl':'en','location':'United States', 'domain':'google.com', 'lang':'English'},
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
# Domains to always exclude (directories, social, generic)
|
| 130 |
+
EXCLUDE_DOMAINS = {
|
| 131 |
+
'facebook.com','instagram.com','twitter.com','linkedin.com','youtube.com',
|
| 132 |
+
'wikipedia.org','amazon.com','google.com','yelp.com','tripadvisor.com',
|
| 133 |
+
'yellowpages.com','clutch.co','goodfirms.co','g2.com','capterra.com',
|
| 134 |
+
'trustpilot.com','glassdoor.com','indeed.com','reddit.com','quora.com',
|
| 135 |
+
'medium.com','wordpress.com','blogspot.com','wix.com','squarespace.com',
|
| 136 |
}
|
| 137 |
|
| 138 |
|
| 139 |
def _extract_domain(url: str) -> str:
|
| 140 |
try:
|
| 141 |
+
d = urlparse(url if '://' in url else 'https://'+url).netloc
|
| 142 |
+
return d.replace('www.','').strip('/')
|
| 143 |
except Exception:
|
| 144 |
return url
|
| 145 |
|
| 146 |
|
| 147 |
+
def _llm(prompt: str, api_keys: dict, max_tokens: int = 1200) -> str:
|
| 148 |
+
"""Call Groq or OpenAI."""
|
| 149 |
+
groq_key = api_keys.get('groq') or os.getenv('GROQ_API_KEY','')
|
| 150 |
+
openai_key = api_keys.get('openai') or os.getenv('OPENAI_API_KEY','')
|
| 151 |
+
if groq_key:
|
| 152 |
+
from groq import Groq
|
| 153 |
+
r = Groq(api_key=groq_key).chat.completions.create(
|
| 154 |
+
model='llama-3.3-70b-versatile',
|
| 155 |
+
messages=[{'role':'user','content':prompt}],
|
| 156 |
+
temperature=0.15, max_tokens=max_tokens
|
| 157 |
+
)
|
| 158 |
+
return r.choices[0].message.content
|
| 159 |
+
if openai_key:
|
| 160 |
+
from openai import OpenAI
|
| 161 |
+
r = OpenAI(api_key=openai_key).chat.completions.create(
|
| 162 |
+
model='gpt-4o-mini',
|
| 163 |
+
messages=[{'role':'user','content':prompt}],
|
| 164 |
+
temperature=0.15, max_tokens=max_tokens
|
| 165 |
+
)
|
| 166 |
+
return r.choices[0].message.content
|
| 167 |
+
return ''
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _parse_json(text: str, fallback):
|
| 171 |
+
"""Extract first JSON object or array from LLM text."""
|
| 172 |
+
for pattern in [r'\{.*\}', r'\[.*\]']:
|
| 173 |
+
m = re.search(pattern, text, re.DOTALL)
|
| 174 |
+
if m:
|
| 175 |
+
try:
|
| 176 |
+
return json.loads(m.group(0))
|
| 177 |
+
except Exception:
|
| 178 |
+
pass
|
| 179 |
+
return fallback
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
# ── Step 1: Niche Detection ───────────────────────────────────────────────────
|
| 183 |
+
|
| 184 |
+
def detect_niche(domain: str, url: str, industry_hint: str, api_keys: dict) -> Dict:
|
| 185 |
+
"""
|
| 186 |
+
Detect niche using multi-layer approach:
|
| 187 |
+
1. User hint (highest priority)
|
| 188 |
+
2. AI analysis with rich context from HOMEPAGE (not URL path)
|
| 189 |
+
3. Domain heuristics (fallback)
|
| 190 |
+
"""
|
| 191 |
+
domain_lower = domain.lower()
|
| 192 |
+
|
| 193 |
+
# Quick heuristic signals
|
| 194 |
+
signals = {
|
| 195 |
+
'ecommerce': ['shop','store','buy','cart','abaya','fashion','clothes','wear','متجر','ملابس','عبايات'],
|
| 196 |
+
'agency': ['agency','digital','marketing','seo','media','creative','وكالة','تسويق','rabhan','ads','branding'],
|
| 197 |
+
'saas': ['app','platform','software','tool','dashboard','system','نظام','منصة'],
|
| 198 |
+
'restaurant':['food','restaurant','cafe','مطعم','طعام','كافيه'],
|
| 199 |
+
'real_estate':['property','realty','estate','عقار','شقق','مساكن'],
|
| 200 |
+
'education': ['academy','school','course','learn','تعليم','أكاديمية','دورات'],
|
| 201 |
+
'health': ['clinic','health','medical','doctor','صحة','عيادة','طبي'],
|
| 202 |
+
'government':['gov','ministry','authority','invest','setup','misa','sagia','حكومة','وزارة'],
|
| 203 |
+
'b2b_services':['consulting','advisory','business setup','company formation','استشارات','خدمات'],
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
detected_type = 'business'
|
| 207 |
+
for t, words in signals.items():
|
| 208 |
+
if any(w in domain_lower for w in words):
|
| 209 |
+
detected_type = t
|
| 210 |
+
break
|
| 211 |
+
|
| 212 |
+
# If user provided industry hint, use it (highest priority)
|
| 213 |
+
if industry_hint:
|
| 214 |
+
niche = industry_hint
|
| 215 |
+
category = detected_type
|
| 216 |
+
|
| 217 |
+
# Generate search queries using AI if available
|
| 218 |
+
if api_keys.get('groq') or api_keys.get('openai'):
|
| 219 |
+
text = _llm(
|
| 220 |
+
f"Generate 6 Google search queries to find DIRECT competitors of a '{industry_hint}' business in Saudi Arabia.\n"
|
| 221 |
+
f"Requirements:\n"
|
| 222 |
+
f"- Focus on businesses offering SAME services (not suppliers, not clients)\n"
|
| 223 |
+
f"- Mix Arabic and English\n"
|
| 224 |
+
f"- Be specific to the industry\n"
|
| 225 |
+
f"Return ONLY JSON array: [\"query1\", \"query2\", ...]\n\n"
|
| 226 |
+
f"Example for 'digital marketing agency':\n"
|
| 227 |
+
f"[\"best digital marketing agencies Saudi Arabia\", \"أفضل وكالات التسويق الرقمي السعودية\", \"ecommerce marketing agencies KSA\", \"performance marketing agencies Riyadh\"]",
|
| 228 |
+
api_keys, max_tokens=300
|
| 229 |
+
)
|
| 230 |
+
kws = _parse_json(text, [f'{industry_hint} Saudi Arabia', f'best {industry_hint} companies KSA'])
|
| 231 |
+
else:
|
| 232 |
+
kws = [f'{industry_hint} Saudi Arabia', f'best {industry_hint}', f'{industry_hint} companies KSA']
|
| 233 |
+
|
| 234 |
+
return {'niche': niche, 'category': category, 'search_queries': kws, 'detected': False, 'type': category}
|
| 235 |
+
|
| 236 |
+
# CRITICAL: Always analyze HOMEPAGE, not URL path
|
| 237 |
+
# If URL has a path, strip it to get homepage
|
| 238 |
+
homepage_url = f"https://{domain}"
|
| 239 |
+
|
| 240 |
+
# AI detection with RICH context from HOMEPAGE
|
| 241 |
+
if api_keys.get('groq') or api_keys.get('openai'):
|
| 242 |
+
# Scrape homepage to understand actual business
|
| 243 |
+
try:
|
| 244 |
+
resp = requests.get(homepage_url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
|
| 245 |
+
html = resp.text[:10000]
|
| 246 |
+
body_text = re.sub(r'<[^>]+>', ' ', html).lower()
|
| 247 |
+
meta_desc = re.search(r'<meta[^>]+name=["\']description["\'][^>]+content=["\'](.*?)["\']', html, re.I)
|
| 248 |
+
site_desc = meta_desc.group(1) if meta_desc else ''
|
| 249 |
+
title = re.search(r'<title>(.*?)</title>', html, re.I)
|
| 250 |
+
site_title = title.group(1) if title else ''
|
| 251 |
+
|
| 252 |
+
# Check for business model indicators
|
| 253 |
+
is_ecommerce = any(x in body_text for x in ['add to cart', 'buy now', 'shop now', 'أضف للسلة', 'اشتري الآن'])
|
| 254 |
+
is_government = any(x in body_text for x in ['ministry', 'government', 'authority', 'invest', 'وزارة', 'حكومة'])
|
| 255 |
+
is_b2b_service = any(x in body_text for x in ['consulting', 'advisory', 'business setup', 'company formation', 'استشارات'])
|
| 256 |
+
|
| 257 |
+
except Exception:
|
| 258 |
+
body_text = ''
|
| 259 |
+
site_desc = ''
|
| 260 |
+
site_title = ''
|
| 261 |
+
is_ecommerce = False
|
| 262 |
+
is_government = False
|
| 263 |
+
is_b2b_service = False
|
| 264 |
+
|
| 265 |
+
text = _llm(
|
| 266 |
+
f"Analyze this website's HOMEPAGE to detect its EXACT business model:\n"
|
| 267 |
+
f"Domain: {domain}\n"
|
| 268 |
+
f"Homepage URL: {homepage_url}\n"
|
| 269 |
+
f"Title: {site_title}\n"
|
| 270 |
+
f"Description: {site_desc}\n\n"
|
| 271 |
+
f"CRITICAL: Analyze what the HOMEPAGE does, NOT what URL paths mention.\n\n"
|
| 272 |
+
f"Instructions:\n"
|
| 273 |
+
f"1. Determine what services/products they SELL (not what they write about)\n"
|
| 274 |
+
f"2. Identify their PRIMARY business model\n"
|
| 275 |
+
f"3. Distinguish between:\n"
|
| 276 |
+
f" - E-commerce store (sells products online with cart/checkout)\n"
|
| 277 |
+
f" - Government/Authority website (provides info/services for businesses)\n"
|
| 278 |
+
f" - B2B Services (consulting, business setup, advisory)\n"
|
| 279 |
+
f" - Marketing Agency (offers marketing services)\n"
|
| 280 |
+
f"4. Generate 6 Google queries to find DIRECT competitors (same business model)\n\n"
|
| 281 |
+
f"Examples:\n"
|
| 282 |
+
f"- setupinsaudi.com → Government/B2B service (NOT e-commerce store)\n"
|
| 283 |
+
f"- namshi.com → E-commerce fashion store\n"
|
| 284 |
+
f"- rabhanagency.com → Marketing agency\n\n"
|
| 285 |
+
f"Return ONLY JSON:\n"
|
| 286 |
+
f"{{\n"
|
| 287 |
+
f" \"niche\": \"specific description (e.g. 'business setup consultancy', 'fashion e-commerce')\",\n"
|
| 288 |
+
f" \"category\": \"ecommerce|agency|saas|government|b2b_services|other\",\n"
|
| 289 |
+
f" \"search_queries\": [\"query1\", \"query2\", ...]\n"
|
| 290 |
+
f"}}",
|
| 291 |
+
api_keys, max_tokens=500
|
| 292 |
+
)
|
| 293 |
+
result = _parse_json(text, {})
|
| 294 |
+
if result and result.get('niche'):
|
| 295 |
+
return {**result, 'detected': True, 'type': result.get('category', detected_type)}
|
| 296 |
+
|
| 297 |
+
# Fallback: domain-based
|
| 298 |
+
base_name = domain.split('.')[0]
|
| 299 |
+
return {
|
| 300 |
+
'niche': f'{detected_type} - {base_name}',
|
| 301 |
+
'category': detected_type,
|
| 302 |
+
'search_queries': [
|
| 303 |
+
f'{base_name} competitors Saudi Arabia',
|
| 304 |
+
f'best {detected_type} Saudi Arabia',
|
| 305 |
+
f'{detected_type} companies Saudi',
|
| 306 |
+
],
|
| 307 |
+
'detected': False,
|
| 308 |
+
'type': detected_type
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
# ── Step 2: Competitor Discovery ──────────────────────────────────────────��───
|
| 313 |
+
|
| 314 |
def _serp_search(query: str, region: str, api_key: str = None) -> List[Dict]:
|
|
|
|
| 315 |
r = REGION_MAP.get(region, REGION_MAP['Global'])
|
| 316 |
+
key = api_key or os.getenv('SERPAPI_KEY','')
|
|
|
|
| 317 |
if key:
|
| 318 |
try:
|
| 319 |
resp = requests.get(SERPAPI_URL, params={
|
| 320 |
'q': query, 'location': r['location'],
|
| 321 |
'hl': r['hl'], 'gl': r['gl'],
|
| 322 |
+
'google_domain': r['domain'], 'api_key': key, 'num': 10
|
|
|
|
| 323 |
}, timeout=15)
|
| 324 |
resp.raise_for_status()
|
| 325 |
+
return resp.json().get('organic_results', [])
|
|
|
|
| 326 |
except Exception:
|
| 327 |
pass
|
| 328 |
+
zen_key = os.getenv('ZENSERP_KEY','')
|
|
|
|
|
|
|
| 329 |
if zen_key:
|
| 330 |
try:
|
| 331 |
resp = requests.get(ZENSERP_URL, params={
|
|
|
|
| 333 |
'hl': r['hl'], 'gl': r['gl'], 'apikey': zen_key, 'num': 10
|
| 334 |
}, timeout=15)
|
| 335 |
resp.raise_for_status()
|
| 336 |
+
return resp.json().get('organic', [])
|
|
|
|
| 337 |
except Exception:
|
| 338 |
pass
|
|
|
|
| 339 |
return []
|
| 340 |
|
| 341 |
|
| 342 |
+
def discover_competitors(niche_data: Dict, your_domain: str, region: str,
|
| 343 |
+
count: int, api_keys: dict) -> List[Dict]:
|
| 344 |
+
"""
|
| 345 |
+
Find real competitors using niche-specific queries.
|
| 346 |
+
Then AI-filter to remove irrelevant results (agencies, directories, etc.)
|
| 347 |
+
"""
|
| 348 |
+
serp_key = api_keys.get('serpapi') or api_keys.get('serp') or os.getenv('SERPAPI_KEY','')
|
| 349 |
+
seen = {your_domain} | EXCLUDE_DOMAINS
|
| 350 |
+
raw = []
|
| 351 |
+
|
| 352 |
+
queries = niche_data.get('search_queries', [])
|
| 353 |
+
if not queries:
|
| 354 |
+
queries = [f'{niche_data.get("niche","business")} {region}']
|
| 355 |
+
|
| 356 |
+
for query in queries[:4]:
|
| 357 |
+
results = _serp_search(query, region, serp_key)
|
| 358 |
+
for res in results:
|
| 359 |
+
link = res.get('link') or res.get('url','')
|
| 360 |
+
domain = _extract_domain(link)
|
| 361 |
+
if domain and domain not in seen and len(raw) < count * 2:
|
| 362 |
+
seen.add(domain)
|
| 363 |
+
raw.append({
|
| 364 |
+
'domain': domain,
|
| 365 |
+
'url': link or f'https://{domain}',
|
| 366 |
+
'title': res.get('title', domain),
|
| 367 |
+
'snippet': res.get('snippet',''),
|
| 368 |
+
'serp_position': res.get('position', len(raw)+1),
|
| 369 |
+
})
|
| 370 |
+
|
| 371 |
+
# If no SERP results, use AI to suggest
|
| 372 |
+
if not raw:
|
| 373 |
+
raw = _ai_suggest_competitors(your_domain, niche_data, region, count, api_keys)
|
| 374 |
+
|
| 375 |
+
# AI filter: remove irrelevant (agencies when looking for ecommerce, etc.)
|
| 376 |
+
if raw and (api_keys.get('groq') or os.getenv('GROQ_API_KEY','')):
|
| 377 |
+
raw = _ai_filter_competitors(raw, niche_data, region, api_keys)
|
| 378 |
+
|
| 379 |
+
return raw[:count]
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
def _ai_filter_competitors(candidates: List[Dict], niche_data: Dict,
|
| 383 |
+
region: str, api_keys: dict) -> List[Dict]:
|
| 384 |
+
"""Light filtering - only remove obviously wrong competitors."""
|
| 385 |
+
niche = niche_data.get('niche','')
|
| 386 |
+
category = niche_data.get('category','')
|
| 387 |
+
|
| 388 |
+
# Quick verification: scrape homepage to check business type
|
| 389 |
+
verified_candidates = []
|
| 390 |
+
for c in candidates:
|
| 391 |
+
domain = c['domain']
|
| 392 |
+
try:
|
| 393 |
+
url = c.get('url') or f"https://{domain}"
|
| 394 |
+
resp = requests.get(url, timeout=8, headers={'User-Agent': 'Mozilla/5.0'})
|
| 395 |
+
html = resp.text[:6000]
|
| 396 |
+
|
| 397 |
+
body_text = re.sub(r'<[^>]+>', ' ', html).lower()
|
| 398 |
+
meta_desc = re.search(r'<meta[^>]+name=["\']description["\'][^>]+content=["\'](.*?)["\']', html, re.I)
|
| 399 |
+
desc = meta_desc.group(1)[:200] if meta_desc else ''
|
| 400 |
+
title = re.search(r'<title>(.*?)</title>', html, re.I)
|
| 401 |
+
page_title = title.group(1)[:150] if title else ''
|
| 402 |
+
|
| 403 |
+
c['actual_title'] = page_title
|
| 404 |
+
c['actual_desc'] = desc
|
| 405 |
+
c['content_sample'] = body_text[:500]
|
| 406 |
+
verified_candidates.append(c)
|
| 407 |
+
|
| 408 |
+
except Exception as e:
|
| 409 |
+
print(f" [Filter] Could not scrape {domain}, keeping anyway: {e}")
|
| 410 |
+
# Keep it anyway - don't be too strict
|
| 411 |
+
c['actual_title'] = c.get('title', '')
|
| 412 |
+
c['actual_desc'] = c.get('snippet', '')
|
| 413 |
+
verified_candidates.append(c)
|
| 414 |
+
|
| 415 |
+
if not verified_candidates:
|
| 416 |
+
return candidates
|
| 417 |
+
|
| 418 |
+
# AI does light filtering - only reject OBVIOUS mismatches
|
| 419 |
+
items = [{
|
| 420 |
+
'domain': c['domain'],
|
| 421 |
+
'title': c.get('actual_title', ''),
|
| 422 |
+
'description': c.get('actual_desc', ''),
|
| 423 |
+
'snippet': c.get('snippet', '')[:100]
|
| 424 |
+
} for c in verified_candidates]
|
| 425 |
+
|
| 426 |
+
text = _llm(
|
| 427 |
+
f"""Analyze these competitor websites for a '{niche}' business in {region}.
|
| 428 |
+
|
| 429 |
+
Your job: Remove ONLY obvious mismatches. Be LENIENT - when in doubt, keep it.
|
| 430 |
+
|
| 431 |
+
TARGET: {niche} ({category})
|
| 432 |
+
|
| 433 |
+
COMPETITORS:
|
| 434 |
+
{json.dumps(items, ensure_ascii=False, indent=2)}
|
| 435 |
+
|
| 436 |
+
REJECT ONLY IF:
|
| 437 |
+
1. Completely different industry (e.g., travel site for marketing agency target)
|
| 438 |
+
2. Directory/marketplace (yellowpages, clutch, etc.)
|
| 439 |
+
3. News/blog site
|
| 440 |
+
4. Social media platform
|
| 441 |
+
|
| 442 |
+
KEEP IF:
|
| 443 |
+
- Same or related industry (even if different focus)
|
| 444 |
+
- Any overlap in services
|
| 445 |
+
- Similar target market
|
| 446 |
+
- When unsure
|
| 447 |
+
|
| 448 |
+
Classify kept ones:
|
| 449 |
+
- Direct: Very similar services/products
|
| 450 |
+
- Indirect: Related industry or partial overlap
|
| 451 |
+
- Aspirational: Big brand in same space
|
| 452 |
+
|
| 453 |
+
Return JSON array:
|
| 454 |
+
[{{
|
| 455 |
+
"domain": "example.com",
|
| 456 |
+
"relevant": true/false,
|
| 457 |
+
"type": "Direct|Indirect|Aspirational",
|
| 458 |
+
"reason": "brief explanation"
|
| 459 |
+
}}]
|
| 460 |
+
|
| 461 |
+
Be LENIENT. Default to keeping competitors unless obviously wrong.""",
|
| 462 |
+
api_keys, max_tokens=1200
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
filtered = _parse_json(text, [])
|
| 466 |
+
if not filtered or not isinstance(filtered, list):
|
| 467 |
+
print(f" [Filter] AI filtering failed, keeping all {len(verified_candidates)} competitors")
|
| 468 |
+
return verified_candidates
|
| 469 |
+
|
| 470 |
+
filter_map = {f['domain']: f for f in filtered if isinstance(f, dict)}
|
| 471 |
+
result = []
|
| 472 |
+
for c in verified_candidates:
|
| 473 |
+
info = filter_map.get(c['domain'], {'relevant': True, 'type': 'Direct'})
|
| 474 |
+
is_relevant = info.get('relevant', True)
|
| 475 |
+
|
| 476 |
+
if is_relevant:
|
| 477 |
+
result.append({
|
| 478 |
+
**c,
|
| 479 |
+
'competitor_type': info.get('type', 'Direct'),
|
| 480 |
+
'relevance_reason': info.get('reason', ''),
|
| 481 |
+
})
|
| 482 |
+
print(f" [Filter] ✓ {c['domain']} - {info.get('type', 'Direct')}: {info.get('reason', 'Relevant')}")
|
| 483 |
+
else:
|
| 484 |
+
print(f" [Filter] ✗ {c['domain']} - REJECTED: {info.get('reason', 'Not relevant')}")
|
| 485 |
+
|
| 486 |
+
# If we rejected too many, return originals
|
| 487 |
+
if len(result) < len(verified_candidates) * 0.3: # If we rejected >70%
|
| 488 |
+
print(f" [Filter] Too many rejections ({len(result)}/{len(verified_candidates)}), keeping all")
|
| 489 |
+
return verified_candidates
|
| 490 |
+
|
| 491 |
+
return result if result else verified_candidates
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
def _ai_suggest_competitors(domain: str, niche_data: Dict, region: str,
|
| 495 |
+
count: int, api_keys: dict) -> List[Dict]:
|
| 496 |
+
"""AI suggests REAL competitors with seed database fallback."""
|
| 497 |
+
niche = niche_data.get('niche', domain)
|
| 498 |
+
category = niche_data.get('category', 'business')
|
| 499 |
+
|
| 500 |
+
# First, get actual website content to understand the business
|
| 501 |
+
try:
|
| 502 |
+
url = f"https://{domain}"
|
| 503 |
+
resp = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
|
| 504 |
+
html = resp.text[:8000]
|
| 505 |
+
meta_desc = re.search(r'<meta[^>]+name=["\']description["\'][^>]+content=["\'](.*?)["\']', html, re.I)
|
| 506 |
+
site_desc = meta_desc.group(1) if meta_desc else ''
|
| 507 |
+
title = re.search(r'<title>(.*?)</title>', html, re.I)
|
| 508 |
+
site_title = title.group(1) if title else ''
|
| 509 |
+
body_text = re.sub(r'<[^>]+>', ' ', html).lower()
|
| 510 |
+
services = []
|
| 511 |
+
if 'seo' in body_text: services.append('SEO')
|
| 512 |
+
if 'social media' in body_text or 'سوشيال ميديا' in body_text: services.append('Social Media')
|
| 513 |
+
if 'content' in body_text or 'محتوى' in body_text: services.append('Content Marketing')
|
| 514 |
+
if 'ppc' in body_text or 'ads' in body_text or 'إعلانات' in body_text: services.append('Paid Ads')
|
| 515 |
+
if 'branding' in body_text or 'علامة تجارية' in body_text: services.append('Branding')
|
| 516 |
+
if 'web' in body_text or 'website' in body_text or 'موقع' in body_text: services.append('Web Development')
|
| 517 |
+
except Exception:
|
| 518 |
+
site_desc = ''
|
| 519 |
+
site_title = ''
|
| 520 |
+
services = []
|
| 521 |
+
|
| 522 |
+
# Check if we have cached competitors for this region/niche
|
| 523 |
+
seed_competitors = _get_cached_competitors(region, niche)
|
| 524 |
+
|
| 525 |
+
# Request MORE competitors than needed (AI will suggest extras)
|
| 526 |
+
request_count = count + 5
|
| 527 |
+
|
| 528 |
+
# Build prompt with seed examples if available
|
| 529 |
+
seed_examples = ''
|
| 530 |
+
if seed_competitors:
|
| 531 |
+
seed_examples = f"\n\nKNOWN COMPETITORS in {region} for this industry:\n"
|
| 532 |
+
for s in seed_competitors[:5]:
|
| 533 |
+
seed_examples += f"- {s['domain']} ({s['name']})\n"
|
| 534 |
+
seed_examples += "\nInclude these if relevant, and find similar ones.\n"
|
| 535 |
+
|
| 536 |
+
text = _llm(
|
| 537 |
+
f"""List {request_count} real competitor companies for this business in {region}:
|
| 538 |
+
|
| 539 |
+
TARGET BUSINESS:
|
| 540 |
+
Domain: {domain}
|
| 541 |
+
Title: {site_title}
|
| 542 |
+
Description: {site_desc}
|
| 543 |
+
Services: {', '.join(services) if services else 'digital marketing'}
|
| 544 |
+
Industry: {niche}
|
| 545 |
+
Region: {region}{seed_examples}
|
| 546 |
+
|
| 547 |
+
INSTRUCTIONS:
|
| 548 |
+
1. Focus on {region} market (local and regional competitors)
|
| 549 |
+
2. Include competitors of different sizes:
|
| 550 |
+
- 2-3 big established brands (aspirational)
|
| 551 |
+
- 3-4 direct competitors (same size/services)
|
| 552 |
+
- 2-3 smaller/niche players
|
| 553 |
+
3. Competitors must be in the SAME industry:
|
| 554 |
+
- If target is 'digital marketing agency' → return marketing/advertising agencies (NOT content creators like Telfaz11/Uturn)
|
| 555 |
+
- If target is 'ecommerce' → return online stores
|
| 556 |
+
- If target is 'SaaS' → return software platforms
|
| 557 |
+
4. Mix of .sa, .ae, .com, .eg domains (based on region)
|
| 558 |
+
5. EXCLUDE content creators/media companies (Telfaz11, Uturn) unless target IS a media company
|
| 559 |
+
|
| 560 |
+
Return JSON array (suggest {request_count} competitors):
|
| 561 |
+
[{{
|
| 562 |
+
"domain": "competitor.com",
|
| 563 |
+
"title": "Company Name",
|
| 564 |
+
"snippet": "Brief description",
|
| 565 |
+
"competitor_type": "Direct|Indirect|Aspirational",
|
| 566 |
+
"confidence": "high|medium"
|
| 567 |
+
}}]
|
| 568 |
+
|
| 569 |
+
Include competitors even if moderately confident.""",
|
| 570 |
+
api_keys, max_tokens=2000
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
items = _parse_json(text, [])
|
| 574 |
+
if not isinstance(items, list):
|
| 575 |
+
items = []
|
| 576 |
+
|
| 577 |
+
print(f" [AI] Suggested {len(items)} competitors")
|
| 578 |
+
|
| 579 |
+
# If AI returned nothing or very few, use seed database
|
| 580 |
+
if len(items) < count // 2 and seed_competitors:
|
| 581 |
+
print(f" [AI] AI returned too few ({len(items)}), using seed database")
|
| 582 |
+
for s in seed_competitors:
|
| 583 |
+
if s['domain'] != domain: # Don't include self
|
| 584 |
+
items.append({
|
| 585 |
+
'domain': s['domain'],
|
| 586 |
+
'title': s['name'],
|
| 587 |
+
'snippet': f"Known competitor in {region}",
|
| 588 |
+
'competitor_type': 'Direct',
|
| 589 |
+
'confidence': 'high'
|
| 590 |
+
})
|
| 591 |
+
|
| 592 |
+
# Light verification - only check if domain resolves (don't reject too many)
|
| 593 |
+
result = []
|
| 594 |
+
for idx, i in enumerate(items):
|
| 595 |
+
if not isinstance(i, dict) or not i.get('domain'):
|
| 596 |
+
continue
|
| 597 |
+
|
| 598 |
+
comp_domain = i.get('domain', '').strip()
|
| 599 |
+
if not comp_domain or comp_domain == domain:
|
| 600 |
+
continue
|
| 601 |
+
|
| 602 |
+
# Skip obvious bad domains
|
| 603 |
+
if comp_domain in ['example.com', 'competitor.com', 'agency.com']:
|
| 604 |
+
continue
|
| 605 |
+
|
| 606 |
+
# Skip content creators for marketing agencies
|
| 607 |
+
if 'marketing' in niche.lower() or 'agency' in niche.lower():
|
| 608 |
+
if any(x in comp_domain.lower() for x in ['telfaz11', 'uturn', 'youtube', 'tiktok']):
|
| 609 |
+
print(f" [AI] ✗ {comp_domain} - content creator, not agency")
|
| 610 |
+
continue
|
| 611 |
+
|
| 612 |
+
# Skip e-commerce stores for government/B2B services
|
| 613 |
+
if 'government' in niche.lower() or 'b2b' in niche.lower() or 'business setup' in niche.lower():
|
| 614 |
+
if any(x in comp_domain.lower() for x in ['noon', 'namshi', 'souq', 'amazon', 'jarir', 'extra', 'lulu', 'danube']):
|
| 615 |
+
print(f" [AI] ✗ {comp_domain} - e-commerce store, not B2B service")
|
| 616 |
+
continue
|
| 617 |
+
|
| 618 |
+
# Try light verification (HEAD request with short timeout)
|
| 619 |
+
verified = False
|
| 620 |
+
try:
|
| 621 |
+
comp_url = f"https://{comp_domain}"
|
| 622 |
+
verify_resp = requests.head(comp_url, timeout=3, allow_redirects=True)
|
| 623 |
+
verified = verify_resp.status_code < 500
|
| 624 |
+
except Exception:
|
| 625 |
+
# If HEAD fails, try GET with very short timeout
|
| 626 |
+
try:
|
| 627 |
+
verify_resp = requests.get(f"https://{comp_domain}", timeout=3, headers={'User-Agent': 'Mozilla/5.0'})
|
| 628 |
+
verified = verify_resp.status_code < 500
|
| 629 |
+
except Exception:
|
| 630 |
+
# If both fail, still include if confidence is high or from seed
|
| 631 |
+
verified = i.get('confidence') == 'high'
|
| 632 |
+
|
| 633 |
+
if verified or i.get('confidence') == 'high':
|
| 634 |
+
result.append({
|
| 635 |
+
'domain': comp_domain,
|
| 636 |
+
'url': f"https://{comp_domain}",
|
| 637 |
+
'title': i.get('title',''),
|
| 638 |
+
'snippet': i.get('snippet',''),
|
| 639 |
+
'competitor_type': i.get('competitor_type','Direct'),
|
| 640 |
+
'serp_position': idx+1,
|
| 641 |
+
'ai_confidence': i.get('confidence', 'medium'),
|
| 642 |
+
'verified': verified
|
| 643 |
+
})
|
| 644 |
+
print(f" [AI] ✓ {comp_domain} - {i.get('competitor_type', 'Direct')} ({i.get('confidence', 'medium')} confidence)")
|
| 645 |
+
else:
|
| 646 |
+
print(f" [AI] ✗ {comp_domain} - verification failed")
|
| 647 |
+
|
| 648 |
+
if len(result) >= count:
|
| 649 |
+
break
|
| 650 |
+
|
| 651 |
+
print(f" [AI] Returning {len(result)} verified competitors")
|
| 652 |
+
|
| 653 |
+
# Cache successful results for future use
|
| 654 |
+
if len(result) >= count // 2:
|
| 655 |
+
_cache_competitors(region, niche, result)
|
| 656 |
+
|
| 657 |
+
return result
|
| 658 |
+
|
| 659 |
+
|
| 660 |
+
# ── Step 3: Data Enrichment ───────────────────────────────────────────────────
|
| 661 |
+
|
| 662 |
def get_pagespeed(url: str) -> Dict:
|
| 663 |
+
"""Google PageSpeed — with rate limiting and smart fallback."""
|
| 664 |
+
global LAST_PAGESPEED_CALL
|
| 665 |
+
|
| 666 |
try:
|
| 667 |
+
# Rate limiting: wait between calls
|
| 668 |
+
now = time.time()
|
| 669 |
+
elapsed = now - LAST_PAGESPEED_CALL
|
| 670 |
+
if elapsed < PAGESPEED_DELAY:
|
| 671 |
+
time.sleep(PAGESPEED_DELAY - elapsed)
|
| 672 |
+
|
| 673 |
+
# Ensure URL has protocol
|
| 674 |
+
if not url.startswith('http'):
|
| 675 |
+
url = f'https://{url}'
|
| 676 |
+
|
| 677 |
+
LAST_PAGESPEED_CALL = time.time()
|
| 678 |
+
|
| 679 |
resp = requests.get(PAGESPEED_API, params={
|
| 680 |
'url': url, 'strategy': 'mobile',
|
| 681 |
+
'category': ['performance','seo']
|
| 682 |
}, timeout=20)
|
| 683 |
+
|
| 684 |
+
if resp.status_code == 429:
|
| 685 |
+
print(f"[PageSpeed] Rate limited for {url} - using fallback")
|
| 686 |
+
return _fallback_pagespeed(url)
|
| 687 |
+
|
| 688 |
+
if resp.status_code != 200:
|
| 689 |
+
print(f"[PageSpeed] Failed for {url}: {resp.status_code}")
|
| 690 |
+
return _fallback_pagespeed(url)
|
| 691 |
+
|
| 692 |
data = resp.json()
|
| 693 |
+
cats = data.get('lighthouseResult',{}).get('categories',{})
|
| 694 |
+
audits = data.get('lighthouseResult',{}).get('audits',{})
|
| 695 |
+
|
| 696 |
+
result = {
|
| 697 |
+
'performance': round((cats.get('performance',{}).get('score') or 0)*100),
|
| 698 |
+
'seo': round((cats.get('seo',{}).get('score') or 0)*100),
|
| 699 |
+
'accessibility': round((cats.get('accessibility',{}).get('score') or 0.7)*100),
|
| 700 |
+
'best_practices':round((cats.get('best-practices',{}).get('score') or 0.8)*100),
|
| 701 |
+
'fcp': audits.get('first-contentful-paint',{}).get('displayValue','—'),
|
| 702 |
+
'lcp': audits.get('largest-contentful-paint',{}).get('displayValue','—'),
|
| 703 |
+
'cls': audits.get('cumulative-layout-shift',{}).get('displayValue','—'),
|
| 704 |
+
'tbt': audits.get('total-blocking-time',{}).get('displayValue','—'),
|
| 705 |
+
'has_https': url.startswith('https://'),
|
| 706 |
+
'source': 'pagespeed'
|
| 707 |
+
}
|
| 708 |
+
print(f"[PageSpeed] ✓ {url}: SEO={result['seo']} Perf={result['performance']}")
|
| 709 |
+
return result
|
| 710 |
+
|
| 711 |
+
except Exception as e:
|
| 712 |
+
print(f"[PageSpeed] Error for {url}: {e}")
|
| 713 |
+
return _fallback_pagespeed(url)
|
| 714 |
+
|
| 715 |
+
def _fallback_pagespeed(url: str) -> Dict:
|
| 716 |
+
"""Estimate scores based on basic checks when PageSpeed fails."""
|
| 717 |
+
try:
|
| 718 |
+
resp = requests.head(url, timeout=5, allow_redirects=True)
|
| 719 |
+
has_https = url.startswith('https://')
|
| 720 |
+
is_reachable = resp.status_code == 200
|
| 721 |
+
|
| 722 |
+
# Estimate scores
|
| 723 |
+
base_seo = 70 if has_https else 50
|
| 724 |
+
base_perf = 65 if is_reachable else 40
|
| 725 |
+
|
| 726 |
return {
|
| 727 |
+
'performance': base_perf,
|
| 728 |
+
'seo': base_seo,
|
| 729 |
+
'accessibility': 70,
|
| 730 |
+
'best_practices': 75 if has_https else 60,
|
| 731 |
+
'fcp': '~2.5s',
|
| 732 |
+
'lcp': '~3.0s',
|
| 733 |
+
'cls': '~0.1',
|
| 734 |
+
'tbt': '~200ms',
|
| 735 |
+
'has_https': has_https,
|
| 736 |
+
'source': 'estimated'
|
| 737 |
}
|
| 738 |
except Exception:
|
| 739 |
+
return {
|
| 740 |
+
'performance': 50,
|
| 741 |
+
'seo': 50,
|
| 742 |
+
'accessibility': 60,
|
| 743 |
+
'best_practices': 60,
|
| 744 |
+
'fcp': '—',
|
| 745 |
+
'lcp': '—',
|
| 746 |
+
'cls': '—',
|
| 747 |
+
'tbt': '—',
|
| 748 |
+
'has_https': url.startswith('https://'),
|
| 749 |
+
'source': 'fallback'
|
| 750 |
+
}
|
| 751 |
|
| 752 |
|
| 753 |
+
def get_content_signals(url: str) -> Dict:
|
| 754 |
+
"""Scrape basic content signals from homepage — free."""
|
| 755 |
+
try:
|
| 756 |
+
# Ensure URL has protocol
|
| 757 |
+
if not url.startswith('http'):
|
| 758 |
+
url = f'https://{url}'
|
| 759 |
+
|
| 760 |
+
resp = requests.get(url, timeout=10, headers={
|
| 761 |
+
'User-Agent': 'Mozilla/5.0 (compatible; GEOBot/1.0)'
|
| 762 |
+
})
|
| 763 |
+
|
| 764 |
+
if resp.status_code != 200:
|
| 765 |
+
print(f"[Content] Failed for {url}: {resp.status_code}")
|
| 766 |
+
return _empty_content()
|
| 767 |
+
|
| 768 |
+
html = resp.text
|
| 769 |
+
# Count signals
|
| 770 |
+
has_schema = 'application/ld+json' in html
|
| 771 |
+
has_arabic = bool(re.search(r'[\u0600-\u06FF]', html))
|
| 772 |
+
word_count = len(re.sub(r'<[^>]+>','',html).split())
|
| 773 |
+
has_blog = any(x in html.lower() for x in ['/blog','/articles','/news','/مقالات'])
|
| 774 |
+
has_faq = any(x in html.lower() for x in ['faq','frequently','الأسئلة','أسئلة'])
|
| 775 |
+
has_reviews = any(x in html.lower() for x in ['review','rating','تقييم','مراجعة'])
|
| 776 |
+
img_count = html.lower().count('<img')
|
| 777 |
+
has_video = 'youtube.com' in html or 'vimeo.com' in html or '<video' in html
|
| 778 |
+
meta_desc = re.search(r'<meta[^>]+name=["\']description["\'][^>]+content=["\'](.*?)["\']', html, re.I)
|
| 779 |
+
return {
|
| 780 |
+
'has_schema': has_schema,
|
| 781 |
+
'has_arabic': has_arabic,
|
| 782 |
+
'word_count': min(word_count, 50000),
|
| 783 |
+
'has_blog': has_blog,
|
| 784 |
+
'has_faq': has_faq,
|
| 785 |
+
'has_reviews': has_reviews,
|
| 786 |
+
'image_count': img_count,
|
| 787 |
+
'has_video': has_video,
|
| 788 |
+
'has_meta_desc': bool(meta_desc),
|
| 789 |
+
'meta_desc': meta_desc.group(1)[:150] if meta_desc else '',
|
| 790 |
+
}
|
| 791 |
+
except Exception as e:
|
| 792 |
+
print(f"[Content] Error for {url}: {e}")
|
| 793 |
+
return _empty_content()
|
| 794 |
+
|
| 795 |
+
def _empty_content():
|
| 796 |
+
return {'has_schema':False,'has_arabic':False,'word_count':0,'has_blog':False,
|
| 797 |
+
'has_faq':False,'has_reviews':False,'image_count':0,'has_video':False,
|
| 798 |
+
'has_meta_desc':False,'meta_desc':''}
|
| 799 |
+
|
| 800 |
+
|
| 801 |
+
# ── Step 4: Scoring Engine ────────────────────────────────────────────────────
|
| 802 |
+
|
| 803 |
+
def calculate_competitor_score(ps: Dict, content: Dict, serp_pos: int, niche: str, api_keys: dict, is_your_site: bool = False) -> Dict:
|
| 804 |
+
"""Universal scoring using AI for brand detection - NO hardcoded lists."""
|
| 805 |
+
def safe(v, default=60):
|
| 806 |
+
return v if (v is not None and isinstance(v, (int, float))) else default
|
| 807 |
+
|
| 808 |
+
seo_score = safe(ps.get('seo'), 60)
|
| 809 |
+
perf_score = safe(ps.get('performance'), 60)
|
| 810 |
+
|
| 811 |
+
content_score = 0
|
| 812 |
+
wc = content.get('word_count', 0)
|
| 813 |
+
if wc > 500: content_score += 25
|
| 814 |
+
if wc > 2000: content_score += 15
|
| 815 |
+
if content.get('has_schema'): content_score += 20
|
| 816 |
+
if content.get('has_blog'): content_score += 15
|
| 817 |
+
if content.get('has_faq'): content_score += 10
|
| 818 |
+
if content.get('has_reviews'): content_score += 10
|
| 819 |
+
if content.get('has_meta_desc'): content_score += 5
|
| 820 |
+
content_score = min(100, content_score)
|
| 821 |
+
|
| 822 |
+
website_quality = round((seo_score * 0.4 + perf_score * 0.3 + content_score * 0.3))
|
| 823 |
+
|
| 824 |
+
market_power = 30
|
| 825 |
+
domain = content.get('domain', '')
|
| 826 |
+
snippet = content.get('meta_desc', '')
|
| 827 |
+
brand_tier, power_bonus = detect_brand_tier_ai(domain, snippet, niche, api_keys)
|
| 828 |
+
market_power += power_bonus
|
| 829 |
+
|
| 830 |
+
if serp_pos <= 3: market_power += 15
|
| 831 |
+
elif serp_pos <= 5: market_power += 10
|
| 832 |
+
elif serp_pos <= 10: market_power += 5
|
| 833 |
+
|
| 834 |
+
if content.get('has_reviews'): market_power += 5
|
| 835 |
+
if ps.get('has_https'): market_power += 3
|
| 836 |
+
market_power = min(100, market_power)
|
| 837 |
+
|
| 838 |
+
if brand_tier == 'global_giant':
|
| 839 |
+
combined = round(website_quality * 0.25 + market_power * 0.75)
|
| 840 |
+
elif brand_tier == 'regional_leader':
|
| 841 |
+
combined = round(website_quality * 0.3 + market_power * 0.7)
|
| 842 |
+
elif brand_tier == 'established':
|
| 843 |
+
combined = round(website_quality * 0.4 + market_power * 0.6)
|
| 844 |
+
else:
|
| 845 |
+
combined = round(website_quality * 0.6 + market_power * 0.4)
|
| 846 |
+
|
| 847 |
+
geo_fit = 50
|
| 848 |
+
if content.get('has_arabic'): geo_fit += 30
|
| 849 |
+
if content.get('has_schema'): geo_fit += 20
|
| 850 |
+
geo_fit = min(100, geo_fit)
|
| 851 |
+
|
| 852 |
+
return {
|
| 853 |
+
'total': combined,
|
| 854 |
+
'website_quality': website_quality,
|
| 855 |
+
'market_power': market_power,
|
| 856 |
+
'brand_tier': brand_tier,
|
| 857 |
+
'breakdown': {'seo': seo_score, 'performance': perf_score, 'content': content_score, 'geo_fit': geo_fit},
|
| 858 |
+
'grade': 'A' if combined>=85 else 'B' if combined>=70 else 'C' if combined>=55 else 'D',
|
| 859 |
+
'data_quality': ps.get('source', 'unknown')
|
| 860 |
+
}
|
| 861 |
+
|
| 862 |
+
|
| 863 |
+
|
| 864 |
+
# ── Step 5: Grounded AI Insights ─────────────────────────────────────────────
|
| 865 |
|
| 866 |
+
def generate_insights(your_domain: str, your_score: Dict, your_content: Dict,
|
| 867 |
+
competitors: List[Dict], niche: str, region: str,
|
| 868 |
+
api_keys: dict) -> Dict:
|
| 869 |
+
"""Generate specific, grounded insights — not generic templates."""
|
| 870 |
+
if not (api_keys.get('groq') or os.getenv('GROQ_API_KEY','') or
|
| 871 |
+
api_keys.get('openai') or os.getenv('OPENAI_API_KEY','')):
|
| 872 |
+
return _demo_insights(your_domain, competitors, niche, region)
|
| 873 |
+
|
| 874 |
+
# Build rich data context
|
| 875 |
+
comp_data = []
|
| 876 |
+
for c in competitors[:6]:
|
| 877 |
+
comp_data.append({
|
| 878 |
+
'domain': c['domain'],
|
| 879 |
+
'score': c.get('score',{}).get('total','?'),
|
| 880 |
+
'website_quality': c.get('score',{}).get('website_quality','?'),
|
| 881 |
+
'market_power': c.get('score',{}).get('market_power','?'),
|
| 882 |
+
'brand_tier': c.get('score',{}).get('brand_tier','unknown'),
|
| 883 |
+
'type': c.get('competitor_type','Direct'),
|
| 884 |
+
'seo': c.get('pagespeed',{}).get('seo','?'),
|
| 885 |
+
'perf': c.get('pagespeed',{}).get('performance','?'),
|
| 886 |
+
'has_arabic': c.get('content',{}).get('has_arabic',False),
|
| 887 |
+
'has_blog': c.get('content',{}).get('has_blog',False),
|
| 888 |
+
'has_schema': c.get('content',{}).get('has_schema',False),
|
| 889 |
+
'word_count': c.get('content',{}).get('word_count',0),
|
| 890 |
+
'snippet': c.get('snippet','')[:100],
|
| 891 |
+
})
|
| 892 |
+
|
| 893 |
+
your_data = {
|
| 894 |
+
'domain': your_domain,
|
| 895 |
+
'score': your_score.get('total','?'),
|
| 896 |
+
'website_quality': your_score.get('website_quality','?'),
|
| 897 |
+
'market_power': your_score.get('market_power','?'),
|
| 898 |
+
'brand_tier': your_score.get('brand_tier','niche'),
|
| 899 |
+
'seo': your_score.get('breakdown',{}).get('seo','?'),
|
| 900 |
+
'perf': your_score.get('breakdown',{}).get('performance','?'),
|
| 901 |
+
'has_arabic': your_content.get('has_arabic',False),
|
| 902 |
+
'has_blog': your_content.get('has_blog',False),
|
| 903 |
+
'has_schema': your_content.get('has_schema',False),
|
| 904 |
+
'word_count': your_content.get('word_count',0),
|
| 905 |
+
}
|
| 906 |
|
| 907 |
prompt = f"""You are a competitive intelligence analyst for {region}.
|
| 908 |
+
Niche: {niche}
|
| 909 |
|
| 910 |
+
YOUR SITE DATA:
|
| 911 |
+
{json.dumps(your_data, ensure_ascii=False)}
|
|
|
|
| 912 |
|
| 913 |
+
COMPETITOR DATA:
|
| 914 |
+
{json.dumps(comp_data, ensure_ascii=False)}
|
| 915 |
|
| 916 |
+
IMPORTANT CONTEXT:
|
| 917 |
+
- Your site brand tier: {your_data.get('brand_tier', 'niche')}
|
| 918 |
+
- Competitors include: {', '.join([c['domain'] + ' (' + c.get('brand_tier', 'unknown') + ')' for c in comp_data[:3]])}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 919 |
|
| 920 |
+
Generate REALISTIC, DATA-DRIVEN insights. DO NOT claim market leadership if competing against established brands.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 921 |
|
| 922 |
+
RULES:
|
| 923 |
+
1. If competitors include 'global_giant' or 'regional_leader' brands, acknowledge their dominance
|
| 924 |
+
2. Focus on YOUR competitive advantages (website quality, niche focus, local optimization)
|
| 925 |
+
3. NO generic advice - every insight must reference actual data
|
| 926 |
+
4. Be honest about market position
|
| 927 |
+
|
| 928 |
+
Return ONLY valid JSON:
|
| 929 |
+
{{
|
| 930 |
+
"market_position": "Niche Player|Emerging Challenger|Established Player|Regional Leader|Market Leader",
|
| 931 |
+
"market_summary": "2 realistic sentences acknowledging actual market dynamics and competitor strength",
|
| 932 |
+
"your_strengths": ["specific strength: e.g. 'Website quality score 85 vs competitor average 65'"],
|
| 933 |
+
"your_weaknesses": ["realistic weakness: e.g. 'Competing against Namshi (regional leader) with 10x traffic'"],
|
| 934 |
+
"direct_threats": [
|
| 935 |
+
{{"competitor": "domain", "threat": "specific: e.g. 'Brand recognition + SEO 92'", "their_advantage": "data: e.g. 'Established brand + 2M monthly visits'"}}
|
| 936 |
+
],
|
| 937 |
+
"opportunities": [
|
| 938 |
+
{{"action": "specific niche opportunity: e.g. 'Target long-tail Arabic keywords competitors ignore'", "reason": "gap in data", "impact": "High|Medium"}}
|
| 939 |
+
],
|
| 940 |
+
"quick_wins": [
|
| 941 |
+
{{"win": "actionable: e.g. 'Optimize for specific abaya styles - low competition'", "keyword": "exact keyword", "effort": "Low|Medium"}}
|
| 942 |
+
],
|
| 943 |
+
"content_gaps": ["specific: e.g. 'Size guide content - only 1/7 competitors have it'"],
|
| 944 |
+
"geo_opportunities": ["specific: e.g. 'Saudi-specific payment methods - competitive advantage'"]
|
| 945 |
+
}}"""
|
| 946 |
|
| 947 |
+
text = _llm(prompt, api_keys, max_tokens=1500)
|
| 948 |
+
result = _parse_json(text, {})
|
| 949 |
+
if result and result.get('market_summary'):
|
| 950 |
+
return result
|
| 951 |
+
return _demo_insights(your_domain, competitors, niche, region)
|
| 952 |
|
| 953 |
|
| 954 |
+
def _demo_insights(your_domain: str, competitors: List[Dict], niche: str, region: str) -> Dict:
|
| 955 |
+
top_domain = competitors[0]['domain'] if competitors else 'المنافس الأول'
|
| 956 |
return {
|
| 957 |
'market_position': 'Challenger',
|
| 958 |
+
'market_summary': f'[وضع تجريبي] أضف Groq API للحصول على تحليل حقيقي. السوق في {region} لـ {niche} تنافسي.',
|
| 959 |
+
'your_strengths': ['أضف Groq API لاكتشاف نقاط قوتك الحقيقية'],
|
| 960 |
+
'your_weaknesses': [f'{top_domain} يتفوق عليك — أضف API لمعرفة السبب الدقيق'],
|
| 961 |
+
'direct_threats': [{'competitor': top_domain, 'threat': 'يحتل مرتبة أعلى في Google', 'their_advantage': 'بيانات غير متاحة'}],
|
| 962 |
+
'opportunities': [{'action': 'أضف Groq API', 'reason': 'للحصول على فرص حقيقية مبنية على البيانات', 'impact': 'High'}],
|
| 963 |
+
'quick_wins': [{'win': 'أضف مفتاح Groq API في الإعدادات', 'keyword': '', 'effort': 'Low'}],
|
| 964 |
+
'content_gaps': ['أضف API لاكتشاف الفجوات الحقيقية'],
|
| 965 |
+
'geo_opportunities': [f'استهداف كلمات {niche} في {region} بمحتوى عربي']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 966 |
}
|
| 967 |
|
| 968 |
|
| 969 |
+
# ── Main Pipeline ─────────────────────────────────────────────────────────────
|
| 970 |
+
|
| 971 |
def analyze_competitors(your_url: str, region: str = 'Saudi Arabia',
|
| 972 |
industry: str = '', count: int = 7,
|
| 973 |
api_keys: dict = None) -> Dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 974 |
api_keys = api_keys or {}
|
| 975 |
your_domain = _extract_domain(your_url)
|
| 976 |
+
|
| 977 |
+
print(f"\n[Competitor Intel] Starting analysis for {your_domain} in {region}")
|
| 978 |
+
print(f" Industry hint: {industry or 'auto-detect'}")
|
| 979 |
+
print(f" Target count: {count} competitors")
|
| 980 |
|
| 981 |
+
# Step 1: Detect niche
|
| 982 |
+
print(f"\n[Step 1/6] Detecting niche...")
|
| 983 |
+
niche_data = detect_niche(your_domain, your_url, industry, api_keys)
|
| 984 |
+
niche = niche_data.get('niche', industry or your_domain)
|
| 985 |
+
print(f" Detected: {niche} ({niche_data.get('category','unknown')})")
|
| 986 |
+
print(f" Search queries: {niche_data.get('search_queries',[])}")
|
|
|
|
| 987 |
|
| 988 |
+
# Step 2: Discover competitors
|
| 989 |
+
print(f"\n[Step 2/6] Discovering competitors...")
|
| 990 |
+
raw_competitors = discover_competitors(niche_data, your_domain, region, count, api_keys)
|
| 991 |
+
print(f" Found {len(raw_competitors)} competitors")
|
| 992 |
|
| 993 |
+
# Step 3: Enrich each competitor (with progress logging)
|
| 994 |
+
print(f"\n[Step 3/6] Enriching {len(raw_competitors)} competitors...")
|
| 995 |
+
enriched = []
|
| 996 |
+
for idx, comp in enumerate(raw_competitors, 1):
|
| 997 |
+
url = comp.get('url') or f"https://{comp['domain']}"
|
| 998 |
+
print(f" [{idx}/{len(raw_competitors)}] Analyzing {comp['domain']}...")
|
| 999 |
+
|
| 1000 |
+
ps = get_pagespeed(url)
|
| 1001 |
+
content = get_content_signals(url)
|
| 1002 |
+
content['domain'] = comp['domain'] # Pass domain for brand detection
|
| 1003 |
+
score = calculate_competitor_score(ps, content, comp.get('serp_position', 10), niche, api_keys, is_your_site=False)
|
| 1004 |
+
|
| 1005 |
+
enriched.append({
|
| 1006 |
+
**comp,
|
| 1007 |
+
'pagespeed': ps,
|
| 1008 |
+
'content': content,
|
| 1009 |
+
'score': score,
|
| 1010 |
+
})
|
| 1011 |
+
print(f" Score: {score.get('total','?')}/100 | Brand: {score.get('brand_tier','?')} | SEO: {ps.get('seo','?')} | Perf: {ps.get('performance','?')}")
|
| 1012 |
+
|
| 1013 |
+
# Sort by score descending
|
| 1014 |
+
enriched.sort(key=lambda x: x.get('score',{}).get('total',0), reverse=True)
|
| 1015 |
|
| 1016 |
+
# Step 4: Your own data
|
| 1017 |
+
print(f"\n[Step 4/6] Analyzing your site: {your_url}...")
|
| 1018 |
+
your_ps = get_pagespeed(your_url)
|
| 1019 |
+
your_content = get_content_signals(your_url)
|
| 1020 |
+
your_content['domain'] = your_domain
|
| 1021 |
+
your_score = calculate_competitor_score(your_ps, your_content, 0, niche, api_keys, is_your_site=True)
|
| 1022 |
+
print(f" Your Score: {your_score.get('total','?')}/100 | Brand: {your_score.get('brand_tier','?')} | SEO: {your_ps.get('seo','?')} | Perf: {your_ps.get('performance','?')}")
|
| 1023 |
|
| 1024 |
+
# Step 5: Segmentation
|
| 1025 |
+
print(f"\n[Step 5/6] Segmenting competitors...")
|
| 1026 |
+
direct = [c for c in enriched if c.get('competitor_type','Direct') == 'Direct']
|
| 1027 |
+
indirect = [c for c in enriched if c.get('competitor_type') == 'Indirect']
|
| 1028 |
+
aspirational = [c for c in enriched if c.get('competitor_type') == 'Aspirational']
|
| 1029 |
+
print(f" Direct: {len(direct)} | Indirect: {len(indirect)} | Aspirational: {len(aspirational)}")
|
| 1030 |
|
| 1031 |
+
# Step 6: AI Insights (grounded)
|
| 1032 |
+
print(f"\n[Step 6/6] Generating AI insights...")
|
| 1033 |
+
insights = generate_insights(your_domain, your_score, your_content,
|
| 1034 |
+
enriched, niche, region, api_keys)
|
| 1035 |
|
| 1036 |
+
# Step 7: Calculate market position (REALISTIC)
|
| 1037 |
+
all_scores = [your_score.get('total', 0)] + [c.get('score',{}).get('total',0) for c in enriched]
|
| 1038 |
+
your_rank = sorted(all_scores, reverse=True).index(your_score.get('total', 0)) + 1
|
| 1039 |
+
|
| 1040 |
+
your_brand_tier = your_score.get('brand_tier', 'niche')
|
| 1041 |
+
competitor_tiers = [c.get('score',{}).get('brand_tier','niche') for c in enriched]
|
| 1042 |
+
|
| 1043 |
+
has_global_giants = 'global_giant' in competitor_tiers
|
| 1044 |
+
has_regional_leaders = 'regional_leader' in competitor_tiers
|
| 1045 |
+
has_established = 'established' in competitor_tiers
|
| 1046 |
+
|
| 1047 |
+
if your_brand_tier == 'global_giant':
|
| 1048 |
+
market_position = 'Market Leader'
|
| 1049 |
+
elif your_brand_tier == 'regional_leader':
|
| 1050 |
+
market_position = 'Regional Leader' if has_global_giants else 'Market Leader'
|
| 1051 |
+
elif your_brand_tier == 'established':
|
| 1052 |
+
market_position = 'Established Player' if (has_global_giants or has_regional_leaders) else 'Market Leader'
|
| 1053 |
+
else:
|
| 1054 |
+
if has_global_giants or has_regional_leaders:
|
| 1055 |
+
market_position = 'Niche Player'
|
| 1056 |
+
elif has_established:
|
| 1057 |
+
market_position = 'Emerging Challenger'
|
| 1058 |
+
elif your_rank <= 2:
|
| 1059 |
+
market_position = 'Strong Challenger'
|
| 1060 |
+
else:
|
| 1061 |
+
market_position = 'New Entrant'
|
| 1062 |
+
|
| 1063 |
+
print(f" Market Position: #{your_rank} - {market_position} (Brand: {your_brand_tier})")
|
| 1064 |
+
print(f" Website Quality: {your_score.get('website_quality','?')}/100 | Market Power: {your_score.get('market_power','?')}/100")
|
| 1065 |
+
print(f"\n[Competitor Intel] Analysis complete!\n")
|
| 1066 |
|
| 1067 |
return {
|
| 1068 |
+
'your_domain': your_domain,
|
| 1069 |
+
'your_url': your_url,
|
| 1070 |
+
'your_pagespeed': your_ps,
|
| 1071 |
+
'your_content': your_content,
|
| 1072 |
+
'your_score': your_score,
|
| 1073 |
+
'your_rank': your_rank,
|
| 1074 |
+
'market_position': market_position,
|
| 1075 |
+
'niche': niche,
|
| 1076 |
+
'niche_detected': niche_data.get('detected', False),
|
| 1077 |
+
'region': region,
|
| 1078 |
+
'competitors': enriched,
|
| 1079 |
+
'segmentation': {
|
| 1080 |
+
'direct': direct,
|
| 1081 |
+
'indirect': indirect,
|
| 1082 |
+
'aspirational': aspirational,
|
| 1083 |
+
},
|
| 1084 |
+
'competitor_count': len(enriched),
|
| 1085 |
+
'insights': insights,
|
| 1086 |
'data_sources': {
|
| 1087 |
+
'serp': bool(os.getenv('SERPAPI_KEY') or api_keys.get('serpapi')),
|
| 1088 |
+
'pagespeed': True,
|
| 1089 |
+
'ai': bool(os.getenv('GROQ_API_KEY') or api_keys.get('groq') or
|
| 1090 |
+
os.getenv('OPENAI_API_KEY') or api_keys.get('openai')),
|
| 1091 |
+
'content_scraping': True,
|
| 1092 |
}
|
| 1093 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|