yashgori20 commited on
Commit
7da164e
·
1 Parent(s): 5f0cfa7
Files changed (1) hide show
  1. modules/keywords.py +484 -245
modules/keywords.py CHANGED
@@ -1,64 +1,114 @@
1
  """
2
  Keywords Rankings Module for SEO Report Generator
3
- Supports Google Search Console API (primary) and SERP API (fallback)
4
  """
5
 
6
  import os
7
  import requests
8
  import json
9
- from typing import Dict, Any, List, Optional
 
 
10
  from urllib.parse import urlparse
11
  from datetime import datetime, timedelta
 
 
12
 
13
 
 
14
  class ModuleResult:
15
  """Standard result object for SEO modules"""
16
- def __init__(self, success: bool, data: Dict[str, Any], error: str = None):
17
- self.success = success
18
- self.data = data
19
- self.error = error
20
 
21
 
22
  class KeywordsModule:
23
  def __init__(self):
24
- self.gsc_api_key = os.getenv('GOOGLE_SEARCH_CONSOLE_API_KEY')
25
- self.serp_api_key = os.getenv('SERP_API_KEY') # SerpAPI or similar
26
- self.data_for_seo_key = os.getenv('DATAFORSEO_API_KEY')
 
27
 
28
- def analyze(self, url: str, quick_scan: bool = False) -> ModuleResult:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  """
30
- Analyze keyword rankings for the given URL
31
 
32
  Args:
33
  url: Target website URL
34
- quick_scan: If True, use limited data for competitor analysis
 
35
 
36
  Returns:
37
- ModuleResult with keywords data
38
  """
 
 
39
  try:
 
 
 
 
 
 
 
40
  domain = self._extract_domain(url)
 
41
 
42
- # Try Google Search Console first (if credentials available)
43
- if self.gsc_api_key:
44
- result = self._analyze_with_gsc(domain, quick_scan)
45
- if result.success:
46
- return result
47
 
48
- # Fallback to SERP API
49
- if self.serp_api_key:
50
- result = self._analyze_with_serp_api(domain, quick_scan)
51
- if result.success:
52
- return result
 
 
 
53
 
54
- # Fallback to DataForSEO
55
- if self.data_for_seo_key:
56
- result = self._analyze_with_dataforseo(domain, quick_scan)
57
- if result.success:
58
- return result
 
59
 
60
- # No API keys available - return placeholder data
61
- return self._generate_placeholder_data(domain)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  except Exception as e:
64
  return ModuleResult(
@@ -73,243 +123,432 @@ class KeywordsModule:
73
  url = 'https://' + url
74
  return urlparse(url).netloc.replace('www.', '')
75
 
76
- def _analyze_with_gsc(self, domain: str, quick_scan: bool) -> ModuleResult:
77
- """Analyze with Google Search Console API"""
78
  try:
79
- # Note: GSC API requires site verification and proper setup
80
- # This is a simplified implementation - real GSC API needs OAuth2
81
-
82
- # GSC API endpoint (simplified)
83
- base_url = "https://searchconsole.googleapis.com/webmasters/v3/sites"
84
- site_url = f"https://{domain}/"
85
-
86
- # Get search analytics data
87
- analytics_url = f"{base_url}/{site_url}/searchAnalytics/query"
88
 
89
- # Date range (last 90 days)
90
- end_date = datetime.now().date()
91
- start_date = end_date - timedelta(days=90)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- payload = {
94
- "startDate": start_date.isoformat(),
95
- "endDate": end_date.isoformat(),
96
- "dimensions": ["query", "page"],
97
- "rowLimit": 1000 if not quick_scan else 100
 
 
98
  }
99
 
100
- headers = {
101
- "Authorization": f"Bearer {self.gsc_api_key}",
102
- "Content-Type": "application/json"
103
- }
104
-
105
- response = requests.post(analytics_url, json=payload, headers=headers, timeout=30)
106
-
107
- if response.status_code != 200:
108
- raise Exception(f"GSC API error: {response.status_code}")
109
-
110
- data = response.json()
111
- return self._process_gsc_data(data, domain)
112
-
113
  except Exception as e:
114
- return ModuleResult(success=False, data={}, error=str(e))
115
 
116
- def _analyze_with_serp_api(self, domain: str, quick_scan: bool) -> ModuleResult:
117
- """Analyze with SERP API (SerpAPI, etc.)"""
118
- try:
119
- # Using SerpAPI as example
120
- url = "https://serpapi.com/search"
121
-
122
- params = {
123
- "engine": "google",
124
- "q": f"site:{domain}",
125
- "api_key": self.serp_api_key,
126
- "num": 100 if not quick_scan else 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  }
128
-
129
- response = requests.get(url, params=params, timeout=30)
130
-
131
- if response.status_code != 200:
132
- raise Exception(f"SERP API error: {response.status_code}")
133
-
134
- data = response.json()
135
- return self._process_serp_data(data, domain)
136
-
137
- except Exception as e:
138
- return ModuleResult(success=False, data={}, error=str(e))
139
 
140
- def _analyze_with_dataforseo(self, domain: str, quick_scan: bool) -> ModuleResult:
141
- """Analyze with DataForSEO API"""
142
- try:
143
- # DataForSEO implementation
144
- auth = (self.data_for_seo_key, os.getenv('DATAFORSEO_API_PASSWORD', ''))
145
-
146
- # Get domain keywords
147
- url = "https://api.dataforseo.com/v3/dataforseo_labs/google/ranked_keywords/live"
148
-
149
- payload = {
150
- "target": domain,
151
- "limit": 1000 if not quick_scan else 100,
152
- "offset": 0,
153
- "filters": [
154
- ["metrics.organic.pos", "<=", 100]
155
- ]
 
 
 
 
 
 
 
 
 
156
  }
157
-
158
- response = requests.post(url, json=[payload], auth=auth, timeout=60)
159
-
160
- if response.status_code != 200:
161
- raise Exception(f"DataForSEO API error: {response.status_code}")
162
-
163
- data = response.json()
164
- return self._process_dataforseo_data(data, domain)
165
-
166
- except Exception as e:
167
- return ModuleResult(success=False, data={}, error=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- def _process_gsc_data(self, data: Dict, domain: str) -> ModuleResult:
170
- """Process Google Search Console data"""
171
- if 'rows' not in data:
172
- return ModuleResult(success=False, data={}, error="No GSC data available")
 
 
173
 
174
- rows = data['rows']
175
- total_keywords = len(rows)
176
 
177
- # Position distribution
178
- top_3 = sum(1 for row in rows if row.get('position', 100) <= 3)
179
- top_10 = sum(1 for row in rows if row.get('position', 100) <= 10)
180
- top_50 = sum(1 for row in rows if row.get('position', 100) <= 50)
181
-
182
- # Best and worst performing
183
- sorted_by_position = sorted(rows, key=lambda x: x.get('position', 100))
184
- best_keywords = sorted_by_position[:10]
185
- worst_keywords = sorted_by_position[-10:]
186
-
187
- # High opportunity keywords (high impressions, low clicks)
188
- opportunity_keywords = []
189
- for row in rows:
190
- impressions = row.get('impressions', 0)
191
- clicks = row.get('clicks', 0)
192
- ctr = (clicks / impressions * 100) if impressions > 0 else 0
 
 
 
193
 
194
- if impressions > 100 and ctr < 2 and row.get('position', 100) > 10:
195
- opportunity_keywords.append({
196
- 'keyword': row.get('keys', [''])[0],
197
- 'position': row.get('position', 0),
198
- 'impressions': impressions,
199
- 'clicks': clicks,
200
- 'ctr': round(ctr, 2)
201
  })
202
 
203
- opportunity_keywords = sorted(opportunity_keywords, key=lambda x: x['impressions'], reverse=True)[:10]
204
-
205
- keywords_data = {
206
- 'total_keywords': total_keywords,
207
- 'position_distribution': {
208
- 'top_3': top_3,
209
- 'top_10': top_10,
210
- 'top_50': top_50,
211
- 'beyond_50': total_keywords - top_50
212
- },
213
- 'best_keywords': [
214
- {
215
- 'keyword': row.get('keys', [''])[0],
216
- 'position': row.get('position', 0),
217
- 'clicks': row.get('clicks', 0),
218
- 'impressions': row.get('impressions', 0)
219
- } for row in best_keywords
220
- ],
221
- 'worst_keywords': [
222
- {
223
- 'keyword': row.get('keys', [''])[0],
224
- 'position': row.get('position', 0),
225
- 'clicks': row.get('clicks', 0),
226
- 'impressions': row.get('impressions', 0)
227
- } for row in worst_keywords
228
- ],
229
- 'opportunity_keywords': opportunity_keywords,
230
- 'data_source': 'Google Search Console',
231
- 'last_updated': datetime.now().isoformat()
232
- }
233
 
234
- return ModuleResult(success=True, data=keywords_data)
235
 
236
- def _process_serp_data(self, data: Dict, domain: str) -> ModuleResult:
237
- """Process SERP API data"""
238
- # Simplified SERP data processing
239
- organic_results = data.get('organic_results', [])
240
-
241
- keywords_data = {
242
- 'total_keywords': len(organic_results),
243
- 'position_distribution': {
244
- 'top_3': len([r for r in organic_results if r.get('position', 100) <= 3]),
245
- 'top_10': len([r for r in organic_results if r.get('position', 100) <= 10]),
246
- 'top_50': len([r for r in organic_results if r.get('position', 100) <= 50]),
247
- 'beyond_50': len([r for r in organic_results if r.get('position', 100) > 50])
248
- },
249
- 'best_keywords': [
250
- {
251
- 'keyword': r.get('title', ''),
252
- 'position': r.get('position', 0),
253
- 'url': r.get('link', '')
254
- } for r in organic_results[:10]
255
- ],
256
- 'data_source': 'SERP API',
257
- 'last_updated': datetime.now().isoformat()
258
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
- return ModuleResult(success=True, data=keywords_data)
261
 
262
- def _process_dataforseo_data(self, data: Dict, domain: str) -> ModuleResult:
263
- """Process DataForSEO data"""
264
- if not data.get('tasks') or not data['tasks'][0].get('result'):
265
- return ModuleResult(success=False, data={}, error="No DataForSEO data available")
266
 
267
- results = data['tasks'][0]['result']
268
- total_keywords = len(results)
 
 
 
 
269
 
270
- # Position distribution
271
- top_3 = sum(1 for r in results if r.get('metrics', {}).get('organic', {}).get('pos', 100) <= 3)
272
- top_10 = sum(1 for r in results if r.get('metrics', {}).get('organic', {}).get('pos', 100) <= 10)
273
- top_50 = sum(1 for r in results if r.get('metrics', {}).get('organic', {}).get('pos', 100) <= 50)
274
-
275
- keywords_data = {
276
- 'total_keywords': total_keywords,
277
- 'position_distribution': {
278
- 'top_3': top_3,
279
- 'top_10': top_10,
280
- 'top_50': top_50,
281
- 'beyond_50': total_keywords - top_50
282
- },
283
- 'best_keywords': [
284
- {
285
- 'keyword': r.get('keyword', ''),
286
- 'position': r.get('metrics', {}).get('organic', {}).get('pos', 0),
287
- 'search_volume': r.get('keyword_info', {}).get('search_volume', 0)
288
- } for r in sorted(results, key=lambda x: x.get('metrics', {}).get('organic', {}).get('pos', 100))[:10]
289
- ],
290
- 'data_source': 'DataForSEO',
291
- 'last_updated': datetime.now().isoformat()
292
- }
293
 
294
- return ModuleResult(success=True, data=keywords_data)
 
295
 
296
- def _generate_placeholder_data(self, domain: str) -> ModuleResult:
297
- """Generate placeholder data when no API keys are available"""
298
- keywords_data = {
299
- 'total_keywords': 0,
300
- 'position_distribution': {
301
- 'top_3': 0,
302
- 'top_10': 0,
303
- 'top_50': 0,
304
- 'beyond_50': 0
305
- },
306
- 'best_keywords': [],
307
- 'worst_keywords': [],
308
- 'opportunity_keywords': [],
309
- 'data_source': 'No API credentials',
310
- 'last_updated': datetime.now().isoformat(),
311
- 'placeholder': True,
312
- 'message': 'Connect Google Search Console or SERP API to unlock keyword data'
313
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
- return ModuleResult(success=True, data=keywords_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  Keywords Rankings Module for SEO Report Generator
3
+ Implements PRD requirements with Competitors Ranking Keywords API and Google Keyword Insight API
4
  """
5
 
6
  import os
7
  import requests
8
  import json
9
+ import time
10
+ import hashlib
11
+ from typing import Dict, Any, List, Optional, Tuple
12
  from urllib.parse import urlparse
13
  from datetime import datetime, timedelta
14
+ from dataclasses import dataclass
15
+ from concurrent.futures import ThreadPoolExecutor, as_completed
16
 
17
 
18
+ @dataclass
19
  class ModuleResult:
20
  """Standard result object for SEO modules"""
21
+ success: bool
22
+ data: Dict[str, Any]
23
+ error: str = None
 
24
 
25
 
26
  class KeywordsModule:
27
  def __init__(self):
28
+ # API Configuration
29
+ self.rapidapi_key = os.getenv('RAPIDAPI_KEY')
30
+ self.primary_api_host = "seo-get-competitors-ranking-keywords.p.rapidapi.com"
31
+ self.enrichment_api_host = "google-keyword-insight1.p.rapidapi.com"
32
 
33
+ # Performance Configuration
34
+ self.timeout = int(os.getenv('KEYWORD_API_TIMEOUT', 30))
35
+ self.max_retries = int(os.getenv('KEYWORD_MAX_RETRIES', 3))
36
+ self.pagination_limit = int(os.getenv('KEYWORD_PAGINATION_LIMIT', 1000))
37
+ self.enrichment_batch_size = int(os.getenv('ENRICHMENT_BATCH_SIZE', 50))
38
+ self.enrichment_cache_ttl = int(os.getenv('ENRICHMENT_CACHE_TTL', 86400))
39
+
40
+ # Rate limiting
41
+ self.primary_api_calls = 0
42
+ self.enrichment_api_calls = 0
43
+ self.last_primary_call = 0
44
+ self.last_enrichment_call = 0
45
+
46
+ # In-memory cache for enrichment data
47
+ self.enrichment_cache = {}
48
+ self.cache_timestamps = {}
49
+
50
+ def analyze(self, url: str, competitor_domains: List[str] = None, quick_scan: bool = False) -> ModuleResult:
51
  """
52
+ Analyze keyword rankings for the given URL and competitors
53
 
54
  Args:
55
  url: Target website URL
56
+ competitor_domains: List of competitor domains to analyze
57
+ quick_scan: If True, limit to 1000 keywords per domain
58
 
59
  Returns:
60
+ ModuleResult with comprehensive keywords data
61
  """
62
+ start_time = time.time()
63
+
64
  try:
65
+ if not self.rapidapi_key:
66
+ return ModuleResult(
67
+ success=False,
68
+ data={},
69
+ error="RAPIDAPI_KEY environment variable is required"
70
+ )
71
+
72
  domain = self._extract_domain(url)
73
+ competitor_domains = competitor_domains or []
74
 
75
+ # Limit competitors for demo performance
76
+ if len(competitor_domains) > 3:
77
+ competitor_domains = competitor_domains[:3]
 
 
78
 
79
+ # Fetch main domain data
80
+ main_domain_data = self._fetch_domain_keywords(domain, quick_scan)
81
+ if not main_domain_data['success']:
82
+ return ModuleResult(
83
+ success=False,
84
+ data={},
85
+ error=f"Failed to fetch data for main domain: {main_domain_data['error']}"
86
+ )
87
 
88
+ # Fetch competitor data
89
+ competitor_data = {}
90
+ for comp_domain in competitor_domains:
91
+ comp_result = self._fetch_domain_keywords(comp_domain, quick_scan)
92
+ if comp_result['success']:
93
+ competitor_data[comp_domain] = comp_result['data']
94
 
95
+ # Process and enrich data
96
+ result_data = self._process_keywords_data(
97
+ main_domain_data['data'],
98
+ competitor_data,
99
+ domain,
100
+ competitor_domains
101
+ )
102
+
103
+ # Add metadata
104
+ processing_time = time.time() - start_time
105
+ result_data['meta'] = {
106
+ 'last_updated': datetime.now().isoformat(),
107
+ 'processing_time': round(processing_time, 2),
108
+ 'locale': 'en-US'
109
+ }
110
+
111
+ return ModuleResult(success=True, data=result_data)
112
 
113
  except Exception as e:
114
  return ModuleResult(
 
123
  url = 'https://' + url
124
  return urlparse(url).netloc.replace('www.', '')
125
 
126
+ def _fetch_domain_keywords(self, domain: str, quick_scan: bool) -> Dict[str, Any]:
127
+ """Fetch keywords data for a domain using Competitors Ranking Keywords API"""
128
  try:
129
+ all_keywords = []
130
+ offset = 0
131
+ max_keywords = 1000 if quick_scan else 5000
 
 
 
 
 
 
132
 
133
+ while len(all_keywords) < max_keywords:
134
+ # Rate limiting
135
+ self._rate_limit_primary_api()
136
+
137
+ url = "https://seo-get-competitors-ranking-keywords.p.rapidapi.com/"
138
+ headers = {
139
+ "x-rapidapi-key": self.rapidapi_key,
140
+ "x-rapidapi-host": self.primary_api_host
141
+ }
142
+ params = {
143
+ "domain": domain,
144
+ "offset": offset,
145
+ "order_by": "position",
146
+ "sort_by": "desc",
147
+ "limit": min(self.pagination_limit, max_keywords - len(all_keywords))
148
+ }
149
+
150
+ response = requests.get(url, headers=headers, params=params, timeout=self.timeout)
151
+ self.primary_api_calls += 1
152
+ self.last_primary_call = time.time()
153
+
154
+ if response.status_code != 200:
155
+ raise Exception(f"API error {response.status_code}: {response.text}")
156
+
157
+ data = response.json()
158
+
159
+ # Extract keywords
160
+ keywords = data.get('keywords', [])
161
+ if not keywords:
162
+ break
163
+
164
+ all_keywords.extend(keywords)
165
+ offset += len(keywords)
166
+
167
+ # Check if we have domain statistics (should be in first response)
168
+ if offset == len(keywords) and 'domain_statistics' in data:
169
+ domain_stats = data['domain_statistics']
170
+ elif 'domain_statistics' not in locals():
171
+ domain_stats = self._calculate_domain_statistics(all_keywords)
172
+
173
+ # Break if no more data
174
+ if len(keywords) < self.pagination_limit:
175
+ break
176
 
177
+ return {
178
+ 'success': True,
179
+ 'data': {
180
+ 'domain': domain,
181
+ 'statistics': domain_stats,
182
+ 'keywords': all_keywords[:max_keywords]
183
+ }
184
  }
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  except Exception as e:
187
+ return {'success': False, 'error': str(e)}
188
 
189
+ def _calculate_domain_statistics(self, keywords: List[Dict]) -> Dict[str, Any]:
190
+ """Calculate domain statistics from keywords data"""
191
+ total_keywords = len(keywords)
192
+
193
+ # Position distribution
194
+ pos_1 = sum(1 for k in keywords if k.get('rank', 100) == 1)
195
+ pos_2_3 = sum(1 for k in keywords if 2 <= k.get('rank', 100) <= 3)
196
+ pos_4_10 = sum(1 for k in keywords if 4 <= k.get('rank', 100) <= 10)
197
+ pos_11_20 = sum(1 for k in keywords if 11 <= k.get('rank', 100) <= 20)
198
+
199
+ # Movement tracking
200
+ new_keywords = sum(1 for k in keywords if k.get('previous_rank') is None)
201
+ up_keywords = sum(1 for k in keywords if k.get('rank', 100) < k.get('previous_rank', 100))
202
+ down_keywords = sum(1 for k in keywords if k.get('rank', 100) > k.get('previous_rank', 100))
203
+
204
+ # Traffic estimation
205
+ estimated_traffic = sum(k.get('estimated_traffic_volume', 0) for k in keywords)
206
+
207
+ return {
208
+ 'organic': {
209
+ 'keywords_in_pos_1': pos_1,
210
+ 'keywords_in_pos_2_3': pos_2_3,
211
+ 'keywords_in_pos_4_10': pos_4_10,
212
+ 'keywords_in_pos_11_20': pos_11_20,
213
+ 'total_keywords_count': total_keywords,
214
+ 'Estimated_traffic_volume': estimated_traffic,
215
+ 'is_new': new_keywords,
216
+ 'is_up': up_keywords,
217
+ 'is_down': down_keywords,
218
+ 'is_lost': 0
219
  }
220
+ }
 
 
 
 
 
 
 
 
 
 
221
 
222
+ def _process_keywords_data(self, main_data: Dict, competitor_data: Dict,
223
+ domain: str, competitor_domains: List[str]) -> Dict[str, Any]:
224
+ """Process and structure the keywords data"""
225
+ stats = main_data['statistics']['organic']
226
+ keywords = main_data['keywords']
227
+
228
+ # Calculate totals
229
+ totals = {
230
+ 'keywords': stats['total_keywords_count'],
231
+ 'estimated_traffic': stats['Estimated_traffic_volume']
232
+ }
233
+
234
+ # Calculate position distribution
235
+ top3 = stats['keywords_in_pos_1'] + stats['keywords_in_pos_2_3']
236
+ top10 = top3 + stats['keywords_in_pos_4_10']
237
+ top50 = top10 + stats['keywords_in_pos_11_20'] # Approximate
238
+
239
+ distribution = {
240
+ 'top3': top3,
241
+ 'top10': top10,
242
+ 'top50': top50,
243
+ 'percentages': {
244
+ 'top3': round(top3 / stats['total_keywords_count'] * 100, 1) if stats['total_keywords_count'] > 0 else 0,
245
+ 'top10': round(top10 / stats['total_keywords_count'] * 100, 1) if stats['total_keywords_count'] > 0 else 0,
246
+ 'top50': round(top50 / stats['total_keywords_count'] * 100, 1) if stats['total_keywords_count'] > 0 else 0
247
  }
248
+ }
249
+
250
+ # Movement tracking
251
+ movement = {
252
+ 'new': stats['is_new'],
253
+ 'up': stats['is_up'],
254
+ 'down': stats['is_down'],
255
+ 'lost': stats['is_lost']
256
+ }
257
+
258
+ # Identify best keywords
259
+ best_keywords = self._identify_best_keywords(keywords)
260
+
261
+ # Identify declining keywords
262
+ declining_keywords = self._identify_declining_keywords(keywords)
263
+
264
+ # Competitor gap analysis
265
+ opportunities, competitor_summary = self._analyze_competitor_gaps(
266
+ keywords, competitor_data, domain, competitor_domains
267
+ )
268
+
269
+ # Enrich keywords with volume/CPC data
270
+ enriched_keywords = self._enrich_keywords_data(keywords)
271
+
272
+ # Data sources tracking
273
+ data_sources = {
274
+ 'positions': 'Competitors Ranking Keywords API',
275
+ 'volume': 'Google Keyword Insight API',
276
+ 'enrichment_rate': self._calculate_enrichment_rate(enriched_keywords)
277
+ }
278
+
279
+ return {
280
+ 'totals': totals,
281
+ 'distribution': distribution,
282
+ 'movement': movement,
283
+ 'best_keywords': best_keywords,
284
+ 'declining_keywords': declining_keywords,
285
+ 'opportunities': opportunities,
286
+ 'competitor_summary': competitor_summary,
287
+ 'data_sources': data_sources
288
+ }
289
 
290
+ def _identify_best_keywords(self, keywords: List[Dict]) -> List[Dict]:
291
+ """Identify best performing keywords"""
292
+ best_candidates = [
293
+ k for k in keywords
294
+ if k.get('rank', 100) <= 3 and k.get('estimated_traffic_volume', 0) > 10
295
+ ]
296
 
297
+ # Sort by estimated traffic volume
298
+ best_candidates.sort(key=lambda x: x.get('estimated_traffic_volume', 0), reverse=True)
299
 
300
+ return [
301
+ {
302
+ 'keyword': k.get('keyword', ''),
303
+ 'rank': k.get('rank', 0),
304
+ 'url': k.get('url', ''),
305
+ 'volume': k.get('avg_search_volume', 0),
306
+ 'estimated_traffic': k.get('estimated_traffic_volume', 0),
307
+ 'trend': self._determine_trend(k)
308
+ }
309
+ for k in best_candidates[:15]
310
+ ]
311
+
312
+ def _identify_declining_keywords(self, keywords: List[Dict]) -> List[Dict]:
313
+ """Identify keywords with declining performance"""
314
+ declining_candidates = []
315
+
316
+ for k in keywords:
317
+ current_rank = k.get('rank', 100)
318
+ previous_rank = k.get('previous_rank', 100)
319
 
320
+ if current_rank > previous_rank and (current_rank - previous_rank) >= 5:
321
+ declining_candidates.append({
322
+ 'keyword': k.get('keyword', ''),
323
+ 'rank': current_rank,
324
+ 'previous_rank': previous_rank,
325
+ 'rank_delta': current_rank - previous_rank,
326
+ 'volume': k.get('avg_search_volume', 0)
327
  })
328
 
329
+ # Sort by rank delta (biggest drops first)
330
+ declining_candidates.sort(key=lambda x: x['rank_delta'], reverse=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
+ return declining_candidates[:15]
333
 
334
+ def _analyze_competitor_gaps(self, main_keywords: List[Dict], competitor_data: Dict,
335
+ domain: str, competitor_domains: List[str]) -> Tuple[List[Dict], List[Dict]]:
336
+ """Analyze competitor gaps and opportunities"""
337
+ opportunities = []
338
+ competitor_summary = []
339
+
340
+ # Normalize main domain keywords
341
+ main_keyword_set = {k.get('keyword', '').lower().strip() for k in main_keywords}
342
+
343
+ for comp_domain, comp_data in competitor_data.items():
344
+ comp_keywords = comp_data.get('keywords', [])
345
+ comp_stats = comp_data.get('statistics', {}).get('organic', {})
346
+
347
+ # Find gaps
348
+ gaps = []
349
+ for k in comp_keywords:
350
+ keyword = k.get('keyword', '').lower().strip()
351
+ comp_rank = k.get('rank', 100)
352
+
353
+ # Keyword where competitor ranks well but main domain doesn't
354
+ if keyword not in main_keyword_set and comp_rank <= 20:
355
+ gaps.append({
356
+ 'keyword': k.get('keyword', ''),
357
+ 'competitor_rank': comp_rank,
358
+ 'competitor_domain': comp_domain,
359
+ 'volume': k.get('avg_search_volume', 0),
360
+ 'difficulty': self._estimate_difficulty(comp_rank, k.get('avg_search_volume', 0))
361
+ })
362
+
363
+ # Calculate opportunity scores
364
+ for gap in gaps:
365
+ score = self._calculate_opportunity_score(
366
+ gap['competitor_rank'],
367
+ gap['volume'],
368
+ gap['difficulty']
369
+ )
370
+ gap['priority_score'] = score
371
+
372
+ # Sort by priority score
373
+ gaps.sort(key=lambda x: x['priority_score'], reverse=True)
374
+ opportunities.extend(gaps[:20]) # Top 20 per competitor
375
+
376
+ # Competitor summary
377
+ overlapping = len([k for k in comp_keywords if k.get('keyword', '').lower().strip() in main_keyword_set])
378
+ competitor_summary.append({
379
+ 'domain': comp_domain,
380
+ 'total_keywords': comp_stats.get('total_keywords_count', 0),
381
+ 'overlapping_keywords': overlapping,
382
+ 'gaps_identified': len(gaps)
383
+ })
384
+
385
+ # Sort all opportunities by priority score
386
+ opportunities.sort(key=lambda x: x['priority_score'], reverse=True)
387
 
388
+ return opportunities[:50], competitor_summary # Top 50 overall
389
 
390
+ def _calculate_opportunity_score(self, competitor_rank: int, search_volume: int, difficulty: int) -> float:
391
+ """Calculate opportunity score using the PRD algorithm"""
392
+ position_ctr = {1: 28, 2: 15, 3: 11, 4: 8, 5: 7, 10: 2, 20: 1}
 
393
 
394
+ # Find closest CTR value
395
+ ctr_value = 1
396
+ for pos, ctr in position_ctr.items():
397
+ if competitor_rank <= pos:
398
+ ctr_value = ctr
399
+ break
400
 
401
+ traffic_potential = ctr_value * search_volume / 100
402
+ competition_factor = max(competitor_rank, 1)
403
+ difficulty_factor = max(difficulty, 10) / 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
+ score = traffic_potential / (competition_factor * difficulty_factor)
406
+ return min(round(score, 1), 100)
407
 
408
+ def _estimate_difficulty(self, rank: int, volume: int) -> int:
409
+ """Estimate keyword difficulty based on rank and volume"""
410
+ # Simple heuristic - in practice, this would come from a keyword difficulty API
411
+ if rank <= 3:
412
+ return 20 + (volume // 1000) * 5
413
+ elif rank <= 10:
414
+ return 35 + (volume // 1000) * 3
415
+ else:
416
+ return 50 + (volume // 1000) * 2
417
+
418
+ def _enrich_keywords_data(self, keywords: List[Dict]) -> List[Dict]:
419
+ """Enrich keywords with volume and CPC data"""
420
+ # Identify keywords needing enrichment
421
+ keywords_to_enrich = [
422
+ k for k in keywords
423
+ if not k.get('avg_search_volume') or k.get('avg_search_volume', 0) == 0
424
+ ]
425
+
426
+ if not keywords_to_enrich:
427
+ return keywords
428
+
429
+ # Batch enrichment
430
+ enriched_data = self._batch_enrich_keywords(
431
+ [k.get('keyword', '') for k in keywords_to_enrich]
432
+ )
433
+
434
+ # Merge enriched data back
435
+ enriched_keywords = keywords.copy()
436
+ for i, keyword_data in enumerate(keywords_to_enrich):
437
+ keyword = keyword_data.get('keyword', '')
438
+ if keyword in enriched_data:
439
+ # Find the keyword in the original list and update it
440
+ for j, k in enumerate(enriched_keywords):
441
+ if k.get('keyword', '') == keyword:
442
+ enriched_keywords[j].update(enriched_data[keyword])
443
+ break
444
+
445
+ return enriched_keywords
446
+
447
+ def _batch_enrich_keywords(self, keywords: List[str]) -> Dict[str, Dict]:
448
+ """Batch enrich keywords using Google Keyword Insight API"""
449
+ enriched_data = {}
450
 
451
+ # Process in batches
452
+ for i in range(0, len(keywords), self.enrichment_batch_size):
453
+ batch = keywords[i:i + self.enrichment_batch_size]
454
+
455
+ # Check cache first
456
+ uncached_keywords = []
457
+ for keyword in batch:
458
+ cache_key = self._get_cache_key(keyword)
459
+ if cache_key in self.enrichment_cache:
460
+ cache_age = time.time() - self.cache_timestamps.get(cache_key, 0)
461
+ if cache_age < self.enrichment_cache_ttl:
462
+ enriched_data[keyword] = self.enrichment_cache[cache_key]
463
+ else:
464
+ uncached_keywords.append(keyword)
465
+ else:
466
+ uncached_keywords.append(keyword)
467
+
468
+ if not uncached_keywords:
469
+ continue
470
+
471
+ # Enrich uncached keywords
472
+ try:
473
+ self._rate_limit_enrichment_api()
474
+
475
+ url = "https://google-keyword-insight1.p.rapidapi.com/globalkey/"
476
+ headers = {
477
+ "x-rapidapi-key": self.rapidapi_key,
478
+ "x-rapidapi-host": self.enrichment_api_host
479
+ }
480
+
481
+ for keyword in uncached_keywords:
482
+ params = {
483
+ "keyword": keyword,
484
+ "lang": "en"
485
+ }
486
+
487
+ response = requests.get(url, headers=headers, params=params, timeout=self.timeout)
488
+ self.enrichment_api_calls += 1
489
+ self.last_enrichment_call = time.time()
490
+
491
+ if response.status_code == 200:
492
+ data = response.json()
493
+ if data and isinstance(data, list) and len(data) > 0:
494
+ insight = data[0]
495
+ enriched_info = {
496
+ 'avg_search_volume': insight.get('volume', 0),
497
+ 'cpc_low': insight.get('low_bid', 0),
498
+ 'cpc_high': insight.get('high_bid', 0),
499
+ 'competition_level': insight.get('competition_level', 'UNKNOWN'),
500
+ 'trend': insight.get('trend', 0)
501
+ }
502
+
503
+ enriched_data[keyword] = enriched_info
504
+
505
+ # Cache the result
506
+ cache_key = self._get_cache_key(keyword)
507
+ self.enrichment_cache[cache_key] = enriched_info
508
+ self.cache_timestamps[cache_key] = time.time()
509
+
510
+ # Small delay to respect rate limits
511
+ time.sleep(0.1)
512
+
513
+ except Exception as e:
514
+ # Continue processing even if enrichment fails
515
+ print(f"Enrichment error: {e}")
516
+ continue
517
+
518
+ return enriched_data
519
+
520
+ def _get_cache_key(self, keyword: str) -> str:
521
+ """Generate cache key for keyword"""
522
+ return hashlib.md5(keyword.lower().encode()).hexdigest()
523
+
524
+ def _calculate_enrichment_rate(self, keywords: List[Dict]) -> float:
525
+ """Calculate the percentage of keywords with volume data"""
526
+ enriched = sum(1 for k in keywords if k.get('avg_search_volume', 0) > 0)
527
+ total = len(keywords)
528
+ return round(enriched / total * 100, 1) if total > 0 else 0
529
+
530
+ def _determine_trend(self, keyword_data: Dict) -> str:
531
+ """Determine keyword trend based on rank changes"""
532
+ current_rank = keyword_data.get('rank', 100)
533
+ previous_rank = keyword_data.get('previous_rank', 100)
534
+
535
+ if previous_rank is None:
536
+ return 'new'
537
+ elif current_rank < previous_rank:
538
+ return 'up'
539
+ elif current_rank > previous_rank:
540
+ return 'down'
541
+ else:
542
+ return 'stable'
543
+
544
+ def _rate_limit_primary_api(self):
545
+ """Rate limiting for primary API (60 requests/minute)"""
546
+ current_time = time.time()
547
+ if current_time - self.last_primary_call < 1: # 1 second between calls
548
+ time.sleep(1)
549
+
550
+ def _rate_limit_enrichment_api(self):
551
+ """Rate limiting for enrichment API (100 requests/minute)"""
552
+ current_time = time.time()
553
+ if current_time - self.last_enrichment_call < 0.6: # 0.6 seconds between calls
554
+ time.sleep(0.6)