| import json
|
| import time
|
| from datetime import datetime
|
| from typing import Dict, List
|
| import logging
|
| from serpapi import GoogleSearch
|
| from pathlib import Path
|
|
|
| from config import (
|
| SERP_API_KEY, SERP_MONTHLY_LIMIT, SEARCH_QUERIES,
|
| RAW_DIR, LOG_DIR
|
| )
|
|
|
| class BloomingtonScraper:
|
| def __init__(self):
|
| self.search_count = 0
|
| self.results_by_category = {}
|
|
|
|
|
| log_file = LOG_DIR / f"scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
|
| logging.basicConfig(
|
| level=logging.INFO,
|
| format='%(asctime)s - %(levelname)s - %(message)s',
|
| filename=log_file
|
| )
|
|
|
| def _make_serp_request(self, query: str, category: str) -> List[Dict]:
|
| """Make a single SERP API request"""
|
| if self.search_count >= SERP_MONTHLY_LIMIT:
|
| logging.warning("Monthly SERP API limit reached")
|
| return []
|
|
|
| params = {
|
| "api_key": SERP_API_KEY,
|
| "engine": "google",
|
| "q": query,
|
| "location": "Bloomington, Indiana, United States",
|
| "google_domain": "google.com",
|
| "num": 100,
|
| "start": 0
|
| }
|
|
|
| try:
|
| search = GoogleSearch(params)
|
| results = search.get_dict()
|
| self.search_count += 1
|
|
|
|
|
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| raw_file = RAW_DIR / f"raw_results_{category}_{timestamp}.json"
|
| with open(raw_file, 'w') as f:
|
| json.dump(results, f, indent=2)
|
|
|
| logging.info(f"SERP API calls used: {self.search_count}/{SERP_MONTHLY_LIMIT}")
|
| return results.get('organic_results', [])
|
|
|
| except Exception as e:
|
| logging.error(f"SERP API error for query '{query}': {e}")
|
| return []
|
|
|
| def scrape_all_categories(self) -> Dict[str, List[Dict]]:
|
| """Scrape data for all categories"""
|
| for category, queries in SEARCH_QUERIES.items():
|
| logging.info(f"Starting scraping for category: {category}")
|
| category_results = []
|
|
|
| for query in queries:
|
| if self.search_count >= SERP_MONTHLY_LIMIT:
|
| logging.warning(f"Monthly limit reached during {category} scraping")
|
| break
|
|
|
| results = self._make_serp_request(query, category)
|
| category_results.extend(results)
|
| time.sleep(2)
|
|
|
| self.results_by_category[category] = category_results
|
|
|
|
|
| category_file = RAW_DIR / f"{category}_results.json"
|
| with open(category_file, 'w') as f:
|
| json.dump(category_results, f, indent=2)
|
|
|
| logging.info(f"Completed scraping for {category}: {len(category_results)} results")
|
|
|
| return self.results_by_category
|
|
|
| def get_search_stats(self) -> Dict:
|
| """Get statistics about the search results"""
|
| stats = {
|
| "total_searches": self.search_count,
|
| "remaining_searches": SERP_MONTHLY_LIMIT - self.search_count,
|
| "results_per_category": {
|
| category: len(results)
|
| for category, results in self.results_by_category.items()
|
| }
|
| }
|
|
|
|
|
| stats_file = RAW_DIR / "search_stats.json"
|
| with open(stats_file, 'w') as f:
|
| json.dump(stats, f, indent=2)
|
|
|
| return stats |