|
|
|
|
|
import os |
|
import requests |
|
import time |
|
from typing import List |
|
from boilerpy3 import extractors |
|
|
|
article_extractor = extractors.ArticleExtractor() |
|
|
|
HEADERS = { |
|
"User-Agent": ( |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
|
"AppleWebKit/537.36 (KHTML, like Gecko) " |
|
"Chrome/114.0.0.0 Safari/537.36" |
|
) |
|
} |
|
|
|
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") |
|
GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID") |
|
|
|
|
|
article_extractor = extractors.ArticleExtractor() |
|
|
|
|
|
def extract_full_text(url: str) -> str: |
|
try: |
|
html = requests.get(url, headers=HEADERS, timeout=10).text |
|
return article_extractor.get_content(html) or "" |
|
except Exception as e: |
|
print(f"[SCRAPER ERROR] {url}: {e}") |
|
return "" |
|
|
|
|
|
def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]: |
|
""" |
|
Run a Google Custom Search and return a list of dicts with: |
|
title, link, snippet, content (full article text) |
|
""" |
|
query = " ".join(keywords) |
|
url = ( |
|
"https://www.googleapis.com/customsearch/v1" |
|
f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}" |
|
f"&q={query}&num={num_results}" |
|
) |
|
|
|
try: |
|
res = requests.get(url, timeout=10) |
|
res.raise_for_status() |
|
data = res.json() |
|
|
|
results = [] |
|
for item in data.get("items", []): |
|
link = item.get("link") |
|
article_text = extract_full_text(link) |
|
|
|
results.append({ |
|
"title": item.get("title"), |
|
"link": link, |
|
"snippet": item.get("snippet"), |
|
"content": article_text, |
|
}) |
|
|
|
return results |
|
|
|
except Exception as e: |
|
print(f"[ERROR] Google search failed: {e}") |
|
return [] |
|
|