FastAPI / nuse_modules /google_search.py
raghavNCI
text extractor changes
2e99a5a
raw
history blame
1.81 kB
# nuse_modules/google_search.py
import os
import requests
import time
from typing import List
from boilerpy3 import extractors
article_extractor = extractors.ArticleExtractor()
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.0.0 Safari/537.36"
)
}
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
# initialise once (thread-safe)
article_extractor = extractors.ArticleExtractor()
def extract_full_text(url: str) -> str:
try:
html = requests.get(url, headers=HEADERS, timeout=10).text
return article_extractor.get_content(html) or ""
except Exception as e:
print(f"[SCRAPER ERROR] {url}: {e}")
return ""
def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
"""
Run a Google Custom Search and return a list of dicts with:
title, link, snippet, content (full article text)
"""
query = " ".join(keywords)
url = (
"https://www.googleapis.com/customsearch/v1"
f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
f"&q={query}&num={num_results}"
)
try:
res = requests.get(url, timeout=10)
res.raise_for_status()
data = res.json()
results = []
for item in data.get("items", []):
link = item.get("link")
article_text = extract_full_text(link)
results.append({
"title": item.get("title"),
"link": link,
"snippet": item.get("snippet"),
"content": article_text,
})
return results
except Exception as e:
print(f"[ERROR] Google search failed: {e}")
return []