File size: 1,813 Bytes
826a1b8 b029173 2e99a5a 826a1b8 206e141 2f96339 826a1b8 b029173 9c1bffa 2e99a5a 9c1bffa 2f96339 b029173 9c1bffa 2f96339 826a1b8 2f96339 826a1b8 9c1bffa 826a1b8 2f96339 826a1b8 9c1bffa 8121f99 2f96339 8121f99 2f96339 8121f99 826a1b8 9c1bffa 826a1b8 9c1bffa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# nuse_modules/google_search.py
import os
import requests
import time
from typing import List
from boilerpy3 import extractors
article_extractor = extractors.ArticleExtractor()
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.0.0 Safari/537.36"
)
}
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
# initialise once (thread-safe)
article_extractor = extractors.ArticleExtractor()
def extract_full_text(url: str) -> str:
try:
html = requests.get(url, headers=HEADERS, timeout=10).text
return article_extractor.get_content(html) or ""
except Exception as e:
print(f"[SCRAPER ERROR] {url}: {e}")
return ""
def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
"""
Run a Google Custom Search and return a list of dicts with:
title, link, snippet, content (full article text)
"""
query = " ".join(keywords)
url = (
"https://www.googleapis.com/customsearch/v1"
f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
f"&q={query}&num={num_results}"
)
try:
res = requests.get(url, timeout=10)
res.raise_for_status()
data = res.json()
results = []
for item in data.get("items", []):
link = item.get("link")
article_text = extract_full_text(link)
results.append({
"title": item.get("title"),
"link": link,
"snippet": item.get("snippet"),
"content": article_text,
})
return results
except Exception as e:
print(f"[ERROR] Google search failed: {e}")
return []
|