File size: 1,813 Bytes
826a1b8
 
 
 
b029173
 
2e99a5a
 
 
 
 
 
 
 
 
 
 
826a1b8
206e141
2f96339
 
 
 
826a1b8
b029173
9c1bffa
 
2e99a5a
 
9c1bffa
 
2f96339
b029173
 
9c1bffa
2f96339
 
 
 
826a1b8
 
2f96339
826a1b8
 
 
9c1bffa
826a1b8
 
 
 
 
2f96339
826a1b8
9c1bffa
 
 
8121f99
2f96339
 
8121f99
2f96339
8121f99
826a1b8
 
9c1bffa
826a1b8
9c1bffa
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# nuse_modules/google_search.py

import os
import requests
import time
from typing import List
from boilerpy3 import extractors

article_extractor = extractors.ArticleExtractor()

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/114.0.0.0 Safari/537.36"
    )
}

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CX_ID   = os.getenv("GOOGLE_CX_ID")

# initialise once (thread-safe)
article_extractor = extractors.ArticleExtractor()


def extract_full_text(url: str) -> str:
    try:
        html = requests.get(url, headers=HEADERS, timeout=10).text
        return article_extractor.get_content(html) or ""
    except Exception as e:
        print(f"[SCRAPER ERROR] {url}: {e}")
        return ""


def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
    """
    Run a Google Custom Search and return a list of dicts with:
        title, link, snippet, content (full article text)
    """
    query = " ".join(keywords)
    url = (
        "https://www.googleapis.com/customsearch/v1"
        f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
        f"&q={query}&num={num_results}"
    )

    try:
        res = requests.get(url, timeout=10)
        res.raise_for_status()
        data = res.json()

        results = []
        for item in data.get("items", []):
            link = item.get("link")
            article_text = extract_full_text(link)

            results.append({
                "title":   item.get("title"),
                "link":    link,
                "snippet": item.get("snippet"),
                "content": article_text,
            })

        return results

    except Exception as e:
        print(f"[ERROR] Google search failed: {e}")
        return []