Spaces:

nuseAI
/

FastAPI

Sleeping

App Files Files Community

raghavNCI commited on Jul 8

Commit

011e38f

1 Parent(s): 2049c5d

using gnews

Browse files

Files changed (2) hide show

nuse_modules/headlines_generator.py +34 -58
requirements.txt +0 -3

nuse_modules/headlines_generator.py CHANGED Viewed

@@ -7,106 +7,83 @@ import time
 from typing import List, Dict
 import requests
-import feedparser
 from boilerpy3 import extractors
 from clients.redis_client import redis_client as _r
 from models_initialization.mistral_registry import mistral_generate
 # ──────────────────────────────────────────────────────────────
-# CONFIG (Google News RSS, no external API keys needed)
 # ──────────────────────────────────────────────────────────────
 _CATEGORIES: dict[str, str] = {
-    "world":         "world news",
-    "india":         "india top stories",
     "finance":       "finance business economy",
-    "sports":        "sports headlines",
-    "entertainment": "entertainment celebrity movies tv",
 }
 _ARTICLES_PER_CAT   = 5
 _SUMMARY_TOKENS     = 120
 _REDIS_TTL_SECONDS  = 24 * 3600
-_RSS_TIMEOUT        = 10   # seconds
-_ARTICLE_TIMEOUT    = 10   # seconds
-_MIN_BODY_LENGTH    = 120  # relaxed threshold so short briefs pass
-# Google News RSS search template
-def _rss_url(query: str) -> str:
-    query = requests.utils.quote(query)
-    return (
-        "https://news.google.com/rss/search?q=" + query +
-        "&hl=en-US&gl=US&ceid=US:en"
-    )
-# BoilerPy3 extractor (thread‑safe singleton)
 _bp_extractor = extractors.ArticleExtractor()
-# Common browser UA to avoid 403s
-_HEADERS = {
-    "User-Agent": (
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114 Safari/537.36"
-    )
-}
 # ──────────────────────────────────────────────────────────────
-# FETCH RSS + ARTICLE BODY
 # ──────────────────────────────────────────────────────────────
-def _follow_google_redirect(html: str) -> str | None:
-    """Extract the real URL from a Google News redirect HTML page."""
-    match = re.search(r'url=(https?[^"\']+)', html, flags=re.I)
-    return match.group(1) if match else None
 def _extract_fulltext(url: str) -> str:
     try:
-        resp = requests.get(url, headers=_HEADERS, timeout=_ARTICLE_TIMEOUT, allow_redirects=True)
-        html = resp.text
-        # If still on news.google.com and meta refresh present → follow manually
-        if "news.google.com" in resp.url and "http-equiv=\"refresh\"" in html.lower():
-            real_url = _follow_google_redirect(html)
-            if real_url:
-                html = requests.get(real_url, headers=_HEADERS, timeout=_ARTICLE_TIMEOUT).text
-        text = _bp_extractor.get_content(html)
-        return text or ""
     except Exception as e:
         print(f"[SCRAPE ERR] {url}: {e}")
         return ""
 def _fetch_articles(query: str, wanted: int) -> List[dict]:
-    feed_url = _rss_url(query)
     try:
-        feed = feedparser.parse(feed_url, request_headers=_HEADERS)
     except Exception as e:
-        print(f"[RSS ERR] {query}: {e}")
         return []
     collected: List[dict] = []
-    seen_links: set[str] = set()
-    for entry in feed.entries:
-        link = entry.link
-        if link in seen_links:
             continue
-        seen_links.add(link)
         body = _extract_fulltext(link)
         if len(body) < _MIN_BODY_LENGTH:
-            continue  # skip very short pages/homepages
         collected.append({
-            "title": entry.title,
             "url":   link,
             "content": body,
-            "pubDate": entry.get("published", ""),
-            "image":  None,  # can scrape OG tag later
-            "source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
         })
         if len(collected) >= wanted:
             break
@@ -118,7 +95,6 @@ def _fetch_articles(query: str, wanted: int) -> List[dict]:
 # ──────────────────────────────────────────────────────────────
 _RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
 def _summarise(text: str) -> str:
     prompt = (
         "You are a concise news assistant. Summarise the following article "
@@ -140,7 +116,7 @@ def _redis_key(date: str, cat: str) -> str:
 # ──────────────────────────────────────────────────────────────
 def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
-    """Fetches, summarises, and caches headlines via Google News RSS."""
     date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
     all_results: Dict[str, List[dict]] = {}

 from typing import List, Dict
 import requests
 from boilerpy3 import extractors
 from clients.redis_client import redis_client as _r
 from models_initialization.mistral_registry import mistral_generate
 # ──────────────────────────────────────────────────────────────
+# CONFIG  – GNews.io API
 # ──────────────────────────────────────────────────────────────
+GNEWS_API_KEY = os.getenv("GNEWS_API_KEY")
+assert GNEWS_API_KEY, "❌ GNEWS_API_KEY missing (add to Space secrets or .env)"
 _CATEGORIES: dict[str, str] = {
+    "world":         "world",
+    "india":         "india",
     "finance":       "finance business economy",
+    "sports":        "sports",
+    "entertainment": "entertainment celebrity",
 }
 _ARTICLES_PER_CAT   = 5
 _SUMMARY_TOKENS     = 120
 _REDIS_TTL_SECONDS  = 24 * 3600
+_REQ_TIMEOUT        = 10
+_MIN_BODY_LENGTH    = 120
 _bp_extractor = extractors.ArticleExtractor()
+_HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
 # ──────────────────────────────────────────────────────────────
+# HELPERS
 # ──────────────────────────────────────────────────────────────
+def _gnews_url(query: str, max_res: int = 10) -> str:
+    q = requests.utils.quote(query)
+    return (
+        "https://gnews.io/api/v4/search?"  # paid plans allow /top-headlines but /search works on free
+        f"q={q}&lang=en&max={max_res}&token={GNEWS_API_KEY}"
+    )
 def _extract_fulltext(url: str) -> str:
     try:
+        html = requests.get(url, headers=_HEADERS, timeout=_REQ_TIMEOUT, allow_redirects=True).text
+        return _bp_extractor.get_content(html) or ""
     except Exception as e:
         print(f"[SCRAPE ERR] {url}: {e}")
         return ""
 def _fetch_articles(query: str, wanted: int) -> List[dict]:
+    url = _gnews_url(query, max_res=wanted * 2)  # fetch extra to account for skips
     try:
+        data = requests.get(url, timeout=_REQ_TIMEOUT).json()
     except Exception as e:
+        print(f"[GNEWS ERR] {query}: {e}")
         return []
     collected: List[dict] = []
+    seen_urls: set[str] = set()
+    for item in data.get("articles", []):
+        link = item.get("url")
+        if not link or link in seen_urls:
             continue
+        seen_urls.add(link)
         body = _extract_fulltext(link)
         if len(body) < _MIN_BODY_LENGTH:
+            continue
         collected.append({
+            "title": item.get("title"),
             "url":   link,
             "content": body,
+            "pubDate": item.get("publishedAt"),
+            "image":  item.get("image"),
+            "source_snippet": item.get("description", ""),
         })
         if len(collected) >= wanted:
             break
 # ──────────────────────────────────────────────────────────────
 _RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
 def _summarise(text: str) -> str:
     prompt = (
         "You are a concise news assistant. Summarise the following article "
 # ──────────────────────────────────────────────────────────────
 def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
+    """Fetch, summarise, and cache headlines via GNews API."""
     date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
     all_results: Dict[str, List[dict]] = {}

requirements.txt CHANGED Viewed

@@ -8,6 +8,3 @@ accelerate
 torch
 huggingface_hub
 boilerpy3==1.0.6
-feedparser
-newspaper3k
-nltk

 torch
 huggingface_hub
 boilerpy3==1.0.6