FastAPI / nuse_modules /headlines_generator.py
raghavNCI
using gnews
011e38f
raw
history blame
6.06 kB
from __future__ import annotations
import datetime as _dt
import json
import os
import re
import time
from typing import List, Dict
import requests
from boilerpy3 import extractors
from clients.redis_client import redis_client as _r
from models_initialization.mistral_registry import mistral_generate
# ──────────────────────────────────────────────────────────────
# CONFIG – GNews.io API
# ──────────────────────────────────────────────────────────────
GNEWS_API_KEY = os.getenv("GNEWS_API_KEY")
assert GNEWS_API_KEY, "❌ GNEWS_API_KEY missing (add to Space secrets or .env)"
_CATEGORIES: dict[str, str] = {
"world": "world",
"india": "india",
"finance": "finance business economy",
"sports": "sports",
"entertainment": "entertainment celebrity",
}
_ARTICLES_PER_CAT = 5
_SUMMARY_TOKENS = 120
_REDIS_TTL_SECONDS = 24 * 3600
_REQ_TIMEOUT = 10
_MIN_BODY_LENGTH = 120
_bp_extractor = extractors.ArticleExtractor()
_HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
# ──────────────────────────────────────────────────────────────
# HELPERS
# ──────────────────────────────────────────────────────────────
def _gnews_url(query: str, max_res: int = 10) -> str:
q = requests.utils.quote(query)
return (
"https://gnews.io/api/v4/search?" # paid plans allow /top-headlines but /search works on free
f"q={q}&lang=en&max={max_res}&token={GNEWS_API_KEY}"
)
def _extract_fulltext(url: str) -> str:
try:
html = requests.get(url, headers=_HEADERS, timeout=_REQ_TIMEOUT, allow_redirects=True).text
return _bp_extractor.get_content(html) or ""
except Exception as e:
print(f"[SCRAPE ERR] {url}: {e}")
return ""
def _fetch_articles(query: str, wanted: int) -> List[dict]:
url = _gnews_url(query, max_res=wanted * 2) # fetch extra to account for skips
try:
data = requests.get(url, timeout=_REQ_TIMEOUT).json()
except Exception as e:
print(f"[GNEWS ERR] {query}: {e}")
return []
collected: List[dict] = []
seen_urls: set[str] = set()
for item in data.get("articles", []):
link = item.get("url")
if not link or link in seen_urls:
continue
seen_urls.add(link)
body = _extract_fulltext(link)
if len(body) < _MIN_BODY_LENGTH:
continue
collected.append({
"title": item.get("title"),
"url": link,
"content": body,
"pubDate": item.get("publishedAt"),
"image": item.get("image"),
"source_snippet": item.get("description", ""),
})
if len(collected) >= wanted:
break
return collected
# ──────────────────────────────────────────────────────────────
# SUMMARISER
# ──────────────────────────────────────────────────────────────
_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
def _summarise(text: str) -> str:
prompt = (
"You are a concise news assistant. Summarise the following article "
"in one sentence (<=25 words). Omit source and author names.\n\n"
f"ARTICLE:\n{text}"
)
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
return _RE_PROMPT_ECHO.sub("", raw).strip()
# ──────────────────────────────────────────────────────────────
# REDIS KEY
# ──────────────────────────────────────────────────────────────
def _redis_key(date: str, cat: str) -> str:
return f"headlines:{date}:{cat}"
# ──────────────────────────────────────────────────────────────
# MAIN ENTRY
# ──────────────────────────────────────────────────────────────
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
"""Fetch, summarise, and cache headlines via GNews API."""
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
all_results: Dict[str, List[dict]] = {}
for cat, query in _CATEGORIES.items():
print(f"[HEADLINES] {cat.title()} …")
articles = _fetch_articles(query, _ARTICLES_PER_CAT)
summaries: List[dict] = []
for art in articles:
summary_txt = _summarise(art["content"])
summaries.append({
"title": art["title"],
"url": art["url"],
"summary": summary_txt,
"source_snippet": art["source_snippet"],
"image": art["image"],
"pubDate": art["pubDate"],
})
redis_key = _redis_key(date_str, cat)
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
all_results[cat] = summaries
print(f" ↳ stored {len(summaries)} items β†’ {redis_key}")
return all_results