FastAPI / nuse_modules /headlines_generator.py
raghavNCI
switching to newsdata
b2bd47e
raw
history blame
6.85 kB
# nuse_modules/headlines_generator.py
from __future__ import annotations
import datetime as _dt
import json, os, re, time
from typing import List, Dict, Optional
import requests
from clients.redis_client import redis_client as _r
from models_initialization.mistral_registry import mistral_generate
# ──────────────────────────────────────────────────────────────
# CONFIG
# ──────────────────────────────────────────────────────────────
NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY is not set in env / Space secrets"
# Newsdata supports these canonical categories:
# 'world', 'business', 'science', 'technology', 'entertainment',
# 'sports', 'environment', 'politics'
_CATEGORIES = {
"world": "world",
"india": "world", # use query filter for India
"finance": "business",
"sports": "sports",
"entertainment": "entertainment",
}
_ARTICLES_PER_CAT = 5
_SUMMARY_TOKENS = 120
_REDIS_TTL_SECONDS = 24 * 3600
_REQUEST_TIMEOUT = 10
# ──────────────────────────────────────────────────────────────
# NEWSDATA FETCHER
# ──────────────────────────────────────────────────────────────
def _newsdata_url(
category: str,
query: Optional[str] = None,
page: int = 0,
language: str = "en",
size: int = 25,
) -> str:
base = (
"https://newsdata.io/api/1/news"
f"?apikey={NEWSDATA_API_KEY}"
f"&language={language}"
f"&category={category}"
f"&size={size}"
f"&page={page}"
)
if query:
base += f"&q={query}"
return base
def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]:
"""
Fetch up to `wanted` articles for a given logical category (cat_key).
"""
collected: List[dict] = []
seen_links = set()
page = 0
while len(collected) < wanted and page < 5: # safety cap
url = _newsdata_url(
category=category,
query="india" if cat_key == "india" else None,
page=page,
)
try:
res = requests.get(url, timeout=_REQUEST_TIMEOUT)
res.raise_for_status()
data = res.json()
except Exception as e:
print(f"[ERROR] Newsdata fetch failed ({cat_key}, page {page}): {e}")
break
for item in data.get("results", []):
link = item.get("link")
if not link or link in seen_links:
continue
seen_links.add(link)
content = item.get("content") or item.get("full_description") or ""
if not content or len(content) < 300:
continue # skip short / empty bodies
collected.append(
{
"title": item.get("title"),
"url": link,
"content": content,
"image": item.get("image_url"),
"source_snippet": item.get("description") or "",
"pubDate": item.get("pubDate"),
}
)
if len(collected) >= wanted:
break
if not data.get("nextPage"):
break # no more pages
page += 1
time.sleep(0.4) # gentle throttle
return collected[:wanted]
# ──────────────────────────────────────────────────────────────
# SUMMARISER
# ──────────────────────────────────────────────────────────────
_CLEAN_RE = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
def _summarise_article(body: str) -> str:
prompt = (
"You are a concise news assistant. Summarise the following article "
"in one sentence (<=25 words). Omit source and author names.\n\n"
f"ARTICLE:\n{body}"
)
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
return _CLEAN_RE.sub("", raw).strip()
# ──────────────────────────────────────────────────────────────
# REDIS KEY HELPERS
# ──────────────────────────────────────────────────────────────
def _redis_key(date: str, category: str) -> str:
return f"headlines:{date}:{category}"
# ──────────────────────────────────────────────────────────────
# MAIN ENTRY POINT
# ──────────────────────────────────────────────────────────────
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
"""
Fetches top articles per category via Newsdata.io, summarises them,
stores in Upstash Redis, and returns the payload for logging/tests.
"""
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
all_output: Dict[str, List[dict]] = {}
for cat_key, newsdata_cat in _CATEGORIES.items():
print(f"[HEADLINES] {cat_key.title()} …")
articles = _fetch_newsdata_articles(cat_key, newsdata_cat, _ARTICLES_PER_CAT)
summaries: List[dict] = []
for art in articles:
summary = _summarise_article(art["content"])
summaries.append(
{
"title": art["title"],
"url": art["url"],
"summary": summary,
"source_snippet": art["source_snippet"],
"image": art["image"],
"pubDate": art["pubDate"],
}
)
redis_key = _redis_key(date_str, cat_key)
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
all_output[cat_key] = summaries
print(f" ↳ stored {len(summaries)} items in Redis ({redis_key})")
return all_output