FastAPI / nuse_modules /headlines_generator.py
raghavNCI
headlines fix 1
7fdb7c1
raw
history blame
6.53 kB
# nuse_modules/headlines_generator.py
from __future__ import annotations
import datetime as _dt
import json
import os
import re
import time
from typing import List, Dict, Optional
import requests
from clients.redis_client import redis_client as _r
from models_initialization.mistral_registry import mistral_generate
# ──────────────────────────────────────────────────────────────
# CONFIG
# ──────────────────────────────────────────────────────────────
NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY missing (add to Space secrets or .env)"
# Pure-query strings we’ll pass via &q=
_CATEGORIES: dict[str, str] = {
"world": "world news top stories",
"india": "india top headlines",
"finance": "business finance economy",
"sports": "sports news today",
"entertainment": "celebrity movies tv music",
}
_ARTICLES_PER_CAT = 5
_SUMMARY_TOKENS = 120
_REDIS_TTL_SECONDS = 24 * 3600
_REQUEST_TIMEOUT = 10 # seconds
# ──────────────────────────────────────────────────────────────
# NEWSDATA HELPER
# ──────────────────────────────────────────────────────────────
def _newsdata_url(
query: str,
page: int = 0,
language: str = "en",
size: int = 25,
) -> str:
"""
Build a Newsdata /latest request that always uses q=.
"""
return (
"https://newsdata.io/api/1/latest"
f"?apikey={NEWSDATA_API_KEY}"
f"&language={language}"
f"&size={size}"
f"&page={page}"
f"&q={requests.utils.quote(query)}"
)
def _fetch_articles(q: str, wanted: int) -> List[dict]:
"""
Fetch up to `wanted` unique articles for the query string `q`.
"""
collected: List[dict] = []
seen_urls: set[str] = set()
page = 0
while len(collected) < wanted and page < 5: # hard stop at 5 pages
url = _newsdata_url(query=q, page=page)
try:
res = requests.get(url, timeout=_REQUEST_TIMEOUT)
res.raise_for_status()
data = res.json()
except Exception as e:
print(f"[ERROR] Newsdata fetch failed ({q}, page {page}): {e}")
break
for item in data.get("results", []):
url_link = item.get("link")
if not url_link or url_link in seen_urls:
continue
seen_urls.add(url_link)
content = item.get("content") or item.get("full_description") or ""
if len(content) < 300:
continue # skip short or empty articles
collected.append(
{
"title": item.get("title"),
"url": url_link,
"content": content,
"image": item.get("image_url"),
"source_snippet": item.get("description") or "",
"pubDate": item.get("pubDate"),
}
)
if len(collected) >= wanted:
break
if not data.get("nextPage"):
break
page += 1
time.sleep(0.4) # gentle throttling
return collected[:wanted]
# ──────────────────────────────────────────────────────────────
# SUMMARISER
# ──────────────────────────────────────────────────────────────
_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
def _summarise(text: str) -> str:
prompt = (
"You are a concise news assistant. Summarise the following article "
"in one sentence (<=25 words). Omit source and author names.\n\n"
f"ARTICLE:\n{text}"
)
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
return _RE_PROMPT_ECHO.sub("", raw).strip()
# ──────────────────────────────────────────────────────────────
# REDIS KEY
# ──────────────────────────────────────────────────────────────
def _redis_key(date: str, cat: str) -> str:
return f"headlines:{date}:{cat}"
# ──────────────────────────────────────────────────────────────
# MAIN ENTRY
# ──────────────────────────────────────────────────────────────
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
"""
Fetch, summarise, and cache today’s headlines for each category.
"""
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
all_results: Dict[str, List[dict]] = {}
for cat, query in _CATEGORIES.items():
print(f"[HEADLINES] {cat.title()} …")
articles = _fetch_articles(query, _ARTICLES_PER_CAT)
summaries: List[dict] = []
for art in articles:
summary_txt = _summarise(art["content"])
summaries.append(
{
"title": art["title"],
"url": art["url"],
"summary": summary_txt,
"source_snippet": art["source_snippet"],
"image": art["image"],
"pubDate": art["pubDate"],
}
)
redis_key = _redis_key(date_str, cat)
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
all_results[cat] = summaries
print(f" ↳ stored {len(summaries)} items β†’ {redis_key}")
return all_results