|
|
|
from __future__ import annotations |
|
import datetime as _dt |
|
import json, os, re, time |
|
from typing import List, Dict, Optional |
|
|
|
import requests |
|
|
|
from clients.redis_client import redis_client as _r |
|
from models_initialization.mistral_registry import mistral_generate |
|
|
|
|
|
|
|
|
|
|
|
NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY") |
|
assert NEWSDATA_API_KEY, "β NEWSDATA_API_KEY is not set in env / Space secrets" |
|
|
|
|
|
|
|
|
|
_CATEGORIES = { |
|
"world": "world", |
|
"india": "world", |
|
"finance": "business", |
|
"sports": "sports", |
|
"entertainment": "entertainment", |
|
} |
|
|
|
_ARTICLES_PER_CAT = 5 |
|
_SUMMARY_TOKENS = 120 |
|
_REDIS_TTL_SECONDS = 24 * 3600 |
|
_REQUEST_TIMEOUT = 10 |
|
|
|
|
|
|
|
|
|
|
|
def _newsdata_url( |
|
category: str, |
|
query: Optional[str] = None, |
|
page: int = 0, |
|
language: str = "en", |
|
size: int = 25, |
|
) -> str: |
|
base = ( |
|
"https://newsdata.io/api/1/news" |
|
f"?apikey={NEWSDATA_API_KEY}" |
|
f"&language={language}" |
|
f"&category={category}" |
|
f"&size={size}" |
|
f"&page={page}" |
|
) |
|
if query: |
|
base += f"&q={query}" |
|
return base |
|
|
|
|
|
def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]: |
|
""" |
|
Fetch up to `wanted` articles for a given logical category (cat_key). |
|
""" |
|
collected: List[dict] = [] |
|
seen_links = set() |
|
page = 0 |
|
|
|
while len(collected) < wanted and page < 5: |
|
url = _newsdata_url( |
|
category=category, |
|
query="india" if cat_key == "india" else None, |
|
page=page, |
|
) |
|
try: |
|
res = requests.get(url, timeout=_REQUEST_TIMEOUT) |
|
res.raise_for_status() |
|
data = res.json() |
|
except Exception as e: |
|
print(f"[ERROR] Newsdata fetch failed ({cat_key}, page {page}): {e}") |
|
break |
|
|
|
for item in data.get("results", []): |
|
link = item.get("link") |
|
if not link or link in seen_links: |
|
continue |
|
seen_links.add(link) |
|
|
|
content = item.get("content") or item.get("full_description") or "" |
|
if not content or len(content) < 300: |
|
continue |
|
|
|
collected.append( |
|
{ |
|
"title": item.get("title"), |
|
"url": link, |
|
"content": content, |
|
"image": item.get("image_url"), |
|
"source_snippet": item.get("description") or "", |
|
"pubDate": item.get("pubDate"), |
|
} |
|
) |
|
if len(collected) >= wanted: |
|
break |
|
|
|
if not data.get("nextPage"): |
|
break |
|
page += 1 |
|
time.sleep(0.4) |
|
|
|
return collected[:wanted] |
|
|
|
|
|
|
|
|
|
|
|
_CLEAN_RE = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL) |
|
|
|
def _summarise_article(body: str) -> str: |
|
prompt = ( |
|
"You are a concise news assistant. Summarise the following article " |
|
"in one sentence (<=25 words). Omit source and author names.\n\n" |
|
f"ARTICLE:\n{body}" |
|
) |
|
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3) |
|
return _CLEAN_RE.sub("", raw).strip() |
|
|
|
|
|
|
|
|
|
|
|
def _redis_key(date: str, category: str) -> str: |
|
return f"headlines:{date}:{category}" |
|
|
|
|
|
|
|
|
|
|
|
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]: |
|
""" |
|
Fetches top articles per category via Newsdata.io, summarises them, |
|
stores in Upstash Redis, and returns the payload for logging/tests. |
|
""" |
|
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d") |
|
all_output: Dict[str, List[dict]] = {} |
|
|
|
for cat_key, newsdata_cat in _CATEGORIES.items(): |
|
print(f"[HEADLINES] {cat_key.title()} β¦") |
|
articles = _fetch_newsdata_articles(cat_key, newsdata_cat, _ARTICLES_PER_CAT) |
|
|
|
summaries: List[dict] = [] |
|
for art in articles: |
|
summary = _summarise_article(art["content"]) |
|
summaries.append( |
|
{ |
|
"title": art["title"], |
|
"url": art["url"], |
|
"summary": summary, |
|
"source_snippet": art["source_snippet"], |
|
"image": art["image"], |
|
"pubDate": art["pubDate"], |
|
} |
|
) |
|
|
|
redis_key = _redis_key(date_str, cat_key) |
|
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS) |
|
all_output[cat_key] = summaries |
|
print(f" β³ stored {len(summaries)} items in Redis ({redis_key})") |
|
|
|
return all_output |
|
|