|
|
|
from __future__ import annotations |
|
import datetime as _dt |
|
import json |
|
import os |
|
import re |
|
import time |
|
from typing import List, Dict, Optional |
|
|
|
import requests |
|
|
|
from clients.redis_client import redis_client as _r |
|
from models_initialization.mistral_registry import mistral_generate |
|
|
|
|
|
|
|
|
|
NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY") |
|
assert NEWSDATA_API_KEY, "β NEWSDATA_API_KEY missing (add to Space secrets or .env)" |
|
|
|
|
|
_CATEGORIES: dict[str, str] = { |
|
"world": "world news top stories", |
|
"india": "india top headlines", |
|
"finance": "business finance economy", |
|
"sports": "sports news today", |
|
"entertainment": "celebrity movies tv music", |
|
} |
|
|
|
_ARTICLES_PER_CAT = 5 |
|
_SUMMARY_TOKENS = 120 |
|
_REDIS_TTL_SECONDS = 24 * 3600 |
|
_REQUEST_TIMEOUT = 10 |
|
|
|
|
|
|
|
|
|
def _newsdata_url( |
|
query: str, |
|
page: int = 0, |
|
language: str = "en", |
|
size: int = 25, |
|
) -> str: |
|
""" |
|
Build a Newsdata /latest request that always uses q=. |
|
""" |
|
return ( |
|
"https://newsdata.io/api/1/latest" |
|
f"?apikey={NEWSDATA_API_KEY}" |
|
f"&language={language}" |
|
f"&size={size}" |
|
f"&page={page}" |
|
f"&q={requests.utils.quote(query)}" |
|
) |
|
|
|
def _fetch_articles(q: str, wanted: int) -> List[dict]: |
|
""" |
|
Fetch up to `wanted` unique articles for the query string `q`. |
|
""" |
|
collected: List[dict] = [] |
|
seen_urls: set[str] = set() |
|
page = 0 |
|
|
|
while len(collected) < wanted and page < 5: |
|
url = _newsdata_url(query=q, page=page) |
|
try: |
|
res = requests.get(url, timeout=_REQUEST_TIMEOUT) |
|
res.raise_for_status() |
|
data = res.json() |
|
except Exception as e: |
|
print(f"[ERROR] Newsdata fetch failed ({q}, page {page}): {e}") |
|
break |
|
|
|
for item in data.get("results", []): |
|
url_link = item.get("link") |
|
if not url_link or url_link in seen_urls: |
|
continue |
|
seen_urls.add(url_link) |
|
|
|
content = item.get("content") or item.get("full_description") or "" |
|
if len(content) < 300: |
|
continue |
|
|
|
collected.append( |
|
{ |
|
"title": item.get("title"), |
|
"url": url_link, |
|
"content": content, |
|
"image": item.get("image_url"), |
|
"source_snippet": item.get("description") or "", |
|
"pubDate": item.get("pubDate"), |
|
} |
|
) |
|
if len(collected) >= wanted: |
|
break |
|
|
|
if not data.get("nextPage"): |
|
break |
|
page += 1 |
|
time.sleep(0.4) |
|
return collected[:wanted] |
|
|
|
|
|
|
|
|
|
_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL) |
|
|
|
def _summarise(text: str) -> str: |
|
prompt = ( |
|
"You are a concise news assistant. Summarise the following article " |
|
"in one sentence (<=25 words). Omit source and author names.\n\n" |
|
f"ARTICLE:\n{text}" |
|
) |
|
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3) |
|
return _RE_PROMPT_ECHO.sub("", raw).strip() |
|
|
|
|
|
|
|
|
|
def _redis_key(date: str, cat: str) -> str: |
|
return f"headlines:{date}:{cat}" |
|
|
|
|
|
|
|
|
|
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]: |
|
""" |
|
Fetch, summarise, and cache todayβs headlines for each category. |
|
""" |
|
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d") |
|
all_results: Dict[str, List[dict]] = {} |
|
|
|
for cat, query in _CATEGORIES.items(): |
|
print(f"[HEADLINES] {cat.title()} β¦") |
|
articles = _fetch_articles(query, _ARTICLES_PER_CAT) |
|
|
|
summaries: List[dict] = [] |
|
for art in articles: |
|
summary_txt = _summarise(art["content"]) |
|
summaries.append( |
|
{ |
|
"title": art["title"], |
|
"url": art["url"], |
|
"summary": summary_txt, |
|
"source_snippet": art["source_snippet"], |
|
"image": art["image"], |
|
"pubDate": art["pubDate"], |
|
} |
|
) |
|
|
|
redis_key = _redis_key(date_str, cat) |
|
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS) |
|
all_results[cat] = summaries |
|
print(f" β³ stored {len(summaries)} items β {redis_key}") |
|
|
|
return all_results |
|
|