Spaces:

nuseAI
/

FastAPI

Sleeping

File size: 6,853 Bytes

# nuse_modules/headlines_generator.py
from __future__ import annotations
import datetime as _dt
import json, os, re, time
from typing import List, Dict, Optional

import requests

from clients.redis_client import redis_client as _r
from models_initialization.mistral_registry import mistral_generate


# ──────────────────────────────────────────────────────────────
# CONFIG
# ──────────────────────────────────────────────────────────────
NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY is not set in env / Space secrets"

# Newsdata supports these canonical categories:
#  'world', 'business', 'science', 'technology', 'entertainment',
#  'sports', 'environment', 'politics'
_CATEGORIES = {
    "world":         "world",
    "india":         "world",        # use query filter for India
    "finance":       "business",
    "sports":        "sports",
    "entertainment": "entertainment",
}

_ARTICLES_PER_CAT   = 5
_SUMMARY_TOKENS     = 120
_REDIS_TTL_SECONDS  = 24 * 3600
_REQUEST_TIMEOUT    = 10


# ──────────────────────────────────────────────────────────────
# NEWSDATA FETCHER
# ──────────────────────────────────────────────────────────────
def _newsdata_url(
    category: str,
    query: Optional[str] = None,
    page: int = 0,
    language: str = "en",
    size: int = 25,
) -> str:
    base = (
        "https://newsdata.io/api/1/news"
        f"?apikey={NEWSDATA_API_KEY}"
        f"&language={language}"
        f"&category={category}"
        f"&size={size}"
        f"&page={page}"
    )
    if query:
        base += f"&q={query}"
    return base


def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]:
    """
    Fetch up to `wanted` articles for a given logical category (cat_key).
    """
    collected: List[dict] = []
    seen_links = set()
    page = 0

    while len(collected) < wanted and page < 5:  # safety cap
        url = _newsdata_url(
            category=category,
            query="india" if cat_key == "india" else None,
            page=page,
        )
        try:
            res = requests.get(url, timeout=_REQUEST_TIMEOUT)
            res.raise_for_status()
            data = res.json()
        except Exception as e:
            print(f"[ERROR] Newsdata fetch failed ({cat_key}, page {page}): {e}")
            break

        for item in data.get("results", []):
            link = item.get("link")
            if not link or link in seen_links:
                continue
            seen_links.add(link)

            content = item.get("content") or item.get("full_description") or ""
            if not content or len(content) < 300:
                continue  # skip short / empty bodies

            collected.append(
                {
                    "title":   item.get("title"),
                    "url":     link,
                    "content": content,
                    "image":   item.get("image_url"),
                    "source_snippet": item.get("description") or "",
                    "pubDate": item.get("pubDate"),
                }
            )
            if len(collected) >= wanted:
                break

        if not data.get("nextPage"):
            break  # no more pages
        page += 1
        time.sleep(0.4)  # gentle throttle

    return collected[:wanted]


# ──────────────────────────────────────────────────────────────
# SUMMARISER
# ──────────────────────────────────────────────────────────────
_CLEAN_RE = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)

def _summarise_article(body: str) -> str:
    prompt = (
        "You are a concise news assistant. Summarise the following article "
        "in one sentence (<=25 words). Omit source and author names.\n\n"
        f"ARTICLE:\n{body}"
    )
    raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
    return _CLEAN_RE.sub("", raw).strip()


# ──────────────────────────────────────────────────────────────
# REDIS KEY HELPERS
# ──────────────────────────────────────────────────────────────
def _redis_key(date: str, category: str) -> str:
    return f"headlines:{date}:{category}"


# ──────────────────────────────────────────────────────────────
# MAIN ENTRY POINT
# ──────────────────────────────────────────────────────────────
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
    """
    Fetches top articles per category via Newsdata.io, summarises them,
    stores in Upstash Redis, and returns the payload for logging/tests.
    """
    date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
    all_output: Dict[str, List[dict]] = {}

    for cat_key, newsdata_cat in _CATEGORIES.items():
        print(f"[HEADLINES] {cat_key.title()} …")
        articles = _fetch_newsdata_articles(cat_key, newsdata_cat, _ARTICLES_PER_CAT)

        summaries: List[dict] = []
        for art in articles:
            summary = _summarise_article(art["content"])
            summaries.append(
                {
                    "title":   art["title"],
                    "url":     art["url"],
                    "summary": summary,
                    "source_snippet": art["source_snippet"],
                    "image":   art["image"],
                    "pubDate": art["pubDate"],
                }
            )

        redis_key = _redis_key(date_str, cat_key)
        _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
        all_output[cat_key] = summaries
        print(f"  ↳ stored {len(summaries)} items in Redis ({redis_key})")

    return all_output