File size: 6,853 Bytes
b2bd47e
89716e4
 
b2bd47e
 
 
 
89716e4
3611f6f
89716e4
 
 
3611f6f
b2bd47e
3611f6f
b2bd47e
 
 
 
 
 
89716e4
b2bd47e
 
 
 
 
89716e4
 
 
 
3611f6f
b2bd47e
89716e4
3611f6f
 
b2bd47e
3611f6f
b2bd47e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
588f923
b2bd47e
588f923
b2bd47e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
588f923
b2bd47e
 
 
 
 
 
 
 
 
 
 
 
89716e4
b2bd47e
 
 
 
 
 
 
 
 
 
 
 
 
 
89716e4
 
 
b2bd47e
89716e4
b2bd47e
 
89716e4
 
b2bd47e
 
 
89716e4
 
 
 
3611f6f
b2bd47e
3611f6f
89716e4
 
b2bd47e
 
89716e4
b2bd47e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89716e4
b2bd47e
89716e4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# nuse_modules/headlines_generator.py
from __future__ import annotations
import datetime as _dt
import json, os, re, time
from typing import List, Dict, Optional

import requests

from clients.redis_client import redis_client as _r
from models_initialization.mistral_registry import mistral_generate


# ──────────────────────────────────────────────────────────────
# CONFIG
# ──────────────────────────────────────────────────────────────
NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY is not set in env / Space secrets"

# Newsdata supports these canonical categories:
#  'world', 'business', 'science', 'technology', 'entertainment',
#  'sports', 'environment', 'politics'
_CATEGORIES = {
    "world":         "world",
    "india":         "world",        # use query filter for India
    "finance":       "business",
    "sports":        "sports",
    "entertainment": "entertainment",
}

_ARTICLES_PER_CAT   = 5
_SUMMARY_TOKENS     = 120
_REDIS_TTL_SECONDS  = 24 * 3600
_REQUEST_TIMEOUT    = 10


# ──────────────────────────────────────────────────────────────
# NEWSDATA FETCHER
# ──────────────────────────────────────────────────────────────
def _newsdata_url(
    category: str,
    query: Optional[str] = None,
    page: int = 0,
    language: str = "en",
    size: int = 25,
) -> str:
    base = (
        "https://newsdata.io/api/1/news"
        f"?apikey={NEWSDATA_API_KEY}"
        f"&language={language}"
        f"&category={category}"
        f"&size={size}"
        f"&page={page}"
    )
    if query:
        base += f"&q={query}"
    return base


def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]:
    """
    Fetch up to `wanted` articles for a given logical category (cat_key).
    """
    collected: List[dict] = []
    seen_links = set()
    page = 0

    while len(collected) < wanted and page < 5:  # safety cap
        url = _newsdata_url(
            category=category,
            query="india" if cat_key == "india" else None,
            page=page,
        )
        try:
            res = requests.get(url, timeout=_REQUEST_TIMEOUT)
            res.raise_for_status()
            data = res.json()
        except Exception as e:
            print(f"[ERROR] Newsdata fetch failed ({cat_key}, page {page}): {e}")
            break

        for item in data.get("results", []):
            link = item.get("link")
            if not link or link in seen_links:
                continue
            seen_links.add(link)

            content = item.get("content") or item.get("full_description") or ""
            if not content or len(content) < 300:
                continue  # skip short / empty bodies

            collected.append(
                {
                    "title":   item.get("title"),
                    "url":     link,
                    "content": content,
                    "image":   item.get("image_url"),
                    "source_snippet": item.get("description") or "",
                    "pubDate": item.get("pubDate"),
                }
            )
            if len(collected) >= wanted:
                break

        if not data.get("nextPage"):
            break  # no more pages
        page += 1
        time.sleep(0.4)  # gentle throttle

    return collected[:wanted]


# ──────────────────────────────────────────────────────────────
# SUMMARISER
# ──────────────────────────────────────────────────────────────
_CLEAN_RE = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)

def _summarise_article(body: str) -> str:
    prompt = (
        "You are a concise news assistant. Summarise the following article "
        "in one sentence (<=25 words). Omit source and author names.\n\n"
        f"ARTICLE:\n{body}"
    )
    raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
    return _CLEAN_RE.sub("", raw).strip()


# ──────────────────────────────────────────────────────────────
# REDIS KEY HELPERS
# ──────────────────────────────────────────────────────────────
def _redis_key(date: str, category: str) -> str:
    return f"headlines:{date}:{category}"


# ──────────────────────────────────────────────────────────────
# MAIN ENTRY POINT
# ──────────────────────────────────────────────────────────────
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
    """
    Fetches top articles per category via Newsdata.io, summarises them,
    stores in Upstash Redis, and returns the payload for logging/tests.
    """
    date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
    all_output: Dict[str, List[dict]] = {}

    for cat_key, newsdata_cat in _CATEGORIES.items():
        print(f"[HEADLINES] {cat_key.title()} …")
        articles = _fetch_newsdata_articles(cat_key, newsdata_cat, _ARTICLES_PER_CAT)

        summaries: List[dict] = []
        for art in articles:
            summary = _summarise_article(art["content"])
            summaries.append(
                {
                    "title":   art["title"],
                    "url":     art["url"],
                    "summary": summary,
                    "source_snippet": art["source_snippet"],
                    "image":   art["image"],
                    "pubDate": art["pubDate"],
                }
            )

        redis_key = _redis_key(date_str, cat_key)
        _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
        all_output[cat_key] = summaries
        print(f"  ↳ stored {len(summaries)} items in Redis ({redis_key})")

    return all_output