File size: 6,853 Bytes
b2bd47e 89716e4 b2bd47e 89716e4 3611f6f 89716e4 3611f6f b2bd47e 3611f6f b2bd47e 89716e4 b2bd47e 89716e4 3611f6f b2bd47e 89716e4 3611f6f b2bd47e 3611f6f b2bd47e 588f923 b2bd47e 588f923 b2bd47e 588f923 b2bd47e 89716e4 b2bd47e 89716e4 b2bd47e 89716e4 b2bd47e 89716e4 b2bd47e 89716e4 3611f6f b2bd47e 3611f6f 89716e4 b2bd47e 89716e4 b2bd47e 89716e4 b2bd47e 89716e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# nuse_modules/headlines_generator.py
from __future__ import annotations
import datetime as _dt
import json, os, re, time
from typing import List, Dict, Optional
import requests
from clients.redis_client import redis_client as _r
from models_initialization.mistral_registry import mistral_generate
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# CONFIG
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
assert NEWSDATA_API_KEY, "β NEWSDATA_API_KEY is not set in env / Space secrets"
# Newsdata supports these canonical categories:
# 'world', 'business', 'science', 'technology', 'entertainment',
# 'sports', 'environment', 'politics'
_CATEGORIES = {
"world": "world",
"india": "world", # use query filter for India
"finance": "business",
"sports": "sports",
"entertainment": "entertainment",
}
_ARTICLES_PER_CAT = 5
_SUMMARY_TOKENS = 120
_REDIS_TTL_SECONDS = 24 * 3600
_REQUEST_TIMEOUT = 10
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# NEWSDATA FETCHER
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _newsdata_url(
category: str,
query: Optional[str] = None,
page: int = 0,
language: str = "en",
size: int = 25,
) -> str:
base = (
"https://newsdata.io/api/1/news"
f"?apikey={NEWSDATA_API_KEY}"
f"&language={language}"
f"&category={category}"
f"&size={size}"
f"&page={page}"
)
if query:
base += f"&q={query}"
return base
def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]:
"""
Fetch up to `wanted` articles for a given logical category (cat_key).
"""
collected: List[dict] = []
seen_links = set()
page = 0
while len(collected) < wanted and page < 5: # safety cap
url = _newsdata_url(
category=category,
query="india" if cat_key == "india" else None,
page=page,
)
try:
res = requests.get(url, timeout=_REQUEST_TIMEOUT)
res.raise_for_status()
data = res.json()
except Exception as e:
print(f"[ERROR] Newsdata fetch failed ({cat_key}, page {page}): {e}")
break
for item in data.get("results", []):
link = item.get("link")
if not link or link in seen_links:
continue
seen_links.add(link)
content = item.get("content") or item.get("full_description") or ""
if not content or len(content) < 300:
continue # skip short / empty bodies
collected.append(
{
"title": item.get("title"),
"url": link,
"content": content,
"image": item.get("image_url"),
"source_snippet": item.get("description") or "",
"pubDate": item.get("pubDate"),
}
)
if len(collected) >= wanted:
break
if not data.get("nextPage"):
break # no more pages
page += 1
time.sleep(0.4) # gentle throttle
return collected[:wanted]
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SUMMARISER
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_CLEAN_RE = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
def _summarise_article(body: str) -> str:
prompt = (
"You are a concise news assistant. Summarise the following article "
"in one sentence (<=25 words). Omit source and author names.\n\n"
f"ARTICLE:\n{body}"
)
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
return _CLEAN_RE.sub("", raw).strip()
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# REDIS KEY HELPERS
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _redis_key(date: str, category: str) -> str:
return f"headlines:{date}:{category}"
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# MAIN ENTRY POINT
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
"""
Fetches top articles per category via Newsdata.io, summarises them,
stores in Upstash Redis, and returns the payload for logging/tests.
"""
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
all_output: Dict[str, List[dict]] = {}
for cat_key, newsdata_cat in _CATEGORIES.items():
print(f"[HEADLINES] {cat_key.title()} β¦")
articles = _fetch_newsdata_articles(cat_key, newsdata_cat, _ARTICLES_PER_CAT)
summaries: List[dict] = []
for art in articles:
summary = _summarise_article(art["content"])
summaries.append(
{
"title": art["title"],
"url": art["url"],
"summary": summary,
"source_snippet": art["source_snippet"],
"image": art["image"],
"pubDate": art["pubDate"],
}
)
redis_key = _redis_key(date_str, cat_key)
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
all_output[cat_key] = summaries
print(f" β³ stored {len(summaries)} items in Redis ({redis_key})")
return all_output
|