Spaces:

nuseAI
/

FastAPI

Sleeping

FastAPI / nuse_modules /headlines_generator.py

raghavNCI

switching to newsdata

b2bd47e about 2 months ago

6.85 kB

	# nuse_modules/headlines_generator.py
	from __future__ import annotations
	import datetime as _dt
	import json, os, re, time
	from typing import List, Dict, Optional

	import requests

	from clients.redis_client import redis_client as _r
	from models_initialization.mistral_registry import mistral_generate


	# ──────────────────────────────────────────────────────────────
	# CONFIG
	# ──────────────────────────────────────────────────────────────
	NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
	assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY is not set in env / Space secrets"

	# Newsdata supports these canonical categories:
	# 'world', 'business', 'science', 'technology', 'entertainment',
	# 'sports', 'environment', 'politics'
	_CATEGORIES = {
	"world": "world",
	"india": "world", # use query filter for India
	"finance": "business",
	"sports": "sports",
	"entertainment": "entertainment",
	}

	_ARTICLES_PER_CAT = 5
	_SUMMARY_TOKENS = 120
	_REDIS_TTL_SECONDS = 24 * 3600
	_REQUEST_TIMEOUT = 10


	# ──────────────────────────────────────────────────────────────
	# NEWSDATA FETCHER
	# ──────────────────────────────────────────────────────────────
	def _newsdata_url(
	category: str,
	query: Optional[str] = None,
	page: int = 0,
	language: str = "en",
	size: int = 25,
	) -> str:
	base = (
	"https://newsdata.io/api/1/news"
	f"?apikey={NEWSDATA_API_KEY}"
	f"&language={language}"
	f"&category={category}"
	f"&size={size}"
	f"&page={page}"
	)
	if query:
	base += f"&q={query}"
	return base


	def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]:
	"""
	Fetch up to `wanted` articles for a given logical category (cat_key).
	"""
	collected: List[dict] = []
	seen_links = set()
	page = 0

	while len(collected) < wanted and page < 5: # safety cap
	url = _newsdata_url(
	category=category,
	query="india" if cat_key == "india" else None,
	page=page,
	)
	try:
	res = requests.get(url, timeout=_REQUEST_TIMEOUT)
	res.raise_for_status()
	data = res.json()
	except Exception as e:
	print(f"[ERROR] Newsdata fetch failed ({cat_key}, page {page}): {e}")
	break

	for item in data.get("results", []):
	link = item.get("link")
	if not link or link in seen_links:
	continue
	seen_links.add(link)

	content = item.get("content") or item.get("full_description") or ""
	if not content or len(content) < 300:
	continue # skip short / empty bodies

	collected.append(
	{
	"title": item.get("title"),
	"url": link,
	"content": content,
	"image": item.get("image_url"),
	"source_snippet": item.get("description") or "",
	"pubDate": item.get("pubDate"),
	}
	)
	if len(collected) >= wanted:
	break

	if not data.get("nextPage"):
	break # no more pages
	page += 1
	time.sleep(0.4) # gentle throttle

	return collected[:wanted]


	# ──────────────────────────────────────────────────────────────
	# SUMMARISER
	# ──────────────────────────────────────────────────────────────
	_CLEAN_RE = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE \| re.DOTALL)

	def _summarise_article(body: str) -> str:
	prompt = (
	"You are a concise news assistant. Summarise the following article "
	"in one sentence (<=25 words). Omit source and author names.\n\n"
	f"ARTICLE:\n{body}"
	)
	raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
	return _CLEAN_RE.sub("", raw).strip()


	# ──────────────────────────────────────────────────────────────
	# REDIS KEY HELPERS
	# ──────────────────────────────────────────────────────────────
	def _redis_key(date: str, category: str) -> str:
	return f"headlines:{date}:{category}"


	# ──────────────────────────────────────────────────────────────
	# MAIN ENTRY POINT
	# ──────────────────────────────────────────────────────────────
	def generate_and_store_headlines(today: str \| None = None) -> Dict[str, List[dict]]:
	"""
	Fetches top articles per category via Newsdata.io, summarises them,
	stores in Upstash Redis, and returns the payload for logging/tests.
	"""
	date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
	all_output: Dict[str, List[dict]] = {}

	for cat_key, newsdata_cat in _CATEGORIES.items():
	print(f"[HEADLINES] {cat_key.title()} …")
	articles = _fetch_newsdata_articles(cat_key, newsdata_cat, _ARTICLES_PER_CAT)

	summaries: List[dict] = []
	for art in articles:
	summary = _summarise_article(art["content"])
	summaries.append(
	{
	"title": art["title"],
	"url": art["url"],
	"summary": summary,
	"source_snippet": art["source_snippet"],
	"image": art["image"],
	"pubDate": art["pubDate"],
	}
	)

	redis_key = _redis_key(date_str, cat_key)
	_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
	all_output[cat_key] = summaries
	print(f" ↳ stored {len(summaries)} items in Redis ({redis_key})")

	return all_output