Spaces:

nuseAI
/

FastAPI

Sleeping

FastAPI / nuse_modules /headlines_generator.py

raghavNCI

using rss instead

a37ba23 about 2 months ago

6.18 kB

	from __future__ import annotations
	import datetime as _dt
	import json
	import os
	import re
	import time
	from typing import List, Dict

	import requests
	import feedparser
	from boilerpy3 import extractors

	from clients.redis_client import redis_client as _r
	from models_initialization.mistral_registry import mistral_generate

	# ──────────────────────────────────────────────────────────────
	# CONFIG (Google News RSS, no external API keys needed)
	# ──────────────────────────────────────────────────────────────
	# Query strings passed into Google News RSS search feed
	_CATEGORIES: dict[str, str] = {
	"world": "world news",
	"india": "india top stories",
	"finance": "finance business economy",
	"sports": "sports headlines",
	"entertainment": "entertainment celebrity movies tv",
	}

	_ARTICLES_PER_CAT = 5
	_SUMMARY_TOKENS = 120
	_REDIS_TTL_SECONDS = 24 * 3600
	_RSS_TIMEOUT = 10 # seconds
	_ARTICLE_TIMEOUT = 10 # seconds

	# Google News RSS search template
	def _rss_url(query: str) -> str:
	query = requests.utils.quote(query)
	return (
	"https://news.google.com/rss/search?q=" + query +
	"&hl=en-US&gl=US&ceid=US:en"
	)

	# BoilerPy3 extractor (thread‑safe singleton)
	_bp_extractor = extractors.ArticleExtractor()

	# ──────────────────────────────────────────────────────────────
	# FETCH RSS + ARTICLE BODY
	# ──────────────────────────────────────────────────────────────

	def _extract_fulltext(url: str) -> str:
	try:
	html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=_ARTICLE_TIMEOUT).text
	text = _bp_extractor.get_content(html)
	return text or ""
	except Exception as e:
	print(f"[SCRAPE ERR] {url}: {e}")
	return ""


	def _fetch_articles(query: str, wanted: int) -> List[dict]:
	feed_url = _rss_url(query)
	try:
	feed = feedparser.parse(feed_url, request_headers={"User-Agent": "Mozilla/5.0"})
	except Exception as e:
	print(f"[RSS ERR] {query}: {e}")
	return []

	collected: List[dict] = []
	seen_links: set[str] = set()

	for entry in feed.entries:
	link = entry.link
	if link in seen_links:
	continue
	seen_links.add(link)

	body = _extract_fulltext(link)
	if len(body) < 300:
	continue # skip trivial pages/homepages

	collected.append(
	{
	"title": entry.title,
	"url": link,
	"content": body,
	"pubDate": entry.get("published", ""),
	"image": None, # RSS search feed rarely returns image; can scrape OG tag later
	"source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
	}
	)
	if len(collected) >= wanted:
	break

	return collected

	# ──────────────────────────────────────────────────────────────
	# SUMMARISER
	# ──────────────────────────────────────────────────────────────
	_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE \| re.DOTALL)

	def _summarise(text: str) -> str:
	prompt = (
	"You are a concise news assistant. Summarise the following article "
	"in one sentence (<=25 words). Omit source and author names.\n\n"
	f"ARTICLE:\n{text}"
	)
	raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
	return _RE_PROMPT_ECHO.sub("", raw).strip()

	# ──────────────────────────────────────────────────────────────
	# REDIS KEY
	# ──────────────────────────────────────────────────────────────

	def _redis_key(date: str, cat: str) -> str:
	return f"headlines:{date}:{cat}"

	# ──────────────────────────────────────────────────────────────
	# MAIN ENTRY
	# ──────────────────────────────────────────────────────────────

	def generate_and_store_headlines(today: str \| None = None) -> Dict[str, List[dict]]:
	"""Fetches, summarises, and caches headlines via Google News RSS."""
	date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
	all_results: Dict[str, List[dict]] = {}

	for cat, query in _CATEGORIES.items():
	print(f"[HEADLINES] {cat.title()} …")
	articles = _fetch_articles(query, _ARTICLES_PER_CAT)

	summaries: List[dict] = []
	for art in articles:
	summary_txt = _summarise(art["content"])
	summaries.append({
	"title": art["title"],
	"url": art["url"],
	"summary": summary_txt,
	"source_snippet": art["source_snippet"],
	"image": art["image"],
	"pubDate": art["pubDate"],
	})

	redis_key = _redis_key(date_str, cat)
	_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
	all_results[cat] = summaries
	print(f" ↳ stored {len(summaries)} items → {redis_key}")

	return all_results