FastAPI / nuse_modules /headlines_generator.py
raghavNCI
using rss instead
a37ba23
raw
history blame
6.18 kB
from __future__ import annotations
import datetime as _dt
import json
import os
import re
import time
from typing import List, Dict
import requests
import feedparser
from boilerpy3 import extractors
from clients.redis_client import redis_client as _r
from models_initialization.mistral_registry import mistral_generate
# ──────────────────────────────────────────────────────────────
# CONFIG (Google News RSS, no external API keys needed)
# ──────────────────────────────────────────────────────────────
# Query strings passed into GoogleΒ News RSS search feed
_CATEGORIES: dict[str, str] = {
"world": "world news",
"india": "india top stories",
"finance": "finance business economy",
"sports": "sports headlines",
"entertainment": "entertainment celebrity movies tv",
}
_ARTICLES_PER_CAT = 5
_SUMMARY_TOKENS = 120
_REDIS_TTL_SECONDS = 24 * 3600
_RSS_TIMEOUT = 10 # seconds
_ARTICLE_TIMEOUT = 10 # seconds
# Google News RSS search template
def _rss_url(query: str) -> str:
query = requests.utils.quote(query)
return (
"https://news.google.com/rss/search?q=" + query +
"&hl=en-US&gl=US&ceid=US:en"
)
# BoilerPy3 extractor (thread‑safe singleton)
_bp_extractor = extractors.ArticleExtractor()
# ──────────────────────────────────────────────────────────────
# FETCH RSS + ARTICLE BODY
# ──────────────────────────────────────────────────────────────
def _extract_fulltext(url: str) -> str:
try:
html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=_ARTICLE_TIMEOUT).text
text = _bp_extractor.get_content(html)
return text or ""
except Exception as e:
print(f"[SCRAPE ERR] {url}: {e}")
return ""
def _fetch_articles(query: str, wanted: int) -> List[dict]:
feed_url = _rss_url(query)
try:
feed = feedparser.parse(feed_url, request_headers={"User-Agent": "Mozilla/5.0"})
except Exception as e:
print(f"[RSS ERR] {query}: {e}")
return []
collected: List[dict] = []
seen_links: set[str] = set()
for entry in feed.entries:
link = entry.link
if link in seen_links:
continue
seen_links.add(link)
body = _extract_fulltext(link)
if len(body) < 300:
continue # skip trivial pages/homepages
collected.append(
{
"title": entry.title,
"url": link,
"content": body,
"pubDate": entry.get("published", ""),
"image": None, # RSS search feed rarely returns image; can scrape OG tag later
"source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
}
)
if len(collected) >= wanted:
break
return collected
# ──────────────────────────────────────────────────────────────
# SUMMARISER
# ──────────────────────────────────────────────────────────────
_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
def _summarise(text: str) -> str:
prompt = (
"You are a concise news assistant. Summarise the following article "
"in one sentence (<=25 words). Omit source and author names.\n\n"
f"ARTICLE:\n{text}"
)
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
return _RE_PROMPT_ECHO.sub("", raw).strip()
# ──────────────────────────────────────────────────────────────
# REDIS KEY
# ──────────────────────────────────────────────────────────────
def _redis_key(date: str, cat: str) -> str:
return f"headlines:{date}:{cat}"
# ──────────────────────────────────────────────────────────────
# MAIN ENTRY
# ──────────────────────────────────────────────────────────────
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
"""Fetches, summarises, and caches headlines via Google News RSS."""
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
all_results: Dict[str, List[dict]] = {}
for cat, query in _CATEGORIES.items():
print(f"[HEADLINES] {cat.title()} …")
articles = _fetch_articles(query, _ARTICLES_PER_CAT)
summaries: List[dict] = []
for art in articles:
summary_txt = _summarise(art["content"])
summaries.append({
"title": art["title"],
"url": art["url"],
"summary": summary_txt,
"source_snippet": art["source_snippet"],
"image": art["image"],
"pubDate": art["pubDate"],
})
redis_key = _redis_key(date_str, cat)
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
all_results[cat] = summaries
print(f" ↳ stored {len(summaries)} items β†’ {redis_key}")
return all_results