|
from __future__ import annotations |
|
import datetime as _dt |
|
import json |
|
import os |
|
import re |
|
import time |
|
from typing import List, Dict |
|
|
|
import requests |
|
import feedparser |
|
from boilerpy3 import extractors |
|
|
|
from clients.redis_client import redis_client as _r |
|
from models_initialization.mistral_registry import mistral_generate |
|
|
|
|
|
|
|
|
|
|
|
_CATEGORIES: dict[str, str] = { |
|
"world": "world news", |
|
"india": "india top stories", |
|
"finance": "finance business economy", |
|
"sports": "sports headlines", |
|
"entertainment": "entertainment celebrity movies tv", |
|
} |
|
|
|
_ARTICLES_PER_CAT = 5 |
|
_SUMMARY_TOKENS = 120 |
|
_REDIS_TTL_SECONDS = 24 * 3600 |
|
_RSS_TIMEOUT = 10 |
|
_ARTICLE_TIMEOUT = 10 |
|
|
|
|
|
def _rss_url(query: str) -> str: |
|
query = requests.utils.quote(query) |
|
return ( |
|
"https://news.google.com/rss/search?q=" + query + |
|
"&hl=en-US&gl=US&ceid=US:en" |
|
) |
|
|
|
|
|
_bp_extractor = extractors.ArticleExtractor() |
|
|
|
|
|
|
|
|
|
|
|
def _extract_fulltext(url: str) -> str: |
|
try: |
|
html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=_ARTICLE_TIMEOUT).text |
|
text = _bp_extractor.get_content(html) |
|
return text or "" |
|
except Exception as e: |
|
print(f"[SCRAPE ERR] {url}: {e}") |
|
return "" |
|
|
|
|
|
def _fetch_articles(query: str, wanted: int) -> List[dict]: |
|
feed_url = _rss_url(query) |
|
try: |
|
feed = feedparser.parse(feed_url, request_headers={"User-Agent": "Mozilla/5.0"}) |
|
except Exception as e: |
|
print(f"[RSS ERR] {query}: {e}") |
|
return [] |
|
|
|
collected: List[dict] = [] |
|
seen_links: set[str] = set() |
|
|
|
for entry in feed.entries: |
|
link = entry.link |
|
if link in seen_links: |
|
continue |
|
seen_links.add(link) |
|
|
|
body = _extract_fulltext(link) |
|
if len(body) < 300: |
|
continue |
|
|
|
collected.append( |
|
{ |
|
"title": entry.title, |
|
"url": link, |
|
"content": body, |
|
"pubDate": entry.get("published", ""), |
|
"image": None, |
|
"source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "", |
|
} |
|
) |
|
if len(collected) >= wanted: |
|
break |
|
|
|
return collected |
|
|
|
|
|
|
|
|
|
_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL) |
|
|
|
def _summarise(text: str) -> str: |
|
prompt = ( |
|
"You are a concise news assistant. Summarise the following article " |
|
"in one sentence (<=25 words). Omit source and author names.\n\n" |
|
f"ARTICLE:\n{text}" |
|
) |
|
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3) |
|
return _RE_PROMPT_ECHO.sub("", raw).strip() |
|
|
|
|
|
|
|
|
|
|
|
def _redis_key(date: str, cat: str) -> str: |
|
return f"headlines:{date}:{cat}" |
|
|
|
|
|
|
|
|
|
|
|
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]: |
|
"""Fetches, summarises, and caches headlines via Google News RSS.""" |
|
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d") |
|
all_results: Dict[str, List[dict]] = {} |
|
|
|
for cat, query in _CATEGORIES.items(): |
|
print(f"[HEADLINES] {cat.title()} β¦") |
|
articles = _fetch_articles(query, _ARTICLES_PER_CAT) |
|
|
|
summaries: List[dict] = [] |
|
for art in articles: |
|
summary_txt = _summarise(art["content"]) |
|
summaries.append({ |
|
"title": art["title"], |
|
"url": art["url"], |
|
"summary": summary_txt, |
|
"source_snippet": art["source_snippet"], |
|
"image": art["image"], |
|
"pubDate": art["pubDate"], |
|
}) |
|
|
|
redis_key = _redis_key(date_str, cat) |
|
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS) |
|
all_results[cat] = summaries |
|
print(f" β³ stored {len(summaries)} items β {redis_key}") |
|
|
|
return all_results |
|
|