|
from __future__ import annotations |
|
import datetime as _dt |
|
import json |
|
import os |
|
import re |
|
import time |
|
from typing import List, Dict |
|
|
|
import requests |
|
from boilerpy3 import extractors |
|
|
|
from clients.redis_client import redis_client as _r |
|
from models_initialization.mistral_registry import mistral_generate |
|
|
|
|
|
|
|
|
|
GNEWS_API_KEY = os.getenv("GNEWS_API_KEY") |
|
assert GNEWS_API_KEY, "β GNEWS_API_KEY missing (add to Space secrets or .env)" |
|
|
|
_CATEGORIES: dict[str, str] = { |
|
"world": "world", |
|
"india": "india", |
|
"finance": "finance business economy", |
|
"sports": "sports", |
|
"entertainment": "entertainment celebrity", |
|
} |
|
|
|
_ARTICLES_PER_CAT = 5 |
|
_SUMMARY_TOKENS = 120 |
|
_REDIS_TTL_SECONDS = 24 * 3600 |
|
_REQ_TIMEOUT = 10 |
|
_MIN_BODY_LENGTH = 120 |
|
|
|
_bp_extractor = extractors.ArticleExtractor() |
|
_HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"} |
|
|
|
|
|
|
|
|
|
|
|
def _gnews_url(query: str, max_res: int = 10) -> str: |
|
q = requests.utils.quote(query) |
|
return ( |
|
"https://gnews.io/api/v4/search?" |
|
f"q={q}&lang=en&max={max_res}&token={GNEWS_API_KEY}" |
|
) |
|
|
|
|
|
def _extract_fulltext(url: str) -> str: |
|
try: |
|
html = requests.get(url, headers=_HEADERS, timeout=_REQ_TIMEOUT, allow_redirects=True).text |
|
return _bp_extractor.get_content(html) or "" |
|
except Exception as e: |
|
print(f"[SCRAPE ERR] {url}: {e}") |
|
return "" |
|
|
|
|
|
def _fetch_articles(query: str, wanted: int) -> List[dict]: |
|
url = _gnews_url(query, max_res=wanted * 2) |
|
try: |
|
data = requests.get(url, timeout=_REQ_TIMEOUT).json() |
|
except Exception as e: |
|
print(f"[GNEWS ERR] {query}: {e}") |
|
return [] |
|
|
|
collected: List[dict] = [] |
|
seen_urls: set[str] = set() |
|
|
|
for item in data.get("articles", []): |
|
link = item.get("url") |
|
if not link or link in seen_urls: |
|
continue |
|
seen_urls.add(link) |
|
|
|
body = _extract_fulltext(link) |
|
if len(body) < _MIN_BODY_LENGTH: |
|
continue |
|
|
|
collected.append({ |
|
"title": item.get("title"), |
|
"url": link, |
|
"content": body, |
|
"pubDate": item.get("publishedAt"), |
|
"image": item.get("image"), |
|
"source_snippet": item.get("description", ""), |
|
}) |
|
if len(collected) >= wanted: |
|
break |
|
|
|
return collected |
|
|
|
|
|
|
|
|
|
_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL) |
|
|
|
def _summarise(text: str) -> str: |
|
prompt = ( |
|
"You are a concise news assistant. Summarise the following article " |
|
"in one sentence (<=25 words). Omit source and author names.\n\n" |
|
f"ARTICLE:\n{text}" |
|
) |
|
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3) |
|
return _RE_PROMPT_ECHO.sub("", raw).strip() |
|
|
|
|
|
|
|
|
|
|
|
def _redis_key(date: str, cat: str) -> str: |
|
return f"headlines:{date}:{cat}" |
|
|
|
|
|
|
|
|
|
|
|
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]: |
|
"""Fetch, summarise, and cache headlines via GNews API.""" |
|
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d") |
|
all_results: Dict[str, List[dict]] = {} |
|
|
|
for cat, query in _CATEGORIES.items(): |
|
print(f"[HEADLINES] {cat.title()} β¦") |
|
articles = _fetch_articles(query, _ARTICLES_PER_CAT) |
|
|
|
summaries: List[dict] = [] |
|
for art in articles: |
|
summary_txt = _summarise(art["content"]) |
|
summaries.append({ |
|
"title": art["title"], |
|
"url": art["url"], |
|
"summary": summary_txt, |
|
"source_snippet": art["source_snippet"], |
|
"image": art["image"], |
|
"pubDate": art["pubDate"], |
|
}) |
|
|
|
redis_key = _redis_key(date_str, cat) |
|
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS) |
|
all_results[cat] = summaries |
|
print(f" β³ stored {len(summaries)} items β {redis_key}") |
|
|
|
return all_results |
|
|