File size: 6,177 Bytes
89716e4 7fdb7c1 a37ba23 b2bd47e a37ba23 89716e4 3611f6f 89716e4 3611f6f a37ba23 3611f6f a37ba23 7fdb7c1 a37ba23 89716e4 3611f6f a37ba23 3611f6f a37ba23 7fdb7c1 a37ba23 b2bd47e a37ba23 b2bd47e a37ba23 b2bd47e a37ba23 b2bd47e 7fdb7c1 b2bd47e 7fdb7c1 89716e4 7fdb7c1 89716e4 b2bd47e 7fdb7c1 89716e4 b2bd47e 7fdb7c1 b2bd47e a37ba23 7fdb7c1 89716e4 3611f6f 7fdb7c1 3611f6f a37ba23 89716e4 a37ba23 b2bd47e 7fdb7c1 b2bd47e 7fdb7c1 b2bd47e 7fdb7c1 a37ba23 b2bd47e 7fdb7c1 89716e4 7fdb7c1 89716e4 7fdb7c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
from __future__ import annotations
import datetime as _dt
import json
import os
import re
import time
from typing import List, Dict
import requests
import feedparser
from boilerpy3 import extractors
from clients.redis_client import redis_client as _r
from models_initialization.mistral_registry import mistral_generate
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# CONFIG (Google News RSS, no external API keys needed)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Query strings passed into GoogleΒ News RSS search feed
_CATEGORIES: dict[str, str] = {
"world": "world news",
"india": "india top stories",
"finance": "finance business economy",
"sports": "sports headlines",
"entertainment": "entertainment celebrity movies tv",
}
_ARTICLES_PER_CAT = 5
_SUMMARY_TOKENS = 120
_REDIS_TTL_SECONDS = 24 * 3600
_RSS_TIMEOUT = 10 # seconds
_ARTICLE_TIMEOUT = 10 # seconds
# Google News RSS search template
def _rss_url(query: str) -> str:
query = requests.utils.quote(query)
return (
"https://news.google.com/rss/search?q=" + query +
"&hl=en-US&gl=US&ceid=US:en"
)
# BoilerPy3 extractor (threadβsafe singleton)
_bp_extractor = extractors.ArticleExtractor()
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# FETCH RSS + ARTICLE BODY
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _extract_fulltext(url: str) -> str:
try:
html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=_ARTICLE_TIMEOUT).text
text = _bp_extractor.get_content(html)
return text or ""
except Exception as e:
print(f"[SCRAPE ERR] {url}: {e}")
return ""
def _fetch_articles(query: str, wanted: int) -> List[dict]:
feed_url = _rss_url(query)
try:
feed = feedparser.parse(feed_url, request_headers={"User-Agent": "Mozilla/5.0"})
except Exception as e:
print(f"[RSS ERR] {query}: {e}")
return []
collected: List[dict] = []
seen_links: set[str] = set()
for entry in feed.entries:
link = entry.link
if link in seen_links:
continue
seen_links.add(link)
body = _extract_fulltext(link)
if len(body) < 300:
continue # skip trivial pages/homepages
collected.append(
{
"title": entry.title,
"url": link,
"content": body,
"pubDate": entry.get("published", ""),
"image": None, # RSS search feed rarely returns image; can scrape OG tag later
"source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
}
)
if len(collected) >= wanted:
break
return collected
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SUMMARISER
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
def _summarise(text: str) -> str:
prompt = (
"You are a concise news assistant. Summarise the following article "
"in one sentence (<=25 words). Omit source and author names.\n\n"
f"ARTICLE:\n{text}"
)
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
return _RE_PROMPT_ECHO.sub("", raw).strip()
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# REDIS KEY
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _redis_key(date: str, cat: str) -> str:
return f"headlines:{date}:{cat}"
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# MAIN ENTRY
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
"""Fetches, summarises, and caches headlines via Google News RSS."""
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
all_results: Dict[str, List[dict]] = {}
for cat, query in _CATEGORIES.items():
print(f"[HEADLINES] {cat.title()} β¦")
articles = _fetch_articles(query, _ARTICLES_PER_CAT)
summaries: List[dict] = []
for art in articles:
summary_txt = _summarise(art["content"])
summaries.append({
"title": art["title"],
"url": art["url"],
"summary": summary_txt,
"source_snippet": art["source_snippet"],
"image": art["image"],
"pubDate": art["pubDate"],
})
redis_key = _redis_key(date_str, cat)
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
all_results[cat] = summaries
print(f" β³ stored {len(summaries)} items β {redis_key}")
return all_results
|