File size: 6,177 Bytes
89716e4
 
7fdb7c1
 
 
 
a37ba23
b2bd47e
 
a37ba23
 
89716e4
3611f6f
89716e4
 
3611f6f
a37ba23
3611f6f
a37ba23
7fdb7c1
a37ba23
 
 
 
 
89716e4
 
 
 
3611f6f
a37ba23
 
3611f6f
a37ba23
 
 
7fdb7c1
a37ba23
 
b2bd47e
 
a37ba23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2bd47e
a37ba23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2bd47e
 
a37ba23
b2bd47e
 
 
 
7fdb7c1
b2bd47e
7fdb7c1
89716e4
 
 
7fdb7c1
89716e4
b2bd47e
7fdb7c1
89716e4
b2bd47e
7fdb7c1
b2bd47e
a37ba23
7fdb7c1
 
89716e4
3611f6f
7fdb7c1
3611f6f
a37ba23
89716e4
a37ba23
b2bd47e
7fdb7c1
b2bd47e
7fdb7c1
 
 
b2bd47e
 
 
7fdb7c1
a37ba23
 
 
 
 
 
 
 
b2bd47e
7fdb7c1
89716e4
7fdb7c1
 
89716e4
7fdb7c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from __future__ import annotations
import datetime as _dt
import json
import os
import re
import time
from typing import List, Dict

import requests
import feedparser
from boilerpy3 import extractors

from clients.redis_client import redis_client as _r
from models_initialization.mistral_registry import mistral_generate

# ──────────────────────────────────────────────────────────────
# CONFIG (Google News RSS, no external API keys needed)
# ──────────────────────────────────────────────────────────────
# Query strings passed into GoogleΒ News RSS search feed
_CATEGORIES: dict[str, str] = {
    "world":         "world news",
    "india":         "india top stories",
    "finance":       "finance business economy",
    "sports":        "sports headlines",
    "entertainment": "entertainment celebrity movies tv",
}

_ARTICLES_PER_CAT   = 5
_SUMMARY_TOKENS     = 120
_REDIS_TTL_SECONDS  = 24 * 3600
_RSS_TIMEOUT        = 10  # seconds
_ARTICLE_TIMEOUT    = 10  # seconds

# Google News RSS search template
def _rss_url(query: str) -> str:
    query = requests.utils.quote(query)
    return (
        "https://news.google.com/rss/search?q=" + query +
        "&hl=en-US&gl=US&ceid=US:en"
    )

# BoilerPy3 extractor (thread‑safe singleton)
_bp_extractor = extractors.ArticleExtractor()

# ──────────────────────────────────────────────────────────────
# FETCH RSS + ARTICLE BODY
# ──────────────────────────────────────────────────────────────

def _extract_fulltext(url: str) -> str:
    try:
        html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=_ARTICLE_TIMEOUT).text
        text = _bp_extractor.get_content(html)
        return text or ""
    except Exception as e:
        print(f"[SCRAPE ERR] {url}: {e}")
        return ""


def _fetch_articles(query: str, wanted: int) -> List[dict]:
    feed_url = _rss_url(query)
    try:
        feed = feedparser.parse(feed_url, request_headers={"User-Agent": "Mozilla/5.0"})
    except Exception as e:
        print(f"[RSS ERR] {query}: {e}")
        return []

    collected: List[dict] = []
    seen_links: set[str] = set()

    for entry in feed.entries:
        link = entry.link
        if link in seen_links:
            continue
        seen_links.add(link)

        body = _extract_fulltext(link)
        if len(body) < 300:
            continue  # skip trivial pages/homepages

        collected.append(
            {
                "title": entry.title,
                "url":   link,
                "content": body,
                "pubDate": entry.get("published", ""),
                "image":  None,  # RSS search feed rarely returns image; can scrape OG tag later
                "source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
            }
        )
        if len(collected) >= wanted:
            break

    return collected

# ──────────────────────────────────────────────────────────────
# SUMMARISER
# ──────────────────────────────────────────────────────────────
_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)

def _summarise(text: str) -> str:
    prompt = (
        "You are a concise news assistant. Summarise the following article "
        "in one sentence (<=25 words). Omit source and author names.\n\n"
        f"ARTICLE:\n{text}"
    )
    raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
    return _RE_PROMPT_ECHO.sub("", raw).strip()

# ──────────────────────────────────────────────────────────────
# REDIS KEY
# ──────────────────────────────────────────────────────────────

def _redis_key(date: str, cat: str) -> str:
    return f"headlines:{date}:{cat}"

# ──────────────────────────────────────────────────────────────
# MAIN ENTRY
# ──────────────────────────────────────────────────────────────

def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
    """Fetches, summarises, and caches headlines via Google News RSS."""
    date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
    all_results: Dict[str, List[dict]] = {}

    for cat, query in _CATEGORIES.items():
        print(f"[HEADLINES] {cat.title()} …")
        articles = _fetch_articles(query, _ARTICLES_PER_CAT)

        summaries: List[dict] = []
        for art in articles:
            summary_txt = _summarise(art["content"])
            summaries.append({
                "title":   art["title"],
                "url":     art["url"],
                "summary": summary_txt,
                "source_snippet": art["source_snippet"],
                "image":   art["image"],
                "pubDate": art["pubDate"],
            })

        redis_key = _redis_key(date_str, cat)
        _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
        all_results[cat] = summaries
        print(f"  ↳ stored {len(summaries)} items β†’ {redis_key}")

    return all_results