File size: 6,056 Bytes
89716e4
 
7fdb7c1
 
 
 
a37ba23
b2bd47e
 
a37ba23
89716e4
3611f6f
89716e4
 
3611f6f
011e38f
3611f6f
011e38f
 
 
7fdb7c1
011e38f
 
a37ba23
011e38f
 
89716e4
 
 
 
3611f6f
011e38f
 
2049c5d
a37ba23
011e38f
2049c5d
a37ba23
011e38f
a37ba23
 
011e38f
 
 
 
 
 
2049c5d
 
a37ba23
 
011e38f
 
a37ba23
 
 
 
 
 
011e38f
a37ba23
011e38f
a37ba23
011e38f
a37ba23
 
b2bd47e
011e38f
a37ba23
011e38f
 
 
a37ba23
011e38f
a37ba23
 
2049c5d
011e38f
2049c5d
 
011e38f
2049c5d
 
011e38f
 
 
2049c5d
a37ba23
b2bd47e
 
a37ba23
b2bd47e
 
 
 
7fdb7c1
b2bd47e
7fdb7c1
89716e4
 
 
7fdb7c1
89716e4
b2bd47e
7fdb7c1
89716e4
b2bd47e
7fdb7c1
b2bd47e
a37ba23
7fdb7c1
 
89716e4
3611f6f
7fdb7c1
3611f6f
a37ba23
89716e4
011e38f
b2bd47e
7fdb7c1
b2bd47e
7fdb7c1
 
 
b2bd47e
 
 
7fdb7c1
a37ba23
 
 
 
 
 
 
 
b2bd47e
7fdb7c1
89716e4
7fdb7c1
 
89716e4
7fdb7c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from __future__ import annotations
import datetime as _dt
import json
import os
import re
import time
from typing import List, Dict

import requests
from boilerpy3 import extractors

from clients.redis_client import redis_client as _r
from models_initialization.mistral_registry import mistral_generate

# ──────────────────────────────────────────────────────────────
# CONFIG  – GNews.io API
# ──────────────────────────────────────────────────────────────
GNEWS_API_KEY = os.getenv("GNEWS_API_KEY")
assert GNEWS_API_KEY, "❌ GNEWS_API_KEY missing (add to Space secrets or .env)"

_CATEGORIES: dict[str, str] = {
    "world":         "world",
    "india":         "india",
    "finance":       "finance business economy",
    "sports":        "sports",
    "entertainment": "entertainment celebrity",
}

_ARTICLES_PER_CAT   = 5
_SUMMARY_TOKENS     = 120
_REDIS_TTL_SECONDS  = 24 * 3600
_REQ_TIMEOUT        = 10
_MIN_BODY_LENGTH    = 120

_bp_extractor = extractors.ArticleExtractor()
_HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

# ──────────────────────────────────────────────────────────────
# HELPERS
# ──────────────────────────────────────────────────────────────

def _gnews_url(query: str, max_res: int = 10) -> str:
    q = requests.utils.quote(query)
    return (
        "https://gnews.io/api/v4/search?"  # paid plans allow /top-headlines but /search works on free
        f"q={q}&lang=en&max={max_res}&token={GNEWS_API_KEY}"
    )


def _extract_fulltext(url: str) -> str:
    try:
        html = requests.get(url, headers=_HEADERS, timeout=_REQ_TIMEOUT, allow_redirects=True).text
        return _bp_extractor.get_content(html) or ""
    except Exception as e:
        print(f"[SCRAPE ERR] {url}: {e}")
        return ""


def _fetch_articles(query: str, wanted: int) -> List[dict]:
    url = _gnews_url(query, max_res=wanted * 2)  # fetch extra to account for skips
    try:
        data = requests.get(url, timeout=_REQ_TIMEOUT).json()
    except Exception as e:
        print(f"[GNEWS ERR] {query}: {e}")
        return []

    collected: List[dict] = []
    seen_urls: set[str] = set()

    for item in data.get("articles", []):
        link = item.get("url")
        if not link or link in seen_urls:
            continue
        seen_urls.add(link)

        body = _extract_fulltext(link)
        if len(body) < _MIN_BODY_LENGTH:
            continue

        collected.append({
            "title": item.get("title"),
            "url":   link,
            "content": body,
            "pubDate": item.get("publishedAt"),
            "image":  item.get("image"),
            "source_snippet": item.get("description", ""),
        })
        if len(collected) >= wanted:
            break

    return collected

# ──────────────────────────────────────────────────────────────
# SUMMARISER
# ──────────────────────────────────────────────────────────────
_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)

def _summarise(text: str) -> str:
    prompt = (
        "You are a concise news assistant. Summarise the following article "
        "in one sentence (<=25 words). Omit source and author names.\n\n"
        f"ARTICLE:\n{text}"
    )
    raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
    return _RE_PROMPT_ECHO.sub("", raw).strip()

# ──────────────────────────────────────────────────────────────
# REDIS KEY
# ──────────────────────────────────────────────────────────────

def _redis_key(date: str, cat: str) -> str:
    return f"headlines:{date}:{cat}"

# ──────────────────────────────────────────────────────────────
# MAIN ENTRY
# ──────────────────────────────────────────────────────────────

def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
    """Fetch, summarise, and cache headlines via GNews API."""
    date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
    all_results: Dict[str, List[dict]] = {}

    for cat, query in _CATEGORIES.items():
        print(f"[HEADLINES] {cat.title()} …")
        articles = _fetch_articles(query, _ARTICLES_PER_CAT)

        summaries: List[dict] = []
        for art in articles:
            summary_txt = _summarise(art["content"])
            summaries.append({
                "title":   art["title"],
                "url":     art["url"],
                "summary": summary_txt,
                "source_snippet": art["source_snippet"],
                "image":   art["image"],
                "pubDate": art["pubDate"],
            })

        redis_key = _redis_key(date_str, cat)
        _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
        all_results[cat] = summaries
        print(f"  ↳ stored {len(summaries)} items β†’ {redis_key}")

    return all_results