File size: 6,056 Bytes
89716e4 7fdb7c1 a37ba23 b2bd47e a37ba23 89716e4 3611f6f 89716e4 3611f6f 011e38f 3611f6f 011e38f 7fdb7c1 011e38f a37ba23 011e38f 89716e4 3611f6f 011e38f 2049c5d a37ba23 011e38f 2049c5d a37ba23 011e38f a37ba23 011e38f 2049c5d a37ba23 011e38f a37ba23 011e38f a37ba23 011e38f a37ba23 011e38f a37ba23 b2bd47e 011e38f a37ba23 011e38f a37ba23 011e38f a37ba23 2049c5d 011e38f 2049c5d 011e38f 2049c5d 011e38f 2049c5d a37ba23 b2bd47e a37ba23 b2bd47e 7fdb7c1 b2bd47e 7fdb7c1 89716e4 7fdb7c1 89716e4 b2bd47e 7fdb7c1 89716e4 b2bd47e 7fdb7c1 b2bd47e a37ba23 7fdb7c1 89716e4 3611f6f 7fdb7c1 3611f6f a37ba23 89716e4 011e38f b2bd47e 7fdb7c1 b2bd47e 7fdb7c1 b2bd47e 7fdb7c1 a37ba23 b2bd47e 7fdb7c1 89716e4 7fdb7c1 89716e4 7fdb7c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
from __future__ import annotations
import datetime as _dt
import json
import os
import re
import time
from typing import List, Dict
import requests
from boilerpy3 import extractors
from clients.redis_client import redis_client as _r
from models_initialization.mistral_registry import mistral_generate
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# CONFIG β GNews.io API
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
GNEWS_API_KEY = os.getenv("GNEWS_API_KEY")
assert GNEWS_API_KEY, "β GNEWS_API_KEY missing (add to Space secrets or .env)"
_CATEGORIES: dict[str, str] = {
"world": "world",
"india": "india",
"finance": "finance business economy",
"sports": "sports",
"entertainment": "entertainment celebrity",
}
_ARTICLES_PER_CAT = 5
_SUMMARY_TOKENS = 120
_REDIS_TTL_SECONDS = 24 * 3600
_REQ_TIMEOUT = 10
_MIN_BODY_LENGTH = 120
_bp_extractor = extractors.ArticleExtractor()
_HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# HELPERS
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _gnews_url(query: str, max_res: int = 10) -> str:
q = requests.utils.quote(query)
return (
"https://gnews.io/api/v4/search?" # paid plans allow /top-headlines but /search works on free
f"q={q}&lang=en&max={max_res}&token={GNEWS_API_KEY}"
)
def _extract_fulltext(url: str) -> str:
try:
html = requests.get(url, headers=_HEADERS, timeout=_REQ_TIMEOUT, allow_redirects=True).text
return _bp_extractor.get_content(html) or ""
except Exception as e:
print(f"[SCRAPE ERR] {url}: {e}")
return ""
def _fetch_articles(query: str, wanted: int) -> List[dict]:
url = _gnews_url(query, max_res=wanted * 2) # fetch extra to account for skips
try:
data = requests.get(url, timeout=_REQ_TIMEOUT).json()
except Exception as e:
print(f"[GNEWS ERR] {query}: {e}")
return []
collected: List[dict] = []
seen_urls: set[str] = set()
for item in data.get("articles", []):
link = item.get("url")
if not link or link in seen_urls:
continue
seen_urls.add(link)
body = _extract_fulltext(link)
if len(body) < _MIN_BODY_LENGTH:
continue
collected.append({
"title": item.get("title"),
"url": link,
"content": body,
"pubDate": item.get("publishedAt"),
"image": item.get("image"),
"source_snippet": item.get("description", ""),
})
if len(collected) >= wanted:
break
return collected
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SUMMARISER
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
def _summarise(text: str) -> str:
prompt = (
"You are a concise news assistant. Summarise the following article "
"in one sentence (<=25 words). Omit source and author names.\n\n"
f"ARTICLE:\n{text}"
)
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
return _RE_PROMPT_ECHO.sub("", raw).strip()
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# REDIS KEY
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _redis_key(date: str, cat: str) -> str:
return f"headlines:{date}:{cat}"
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# MAIN ENTRY
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
"""Fetch, summarise, and cache headlines via GNews API."""
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
all_results: Dict[str, List[dict]] = {}
for cat, query in _CATEGORIES.items():
print(f"[HEADLINES] {cat.title()} β¦")
articles = _fetch_articles(query, _ARTICLES_PER_CAT)
summaries: List[dict] = []
for art in articles:
summary_txt = _summarise(art["content"])
summaries.append({
"title": art["title"],
"url": art["url"],
"summary": summary_txt,
"source_snippet": art["source_snippet"],
"image": art["image"],
"pubDate": art["pubDate"],
})
redis_key = _redis_key(date_str, cat)
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
all_results[cat] = summaries
print(f" β³ stored {len(summaries)} items β {redis_key}")
return all_results
|