raghavNCI
commited on
Commit
·
89716e4
1
Parent(s):
6a3e0a5
headline generator first trial
Browse files- app.py +6 -3
- nuse_modules/headlines_generator.py +83 -0
app.py
CHANGED
@@ -3,7 +3,8 @@ from routes.category import router # routes.py must be in same folder
|
|
3 |
from routes.question import askMe
|
4 |
from routes.wa_gateway import wa_router
|
5 |
from dotenv import load_dotenv
|
6 |
-
|
|
|
7 |
from fastapi.middleware.cors import CORSMiddleware
|
8 |
|
9 |
|
@@ -23,8 +24,10 @@ app.add_middleware(
|
|
23 |
)
|
24 |
|
25 |
@app.on_event("startup")
|
26 |
-
|
27 |
-
|
|
|
|
|
28 |
|
29 |
@app.get("/health")
|
30 |
def health_check():
|
|
|
3 |
from routes.question import askMe
|
4 |
from routes.wa_gateway import wa_router
|
5 |
from dotenv import load_dotenv
|
6 |
+
import asyncio
|
7 |
+
from nuse_modules.headlines_generator import generate_and_store_headlines
|
8 |
from fastapi.middleware.cors import CORSMiddleware
|
9 |
|
10 |
|
|
|
24 |
)
|
25 |
|
26 |
@app.on_event("startup")
|
27 |
+
def fetch_and_cache_articles() -> None:
|
28 |
+
loop = asyncio.get_event_loop()
|
29 |
+
# Run in default thread-pool executor
|
30 |
+
loop.run_in_executor(None, generate_and_store_headlines)
|
31 |
|
32 |
@app.get("/health")
|
33 |
def health_check():
|
nuse_modules/headlines_generator.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import datetime as _dt
|
3 |
+
import json, os
|
4 |
+
from typing import List, Dict
|
5 |
+
|
6 |
+
from clients.redis_client import redis_client as _r
|
7 |
+
from nuse_modules.google_search import search_google_news
|
8 |
+
from models_initialization.mistral_registry import mistral_generate
|
9 |
+
|
10 |
+
|
11 |
+
_CATEGORIES = {
|
12 |
+
"world": "world news top stories",
|
13 |
+
"india": "india top stories",
|
14 |
+
"finance": "business finance economy today",
|
15 |
+
"sports": "sports headlines today",
|
16 |
+
"entertainment": "entertainment celebrity movie tv",
|
17 |
+
}
|
18 |
+
|
19 |
+
_ARTICLES_PER_CAT = 5
|
20 |
+
_SUMMARY_TOKENS = 120
|
21 |
+
_REDIS_TTL_SECONDS = 24 * 3600
|
22 |
+
|
23 |
+
def _dedupe_urls(articles: List[dict]) -> List[dict]:
|
24 |
+
seen = set()
|
25 |
+
out = []
|
26 |
+
for art in articles:
|
27 |
+
if art["link"] not in seen:
|
28 |
+
seen.add(art["link"])
|
29 |
+
out.append(art)
|
30 |
+
return out
|
31 |
+
|
32 |
+
|
33 |
+
def _summarise_article(article: dict) -> str:
|
34 |
+
prompt = (
|
35 |
+
"You are a concise news assistant. Summarise the following article "
|
36 |
+
"in one sentence (<=25 words). Omit source and author names.\n\n"
|
37 |
+
f"ARTICLE:\n{article['content']}"
|
38 |
+
)
|
39 |
+
return mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
|
40 |
+
|
41 |
+
|
42 |
+
def _redis_key(date: str, category: str) -> str:
|
43 |
+
return f"headlines:{date}:{category}"
|
44 |
+
|
45 |
+
|
46 |
+
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
|
47 |
+
"""
|
48 |
+
Fetches top articles per category, summarises them, stores in Redis,
|
49 |
+
and returns the full payload (useful for logging / testing).
|
50 |
+
"""
|
51 |
+
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
|
52 |
+
all_output = {}
|
53 |
+
|
54 |
+
for cat, query in _CATEGORIES.items():
|
55 |
+
print(f"[HEADLINES] {cat.title()} …")
|
56 |
+
|
57 |
+
# 1. Google -> list of {title, link, snippet, content}
|
58 |
+
raw_articles = search_google_news([query], num_results=_ARTICLES_PER_CAT)
|
59 |
+
raw_articles = _dedupe_urls(raw_articles)
|
60 |
+
|
61 |
+
# 2. Summarise each article
|
62 |
+
summaries = []
|
63 |
+
for art in raw_articles:
|
64 |
+
if not art["content"]:
|
65 |
+
continue # skip if scraper failed
|
66 |
+
summary = _summarise_article(art)
|
67 |
+
summaries.append(
|
68 |
+
{
|
69 |
+
"title": art["title"],
|
70 |
+
"url": art["link"],
|
71 |
+
"summary": summary,
|
72 |
+
"source_snippet": art["snippet"],
|
73 |
+
}
|
74 |
+
)
|
75 |
+
|
76 |
+
# 3. Store in Upstash Redis
|
77 |
+
redis_key = _redis_key(date_str, cat)
|
78 |
+
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
|
79 |
+
|
80 |
+
all_output[cat] = summaries
|
81 |
+
print(f" ↳ stored {len(summaries)} items in Redis ({redis_key})")
|
82 |
+
|
83 |
+
return all_output
|