raghavNCI commited on
Commit
011e38f
Β·
1 Parent(s): 2049c5d

using gnews

Browse files
nuse_modules/headlines_generator.py CHANGED
@@ -7,106 +7,83 @@ import time
7
  from typing import List, Dict
8
 
9
  import requests
10
- import feedparser
11
  from boilerpy3 import extractors
12
 
13
  from clients.redis_client import redis_client as _r
14
  from models_initialization.mistral_registry import mistral_generate
15
 
16
  # ──────────────────────────────────────────────────────────────
17
- # CONFIG (Google News RSS, no external API keys needed)
18
  # ──────────────────────────────────────────────────────────────
 
 
 
19
  _CATEGORIES: dict[str, str] = {
20
- "world": "world news",
21
- "india": "india top stories",
22
  "finance": "finance business economy",
23
- "sports": "sports headlines",
24
- "entertainment": "entertainment celebrity movies tv",
25
  }
26
 
27
  _ARTICLES_PER_CAT = 5
28
  _SUMMARY_TOKENS = 120
29
  _REDIS_TTL_SECONDS = 24 * 3600
30
- _RSS_TIMEOUT = 10 # seconds
31
- _ARTICLE_TIMEOUT = 10 # seconds
32
- _MIN_BODY_LENGTH = 120 # relaxed threshold so short briefs pass
33
-
34
- # Google News RSS search template
35
 
36
- def _rss_url(query: str) -> str:
37
- query = requests.utils.quote(query)
38
- return (
39
- "https://news.google.com/rss/search?q=" + query +
40
- "&hl=en-US&gl=US&ceid=US:en"
41
- )
42
-
43
- # BoilerPy3 extractor (thread‑safe singleton)
44
  _bp_extractor = extractors.ArticleExtractor()
45
-
46
- # Common browser UA to avoid 403s
47
- _HEADERS = {
48
- "User-Agent": (
49
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
50
- "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114 Safari/537.36"
51
- )
52
- }
53
 
54
  # ──────────────────────────────────────────────────────────────
55
- # FETCH RSS + ARTICLE BODY
56
  # ──────────────────────────────────────────────────────────────
57
 
58
- def _follow_google_redirect(html: str) -> str | None:
59
- """Extract the real URL from a Google News redirect HTML page."""
60
- match = re.search(r'url=(https?[^"\']+)', html, flags=re.I)
61
- return match.group(1) if match else None
 
 
62
 
63
 
64
  def _extract_fulltext(url: str) -> str:
65
  try:
66
- resp = requests.get(url, headers=_HEADERS, timeout=_ARTICLE_TIMEOUT, allow_redirects=True)
67
- html = resp.text
68
-
69
- # If still on news.google.com and meta refresh present β†’ follow manually
70
- if "news.google.com" in resp.url and "http-equiv=\"refresh\"" in html.lower():
71
- real_url = _follow_google_redirect(html)
72
- if real_url:
73
- html = requests.get(real_url, headers=_HEADERS, timeout=_ARTICLE_TIMEOUT).text
74
-
75
- text = _bp_extractor.get_content(html)
76
- return text or ""
77
  except Exception as e:
78
  print(f"[SCRAPE ERR] {url}: {e}")
79
  return ""
80
 
81
 
82
  def _fetch_articles(query: str, wanted: int) -> List[dict]:
83
- feed_url = _rss_url(query)
84
  try:
85
- feed = feedparser.parse(feed_url, request_headers=_HEADERS)
86
  except Exception as e:
87
- print(f"[RSS ERR] {query}: {e}")
88
  return []
89
 
90
  collected: List[dict] = []
91
- seen_links: set[str] = set()
92
 
93
- for entry in feed.entries:
94
- link = entry.link
95
- if link in seen_links:
96
  continue
97
- seen_links.add(link)
98
 
99
  body = _extract_fulltext(link)
100
  if len(body) < _MIN_BODY_LENGTH:
101
- continue # skip very short pages/homepages
102
 
103
  collected.append({
104
- "title": entry.title,
105
  "url": link,
106
  "content": body,
107
- "pubDate": entry.get("published", ""),
108
- "image": None, # can scrape OG tag later
109
- "source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
110
  })
111
  if len(collected) >= wanted:
112
  break
@@ -118,7 +95,6 @@ def _fetch_articles(query: str, wanted: int) -> List[dict]:
118
  # ──────────────────────────────────────────────────────────────
119
  _RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
120
 
121
-
122
  def _summarise(text: str) -> str:
123
  prompt = (
124
  "You are a concise news assistant. Summarise the following article "
@@ -140,7 +116,7 @@ def _redis_key(date: str, cat: str) -> str:
140
  # ──────────────────────────────────────────────────────────────
141
 
142
  def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
143
- """Fetches, summarises, and caches headlines via Google News RSS."""
144
  date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
145
  all_results: Dict[str, List[dict]] = {}
146
 
 
7
  from typing import List, Dict
8
 
9
  import requests
 
10
  from boilerpy3 import extractors
11
 
12
  from clients.redis_client import redis_client as _r
13
  from models_initialization.mistral_registry import mistral_generate
14
 
15
  # ──────────────────────────────────────────────────────────────
16
+ # CONFIG – GNews.io API
17
  # ──────────────────────────────────────────────────────────────
18
+ GNEWS_API_KEY = os.getenv("GNEWS_API_KEY")
19
+ assert GNEWS_API_KEY, "❌ GNEWS_API_KEY missing (add to Space secrets or .env)"
20
+
21
  _CATEGORIES: dict[str, str] = {
22
+ "world": "world",
23
+ "india": "india",
24
  "finance": "finance business economy",
25
+ "sports": "sports",
26
+ "entertainment": "entertainment celebrity",
27
  }
28
 
29
  _ARTICLES_PER_CAT = 5
30
  _SUMMARY_TOKENS = 120
31
  _REDIS_TTL_SECONDS = 24 * 3600
32
+ _REQ_TIMEOUT = 10
33
+ _MIN_BODY_LENGTH = 120
 
 
 
34
 
 
 
 
 
 
 
 
 
35
  _bp_extractor = extractors.ArticleExtractor()
36
+ _HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
 
 
 
 
 
 
 
37
 
38
  # ──────────────────────────────────────────────────────────────
39
+ # HELPERS
40
  # ──────────────────────────────────────────────────────────────
41
 
42
+ def _gnews_url(query: str, max_res: int = 10) -> str:
43
+ q = requests.utils.quote(query)
44
+ return (
45
+ "https://gnews.io/api/v4/search?" # paid plans allow /top-headlines but /search works on free
46
+ f"q={q}&lang=en&max={max_res}&token={GNEWS_API_KEY}"
47
+ )
48
 
49
 
50
  def _extract_fulltext(url: str) -> str:
51
  try:
52
+ html = requests.get(url, headers=_HEADERS, timeout=_REQ_TIMEOUT, allow_redirects=True).text
53
+ return _bp_extractor.get_content(html) or ""
 
 
 
 
 
 
 
 
 
54
  except Exception as e:
55
  print(f"[SCRAPE ERR] {url}: {e}")
56
  return ""
57
 
58
 
59
  def _fetch_articles(query: str, wanted: int) -> List[dict]:
60
+ url = _gnews_url(query, max_res=wanted * 2) # fetch extra to account for skips
61
  try:
62
+ data = requests.get(url, timeout=_REQ_TIMEOUT).json()
63
  except Exception as e:
64
+ print(f"[GNEWS ERR] {query}: {e}")
65
  return []
66
 
67
  collected: List[dict] = []
68
+ seen_urls: set[str] = set()
69
 
70
+ for item in data.get("articles", []):
71
+ link = item.get("url")
72
+ if not link or link in seen_urls:
73
  continue
74
+ seen_urls.add(link)
75
 
76
  body = _extract_fulltext(link)
77
  if len(body) < _MIN_BODY_LENGTH:
78
+ continue
79
 
80
  collected.append({
81
+ "title": item.get("title"),
82
  "url": link,
83
  "content": body,
84
+ "pubDate": item.get("publishedAt"),
85
+ "image": item.get("image"),
86
+ "source_snippet": item.get("description", ""),
87
  })
88
  if len(collected) >= wanted:
89
  break
 
95
  # ──────────────────────────────────────────────────────────────
96
  _RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
97
 
 
98
  def _summarise(text: str) -> str:
99
  prompt = (
100
  "You are a concise news assistant. Summarise the following article "
 
116
  # ──────────────────────────────────────────────────────────────
117
 
118
  def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
119
+ """Fetch, summarise, and cache headlines via GNews API."""
120
  date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
121
  all_results: Dict[str, List[dict]] = {}
122
 
requirements.txt CHANGED
@@ -8,6 +8,3 @@ accelerate
8
  torch
9
  huggingface_hub
10
  boilerpy3==1.0.6
11
- feedparser
12
- newspaper3k
13
- nltk
 
8
  torch
9
  huggingface_hub
10
  boilerpy3==1.0.6