ragV98 commited on
Commit
8cb2491
Β·
1 Parent(s): 315bd36

scraper refinements

Browse files
components/fetchers/scraper.py CHANGED
@@ -3,6 +3,12 @@ import trafilatura
3
  from newspaper import Article
4
  from typing import Optional
5
  from bs4 import BeautifulSoup
 
 
 
 
 
 
6
 
7
  HEADERS = {
8
  "User-Agent": (
@@ -13,50 +19,145 @@ HEADERS = {
13
  }
14
 
15
  def clean_text(text: str) -> str:
16
- # Remove HTML tags, collapse whitespace
 
 
 
 
 
 
17
  soup = BeautifulSoup(text, "html.parser")
 
 
 
 
 
18
  cleaned = soup.get_text(separator=" ", strip=True)
19
- cleaned = " ".join(cleaned.split())
 
 
 
 
 
 
 
 
20
  return cleaned
21
 
22
  def is_low_quality(text: str) -> bool:
23
- """Detect navigation garbage, footers, or low-word-count dumps."""
24
- if not text or len(text.split()) < 120:
 
 
 
 
25
  return True
 
 
 
 
 
 
 
26
  junk_markers = [
27
- "subscribe", "click here", "latest headlines", "more from", "privacy policy",
28
- "video", "terms of service", "back to top", "all rights reserved"
 
 
 
 
 
 
 
 
 
 
29
  ]
30
- return any(marker in text.lower() for marker in junk_markers)
31
 
32
- def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Try Trafilatura first
34
  try:
 
35
  response = requests.get(url, timeout=timeout, headers=HEADERS)
36
- if response.status_code == 200:
37
- html = response.text
38
- extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
39
- if extracted:
40
- text = clean_text(extracted)
41
- if not is_low_quality(text):
42
- return text
43
- else:
44
- print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  except Exception as e:
46
- print(f"⚠️ Trafilatura failed for {url}: {e}")
47
 
48
  # Fallback to newspaper3k
49
  try:
50
- article = Article(url)
51
  article.download()
52
  article.parse()
53
  if article.text:
54
  text = clean_text(article.text)
55
  if not is_low_quality(text):
 
56
  return text
57
  else:
58
- print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
 
 
 
 
59
  except Exception as e:
60
- print(f"⚠️ Newspaper3k failed for {url}: {e}")
61
 
62
- return None
 
 
3
  from newspaper import Article
4
  from typing import Optional
5
  from bs4 import BeautifulSoup
6
+ import logging
7
+ import re # For regex in clean_text
8
+ # from tenacity import retry, wait_exponential, stop_after_after_attempt # If you want to add retries
9
+
10
+ # Configure logging at the beginning of your script or module
11
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
 
13
  HEADERS = {
14
  "User-Agent": (
 
19
  }
20
 
21
  def clean_text(text: str) -> str:
22
+ """
23
+ Cleans extracted text by removing HTML tags, normalizing whitespace,
24
+ and optionally removing common non-content patterns.
25
+ """
26
+ if not text:
27
+ return ""
28
+
29
  soup = BeautifulSoup(text, "html.parser")
30
+
31
+ # Add double newlines after paragraphs to preserve some structure
32
+ for p in soup.find_all('p'):
33
+ p.append('\n\n')
34
+
35
  cleaned = soup.get_text(separator=" ", strip=True)
36
+
37
+ # Normalize all whitespace characters to single spaces, then strip leading/trailing
38
+ cleaned = re.sub(r'\s+', ' ', cleaned).strip()
39
+
40
+ # Optional: Remove common non-content patterns (e.g., "Image: ...", "Photo by ...")
41
+ # This might be too aggressive for some articles, test carefully
42
+ # cleaned = re.sub(r'(?:Image|Photo) by [^\n]*\n*', '', cleaned, flags=re.IGNORECASE)
43
+ # cleaned = re.sub(r'\[\s*\d+\s*[/\\-]\s*\d+\s*\]', '', cleaned) # e.g., [1/5], [2-3]
44
+
45
  return cleaned
46
 
47
  def is_low_quality(text: str) -> bool:
48
+ """
49
+ Detect navigation garbage, footers, or low-word-count dumps.
50
+ Uses an expanded list of junk markers and word count checks.
51
+ """
52
+ if not text:
53
+ logging.debug("Text is empty, considered low quality.")
54
  return True
55
+
56
+ words = text.split()
57
+ if len(words) < 150: # Increased minimum word count slightly for better content
58
+ logging.debug(f"Text has only {len(words)} words, considered low quality (min 150).")
59
+ return True
60
+
61
+ # Expanded list of common junk phrases/markers
62
  junk_markers = [
63
+ "subscribe to our newsletter", "cookie policy", "terms and conditions",
64
+ "privacy statement", "all rights reserved", "contact us", "about us",
65
+ "careers", "sitemap", "advertisement", "sponsored content",
66
+ "read more", "view all", "back to top", "connect with us",
67
+ "follow us on", "email us", "download our app", "footer",
68
+ "comments policy", "disclaimer", "affiliate links", "related posts",
69
+ "latest updates", "breaking news", "trending topics", "more news",
70
+ "featured stories", "sign up", "login", "register", "join us",
71
+ "newsletter signup", "skip to content", "navigation", "main menu",
72
+ "sidebar", "archive", "categories", "tags", "go to top", "licence",
73
+ "unlimited access", "support us", "exclusive content", "follow @",
74
+ "copyright", "imprint", "impressum", "legal notice"
75
  ]
 
76
 
77
+ low_quality_score = 0
78
+ lower_text = text.lower()
79
+
80
+ for marker in junk_markers:
81
+ if marker in lower_text:
82
+ low_quality_score += 1
83
+
84
+ # Heuristic: if a significant portion of the text appears to be junk markers
85
+ # Or if too many different markers are present
86
+ if low_quality_score >= 4: # If 4 or more distinct markers are found
87
+ logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.")
88
+ return True
89
+
90
+ # More advanced heuristic for very short lines, indicating lists/tables/boilerplate
91
+ lines = text.split('\n')
92
+ if len(lines) > 15: # Only apply if there are enough lines to make sense
93
+ short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7) # Lines with 1-6 words
94
+ if short_lines_count / len(lines) > 0.4: # If more than 40% of lines are very short
95
+ logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.")
96
+ return True
97
+
98
+ return False
99
+
100
+ # @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
101
+ # def _make_request_with_retry(url, timeout, headers):
102
+ # """Helper for retries if tenacity is enabled."""
103
+ # response = requests.get(url, timeout=timeout, headers=headers)
104
+ # response.raise_for_status()
105
+ # return response
106
+
107
+ def scrape_url(url: str, timeout: int = 15) -> Optional[str]: # Increased default timeout
108
+ """
109
+ Scrapes content from a given URL using Trafilatura and falls back to Newspaper3k.
110
+ Includes robust error handling and quality checks.
111
+ """
112
+ logging.info(f"Attempting to scrape: {url}")
113
+
114
  # Try Trafilatura first
115
  try:
116
+ # Use _make_request_with_retry if retries are enabled
117
  response = requests.get(url, timeout=timeout, headers=HEADERS)
118
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
119
+
120
+ # Handle encoding more robustly
121
+ try:
122
+ html = response.content.decode(response.apparent_encoding)
123
+ except UnicodeDecodeError:
124
+ html = response.content.decode('utf-8', errors='ignore') # Fallback to UTF-8 with ignore
125
+
126
+ extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
127
+
128
+ if extracted:
129
+ text = clean_text(extracted)
130
+ if not is_low_quality(text):
131
+ logging.info(f"Successfully extracted content using Trafilatura for: {url}")
132
+ return text
133
+ else:
134
+ logging.warning(f"Trafilatura: Content identified as low quality for {url}.")
135
+ else:
136
+ logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.")
137
+
138
+ except requests.exceptions.RequestException as req_err:
139
+ logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}")
140
  except Exception as e:
141
+ logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False) # Keep exc_info=False for less verbose logging unless deep debug is needed
142
 
143
  # Fallback to newspaper3k
144
  try:
145
+ article = Article(url, headers=HEADERS, keep_article_html=False) # Pass headers, no need for raw HTML
146
  article.download()
147
  article.parse()
148
  if article.text:
149
  text = clean_text(article.text)
150
  if not is_low_quality(text):
151
+ logging.info(f"Successfully extracted content using Newspaper3k for: {url}")
152
  return text
153
  else:
154
+ logging.warning(f"Newspaper3k: Content identified as low quality for {url}.")
155
+ else:
156
+ logging.info(f"Newspaper3k returned no main content for: {url}.")
157
+ except requests.exceptions.RequestException as req_err:
158
+ logging.error(f"Newspaper3k (Requests) failed for {url}: {req_err}")
159
  except Exception as e:
160
+ logging.error(f"Newspaper3k (Parsing/Processing) failed for {url}: {e}", exc_info=False)
161
 
162
+ logging.error(f"Failed to extract quality content from: {url} using both methods.")
163
+ return None
components/generators/daily_feed.py CHANGED
@@ -5,179 +5,115 @@ import numpy as np
5
  from typing import List, Dict
6
  from openai import OpenAI
7
  from components.indexers.news_indexer import get_upstash_vector_store
8
- from llama_index.core import StorageContext
9
  from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
10
 
11
  # πŸ” Environment variables
12
  REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
13
- REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
14
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
15
 
16
  # βœ… Redis client
17
- try:
18
- redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
19
- except Exception as e:
20
- print("❌ [Redis Init Error]", e)
21
- raise
22
 
23
- # πŸ“° Topic list
24
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
25
- # This list correctly generates 'india', 'world', etc.
26
  TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
27
 
28
- # 🧠 Summarization prompt
29
  BASE_PROMPT = (
30
  "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
31
- "Return up to 3 punchy headlines, each under 20 words, written like a premium editorial bulletin."
32
  )
33
 
34
- # πŸ“₯ Load documents grouped by topic from Upstash
35
- def load_all_documents_grouped_by_topic() -> Dict[str, List[str]]:
36
  topic_docs = {key: [] for key in TOPIC_KEYS}
37
-
38
  try:
39
  vector_store = get_upstash_vector_store()
40
- print("πŸ’‘ Successfully retrieved Upstash vector store.")
41
-
42
- # Debugging prints (keep them for now, they are useful)
43
- print(f"DEBUG: TOPICS = {TOPICS}")
44
- print(f"DEBUG: TOPIC_KEYS = {TOPIC_KEYS}")
45
- print(f"DEBUG: Length of TOPICS = {len(TOPICS)}")
46
- print(f"DEBUG: Length of TOPIC_KEYS = {len(TOPIC_KEYS)}")
47
-
48
- for full_topic_name, topic_key_for_filter in zip(TOPICS, TOPIC_KEYS):
49
- try:
50
- # *** THE CRITICAL CHANGE IS HERE ***
51
- # Use 'topic_key_for_filter' (e.g., "india") which matches your stored metadata
52
- # instead of 'full_topic_name' (e.g., "India news").
53
- filters = MetadataFilters(
54
- filters=[
55
- MetadataFilter(key="topic", value=topic_key_for_filter, operator=FilterOperator.EQ)
56
- ]
57
- )
58
-
59
- dummy_vector = np.random.rand(384).tolist() # Assuming MiniLM embeddings
60
- query = VectorStoreQuery(
61
- query_embedding=dummy_vector,
62
- similarity_top_k=50, # Retrieve enough documents for summarization
63
- filters=filters # Apply the metadata filter
64
- )
65
-
66
- print(f"πŸ”Ž Querying Upstash for topic: '{full_topic_name}' using filter value '{topic_key_for_filter}'")
67
- result = vector_store.query(query)
68
- print(f"➑️ Found {len(result.nodes)} nodes for topic: '{full_topic_name}'.")
69
-
70
- for node in result.nodes:
71
- content = node.get_content().strip()
72
- if content:
73
- topic_docs[topic_key_for_filter].append(content)
74
- # Optional: Print metadata to verify filtering
75
- # print(f" Node metadata: {node.metadata}")
76
- except Exception as e:
77
- print(f"❌ [Topic Metadata Filter error for '{full_topic_name}']: {e}")
78
-
79
  except Exception as e:
80
- print("❌ [load_all_documents_grouped_by_topic Error]", e)
81
-
82
  return topic_docs
83
 
84
- # πŸ§ͺ Summarize one topic at a time using OpenAI GPT-4
85
- def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
86
  if not docs:
87
- print(f"⚠️ No docs found for topic: {topic_key}, skipping summarization.")
88
  return []
89
 
90
  try:
 
91
  client = OpenAI(api_key=OPENAI_API_KEY)
92
- content = "\n\n---\n\n".join(docs)[:12000]
93
-
94
- print(f"🧠 Summarizing topic via OpenAI: {topic_key} ({len(docs)} documents)")
95
- completion = client.chat.completions.create(
96
- model="gpt-4",
97
  messages=[
98
  {"role": "system", "content": BASE_PROMPT},
99
  {"role": "user", "content": content},
100
  ],
101
- max_tokens=512,
102
- temperature=0.7,
103
  )
104
-
105
- text = completion.choices[0].message.content.strip()
106
-
107
- summaries = []
108
- for line in text.splitlines():
109
- line = line.strip("-–‒ ")
110
- if line:
111
- summaries.append({
112
- "summary": line,
113
- "image_url": "https://source.unsplash.com/800x600/?news",
114
- "article_link": f"https://google.com/search?q={topic_key}+news"
115
  })
116
- return summaries
117
-
118
  except Exception as e:
119
- print(f"❌ [OpenAI Summarization Error for '{topic_key}']: {e}")
120
  return []
121
 
122
- # πŸš€ Main callable
123
  def generate_and_cache_daily_feed():
124
- try:
125
- print("πŸ†• Running OpenAI-powered daily feed generator....")
126
- topic_docs = load_all_documents_grouped_by_topic()
127
- feed_map = {}
128
-
129
- for topic_key in TOPIC_KEYS:
130
- try:
131
- summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
132
- feed_map[topic_key] = summaries
133
- except Exception as e:
134
- print(f"❌ [Topic Loop Error for '{topic_key}']: {e}")
135
- feed_map[topic_key] = []
136
-
137
- # When creating final_feed, use TOPICS for the display name but TOPIC_KEYS for mapping
138
- final_feed = [{"topic": display_name, "feed": feed_map[actual_key]}
139
- for display_name, actual_key in zip(TOPICS, TOPIC_KEYS)]
140
-
141
  try:
142
- cache_key_name = "daily_news_feed_cache"
143
- redis_client.set(cache_key_name, json.dumps(final_feed, ensure_ascii=False))
144
- redis_client.expire(cache_key_name, 86400)
145
- print(f"βœ… Cached daily feed under key '{cache_key_name}' with 24-hour expiry.")
146
  except Exception as e:
147
- print("❌ [Redis Cache Error]", e)
148
-
149
- return final_feed
150
 
 
 
 
 
151
  except Exception as e:
152
- print("❌ [generate_and_cache_daily_feed Overall Error]", e)
153
- return []
 
154
 
155
- # πŸ“¦ Get cached data
156
  def get_cached_daily_feed():
157
  try:
158
- cache_key_name = "daily_news_feed_cache"
159
- cached = redis_client.get(cache_key_name)
160
- if cached:
161
- print(f"βœ… Retrieved cached daily feed from '{cache_key_name}'.")
162
- return json.loads(cached)
163
- else:
164
- print(f"ℹ️ No cached data found under key '{cache_key_name}'.")
165
- return []
166
  except Exception as e:
167
- print("❌ [get_cached_daily_feed Error]", e)
168
  return []
169
 
170
- # Example of how to run it (for testing purposes, if this were the main script)
171
  if __name__ == "__main__":
172
- # Ensure your environment variables are set before running
173
- # os.environ["UPSTASH_REDIS_URL"] = "your_upstash_redis_url"
174
- # os.environ["UPSTASH_REDIS_TOKEN"] = "your_upstash_redis_token"
175
- # os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
176
-
177
- generated_feed = generate_and_cache_daily_feed()
178
- print("\n--- Generated and Cached Feed ---")
179
- # print(json.dumps(generated_feed, indent=2, ensure_ascii=False))
180
-
181
- cached_feed = get_cached_daily_feed()
182
- print("\n--- Retrieved from Cache ---")
183
- # print(json.dumps(cached_feed, indent=2, ensure_ascii=False))
 
5
  from typing import List, Dict
6
  from openai import OpenAI
7
  from components.indexers.news_indexer import get_upstash_vector_store
 
8
  from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
9
 
10
  # πŸ” Environment variables
11
  REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
12
+ REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
13
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
14
 
15
  # βœ… Redis client
16
+ redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
 
 
 
 
17
 
18
+ # πŸ“° Topics
19
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 
20
  TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
21
 
22
+ # 🧠 Prompt for summarization
23
  BASE_PROMPT = (
24
  "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
25
+ "Return up to 3 punchy headlines, each under 20 words, and include why the story matters as the second half of the line."
26
  )
27
 
28
+ # πŸ“₯ Load documents by topic and collect references
29
+ def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
30
  topic_docs = {key: [] for key in TOPIC_KEYS}
 
31
  try:
32
  vector_store = get_upstash_vector_store()
33
+ for full_topic, topic_key in zip(TOPICS, TOPIC_KEYS):
34
+ filters = MetadataFilters(
35
+ filters=[MetadataFilter(key="topic", value=topic_key, operator=FilterOperator.EQ)]
36
+ )
37
+ dummy_vector = np.random.rand(384).tolist()
38
+ query = VectorStoreQuery(query_embedding=dummy_vector, similarity_top_k=50, filters=filters)
39
+ result = vector_store.query(query)
40
+ for node in result.nodes:
41
+ content = node.get_content().strip()
42
+ ref_id = node.node_id or node.id_ or ""
43
+ if content and ref_id:
44
+ topic_docs[topic_key].append({"text": content, "ref": ref_id})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  except Exception as e:
46
+ print("❌ [load_docs_by_topic_with_refs Error]", e)
 
47
  return topic_docs
48
 
49
+ # πŸ§ͺ Summarize topic with reference IDs
50
+ def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[Dict]:
51
  if not docs:
52
+ print(f"⚠️ No docs for topic: {topic_key}")
53
  return []
54
 
55
  try:
56
+ content = "\n\n---\n\n".join([d["text"] for d in docs])[:12000]
57
  client = OpenAI(api_key=OPENAI_API_KEY)
58
+ response = client.chat.completions.create(
59
+ model="gpt-4",
 
 
 
60
  messages=[
61
  {"role": "system", "content": BASE_PROMPT},
62
  {"role": "user", "content": content},
63
  ],
64
+ max_tokens=512,
65
+ temperature=0.7,
66
  )
67
+ headlines = response.choices[0].message.content.strip().splitlines()
68
+ result = []
69
+ for i, line in enumerate(headlines):
70
+ clean_line = line.strip("-–‒ ")
71
+ if clean_line:
72
+ ref_id = docs[i]["ref"] if i < len(docs) else ""
73
+ result.append({
74
+ "summary": f"{start_index + i}. {clean_line}",
75
+ "ref": ref_id,
76
+ "image_url": "https://source.unsplash.com/800x600/?news",
77
+ "article_link": f"https://google.com/search?q={topic_key}+news"
78
  })
79
+ return result
 
80
  except Exception as e:
81
+ print(f"❌ [Summarize topic '{topic_key}' Error]", e)
82
  return []
83
 
84
+ # πŸš€ Generate and cache full feed
85
  def generate_and_cache_daily_feed():
86
+ print("πŸ†• Starting daily feed generation with OpenAI...")
87
+ docs_by_topic = load_docs_by_topic_with_refs()
88
+ all_feed = []
89
+ counter = 1
90
+ for topic, topic_key in zip(TOPICS, TOPIC_KEYS):
 
 
 
 
 
 
 
 
 
 
 
 
91
  try:
92
+ summaries = summarize_topic(topic_key, docs_by_topic[topic_key], start_index=counter)
93
+ counter += len(summaries)
94
+ all_feed.append({"topic": topic, "feed": summaries})
 
95
  except Exception as e:
96
+ print(f"❌ [Feed generation error for {topic_key}]", e)
97
+ all_feed.append({"topic": topic, "feed": []})
 
98
 
99
+ try:
100
+ redis_client.set("daily_news_feed_cache", json.dumps(all_feed, ensure_ascii=False))
101
+ redis_client.expire("daily_news_feed_cache", 86400)
102
+ print("βœ… Cached final feed.")
103
  except Exception as e:
104
+ print("❌ [Redis caching error]", e)
105
+
106
+ return all_feed
107
 
108
+ # πŸ—ƒοΈ Fetch from cache
109
  def get_cached_daily_feed():
110
  try:
111
+ data = redis_client.get("daily_news_feed_cache")
112
+ return json.loads(data) if data else []
 
 
 
 
 
 
113
  except Exception as e:
114
+ print("❌ [Cache fetch error]", e)
115
  return []
116
 
 
117
  if __name__ == "__main__":
118
+ feed = generate_and_cache_daily_feed()
119
+ print(json.dumps(feed, indent=2, ensure_ascii=False))
 
 
 
 
 
 
 
 
 
 
pipeline/news_ingest.py CHANGED
@@ -58,6 +58,7 @@ async def main():
58
  print("🌍 Fetching news URLs from Google...")
59
 
60
  all_articles = []
 
61
 
62
  for query in QUERIES:
63
  print(f"πŸ” Searching for: {query}")
@@ -76,13 +77,15 @@ async def main():
76
  article_text = scrape_url(url)
77
 
78
  if article_text:
 
79
  all_articles.append({
80
  "topic": query,
81
- "title": title,
82
  "url": url,
83
  "source": source,
84
  "content": article_text
85
  })
 
86
  else:
87
  print(f"⚠️ Skipped: {url}")
88
 
 
58
  print("🌍 Fetching news URLs from Google...")
59
 
60
  all_articles = []
61
+ counter = 1 # βœ… Initialize global counter
62
 
63
  for query in QUERIES:
64
  print(f"πŸ” Searching for: {query}")
 
77
  article_text = scrape_url(url)
78
 
79
  if article_text:
80
+ numbered_title = f"{counter}. {title}" # βœ… Add headline number
81
  all_articles.append({
82
  "topic": query,
83
+ "title": numbered_title,
84
  "url": url,
85
  "source": source,
86
  "content": article_text
87
  })
88
+ counter += 1
89
  else:
90
  print(f"⚠️ Skipped: {url}")
91