ragV98 commited on
Commit
fbd9dbe
Β·
1 Parent(s): 8cb2491

ref changes and scraper changes

Browse files
components/fetchers/scraper.py CHANGED
@@ -4,8 +4,7 @@ from newspaper import Article
4
  from typing import Optional
5
  from bs4 import BeautifulSoup
6
  import logging
7
- import re # For regex in clean_text
8
- # from tenacity import retry, wait_exponential, stop_after_after_attempt # If you want to add retries
9
 
10
  # Configure logging at the beginning of your script or module
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -37,11 +36,6 @@ def clean_text(text: str) -> str:
37
  # Normalize all whitespace characters to single spaces, then strip leading/trailing
38
  cleaned = re.sub(r'\s+', ' ', cleaned).strip()
39
 
40
- # Optional: Remove common non-content patterns (e.g., "Image: ...", "Photo by ...")
41
- # This might be too aggressive for some articles, test carefully
42
- # cleaned = re.sub(r'(?:Image|Photo) by [^\n]*\n*', '', cleaned, flags=re.IGNORECASE)
43
- # cleaned = re.sub(r'\[\s*\d+\s*[/\\-]\s*\d+\s*\]', '', cleaned) # e.g., [1/5], [2-3]
44
-
45
  return cleaned
46
 
47
  def is_low_quality(text: str) -> bool:
@@ -81,47 +75,31 @@ def is_low_quality(text: str) -> bool:
81
  if marker in lower_text:
82
  low_quality_score += 1
83
 
84
- # Heuristic: if a significant portion of the text appears to be junk markers
85
- # Or if too many different markers are present
86
- if low_quality_score >= 4: # If 4 or more distinct markers are found
87
  logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.")
88
  return True
89
 
90
- # More advanced heuristic for very short lines, indicating lists/tables/boilerplate
91
  lines = text.split('\n')
92
- if len(lines) > 15: # Only apply if there are enough lines to make sense
93
- short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7) # Lines with 1-6 words
94
- if short_lines_count / len(lines) > 0.4: # If more than 40% of lines are very short
95
  logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.")
96
  return True
97
 
98
  return False
99
 
100
- # @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
101
- # def _make_request_with_retry(url, timeout, headers):
102
- # """Helper for retries if tenacity is enabled."""
103
- # response = requests.get(url, timeout=timeout, headers=headers)
104
- # response.raise_for_status()
105
- # return response
106
-
107
- def scrape_url(url: str, timeout: int = 15) -> Optional[str]: # Increased default timeout
108
- """
109
- Scrapes content from a given URL using Trafilatura and falls back to Newspaper3k.
110
- Includes robust error handling and quality checks.
111
- """
112
  logging.info(f"Attempting to scrape: {url}")
113
 
114
  # Try Trafilatura first
115
  try:
116
- # Use _make_request_with_retry if retries are enabled
117
  response = requests.get(url, timeout=timeout, headers=HEADERS)
118
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
119
 
120
- # Handle encoding more robustly
121
  try:
122
  html = response.content.decode(response.apparent_encoding)
123
  except UnicodeDecodeError:
124
- html = response.content.decode('utf-8', errors='ignore') # Fallback to UTF-8 with ignore
125
 
126
  extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
127
 
@@ -131,18 +109,19 @@ def scrape_url(url: str, timeout: int = 15) -> Optional[str]: # Increased defaul
131
  logging.info(f"Successfully extracted content using Trafilatura for: {url}")
132
  return text
133
  else:
134
- logging.warning(f"Trafilatura: Content identified as low quality for {url}.")
 
135
  else:
136
  logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.")
137
 
138
  except requests.exceptions.RequestException as req_err:
139
  logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}")
140
  except Exception as e:
141
- logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False) # Keep exc_info=False for less verbose logging unless deep debug is needed
142
 
143
  # Fallback to newspaper3k
144
  try:
145
- article = Article(url, headers=HEADERS, keep_article_html=False) # Pass headers, no need for raw HTML
146
  article.download()
147
  article.parse()
148
  if article.text:
@@ -151,7 +130,8 @@ def scrape_url(url: str, timeout: int = 15) -> Optional[str]: # Increased defaul
151
  logging.info(f"Successfully extracted content using Newspaper3k for: {url}")
152
  return text
153
  else:
154
- logging.warning(f"Newspaper3k: Content identified as low quality for {url}.")
 
155
  else:
156
  logging.info(f"Newspaper3k returned no main content for: {url}.")
157
  except requests.exceptions.RequestException as req_err:
 
4
  from typing import Optional
5
  from bs4 import BeautifulSoup
6
  import logging
7
+ import re
 
8
 
9
  # Configure logging at the beginning of your script or module
10
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
36
  # Normalize all whitespace characters to single spaces, then strip leading/trailing
37
  cleaned = re.sub(r'\s+', ' ', cleaned).strip()
38
 
 
 
 
 
 
39
  return cleaned
40
 
41
  def is_low_quality(text: str) -> bool:
 
75
  if marker in lower_text:
76
  low_quality_score += 1
77
 
78
+ if low_quality_score >= 4:
 
 
79
  logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.")
80
  return True
81
 
 
82
  lines = text.split('\n')
83
+ if len(lines) > 15:
84
+ short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7)
85
+ if short_lines_count / len(lines) > 0.4:
86
  logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.")
87
  return True
88
 
89
  return False
90
 
91
+ def scrape_url(url: str, timeout: int = 15) -> Optional[str]:
 
 
 
 
 
 
 
 
 
 
 
92
  logging.info(f"Attempting to scrape: {url}")
93
 
94
  # Try Trafilatura first
95
  try:
 
96
  response = requests.get(url, timeout=timeout, headers=HEADERS)
97
+ response.raise_for_status()
98
 
 
99
  try:
100
  html = response.content.decode(response.apparent_encoding)
101
  except UnicodeDecodeError:
102
+ html = response.content.decode('utf-8', errors='ignore')
103
 
104
  extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
105
 
 
109
  logging.info(f"Successfully extracted content using Trafilatura for: {url}")
110
  return text
111
  else:
112
+ # Log when content is identified as low quality by Trafilatura
113
+ logging.warning(f"LOW_QUALITY_CONTENT (Trafilatura): {url} - Reason: Content identified as low quality.")
114
  else:
115
  logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.")
116
 
117
  except requests.exceptions.RequestException as req_err:
118
  logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}")
119
  except Exception as e:
120
+ logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False)
121
 
122
  # Fallback to newspaper3k
123
  try:
124
+ article = Article(url, headers=HEADERS, keep_article_html=False)
125
  article.download()
126
  article.parse()
127
  if article.text:
 
130
  logging.info(f"Successfully extracted content using Newspaper3k for: {url}")
131
  return text
132
  else:
133
+ # Log when content is identified as low quality by Newspaper3k
134
+ logging.warning(f"LOW_QUALITY_CONTENT (Newspaper3k): {url} - Reason: Content identified as low quality.")
135
  else:
136
  logging.info(f"Newspaper3k returned no main content for: {url}.")
137
  except requests.exceptions.RequestException as req_err:
components/generators/daily_feed.py CHANGED
@@ -9,7 +9,6 @@ from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilte
9
 
10
  # πŸ” Environment variables
11
  REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
12
- REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
13
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
14
 
15
  # βœ… Redis client
@@ -19,13 +18,13 @@ redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
19
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
20
  TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
21
 
22
- # 🧠 Prompt for summarization
23
  BASE_PROMPT = (
24
  "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
25
- "Return up to 3 punchy headlines, each under 20 words, and include why the story matters as the second half of the line."
26
  )
27
 
28
- # πŸ“₯ Load documents by topic and collect references
29
  def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
30
  topic_docs = {key: [] for key in TOPIC_KEYS}
31
  try:
@@ -40,13 +39,13 @@ def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
40
  for node in result.nodes:
41
  content = node.get_content().strip()
42
  ref_id = node.node_id or node.id_ or ""
43
- if content and ref_id:
44
  topic_docs[topic_key].append({"text": content, "ref": ref_id})
45
  except Exception as e:
46
  print("❌ [load_docs_by_topic_with_refs Error]", e)
47
  return topic_docs
48
 
49
- # πŸ§ͺ Summarize topic with reference IDs
50
  def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[Dict]:
51
  if not docs:
52
  print(f"⚠️ No docs for topic: {topic_key}")
@@ -67,11 +66,11 @@ def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[
67
  headlines = response.choices[0].message.content.strip().splitlines()
68
  result = []
69
  for i, line in enumerate(headlines):
70
- clean_line = line.strip("-–‒ ")
71
- if clean_line:
72
- ref_id = docs[i]["ref"] if i < len(docs) else ""
73
  result.append({
74
- "summary": f"{start_index + i}. {clean_line}",
75
  "ref": ref_id,
76
  "image_url": "https://source.unsplash.com/800x600/?news",
77
  "article_link": f"https://google.com/search?q={topic_key}+news"
@@ -81,39 +80,57 @@ def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[
81
  print(f"❌ [Summarize topic '{topic_key}' Error]", e)
82
  return []
83
 
84
- # πŸš€ Generate and cache full feed
85
  def generate_and_cache_daily_feed():
86
- print("πŸ†• Starting daily feed generation with OpenAI...")
87
- docs_by_topic = load_docs_by_topic_with_refs()
88
- all_feed = []
89
- counter = 1
90
- for topic, topic_key in zip(TOPICS, TOPIC_KEYS):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  try:
92
- summaries = summarize_topic(topic_key, docs_by_topic[topic_key], start_index=counter)
93
- counter += len(summaries)
94
- all_feed.append({"topic": topic, "feed": summaries})
 
95
  except Exception as e:
96
- print(f"❌ [Feed generation error for {topic_key}]", e)
97
- all_feed.append({"topic": topic, "feed": []})
98
 
99
- try:
100
- redis_client.set("daily_news_feed_cache", json.dumps(all_feed, ensure_ascii=False))
101
- redis_client.expire("daily_news_feed_cache", 86400)
102
- print("βœ… Cached final feed.")
103
- except Exception as e:
104
- print("❌ [Redis caching error]", e)
105
 
106
- return all_feed
 
 
107
 
108
- # πŸ—ƒοΈ Fetch from cache
109
  def get_cached_daily_feed():
110
  try:
111
- data = redis_client.get("daily_news_feed_cache")
112
- return json.loads(data) if data else []
 
113
  except Exception as e:
114
- print("❌ [Cache fetch error]", e)
115
  return []
116
 
 
117
  if __name__ == "__main__":
118
  feed = generate_and_cache_daily_feed()
119
  print(json.dumps(feed, indent=2, ensure_ascii=False))
 
9
 
10
  # πŸ” Environment variables
11
  REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
 
12
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
13
 
14
  # βœ… Redis client
 
18
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
19
  TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
20
 
21
+ # 🧠 Summarization Prompt
22
  BASE_PROMPT = (
23
  "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
24
+ "Return up to 3 punchy headlines, each under 20 words. Each headline should be followed by a short explanation of why the story matters."
25
  )
26
 
27
+ # πŸ“₯ Load documents and metadata
28
  def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
29
  topic_docs = {key: [] for key in TOPIC_KEYS}
30
  try:
 
39
  for node in result.nodes:
40
  content = node.get_content().strip()
41
  ref_id = node.node_id or node.id_ or ""
42
+ if content:
43
  topic_docs[topic_key].append({"text": content, "ref": ref_id})
44
  except Exception as e:
45
  print("❌ [load_docs_by_topic_with_refs Error]", e)
46
  return topic_docs
47
 
48
+ # πŸ§ͺ Topic summarizer
49
  def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[Dict]:
50
  if not docs:
51
  print(f"⚠️ No docs for topic: {topic_key}")
 
66
  headlines = response.choices[0].message.content.strip().splitlines()
67
  result = []
68
  for i, line in enumerate(headlines):
69
+ line = line.strip("-–‒ ").strip()
70
+ if line:
71
+ ref_id = start_index + i
72
  result.append({
73
+ "summary": line,
74
  "ref": ref_id,
75
  "image_url": "https://source.unsplash.com/800x600/?news",
76
  "article_link": f"https://google.com/search?q={topic_key}+news"
 
80
  print(f"❌ [Summarize topic '{topic_key}' Error]", e)
81
  return []
82
 
83
+ # πŸš€ Generate and cache feed
84
  def generate_and_cache_daily_feed():
85
+ try:
86
+ print("πŸ†• Generating daily feed...")
87
+ topic_docs = load_docs_by_topic_with_refs()
88
+ feed_map = {}
89
+ global_ref = 1
90
+
91
+ for topic_key in TOPIC_KEYS:
92
+ try:
93
+ summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []), global_ref)
94
+ feed_map[topic_key] = summaries
95
+ global_ref += len(summaries)
96
+ except Exception as e:
97
+ print(f"❌ [Topic summarization error: {topic_key}]", e)
98
+ feed_map[topic_key] = []
99
+
100
+ final_feed = []
101
+ for topic, topic_key in zip(TOPICS, TOPIC_KEYS):
102
+ topic_feed = feed_map.get(topic_key, [])
103
+ final_feed.append({
104
+ "topic": topic,
105
+ "feed": topic_feed
106
+ })
107
+
108
+ # Cache to Redis
109
  try:
110
+ cache_key = "daily_news_feed_cache"
111
+ redis_client.set(cache_key, json.dumps(final_feed, ensure_ascii=False))
112
+ redis_client.expire(cache_key, 86400)
113
+ print(f"βœ… Cached feed under key '{cache_key}' with 24-hour expiry.")
114
  except Exception as e:
115
+ print("❌ [Redis cache error]", e)
 
116
 
117
+ return final_feed
 
 
 
 
 
118
 
119
+ except Exception as e:
120
+ print("❌ [generate_and_cache_daily_feed Error]", e)
121
+ return []
122
 
123
+ # πŸ“¦ Retrieve from cache
124
  def get_cached_daily_feed():
125
  try:
126
+ cache_key = "daily_news_feed_cache"
127
+ cached = redis_client.get(cache_key)
128
+ return json.loads(cached) if cached else []
129
  except Exception as e:
130
+ print("❌ [get_cached_daily_feed Error]", e)
131
  return []
132
 
133
+ # πŸ§ͺ Run if main
134
  if __name__ == "__main__":
135
  feed = generate_and_cache_daily_feed()
136
  print(json.dumps(feed, indent=2, ensure_ascii=False))