Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 23

Commit

fbd9dbe

1 Parent(s): 8cb2491

ref changes and scraper changes

Browse files

Files changed (2) hide show

components/fetchers/scraper.py +14 -34
components/generators/daily_feed.py +49 -32

components/fetchers/scraper.py CHANGED Viewed

@@ -4,8 +4,7 @@ from newspaper import Article
 from typing import Optional
 from bs4 import BeautifulSoup
 import logging
-import re # For regex in clean_text
-# from tenacity import retry, wait_exponential, stop_after_after_attempt # If you want to add retries
 # Configure logging at the beginning of your script or module
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -37,11 +36,6 @@ def clean_text(text: str) -> str:
     # Normalize all whitespace characters to single spaces, then strip leading/trailing
     cleaned = re.sub(r'\s+', ' ', cleaned).strip()
-    # Optional: Remove common non-content patterns (e.g., "Image: ...", "Photo by ...")
-    # This might be too aggressive for some articles, test carefully
-    # cleaned = re.sub(r'(?:Image|Photo) by [^\n]*\n*', '', cleaned, flags=re.IGNORECASE)
-    # cleaned = re.sub(r'\[\s*\d+\s*[/\\-]\s*\d+\s*\]', '', cleaned) # e.g., [1/5], [2-3]
     return cleaned
 def is_low_quality(text: str) -> bool:
@@ -81,47 +75,31 @@ def is_low_quality(text: str) -> bool:
         if marker in lower_text:
             low_quality_score += 1
-    # Heuristic: if a significant portion of the text appears to be junk markers
-    # Or if too many different markers are present
-    if low_quality_score >= 4: # If 4 or more distinct markers are found
         logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.")
         return True
-    # More advanced heuristic for very short lines, indicating lists/tables/boilerplate
     lines = text.split('\n')
-    if len(lines) > 15: # Only apply if there are enough lines to make sense
-        short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7) # Lines with 1-6 words
-        if short_lines_count / len(lines) > 0.4: # If more than 40% of lines are very short
             logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.")
             return True
     return False
-# @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
-# def _make_request_with_retry(url, timeout, headers):
-#     """Helper for retries if tenacity is enabled."""
-#     response = requests.get(url, timeout=timeout, headers=headers)
-#     response.raise_for_status()
-#     return response
-def scrape_url(url: str, timeout: int = 15) -> Optional[str]: # Increased default timeout
-    """
-    Scrapes content from a given URL using Trafilatura and falls back to Newspaper3k.
-    Includes robust error handling and quality checks.
-    """
     logging.info(f"Attempting to scrape: {url}")
     # Try Trafilatura first
     try:
-        # Use _make_request_with_retry if retries are enabled
         response = requests.get(url, timeout=timeout, headers=HEADERS)
-        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
-        # Handle encoding more robustly
         try:
             html = response.content.decode(response.apparent_encoding)
         except UnicodeDecodeError:
-            html = response.content.decode('utf-8', errors='ignore') # Fallback to UTF-8 with ignore
         extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
@@ -131,18 +109,19 @@ def scrape_url(url: str, timeout: int = 15) -> Optional[str]: # Increased defaul
                 logging.info(f"Successfully extracted content using Trafilatura for: {url}")
                 return text
             else:
-                logging.warning(f"Trafilatura: Content identified as low quality for {url}.")
         else:
             logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.")
     except requests.exceptions.RequestException as req_err:
         logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}")
     except Exception as e:
-        logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False) # Keep exc_info=False for less verbose logging unless deep debug is needed
     # Fallback to newspaper3k
     try:
-        article = Article(url, headers=HEADERS, keep_article_html=False) # Pass headers, no need for raw HTML
         article.download()
         article.parse()
         if article.text:
@@ -151,7 +130,8 @@ def scrape_url(url: str, timeout: int = 15) -> Optional[str]: # Increased defaul
                 logging.info(f"Successfully extracted content using Newspaper3k for: {url}")
                 return text
             else:
-                logging.warning(f"Newspaper3k: Content identified as low quality for {url}.")
         else:
             logging.info(f"Newspaper3k returned no main content for: {url}.")
     except requests.exceptions.RequestException as req_err:

 from typing import Optional
 from bs4 import BeautifulSoup
 import logging
+import re
 # Configure logging at the beginning of your script or module
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
     # Normalize all whitespace characters to single spaces, then strip leading/trailing
     cleaned = re.sub(r'\s+', ' ', cleaned).strip()
     return cleaned
 def is_low_quality(text: str) -> bool:
         if marker in lower_text:
             low_quality_score += 1
+    if low_quality_score >= 4:
         logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.")
         return True
     lines = text.split('\n')
+    if len(lines) > 15:
+        short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7)
+        if short_lines_count / len(lines) > 0.4:
             logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.")
             return True
     return False
+def scrape_url(url: str, timeout: int = 15) -> Optional[str]:
     logging.info(f"Attempting to scrape: {url}")
     # Try Trafilatura first
     try:
         response = requests.get(url, timeout=timeout, headers=HEADERS)
+        response.raise_for_status()
         try:
             html = response.content.decode(response.apparent_encoding)
         except UnicodeDecodeError:
+            html = response.content.decode('utf-8', errors='ignore')
         extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
                 logging.info(f"Successfully extracted content using Trafilatura for: {url}")
                 return text
             else:
+                # Log when content is identified as low quality by Trafilatura
+                logging.warning(f"LOW_QUALITY_CONTENT (Trafilatura): {url} - Reason: Content identified as low quality.")
         else:
             logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.")
     except requests.exceptions.RequestException as req_err:
         logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}")
     except Exception as e:
+        logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False)
     # Fallback to newspaper3k
     try:
+        article = Article(url, headers=HEADERS, keep_article_html=False)
         article.download()
         article.parse()
         if article.text:
                 logging.info(f"Successfully extracted content using Newspaper3k for: {url}")
                 return text
             else:
+                # Log when content is identified as low quality by Newspaper3k
+                logging.warning(f"LOW_QUALITY_CONTENT (Newspaper3k): {url} - Reason: Content identified as low quality.")
         else:
             logging.info(f"Newspaper3k returned no main content for: {url}.")
     except requests.exceptions.RequestException as req_err:

components/generators/daily_feed.py CHANGED Viewed

@@ -9,7 +9,6 @@ from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilte
 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
-REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 # ✅ Redis client
@@ -19,13 +18,13 @@ redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
-# 🧠 Prompt for summarization
 BASE_PROMPT = (
     "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
-    "Return up to 3 punchy headlines, each under 20 words, and include why the story matters as the second half of the line."
 )
-# 📥 Load documents by topic and collect references
 def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
     topic_docs = {key: [] for key in TOPIC_KEYS}
     try:
@@ -40,13 +39,13 @@ def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
             for node in result.nodes:
                 content = node.get_content().strip()
                 ref_id = node.node_id or node.id_ or ""
-                if content and ref_id:
                     topic_docs[topic_key].append({"text": content, "ref": ref_id})
     except Exception as e:
         print("❌ [load_docs_by_topic_with_refs Error]", e)
     return topic_docs
-# 🧪 Summarize topic with reference IDs
 def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[Dict]:
     if not docs:
         print(f"⚠️ No docs for topic: {topic_key}")
@@ -67,11 +66,11 @@ def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[
         headlines = response.choices[0].message.content.strip().splitlines()
         result = []
         for i, line in enumerate(headlines):
-            clean_line = line.strip("-–• ")
-            if clean_line:
-                ref_id = docs[i]["ref"] if i < len(docs) else ""
                 result.append({
-                    "summary": f"{start_index + i}. {clean_line}",
                     "ref": ref_id,
                     "image_url": "https://source.unsplash.com/800x600/?news",
                     "article_link": f"https://google.com/search?q={topic_key}+news"
@@ -81,39 +80,57 @@ def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[
         print(f"❌ [Summarize topic '{topic_key}' Error]", e)
         return []
-# 🚀 Generate and cache full feed
 def generate_and_cache_daily_feed():
-    print("🆕 Starting daily feed generation with OpenAI...")
-    docs_by_topic = load_docs_by_topic_with_refs()
-    all_feed = []
-    counter = 1
-    for topic, topic_key in zip(TOPICS, TOPIC_KEYS):
         try:
-            summaries = summarize_topic(topic_key, docs_by_topic[topic_key], start_index=counter)
-            counter += len(summaries)
-            all_feed.append({"topic": topic, "feed": summaries})
         except Exception as e:
-            print(f"❌ [Feed generation error for {topic_key}]", e)
-            all_feed.append({"topic": topic, "feed": []})
-    try:
-        redis_client.set("daily_news_feed_cache", json.dumps(all_feed, ensure_ascii=False))
-        redis_client.expire("daily_news_feed_cache", 86400)
-        print("✅ Cached final feed.")
-    except Exception as e:
-        print("❌ [Redis caching error]", e)
-    return all_feed
-# 🗃️ Fetch from cache
 def get_cached_daily_feed():
     try:
-        data = redis_client.get("daily_news_feed_cache")
-        return json.loads(data) if data else []
     except Exception as e:
-        print("❌ [Cache fetch error]", e)
         return []
 if __name__ == "__main__":
     feed = generate_and_cache_daily_feed()
     print(json.dumps(feed, indent=2, ensure_ascii=False))

 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 # ✅ Redis client
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
+# 🧠 Summarization Prompt
 BASE_PROMPT = (
     "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
+    "Return up to 3 punchy headlines, each under 20 words. Each headline should be followed by a short explanation of why the story matters."
 )
+# 📥 Load documents and metadata
 def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
     topic_docs = {key: [] for key in TOPIC_KEYS}
     try:
             for node in result.nodes:
                 content = node.get_content().strip()
                 ref_id = node.node_id or node.id_ or ""
+                if content:
                     topic_docs[topic_key].append({"text": content, "ref": ref_id})
     except Exception as e:
         print("❌ [load_docs_by_topic_with_refs Error]", e)
     return topic_docs
+# 🧪 Topic summarizer
 def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[Dict]:
     if not docs:
         print(f"⚠️ No docs for topic: {topic_key}")
         headlines = response.choices[0].message.content.strip().splitlines()
         result = []
         for i, line in enumerate(headlines):
+            line = line.strip("-–• ").strip()
+            if line:
+                ref_id = start_index + i
                 result.append({
+                    "summary": line,
                     "ref": ref_id,
                     "image_url": "https://source.unsplash.com/800x600/?news",
                     "article_link": f"https://google.com/search?q={topic_key}+news"
         print(f"❌ [Summarize topic '{topic_key}' Error]", e)
         return []
+# 🚀 Generate and cache feed
 def generate_and_cache_daily_feed():
+    try:
+        print("🆕 Generating daily feed...")
+        topic_docs = load_docs_by_topic_with_refs()
+        feed_map = {}
+        global_ref = 1
+        for topic_key in TOPIC_KEYS:
+            try:
+                summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []), global_ref)
+                feed_map[topic_key] = summaries
+                global_ref += len(summaries)
+            except Exception as e:
+                print(f"❌ [Topic summarization error: {topic_key}]", e)
+                feed_map[topic_key] = []
+        final_feed = []
+        for topic, topic_key in zip(TOPICS, TOPIC_KEYS):
+            topic_feed = feed_map.get(topic_key, [])
+            final_feed.append({
+                "topic": topic,
+                "feed": topic_feed
+            })
+        # Cache to Redis
         try:
+            cache_key = "daily_news_feed_cache"
+            redis_client.set(cache_key, json.dumps(final_feed, ensure_ascii=False))
+            redis_client.expire(cache_key, 86400)
+            print(f"✅ Cached feed under key '{cache_key}' with 24-hour expiry.")
         except Exception as e:
+            print("❌ [Redis cache error]", e)
+        return final_feed
+    except Exception as e:
+        print("❌ [generate_and_cache_daily_feed Error]", e)
+        return []
+# 📦 Retrieve from cache
 def get_cached_daily_feed():
     try:
+        cache_key = "daily_news_feed_cache"
+        cached = redis_client.get(cache_key)
+        return json.loads(cached) if cached else []
     except Exception as e:
+        print("❌ [get_cached_daily_feed Error]", e)
         return []
+# 🧪 Run if main
 if __name__ == "__main__":
     feed = generate_and_cache_daily_feed()
     print(json.dumps(feed, indent=2, ensure_ascii=False))