Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on 24 days ago

Commit

8cb2491

1 Parent(s): 315bd36

scraper refinements

Browse files

Files changed (3) hide show

components/fetchers/scraper.py +123 -22
components/generators/daily_feed.py +65 -129
pipeline/news_ingest.py +4 -1

components/fetchers/scraper.py CHANGED Viewed

@@ -3,6 +3,12 @@ import trafilatura
 from newspaper import Article
 from typing import Optional
 from bs4 import BeautifulSoup
 HEADERS = {
     "User-Agent": (
@@ -13,50 +19,145 @@ HEADERS = {
 }
 def clean_text(text: str) -> str:
-    # Remove HTML tags, collapse whitespace
     soup = BeautifulSoup(text, "html.parser")
     cleaned = soup.get_text(separator=" ", strip=True)
-    cleaned = " ".join(cleaned.split())
     return cleaned
 def is_low_quality(text: str) -> bool:
-    """Detect navigation garbage, footers, or low-word-count dumps."""
-    if not text or len(text.split()) < 120:
         return True
     junk_markers = [
-        "subscribe", "click here", "latest headlines", "more from", "privacy policy",
-        "video", "terms of service", "back to top", "all rights reserved"
     ]
-    return any(marker in text.lower() for marker in junk_markers)
-def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
     # Try Trafilatura first
     try:
         response = requests.get(url, timeout=timeout, headers=HEADERS)
-        if response.status_code == 200:
-            html = response.text
-            extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
-            if extracted:
-                text = clean_text(extracted)
-                if not is_low_quality(text):
-                    return text
-                else:
-                    print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
     except Exception as e:
-        print(f"⚠️ Trafilatura failed for {url}: {e}")
     # Fallback to newspaper3k
     try:
-        article = Article(url)
         article.download()
         article.parse()
         if article.text:
             text = clean_text(article.text)
             if not is_low_quality(text):
                 return text
             else:
-                print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
     except Exception as e:
-        print(f"⚠️ Newspaper3k failed for {url}: {e}")
-    return None

 from newspaper import Article
 from typing import Optional
 from bs4 import BeautifulSoup
+import logging
+import re # For regex in clean_text
+# from tenacity import retry, wait_exponential, stop_after_after_attempt # If you want to add retries
+# Configure logging at the beginning of your script or module
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 HEADERS = {
     "User-Agent": (
 }
 def clean_text(text: str) -> str:
+    """
+    Cleans extracted text by removing HTML tags, normalizing whitespace,
+    and optionally removing common non-content patterns.
+    """
+    if not text:
+        return ""
     soup = BeautifulSoup(text, "html.parser")
+    # Add double newlines after paragraphs to preserve some structure
+    for p in soup.find_all('p'):
+        p.append('\n\n')
     cleaned = soup.get_text(separator=" ", strip=True)
+    # Normalize all whitespace characters to single spaces, then strip leading/trailing
+    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
+    # Optional: Remove common non-content patterns (e.g., "Image: ...", "Photo by ...")
+    # This might be too aggressive for some articles, test carefully
+    # cleaned = re.sub(r'(?:Image|Photo) by [^\n]*\n*', '', cleaned, flags=re.IGNORECASE)
+    # cleaned = re.sub(r'\[\s*\d+\s*[/\\-]\s*\d+\s*\]', '', cleaned) # e.g., [1/5], [2-3]
     return cleaned
 def is_low_quality(text: str) -> bool:
+    """
+    Detect navigation garbage, footers, or low-word-count dumps.
+    Uses an expanded list of junk markers and word count checks.
+    """
+    if not text:
+        logging.debug("Text is empty, considered low quality.")
         return True
+    words = text.split()
+    if len(words) < 150: # Increased minimum word count slightly for better content
+        logging.debug(f"Text has only {len(words)} words, considered low quality (min 150).")
+        return True
+    # Expanded list of common junk phrases/markers
     junk_markers = [
+        "subscribe to our newsletter", "cookie policy", "terms and conditions",
+        "privacy statement", "all rights reserved", "contact us", "about us",
+        "careers", "sitemap", "advertisement", "sponsored content",
+        "read more", "view all", "back to top", "connect with us",
+        "follow us on", "email us", "download our app", "footer",
+        "comments policy", "disclaimer", "affiliate links", "related posts",
+        "latest updates", "breaking news", "trending topics", "more news",
+        "featured stories", "sign up", "login", "register", "join us",
+        "newsletter signup", "skip to content", "navigation", "main menu",
+        "sidebar", "archive", "categories", "tags", "go to top", "licence",
+        "unlimited access", "support us", "exclusive content", "follow @",
+        "copyright", "imprint", "impressum", "legal notice"
     ]
+    low_quality_score = 0
+    lower_text = text.lower()
+    for marker in junk_markers:
+        if marker in lower_text:
+            low_quality_score += 1
+    # Heuristic: if a significant portion of the text appears to be junk markers
+    # Or if too many different markers are present
+    if low_quality_score >= 4: # If 4 or more distinct markers are found
+        logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.")
+        return True
+    # More advanced heuristic for very short lines, indicating lists/tables/boilerplate
+    lines = text.split('\n')
+    if len(lines) > 15: # Only apply if there are enough lines to make sense
+        short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7) # Lines with 1-6 words
+        if short_lines_count / len(lines) > 0.4: # If more than 40% of lines are very short
+            logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.")
+            return True
+    return False
+# @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
+# def _make_request_with_retry(url, timeout, headers):
+#     """Helper for retries if tenacity is enabled."""
+#     response = requests.get(url, timeout=timeout, headers=headers)
+#     response.raise_for_status()
+#     return response
+def scrape_url(url: str, timeout: int = 15) -> Optional[str]: # Increased default timeout
+    """
+    Scrapes content from a given URL using Trafilatura and falls back to Newspaper3k.
+    Includes robust error handling and quality checks.
+    """
+    logging.info(f"Attempting to scrape: {url}")
     # Try Trafilatura first
     try:
+        # Use _make_request_with_retry if retries are enabled
         response = requests.get(url, timeout=timeout, headers=HEADERS)
+        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
+        # Handle encoding more robustly
+        try:
+            html = response.content.decode(response.apparent_encoding)
+        except UnicodeDecodeError:
+            html = response.content.decode('utf-8', errors='ignore') # Fallback to UTF-8 with ignore
+        extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
+        if extracted:
+            text = clean_text(extracted)
+            if not is_low_quality(text):
+                logging.info(f"Successfully extracted content using Trafilatura for: {url}")
+                return text
+            else:
+                logging.warning(f"Trafilatura: Content identified as low quality for {url}.")
+        else:
+            logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.")
+    except requests.exceptions.RequestException as req_err:
+        logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}")
     except Exception as e:
+        logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False) # Keep exc_info=False for less verbose logging unless deep debug is needed
     # Fallback to newspaper3k
     try:
+        article = Article(url, headers=HEADERS, keep_article_html=False) # Pass headers, no need for raw HTML
         article.download()
         article.parse()
         if article.text:
             text = clean_text(article.text)
             if not is_low_quality(text):
+                logging.info(f"Successfully extracted content using Newspaper3k for: {url}")
                 return text
             else:
+                logging.warning(f"Newspaper3k: Content identified as low quality for {url}.")
+        else:
+            logging.info(f"Newspaper3k returned no main content for: {url}.")
+    except requests.exceptions.RequestException as req_err:
+        logging.error(f"Newspaper3k (Requests) failed for {url}: {req_err}")
     except Exception as e:
+        logging.error(f"Newspaper3k (Parsing/Processing) failed for {url}: {e}", exc_info=False)
+    logging.error(f"Failed to extract quality content from: {url} using both methods.")
+    return None

components/generators/daily_feed.py CHANGED Viewed

@@ -5,179 +5,115 @@ import numpy as np
 from typing import List, Dict
 from openai import OpenAI
 from components.indexers.news_indexer import get_upstash_vector_store
-from llama_index.core import StorageContext
 from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
-REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 # ✅ Redis client
-try:
-    redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
-except Exception as e:
-    print("❌ [Redis Init Error]", e)
-    raise
-# 📰 Topic list
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
-# This list correctly generates 'india', 'world', etc.
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
-# 🧠 Summarization prompt
 BASE_PROMPT = (
     "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
-    "Return up to 3 punchy headlines, each under 20 words, written like a premium editorial bulletin."
 )
-# 📥 Load documents grouped by topic from Upstash
-def load_all_documents_grouped_by_topic() -> Dict[str, List[str]]:
     topic_docs = {key: [] for key in TOPIC_KEYS}
     try:
         vector_store = get_upstash_vector_store()
-        print("💡 Successfully retrieved Upstash vector store.")
-        # Debugging prints (keep them for now, they are useful)
-        print(f"DEBUG: TOPICS = {TOPICS}")
-        print(f"DEBUG: TOPIC_KEYS = {TOPIC_KEYS}")
-        print(f"DEBUG: Length of TOPICS = {len(TOPICS)}")
-        print(f"DEBUG: Length of TOPIC_KEYS = {len(TOPIC_KEYS)}")
-        for full_topic_name, topic_key_for_filter in zip(TOPICS, TOPIC_KEYS):
-            try:
-                # *** THE CRITICAL CHANGE IS HERE ***
-                # Use 'topic_key_for_filter' (e.g., "india") which matches your stored metadata
-                # instead of 'full_topic_name' (e.g., "India news").
-                filters = MetadataFilters(
-                    filters=[
-                        MetadataFilter(key="topic", value=topic_key_for_filter, operator=FilterOperator.EQ)
-                    ]
-                )
-                dummy_vector = np.random.rand(384).tolist()  # Assuming MiniLM embeddings
-                query = VectorStoreQuery(
-                    query_embedding=dummy_vector,
-                    similarity_top_k=50, # Retrieve enough documents for summarization
-                    filters=filters # Apply the metadata filter
-                )
-                print(f"🔎 Querying Upstash for topic: '{full_topic_name}' using filter value '{topic_key_for_filter}'")
-                result = vector_store.query(query)
-                print(f"➡️ Found {len(result.nodes)} nodes for topic: '{full_topic_name}'.")
-                for node in result.nodes:
-                    content = node.get_content().strip()
-                    if content:
-                        topic_docs[topic_key_for_filter].append(content)
-                        # Optional: Print metadata to verify filtering
-                        # print(f"  Node metadata: {node.metadata}")
-            except Exception as e:
-                print(f"❌ [Topic Metadata Filter error for '{full_topic_name}']: {e}")
     except Exception as e:
-        print("❌ [load_all_documents_grouped_by_topic Error]", e)
     return topic_docs
-# 🧪 Summarize one topic at a time using OpenAI GPT-4
-def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
     if not docs:
-        print(f"⚠️ No docs found for topic: {topic_key}, skipping summarization.")
         return []
     try:
         client = OpenAI(api_key=OPENAI_API_KEY)
-        content = "\n\n---\n\n".join(docs)[:12000]
-        print(f"🧠 Summarizing topic via OpenAI: {topic_key} ({len(docs)} documents)")
-        completion = client.chat.completions.create(
-            model="gpt-4",
             messages=[
                 {"role": "system", "content": BASE_PROMPT},
                 {"role": "user", "content": content},
             ],
-            max_tokens=512,
-            temperature=0.7,
         )
-        text = completion.choices[0].message.content.strip()
-        summaries = []
-        for line in text.splitlines():
-            line = line.strip("-–• ")
-            if line:
-                summaries.append({
-                    "summary": line,
-                    "image_url": "https://source.unsplash.com/800x600/?news",
-                    "article_link": f"https://google.com/search?q={topic_key}+news"
                 })
-        return summaries
     except Exception as e:
-        print(f"❌ [OpenAI Summarization Error for '{topic_key}']: {e}")
         return []
-# 🚀 Main callable
 def generate_and_cache_daily_feed():
-    try:
-        print("🆕 Running OpenAI-powered daily feed generator....")
-        topic_docs = load_all_documents_grouped_by_topic()
-        feed_map = {}
-        for topic_key in TOPIC_KEYS:
-            try:
-                summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
-                feed_map[topic_key] = summaries
-            except Exception as e:
-                print(f"❌ [Topic Loop Error for '{topic_key}']: {e}")
-                feed_map[topic_key] = []
-        # When creating final_feed, use TOPICS for the display name but TOPIC_KEYS for mapping
-        final_feed = [{"topic": display_name, "feed": feed_map[actual_key]}
-                      for display_name, actual_key in zip(TOPICS, TOPIC_KEYS)]
         try:
-            cache_key_name = "daily_news_feed_cache"
-            redis_client.set(cache_key_name, json.dumps(final_feed, ensure_ascii=False))
-            redis_client.expire(cache_key_name, 86400)
-            print(f"✅ Cached daily feed under key '{cache_key_name}' with 24-hour expiry.")
         except Exception as e:
-            print("❌ [Redis Cache Error]", e)
-        return final_feed
     except Exception as e:
-        print("❌ [generate_and_cache_daily_feed Overall Error]", e)
-        return []
-# 📦 Get cached data
 def get_cached_daily_feed():
     try:
-        cache_key_name = "daily_news_feed_cache"
-        cached = redis_client.get(cache_key_name)
-        if cached:
-            print(f"✅ Retrieved cached daily feed from '{cache_key_name}'.")
-            return json.loads(cached)
-        else:
-            print(f"ℹ️ No cached data found under key '{cache_key_name}'.")
-            return []
     except Exception as e:
-        print("❌ [get_cached_daily_feed Error]", e)
         return []
-# Example of how to run it (for testing purposes, if this were the main script)
 if __name__ == "__main__":
-    # Ensure your environment variables are set before running
-    # os.environ["UPSTASH_REDIS_URL"] = "your_upstash_redis_url"
-    # os.environ["UPSTASH_REDIS_TOKEN"] = "your_upstash_redis_token"
-    # os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
-    generated_feed = generate_and_cache_daily_feed()
-    print("\n--- Generated and Cached Feed ---")
-    # print(json.dumps(generated_feed, indent=2, ensure_ascii=False))
-    cached_feed = get_cached_daily_feed()
-    print("\n--- Retrieved from Cache ---")
-    # print(json.dumps(cached_feed, indent=2, ensure_ascii=False))

 from typing import List, Dict
 from openai import OpenAI
 from components.indexers.news_indexer import get_upstash_vector_store
 from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
+REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 # ✅ Redis client
+redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
+# 📰 Topics
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
+# 🧠 Prompt for summarization
 BASE_PROMPT = (
     "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
+    "Return up to 3 punchy headlines, each under 20 words, and include why the story matters as the second half of the line."
 )
+# 📥 Load documents by topic and collect references
+def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
     topic_docs = {key: [] for key in TOPIC_KEYS}
     try:
         vector_store = get_upstash_vector_store()
+        for full_topic, topic_key in zip(TOPICS, TOPIC_KEYS):
+            filters = MetadataFilters(
+                filters=[MetadataFilter(key="topic", value=topic_key, operator=FilterOperator.EQ)]
+            )
+            dummy_vector = np.random.rand(384).tolist()
+            query = VectorStoreQuery(query_embedding=dummy_vector, similarity_top_k=50, filters=filters)
+            result = vector_store.query(query)
+            for node in result.nodes:
+                content = node.get_content().strip()
+                ref_id = node.node_id or node.id_ or ""
+                if content and ref_id:
+                    topic_docs[topic_key].append({"text": content, "ref": ref_id})
     except Exception as e:
+        print("❌ [load_docs_by_topic_with_refs Error]", e)
     return topic_docs
+# 🧪 Summarize topic with reference IDs
+def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[Dict]:
     if not docs:
+        print(f"⚠️ No docs for topic: {topic_key}")
         return []
     try:
+        content = "\n\n---\n\n".join([d["text"] for d in docs])[:12000]
         client = OpenAI(api_key=OPENAI_API_KEY)
+        response = client.chat.completions.create(
+            model="gpt-4",
             messages=[
                 {"role": "system", "content": BASE_PROMPT},
                 {"role": "user", "content": content},
             ],
+            max_tokens=512,
+            temperature=0.7,
         )
+        headlines = response.choices[0].message.content.strip().splitlines()
+        result = []
+        for i, line in enumerate(headlines):
+            clean_line = line.strip("-–• ")
+            if clean_line:
+                ref_id = docs[i]["ref"] if i < len(docs) else ""
+                result.append({
+                    "summary": f"{start_index + i}. {clean_line}",
+                    "ref": ref_id,
+                    "image_url": "https://source.unsplash.com/800x600/?news",
+                    "article_link": f"https://google.com/search?q={topic_key}+news"
                 })
+        return result
     except Exception as e:
+        print(f"❌ [Summarize topic '{topic_key}' Error]", e)
         return []
+# 🚀 Generate and cache full feed
 def generate_and_cache_daily_feed():
+    print("🆕 Starting daily feed generation with OpenAI...")
+    docs_by_topic = load_docs_by_topic_with_refs()
+    all_feed = []
+    counter = 1
+    for topic, topic_key in zip(TOPICS, TOPIC_KEYS):
         try:
+            summaries = summarize_topic(topic_key, docs_by_topic[topic_key], start_index=counter)
+            counter += len(summaries)
+            all_feed.append({"topic": topic, "feed": summaries})
         except Exception as e:
+            print(f"❌ [Feed generation error for {topic_key}]", e)
+            all_feed.append({"topic": topic, "feed": []})
+    try:
+        redis_client.set("daily_news_feed_cache", json.dumps(all_feed, ensure_ascii=False))
+        redis_client.expire("daily_news_feed_cache", 86400)
+        print("✅ Cached final feed.")
     except Exception as e:
+        print("❌ [Redis caching error]", e)
+    return all_feed
+# 🗃️ Fetch from cache
 def get_cached_daily_feed():
     try:
+        data = redis_client.get("daily_news_feed_cache")
+        return json.loads(data) if data else []
     except Exception as e:
+        print("❌ [Cache fetch error]", e)
         return []
 if __name__ == "__main__":
+    feed = generate_and_cache_daily_feed()
+    print(json.dumps(feed, indent=2, ensure_ascii=False))

pipeline/news_ingest.py CHANGED Viewed

@@ -58,6 +58,7 @@ async def main():
     print("🌍 Fetching news URLs from Google...")
     all_articles = []
     for query in QUERIES:
         print(f"🔍 Searching for: {query}")
@@ -76,13 +77,15 @@ async def main():
                 article_text = scrape_url(url)
                 if article_text:
                     all_articles.append({
                         "topic": query,
-                        "title": title,
                         "url": url,
                         "source": source,
                         "content": article_text
                     })
                 else:
                     print(f"⚠️ Skipped: {url}")

     print("🌍 Fetching news URLs from Google...")
     all_articles = []
+    counter = 1  # ✅ Initialize global counter
     for query in QUERIES:
         print(f"🔍 Searching for: {query}")
                 article_text = scrape_url(url)
                 if article_text:
+                    numbered_title = f"{counter}. {title}"  # ✅ Add headline number
                     all_articles.append({
                         "topic": query,
+                        "title": numbered_title,
                         "url": url,
                         "source": source,
                         "content": article_text
                     })
+                    counter += 1
                 else:
                     print(f"⚠️ Skipped: {url}")