Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 23

Commit

0820a6b

1 Parent(s): fbd9dbe

ref_id changes and prompt revision

Browse files

Files changed (2) hide show

components/generators/daily_feed.py +113 -37
pipeline/news_ingest.py +11 -5

components/generators/daily_feed.py CHANGED Viewed

@@ -6,56 +6,106 @@ from typing import List, Dict
 from openai import OpenAI
 from components.indexers.news_indexer import get_upstash_vector_store
 from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 # ✅ Redis client
-redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
 # 📰 Topics
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
 # 🧠 Summarization Prompt
 BASE_PROMPT = (
     "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
     "Return up to 3 punchy headlines, each under 20 words. Each headline should be followed by a short explanation of why the story matters."
 )
 # 📥 Load documents and metadata
 def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
     topic_docs = {key: [] for key in TOPIC_KEYS}
     try:
         vector_store = get_upstash_vector_store()
-        for full_topic, topic_key in zip(TOPICS, TOPIC_KEYS):
             filters = MetadataFilters(
                 filters=[MetadataFilter(key="topic", value=topic_key, operator=FilterOperator.EQ)]
             )
-            dummy_vector = np.random.rand(384).tolist()
             query = VectorStoreQuery(query_embedding=dummy_vector, similarity_top_k=50, filters=filters)
             result = vector_store.query(query)
             for node in result.nodes:
                 content = node.get_content().strip()
-                ref_id = node.node_id or node.id_ or ""
-                if content:
-                    topic_docs[topic_key].append({"text": content, "ref": ref_id})
     except Exception as e:
-        print("❌ [load_docs_by_topic_with_refs Error]", e)
     return topic_docs
 # 🧪 Topic summarizer
-def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[Dict]:
     if not docs:
-        print(f"⚠️ No docs for topic: {topic_key}")
         return []
     try:
-        content = "\n\n---\n\n".join([d["text"] for d in docs])[:12000]
         client = OpenAI(api_key=OPENAI_API_KEY)
         response = client.chat.completions.create(
-            model="gpt-4",
             messages=[
                 {"role": "system", "content": BASE_PROMPT},
                 {"role": "user", "content": content},
@@ -63,45 +113,60 @@ def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[
             max_tokens=512,
             temperature=0.7,
         )
-        headlines = response.choices[0].message.content.strip().splitlines()
         result = []
-        for i, line in enumerate(headlines):
-            line = line.strip("-–• ").strip()
-            if line:
-                ref_id = start_index + i
-                result.append({
-                    "summary": line,
-                    "ref": ref_id,
-                    "image_url": "https://source.unsplash.com/800x600/?news",
-                    "article_link": f"https://google.com/search?q={topic_key}+news"
-                })
         return result
     except Exception as e:
-        print(f"❌ [Summarize topic '{topic_key}' Error]", e)
         return []
 # 🚀 Generate and cache feed
 def generate_and_cache_daily_feed():
     try:
-        print("🆕 Generating daily feed...")
-        topic_docs = load_docs_by_topic_with_refs()
         feed_map = {}
-        global_ref = 1
         for topic_key in TOPIC_KEYS:
             try:
-                summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []), global_ref)
                 feed_map[topic_key] = summaries
-                global_ref += len(summaries)
             except Exception as e:
-                print(f"❌ [Topic summarization error: {topic_key}]", e)
                 feed_map[topic_key] = []
         final_feed = []
-        for topic, topic_key in zip(TOPICS, TOPIC_KEYS):
             topic_feed = feed_map.get(topic_key, [])
             final_feed.append({
-                "topic": topic,
                 "feed": topic_feed
             })
@@ -109,15 +174,15 @@ def generate_and_cache_daily_feed():
         try:
             cache_key = "daily_news_feed_cache"
             redis_client.set(cache_key, json.dumps(final_feed, ensure_ascii=False))
-            redis_client.expire(cache_key, 86400)
-            print(f"✅ Cached feed under key '{cache_key}' with 24-hour expiry.")
         except Exception as e:
-            print("❌ [Redis cache error]", e)
         return final_feed
     except Exception as e:
-        print("❌ [generate_and_cache_daily_feed Error]", e)
         return []
 # 📦 Retrieve from cache
@@ -125,12 +190,23 @@ def get_cached_daily_feed():
     try:
         cache_key = "daily_news_feed_cache"
         cached = redis_client.get(cache_key)
-        return json.loads(cached) if cached else []
     except Exception as e:
-        print("❌ [get_cached_daily_feed Error]", e)
         return []
 # 🧪 Run if main
 if __name__ == "__main__":
     feed = generate_and_cache_daily_feed()
     print(json.dumps(feed, indent=2, ensure_ascii=False))

 from openai import OpenAI
 from components.indexers.news_indexer import get_upstash_vector_store
 from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 # ✅ Redis client
+try:
+    redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
+except Exception as e:
+    logging.error(f"❌ [Redis Init Error]: {e}")
+    # Decide if you want to raise an exception or continue without Redis
+    raise # It's critical for caching, so raising is appropriate here
 # 📰 Topics
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
 # 🧠 Summarization Prompt
+# Removed the "numbered headline" instruction as we want LLM to just give plain headlines
 BASE_PROMPT = (
     "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
     "Return up to 3 punchy headlines, each under 20 words. Each headline should be followed by a short explanation of why the story matters."
+    "Don't include unnecessary text like 'this is important because' or 'why this matters because..' just state the logic and nothing else."
 )
 # 📥 Load documents and metadata
 def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
     topic_docs = {key: [] for key in TOPIC_KEYS}
+    logging.info("Starting to load documents by topic from Upstash Vector Store...")
     try:
         vector_store = get_upstash_vector_store()
+        for full_topic_name, topic_key in zip(TOPICS, TOPIC_KEYS):
             filters = MetadataFilters(
                 filters=[MetadataFilter(key="topic", value=topic_key, operator=FilterOperator.EQ)]
             )
+            # A random vector for querying, adjust similarity_top_k based on your data density
+            dummy_vector = np.random.rand(384).tolist()
             query = VectorStoreQuery(query_embedding=dummy_vector, similarity_top_k=50, filters=filters)
+            logging.info(f"🔎 Querying for topic '{topic_key}' with filter value '{topic_key}'.")
             result = vector_store.query(query)
+            logging.info(f"➡️ Found {len(result.nodes)} nodes for topic '{topic_key}'.")
             for node in result.nodes:
                 content = node.get_content().strip()
+                # *** IMPORTANT: Retrieve the 'headline_id' from node.metadata ***
+                headline_id = node.metadata.get("headline_id")
+                # You can also get other metadata like title, url, source if needed for the summary output or debugging
+                title = node.metadata.get("title", "No Title")
+                url = node.metadata.get("url", "#")
+                source = node.metadata.get("source", "Unknown Source")
+                if content and headline_id is not None:
+                    topic_docs[topic_key].append({
+                        "text": content,
+                        "headline_id": headline_id, # Store the actual ID
+                        "title": title,
+                        "url": url,
+                        "source": source
+                    })
+                elif content and headline_id is None:
+                    logging.warning(f"Node found without 'headline_id' for topic '{topic_key}': URL {node.metadata.get('url', 'N/A')}")
     except Exception as e:
+        logging.error(f"❌ [load_docs_by_topic_with_refs Error]: {e}", exc_info=True)
     return topic_docs
 # 🧪 Topic summarizer
+def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]: # Removed start_index
     if not docs:
+        logging.warning(f"⚠️ No docs for topic: {topic_key}, skipping summarization.")
+        return []
+    # Get the headline_id of the first document to use as a representative ID for the summary
+    # This assumes the summary is broadly about the topic, and we just need *a* representative ID.
+    representative_headline_id = docs[0].get("headline_id") if docs else None
+    representative_article_link = docs[0].get("url") if docs else f"https://google.com/search?q={topic_key}+news"
+    representative_title = docs[0].get("title") if docs else f"Summary for {topic_key}"
+    # Concatenate document texts for summarization
+    # Ensure all document texts are strings before joining
+    content = "\n\n---\n\n".join([str(d["text"]) for d in docs if "text" in d and d["text"] is not None])
+    if not content:
+        logging.warning(f"⚠️ No valid text content found in docs for topic: {topic_key}, skipping summarization.")
         return []
+    content = content[:12000] # Truncate to avoid excessive token usage
+    logging.info(f"🧠 Summarizing topic via OpenAI: '{topic_key}' ({len(docs)} documents)")
     try:
         client = OpenAI(api_key=OPENAI_API_KEY)
         response = client.chat.completions.create(
+            model="gpt-4", # Consider "gpt-4o" or "gpt-3.5-turbo" for cost/speed
             messages=[
                 {"role": "system", "content": BASE_PROMPT},
                 {"role": "user", "content": content},
             max_tokens=512,
             temperature=0.7,
         )
+        llm_output = response.choices[0].message.content.strip()
+        # Parse headlines based on the expected format
+        headlines = []
+        for line in llm_output.splitlines():
+            line = line.strip("-–• ") # Clean bullet points
+            if ":" in line:
+                parts = line.split(':', 1) # Split only on the first colon
+                if len(parts) == 2:
+                    headline_text = parts[0].strip()
+                    explanation_text = parts[1].strip()
+                    if headline_text and explanation_text: # Ensure both parts are non-empty
+                        headlines.append({"summary": headline_text, "explanation": explanation_text})
         result = []
+        # Assign the representative ID to each generated summary
+        for i, h_item in enumerate(headlines):
+            result.append({
+                "summary": h_item["summary"],
+                "explanation": h_item["explanation"],
+                "headline_id": representative_headline_id, # Use the representative ID
+                "image_url": "https://source.unsplash.com/800x600/?news",
+                "article_link": representative_article_link, # Use representative link
+                "representative_title": representative_title # Add the representative title for context
+            })
+        logging.info(f"✅ Successfully generated {len(result)} summaries for topic '{topic_key}'.")
         return result
     except Exception as e:
+        logging.error(f"❌ [Summarize topic '{topic_key}' Error]: {e}", exc_info=True)
         return []
 # 🚀 Generate and cache feed
 def generate_and_cache_daily_feed():
     try:
+        logging.info("🆕 Generating daily feed...")
+        topic_docs = load_docs_by_topic_with_refs() # This now returns docs with 'headline_id'
         feed_map = {}
+        # global_ref is no longer needed in this context for generating summary IDs
         for topic_key in TOPIC_KEYS:
             try:
+                # Pass the documents directly to the summarizer
+                summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
                 feed_map[topic_key] = summaries
             except Exception as e:
+                logging.error(f"❌ [Topic summarization loop error for '{topic_key}']: {e}", exc_info=True)
                 feed_map[topic_key] = []
         final_feed = []
+        for topic_display_name, topic_key in zip(TOPICS, TOPIC_KEYS):
             topic_feed = feed_map.get(topic_key, [])
             final_feed.append({
+                "topic": topic_display_name,
                 "feed": topic_feed
             })
         try:
             cache_key = "daily_news_feed_cache"
             redis_client.set(cache_key, json.dumps(final_feed, ensure_ascii=False))
+            redis_client.expire(cache_key, 86400) # Set expiry to 24 hours
+            logging.info(f"✅ Cached feed under key '{cache_key}' with 24-hour expiry.")
         except Exception as e:
+            logging.error(f"❌ [Redis cache error]: {e}", exc_info=True)
         return final_feed
     except Exception as e:
+        logging.critical(f"❌ [generate_and_cache_daily_feed Overall Error]: {e}", exc_info=True)
         return []
 # 📦 Retrieve from cache
     try:
         cache_key = "daily_news_feed_cache"
         cached = redis_client.get(cache_key)
+        if cached:
+            logging.info(f"✅ Retrieved cached daily feed from '{cache_key}'.")
+            return json.loads(cached)
+        else:
+            logging.info(f"ℹ️ No cached data found under key '{cache_key}'.")
+            return []
     except Exception as e:
+        logging.error(f"❌ [get_cached_daily_feed Error]: {e}", exc_info=True)
         return []
 # 🧪 Run if main
 if __name__ == "__main__":
     feed = generate_and_cache_daily_feed()
+    print("\n--- Generated Daily Feed ---")
     print(json.dumps(feed, indent=2, ensure_ascii=False))
+    # Example of retrieving from cache
+    # cached_data = get_cached_daily_feed()
+    # print("\n--- Cached Daily Feed ---")
+    # print(json.dumps(cached_data, indent=2, ensure_ascii=False))

pipeline/news_ingest.py CHANGED Viewed

@@ -37,10 +37,14 @@ def write_articles_jsonl(articles: List[Dict], file_path: str):
 # 📄 Convert raw scraped data into Document objects
 async def build_documents(data: List[Dict]) -> List[Document]:
     return [
         Document(
             text=entry["content"],
             metadata={
                 "title": entry["title"],
                 "url": entry["url"],
                 "topic": entry["topic"].lower().replace(" news", ""),  # normalized topic key
@@ -58,7 +62,8 @@ async def main():
     print("🌍 Fetching news URLs from Google...")
     all_articles = []
-    counter = 1  # ✅ Initialize global counter
     for query in QUERIES:
         print(f"🔍 Searching for: {query}")
@@ -77,15 +82,16 @@ async def main():
                 article_text = scrape_url(url)
                 if article_text:
-                    numbered_title = f"{counter}. {title}"  # ✅ Add headline number
                     all_articles.append({
                         "topic": query,
-                        "title": numbered_title,
                         "url": url,
                         "source": source,
                         "content": article_text
                     })
-                    counter += 1
                 else:
                     print(f"⚠️ Skipped: {url}")
@@ -107,4 +113,4 @@ async def main():
 # 🏁 Entrypoint
 if __name__ == "__main__":
-    asyncio.run(main())

 # 📄 Convert raw scraped data into Document objects
 async def build_documents(data: List[Dict]) -> List[Document]:
+    # --- IMPORTANT CHANGE HERE ---
+    # The 'data' list from all_articles already contains the 'headline_id' (which was 'counter').
+    # We will use that directly.
     return [
         Document(
             text=entry["content"],
             metadata={
+                "headline_id": entry["headline_id"],  # Use the pre-assigned ID
                 "title": entry["title"],
                 "url": entry["url"],
                 "topic": entry["topic"].lower().replace(" news", ""),  # normalized topic key
     print("🌍 Fetching news URLs from Google...")
     all_articles = []
+    # This counter will be used for your simple sequential ID
+    global_headline_id_counter = 1
     for query in QUERIES:
         print(f"🔍 Searching for: {query}")
                 article_text = scrape_url(url)
                 if article_text:
+                    # Assign the simple sequential ID here
                     all_articles.append({
+                        "headline_id": global_headline_id_counter, # Assign the unique ID
                         "topic": query,
+                        "title": title, # Keep title clean, the numbering can be for display later
                         "url": url,
                         "source": source,
                         "content": article_text
                     })
+                    global_headline_id_counter += 1 # Increment for the next article
                 else:
                     print(f"⚠️ Skipped: {url}")
 # 🏁 Entrypoint
 if __name__ == "__main__":
+    asyncio.run(main())