Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 19

Commit

0e7d7a3

1 Parent(s): 3464963

god knows

Browse files

Files changed (2) hide show

components/generators/daily_feed.py +19 -13
pipeline/news_ingest.py +10 -51

components/generators/daily_feed.py CHANGED Viewed

@@ -12,30 +12,34 @@ from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.schema import Document
 from llama_index.core.settings import Settings
-# ✅ Disable OpenAI LLM fallback
 Settings.llm = None
-# 🔐 Load environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
 REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
-MISTRAL_URL = os.environ.get("MISTRAL_URL")       # Mistral inference endpoint
-HF_TOKEN = os.environ.get("HF_TOKEN")             # Hugging Face access token
-# ✅ Connect to Redis
 redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
-# 🔍 Topics to query and summarize
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
-# 🧠 Prompt builder for summarization
 def build_prompt(content: str, topic: str) -> str:
     return (
         f"You are a news summarizer. Summarize the following content in 25-30 words. "
         f"Make it engaging and informative. Include appropriate emojis. Topic: {topic}\n\n{content}"
     )
-# 🔗 Call Mistral API
 def call_mistral(prompt: str) -> str:
     headers = {
         "Authorization": f"Bearer {HF_TOKEN}",
         "Content-Type": "application/json"
@@ -45,6 +49,7 @@ def call_mistral(prompt: str) -> str:
             {"role": "user", "content": prompt}
         ]
     }
     try:
         response = requests.post(MISTRAL_URL, headers=headers, json=payload, timeout=20)
         response.raise_for_status()
@@ -53,7 +58,7 @@ def call_mistral(prompt: str) -> str:
         print(f"⚠️ Mistral error: {e}")
         return None
-# ✂️ Summarize a list of documents into a short news feed
 def summarize_topic(docs: List[str], topic: str) -> List[Dict]:
     feed = []
     for doc in docs[:5]:
@@ -67,30 +72,31 @@ def summarize_topic(docs: List[str], topic: str) -> List[Dict]:
             })
     return feed
-# 🚀 Main pipeline
 def generate_and_cache_daily_feed(documents: List[Document]):
     index = VectorStoreIndex.from_documents(documents)
     retriever = index.as_retriever()
     query_engine = RetrieverQueryEngine(retriever=retriever)
     final_feed = []
     for topic in TOPICS:
         print(f"\n🔍 Generating for: {topic}")
         response = query_engine.query(topic)
         docs = [str(node.get_content()) for node in response.source_nodes]
-        print("Procured docs", docs)
         topic_feed = summarize_topic(docs, topic)
         final_feed.append({
             "topic": topic.lower().replace(" news", ""),
             "feed": topic_feed
         })
-    # 💾 Cache to Redis
     redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
     print(f"✅ Cached daily feed under key '{REDIS_KEY}'")
     return final_feed
-# 🧪 Redis fetch (for use in APIs)
 def get_cached_daily_feed():
     cached = redis_client.get(REDIS_KEY)
     return json.loads(cached) if cached else []

 from llama_index.core.schema import Document
 from llama_index.core.settings import Settings
+# ✅ Disable implicit LLM usage (prevents OpenAI fallback)
 Settings.llm = None
+# 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
 REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
+MISTRAL_URL = os.environ.get("MISTRAL_URL")       # Hugging Face endpoint
+HF_TOKEN = os.environ.get("HF_TOKEN")             # Hugging Face token
+# ✅ Redis client
 redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
+# 📰 Topics
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
+# ✍️ Build summarization prompt
 def build_prompt(content: str, topic: str) -> str:
     return (
         f"You are a news summarizer. Summarize the following content in 25-30 words. "
         f"Make it engaging and informative. Include appropriate emojis. Topic: {topic}\n\n{content}"
     )
+# 🧠 Call Mistral via Hugging Face endpoint
 def call_mistral(prompt: str) -> str:
+    if not prompt or len(prompt.strip()) < 10:
+        print(f"⚠️ Skipping empty or invalid prompt:\n{prompt}\n")
+        return None
     headers = {
         "Authorization": f"Bearer {HF_TOKEN}",
         "Content-Type": "application/json"
             {"role": "user", "content": prompt}
         ]
     }
     try:
         response = requests.post(MISTRAL_URL, headers=headers, json=payload, timeout=20)
         response.raise_for_status()
         print(f"⚠️ Mistral error: {e}")
         return None
+# ✂️ Summarize documents for a given topic
 def summarize_topic(docs: List[str], topic: str) -> List[Dict]:
     feed = []
     for doc in docs[:5]:
             })
     return feed
+# 🔁 Main pipeline: generate and cache feed
 def generate_and_cache_daily_feed(documents: List[Document]):
     index = VectorStoreIndex.from_documents(documents)
     retriever = index.as_retriever()
     query_engine = RetrieverQueryEngine(retriever=retriever)
     final_feed = []
     for topic in TOPICS:
         print(f"\n🔍 Generating for: {topic}")
         response = query_engine.query(topic)
         docs = [str(node.get_content()) for node in response.source_nodes]
         topic_feed = summarize_topic(docs, topic)
         final_feed.append({
             "topic": topic.lower().replace(" news", ""),
             "feed": topic_feed
         })
+    # 💾 Cache in Redis
     redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
     print(f"✅ Cached daily feed under key '{REDIS_KEY}'")
     return final_feed
+# 📦 For API or debugging
 def get_cached_daily_feed():
     cached = redis_client.get(REDIS_KEY)
     return json.loads(cached) if cached else []

pipeline/news_ingest.py CHANGED Viewed

@@ -1,43 +1,5 @@
-import sys
 import os
-import json
-from typing import List, Dict
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-from components.indexers.news_indexer import get_or_build_index_from_docs
-from components.fetchers.google_search import fetch_google_news
-from components.fetchers.scraper import scrape_url
-from components.generators.daily_feed import generate_and_cache_daily_feed
-from llama_index.core.settings import Settings
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.core.schema import Document
-# ✅ Set up local embedding model
-Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
-# 🔐 Environment variables
-API_KEY = os.environ.get("GOOGLE_API_KEY")
-CSE_ID = os.environ.get("GOOGLE_CX_ID")  # ✅ fixed typo
-# ✅ News topics to fetch
-QUERIES = [
-    "India news", "World news", "Tech news", "Finance news", "Sports news"
-]
-# ✅ Paths
-INDEX_DIR = "storage/index"
-DATA_DIR = "data/news"
-RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")
-def write_articles_jsonl(articles: List[Dict], file_path: str):
-    os.makedirs(os.path.dirname(file_path), exist_ok=True)
-    with open(file_path, "w", encoding="utf-8") as f:
-        for article in articles:
-            f.write(json.dumps(article, ensure_ascii=False) + "\n")
 import sys
-import os
 import json
 import asyncio
 from typing import List, Dict
@@ -52,31 +14,29 @@ from llama_index.core.settings import Settings
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 from llama_index.core.schema import Document
-# ✅ Set up local embedding model
 Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
 # 🔐 Environment variables
 API_KEY = os.environ.get("GOOGLE_API_KEY")
 CSE_ID = os.environ.get("GOOGLE_CX_ID")
-# ✅ News topics to fetch
-QUERIES = [
-    "India news", "World news", "Tech news", "Finance news", "Sports news"
-]
-# ✅ Paths
-INDEX_DIR = "storage/index"
 DATA_DIR = "data/news"
 RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")
 def write_articles_jsonl(articles: List[Dict], file_path: str):
     os.makedirs(os.path.dirname(file_path), exist_ok=True)
     with open(file_path, "w", encoding="utf-8") as f:
         for article in articles:
             f.write(json.dumps(article, ensure_ascii=False) + "\n")
 async def build_documents(data: List[Dict]) -> List[Document]:
     return [
         Document(
@@ -91,7 +51,7 @@ async def build_documents(data: List[Dict]) -> List[Document]:
         for entry in data
     ]
 async def main():
     if not API_KEY or not CSE_ID:
         raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
@@ -142,11 +102,10 @@ async def main():
     get_or_build_index_from_docs(documents)
     print("⚡ Generating daily feed...")
-    await generate_and_cache_daily_feed(documents)
     print(f"✅ Indexed, headlines generated, and stored at: {INDEX_DIR}")
 if __name__ == "__main__":
     asyncio.run(main())

 import os
 import sys
 import json
 import asyncio
 from typing import List, Dict
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 from llama_index.core.schema import Document
+# ✅ Use local embedding model
 Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
 # 🔐 Environment variables
 API_KEY = os.environ.get("GOOGLE_API_KEY")
 CSE_ID = os.environ.get("GOOGLE_CX_ID")
+# 📰 Topics
+QUERIES = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
+# 🗂️ Paths
 DATA_DIR = "data/news"
 RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")
+INDEX_DIR = "storage/index"
+# 💾 Save articles to disk
 def write_articles_jsonl(articles: List[Dict], file_path: str):
     os.makedirs(os.path.dirname(file_path), exist_ok=True)
     with open(file_path, "w", encoding="utf-8") as f:
         for article in articles:
             f.write(json.dumps(article, ensure_ascii=False) + "\n")
+# 📄 Convert raw scraped data into Document objects
 async def build_documents(data: List[Dict]) -> List[Document]:
     return [
         Document(
         for entry in data
     ]
+# 🚀 Main pipeline runner
 async def main():
     if not API_KEY or not CSE_ID:
         raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
     get_or_build_index_from_docs(documents)
     print("⚡ Generating daily feed...")
+    generate_and_cache_daily_feed(documents)  # ✅ SYNC CALL
     print(f"✅ Indexed, headlines generated, and stored at: {INDEX_DIR}")
+# 🏁 Entrypoint
 if __name__ == "__main__":
     asyncio.run(main())