Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 19

Commit

d4f91e1

1 Parent(s): a092d54

asyn await for cache gen

Browse files

Files changed (1) hide show

pipeline/news_ingest.py +62 -14

pipeline/news_ingest.py CHANGED Viewed

@@ -36,7 +36,48 @@ def write_articles_jsonl(articles: List[Dict], file_path: str):
         for article in articles:
             f.write(json.dumps(article, ensure_ascii=False) + "\n")
-def build_documents(data: List[Dict]) -> List[Document]:
     return [
         Document(
             text=entry["content"],
@@ -50,16 +91,17 @@ def build_documents(data: List[Dict]) -> List[Document]:
         for entry in data
     ]
-if __name__ == "__main__":
     if not API_KEY or not CSE_ID:
         raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
-    print("\U0001F30D Fetching news URLs from Google...")
     all_articles = []
     for query in QUERIES:
-        print(f"\U0001F50D Searching for: {query}")
         try:
             results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
             print(f"   → Found {len(results)} links for '{query}'.")
@@ -71,7 +113,7 @@ if __name__ == "__main__":
                 if not url or not title:
                     continue
-                print(f"\U0001F310 Scraping: {url}")
                 article_text = scrape_url(url)
                 if article_text:
@@ -90,15 +132,21 @@ if __name__ == "__main__":
     if not all_articles:
         print("⚠️ No content scraped. Exiting.")
-    else:
-        print(f"📝 Writing {len(all_articles)} articles to {RAW_JSON}...")
-        write_articles_jsonl(all_articles, RAW_JSON)
-        print("🧠 Building index...")
-        documents = build_documents(all_articles)
-        get_or_build_index_from_docs(documents)
-        print("⚡ Generating daily feed...")
-        generate_and_cache_daily_feed(documents)  # 👈 CALLS HEADLINE BUILDER
-        print(f"✅ Indexed, headlines generated, and stored at: {INDEX_DIR}")

         for article in articles:
             f.write(json.dumps(article, ensure_ascii=False) + "\n")
+import sys
+import os
+import json
+import asyncio
+from typing import List, Dict
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from components.indexers.news_indexer import get_or_build_index_from_docs
+from components.fetchers.google_search import fetch_google_news
+from components.fetchers.scraper import scrape_url
+from components.generators.daily_feed import generate_and_cache_daily_feed
+from llama_index.core.settings import Settings
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.schema import Document
+# ✅ Set up local embedding model
+Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
+# 🔐 Environment variables
+API_KEY = os.environ.get("GOOGLE_API_KEY")
+CSE_ID = os.environ.get("GOOGLE_CX_ID")
+# ✅ News topics to fetch
+QUERIES = [
+    "India news", "World news", "Tech news", "Finance news", "Sports news"
+]
+# ✅ Paths
+INDEX_DIR = "storage/index"
+DATA_DIR = "data/news"
+RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")
+def write_articles_jsonl(articles: List[Dict], file_path: str):
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    with open(file_path, "w", encoding="utf-8") as f:
+        for article in articles:
+            f.write(json.dumps(article, ensure_ascii=False) + "\n")
+async def build_documents(data: List[Dict]) -> List[Document]:
     return [
         Document(
             text=entry["content"],
         for entry in data
     ]
+async def main():
     if not API_KEY or not CSE_ID:
         raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
+    print("🌍 Fetching news URLs from Google...")
     all_articles = []
     for query in QUERIES:
+        print(f"🔍 Searching for: {query}")
         try:
             results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
             print(f"   → Found {len(results)} links for '{query}'.")
                 if not url or not title:
                     continue
+                print(f"🌐 Scraping: {url}")
                 article_text = scrape_url(url)
                 if article_text:
     if not all_articles:
         print("⚠️ No content scraped. Exiting.")
+        return
+    print(f"📝 Writing {len(all_articles)} articles to {RAW_JSON}...")
+    write_articles_jsonl(all_articles, RAW_JSON)
+    print("🧠 Building index...")
+    documents = await build_documents(all_articles)
+    get_or_build_index_from_docs(documents)
+    print("⚡ Generating daily feed...")
+    await generate_and_cache_daily_feed(documents)
+    print(f"✅ Indexed, headlines generated, and stored at: {INDEX_DIR}")
+if __name__ == "__main__":
+    asyncio.run(main())