Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 17

Commit

989b675

1 Parent(s): 8e17b80

changes v1

Browse files

Files changed (2) hide show

components/indexers/news_indexer.py +10 -3
pipeline/news_ingest.py +2 -2

components/indexers/news_indexer.py CHANGED Viewed

@@ -4,33 +4,40 @@ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, load_index
 from llama_index.core.node_parser import SimpleNodeParser
 from llama_index.core.settings import Settings
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 import os
 Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
 def get_upstash_vector_store():
     return UpstashVectorStore(
         url=os.environ["UPSTASH_VECTOR_REST_URL"],
         token=os.environ["UPSTASH_VECTOR_REST_TOKEN"],
     )
 def build_news_index(data_dir: str) -> VectorStoreIndex:
     documents = SimpleDirectoryReader(data_dir).load_data()
-    nodes = SimpleNodeParser.from_defaults().get_nodes_from_documents(documents)
     vector_store = get_upstash_vector_store()
     storage_context = StorageContext.from_defaults(vector_store=vector_store)
     index = VectorStoreIndex(nodes, storage_context=storage_context)
     return index
 def load_news_index() -> VectorStoreIndex:
     vector_store = get_upstash_vector_store()
     storage_context = StorageContext.from_defaults(vector_store=vector_store)
     return load_index_from_storage(storage_context)
 def get_or_build_index(data_dir: str) -> VectorStoreIndex:
-    # This should check if the index already exists in Upstash
     try:
         return load_news_index()
     except Exception:

 from llama_index.core.node_parser import SimpleNodeParser
 from llama_index.core.settings import Settings
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.schema import Document
 import os
+# ✅ Setup embedding
 Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# ✅ Upstash vector store config
 def get_upstash_vector_store():
     return UpstashVectorStore(
         url=os.environ["UPSTASH_VECTOR_REST_URL"],
         token=os.environ["UPSTASH_VECTOR_REST_TOKEN"],
     )
+# ✅ File-based ingestion
 def build_news_index(data_dir: str) -> VectorStoreIndex:
     documents = SimpleDirectoryReader(data_dir).load_data()
+    return get_or_build_index_from_docs(documents)
+# ✅ Direct document ingestion
+def get_or_build_index_from_docs(documents: list[Document]) -> VectorStoreIndex:
+    nodes = SimpleNodeParser.from_defaults().get_nodes_from_documents(documents)
     vector_store = get_upstash_vector_store()
     storage_context = StorageContext.from_defaults(vector_store=vector_store)
     index = VectorStoreIndex(nodes, storage_context=storage_context)
     return index
+# ✅ Load existing index (if no changes in docs)
 def load_news_index() -> VectorStoreIndex:
     vector_store = get_upstash_vector_store()
     storage_context = StorageContext.from_defaults(vector_store=vector_store)
     return load_index_from_storage(storage_context)
+# ✅ Preferred file-based entry point
 def get_or_build_index(data_dir: str) -> VectorStoreIndex:
     try:
         return load_news_index()
     except Exception:

pipeline/news_ingest.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import List, Dict
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-from components.indexers.news_indexer import get_or_build_index
 from components.fetchers.google_search import fetch_google_news
 from components.fetchers.scraper import scrape_url
 from llama_index.core.settings import Settings
@@ -95,6 +95,6 @@ if __name__ == "__main__":
         print("🧠 Building index...")
         documents = build_documents(all_articles)
-        get_or_build_index(documents)
         print(f"✅ Indexed and stored at: {INDEX_DIR}")

 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from components.indexers.news_indexer import get_or_build_index_from_docs
 from components.fetchers.google_search import fetch_google_news
 from components.fetchers.scraper import scrape_url
 from llama_index.core.settings import Settings
         print("🧠 Building index...")
         documents = build_documents(all_articles)
+        get_or_build_index_from_docs(documents)
         print(f"✅ Indexed and stored at: {INDEX_DIR}")