ragV98 commited on
Commit
989b675
Β·
1 Parent(s): 8e17b80

changes v1

Browse files
components/indexers/news_indexer.py CHANGED
@@ -4,33 +4,40 @@ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, load_index
4
  from llama_index.core.node_parser import SimpleNodeParser
5
  from llama_index.core.settings import Settings
6
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 
7
  import os
8
 
 
9
  Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
10
 
 
11
  def get_upstash_vector_store():
12
  return UpstashVectorStore(
13
  url=os.environ["UPSTASH_VECTOR_REST_URL"],
14
  token=os.environ["UPSTASH_VECTOR_REST_TOKEN"],
15
  )
16
 
 
17
  def build_news_index(data_dir: str) -> VectorStoreIndex:
18
  documents = SimpleDirectoryReader(data_dir).load_data()
19
- nodes = SimpleNodeParser.from_defaults().get_nodes_from_documents(documents)
20
 
 
 
 
21
  vector_store = get_upstash_vector_store()
22
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
23
-
24
  index = VectorStoreIndex(nodes, storage_context=storage_context)
25
  return index
26
 
 
27
  def load_news_index() -> VectorStoreIndex:
28
  vector_store = get_upstash_vector_store()
29
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
30
  return load_index_from_storage(storage_context)
31
 
 
32
  def get_or_build_index(data_dir: str) -> VectorStoreIndex:
33
- # This should check if the index already exists in Upstash
34
  try:
35
  return load_news_index()
36
  except Exception:
 
4
  from llama_index.core.node_parser import SimpleNodeParser
5
  from llama_index.core.settings import Settings
6
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
7
+ from llama_index.core.schema import Document
8
  import os
9
 
10
+ # βœ… Setup embedding
11
  Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
12
 
13
+ # βœ… Upstash vector store config
14
  def get_upstash_vector_store():
15
  return UpstashVectorStore(
16
  url=os.environ["UPSTASH_VECTOR_REST_URL"],
17
  token=os.environ["UPSTASH_VECTOR_REST_TOKEN"],
18
  )
19
 
20
+ # βœ… File-based ingestion
21
  def build_news_index(data_dir: str) -> VectorStoreIndex:
22
  documents = SimpleDirectoryReader(data_dir).load_data()
23
+ return get_or_build_index_from_docs(documents)
24
 
25
+ # βœ… Direct document ingestion
26
+ def get_or_build_index_from_docs(documents: list[Document]) -> VectorStoreIndex:
27
+ nodes = SimpleNodeParser.from_defaults().get_nodes_from_documents(documents)
28
  vector_store = get_upstash_vector_store()
29
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
 
30
  index = VectorStoreIndex(nodes, storage_context=storage_context)
31
  return index
32
 
33
+ # βœ… Load existing index (if no changes in docs)
34
  def load_news_index() -> VectorStoreIndex:
35
  vector_store = get_upstash_vector_store()
36
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
37
  return load_index_from_storage(storage_context)
38
 
39
+ # βœ… Preferred file-based entry point
40
  def get_or_build_index(data_dir: str) -> VectorStoreIndex:
 
41
  try:
42
  return load_news_index()
43
  except Exception:
pipeline/news_ingest.py CHANGED
@@ -5,7 +5,7 @@ from typing import List, Dict
5
 
6
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
7
 
8
- from components.indexers.news_indexer import get_or_build_index
9
  from components.fetchers.google_search import fetch_google_news
10
  from components.fetchers.scraper import scrape_url
11
  from llama_index.core.settings import Settings
@@ -95,6 +95,6 @@ if __name__ == "__main__":
95
 
96
  print("🧠 Building index...")
97
  documents = build_documents(all_articles)
98
- get_or_build_index(documents)
99
 
100
  print(f"βœ… Indexed and stored at: {INDEX_DIR}")
 
5
 
6
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
7
 
8
+ from components.indexers.news_indexer import get_or_build_index_from_docs
9
  from components.fetchers.google_search import fetch_google_news
10
  from components.fetchers.scraper import scrape_url
11
  from llama_index.core.settings import Settings
 
95
 
96
  print("🧠 Building index...")
97
  documents = build_documents(all_articles)
98
+ get_or_build_index_from_docs(documents)
99
 
100
  print(f"βœ… Indexed and stored at: {INDEX_DIR}")