changes v1
Browse files
components/indexers/news_indexer.py
CHANGED
@@ -4,33 +4,40 @@ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, load_index
|
|
4 |
from llama_index.core.node_parser import SimpleNodeParser
|
5 |
from llama_index.core.settings import Settings
|
6 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
|
7 |
import os
|
8 |
|
|
|
9 |
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
10 |
|
|
|
11 |
def get_upstash_vector_store():
|
12 |
return UpstashVectorStore(
|
13 |
url=os.environ["UPSTASH_VECTOR_REST_URL"],
|
14 |
token=os.environ["UPSTASH_VECTOR_REST_TOKEN"],
|
15 |
)
|
16 |
|
|
|
17 |
def build_news_index(data_dir: str) -> VectorStoreIndex:
|
18 |
documents = SimpleDirectoryReader(data_dir).load_data()
|
19 |
-
|
20 |
|
|
|
|
|
|
|
21 |
vector_store = get_upstash_vector_store()
|
22 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
23 |
-
|
24 |
index = VectorStoreIndex(nodes, storage_context=storage_context)
|
25 |
return index
|
26 |
|
|
|
27 |
def load_news_index() -> VectorStoreIndex:
|
28 |
vector_store = get_upstash_vector_store()
|
29 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
30 |
return load_index_from_storage(storage_context)
|
31 |
|
|
|
32 |
def get_or_build_index(data_dir: str) -> VectorStoreIndex:
|
33 |
-
# This should check if the index already exists in Upstash
|
34 |
try:
|
35 |
return load_news_index()
|
36 |
except Exception:
|
|
|
4 |
from llama_index.core.node_parser import SimpleNodeParser
|
5 |
from llama_index.core.settings import Settings
|
6 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
7 |
+
from llama_index.core.schema import Document
|
8 |
import os
|
9 |
|
10 |
+
# β
Setup embedding
|
11 |
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
12 |
|
13 |
+
# β
Upstash vector store config
|
14 |
def get_upstash_vector_store():
|
15 |
return UpstashVectorStore(
|
16 |
url=os.environ["UPSTASH_VECTOR_REST_URL"],
|
17 |
token=os.environ["UPSTASH_VECTOR_REST_TOKEN"],
|
18 |
)
|
19 |
|
20 |
+
# β
File-based ingestion
|
21 |
def build_news_index(data_dir: str) -> VectorStoreIndex:
|
22 |
documents = SimpleDirectoryReader(data_dir).load_data()
|
23 |
+
return get_or_build_index_from_docs(documents)
|
24 |
|
25 |
+
# β
Direct document ingestion
|
26 |
+
def get_or_build_index_from_docs(documents: list[Document]) -> VectorStoreIndex:
|
27 |
+
nodes = SimpleNodeParser.from_defaults().get_nodes_from_documents(documents)
|
28 |
vector_store = get_upstash_vector_store()
|
29 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
|
|
30 |
index = VectorStoreIndex(nodes, storage_context=storage_context)
|
31 |
return index
|
32 |
|
33 |
+
# β
Load existing index (if no changes in docs)
|
34 |
def load_news_index() -> VectorStoreIndex:
|
35 |
vector_store = get_upstash_vector_store()
|
36 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
37 |
return load_index_from_storage(storage_context)
|
38 |
|
39 |
+
# β
Preferred file-based entry point
|
40 |
def get_or_build_index(data_dir: str) -> VectorStoreIndex:
|
|
|
41 |
try:
|
42 |
return load_news_index()
|
43 |
except Exception:
|
pipeline/news_ingest.py
CHANGED
@@ -5,7 +5,7 @@ from typing import List, Dict
|
|
5 |
|
6 |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
7 |
|
8 |
-
from components.indexers.news_indexer import
|
9 |
from components.fetchers.google_search import fetch_google_news
|
10 |
from components.fetchers.scraper import scrape_url
|
11 |
from llama_index.core.settings import Settings
|
@@ -95,6 +95,6 @@ if __name__ == "__main__":
|
|
95 |
|
96 |
print("π§ Building index...")
|
97 |
documents = build_documents(all_articles)
|
98 |
-
|
99 |
|
100 |
print(f"β
Indexed and stored at: {INDEX_DIR}")
|
|
|
5 |
|
6 |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
7 |
|
8 |
+
from components.indexers.news_indexer import get_or_build_index_from_docs
|
9 |
from components.fetchers.google_search import fetch_google_news
|
10 |
from components.fetchers.scraper import scrape_url
|
11 |
from llama_index.core.settings import Settings
|
|
|
95 |
|
96 |
print("π§ Building index...")
|
97 |
documents = build_documents(all_articles)
|
98 |
+
get_or_build_index_from_docs(documents)
|
99 |
|
100 |
print(f"β
Indexed and stored at: {INDEX_DIR}")
|