PharmaBot / build_knowledge_base.py
alperensn's picture
Upload files
25fcb73 verified
# =================================================================================
# build_knowledge_base.py: One-time script to build and save the vector store
# =================================================================================
from llama_index.core import VectorStoreIndex, Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import config
import data_processing
import os
def build_vector_store():
"""
Builds and saves a LlamaIndex vector store from the processed documents.
"""
# Load and process documents from all sources
all_docs = data_processing.load_and_process_all()
# If no documents were created, exit
if not all_docs:
print("No documents were created. Exiting.")
return
# The documents are already in the correct LlamaIndex format.
llama_documents = all_docs
# Initialize the embedding model
print(f"Loading embedding model: {config.EMBEDDING_MODEL_NAME}...")
embed_model = HuggingFaceEmbedding(model_name=config.EMBEDDING_MODEL_NAME)
# Create the LlamaIndex VectorStoreIndex
print("Creating the LlamaIndex vector store...")
index = VectorStoreIndex.from_documents(
llama_documents,
embed_model=embed_model,
transformations=[SentenceSplitter(chunk_size=1000, chunk_overlap=150)]
)
# Persist the index to disk
print(f"Saving the vector store to: {config.LLAMA_INDEX_STORE_PATH}")
index.storage_context.persist(persist_dir=config.LLAMA_INDEX_STORE_PATH)
print("Vector store built and saved successfully.")
def main():
"""
Main function to build the knowledge base.
"""
# Check if the vector store already exists
if os.path.exists(config.LLAMA_INDEX_STORE_PATH):
print("Vector store already exists. Skipping build process.")
else:
build_vector_store()
if __name__ == "__main__":
main()