Spaces:

alperensn
/

PharmaBot

Sleeping

App Files Files Community

PharmaBot / build_knowledge_base.py

alperensn

Upload files

25fcb73 verified 19 days ago

raw

history blame contribute delete

2.03 kB

	# =================================================================================
	# build_knowledge_base.py: One-time script to build and save the vector store
	# =================================================================================
	from llama_index.core import VectorStoreIndex, Document
	from llama_index.core.node_parser import SentenceSplitter
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	import config
	import data_processing
	import os

	def build_vector_store():
	"""
	Builds and saves a LlamaIndex vector store from the processed documents.
	"""
	# Load and process documents from all sources
	all_docs = data_processing.load_and_process_all()

	# If no documents were created, exit
	if not all_docs:
	print("No documents were created. Exiting.")
	return

	# The documents are already in the correct LlamaIndex format.
	llama_documents = all_docs

	# Initialize the embedding model
	print(f"Loading embedding model: {config.EMBEDDING_MODEL_NAME}...")
	embed_model = HuggingFaceEmbedding(model_name=config.EMBEDDING_MODEL_NAME)

	# Create the LlamaIndex VectorStoreIndex
	print("Creating the LlamaIndex vector store...")
	index = VectorStoreIndex.from_documents(
	llama_documents,
	embed_model=embed_model,
	transformations=[SentenceSplitter(chunk_size=1000, chunk_overlap=150)]
	)

	# Persist the index to disk
	print(f"Saving the vector store to: {config.LLAMA_INDEX_STORE_PATH}")
	index.storage_context.persist(persist_dir=config.LLAMA_INDEX_STORE_PATH)
	print("Vector store built and saved successfully.")

	def main():
	"""
	Main function to build the knowledge base.
	"""
	# Check if the vector store already exists
	if os.path.exists(config.LLAMA_INDEX_STORE_PATH):
	print("Vector store already exists. Skipping build process.")
	else:
	build_vector_store()

	if __name__ == "__main__":
	main()