boardpac_chat_app_test / chromaDb.py
Boardpac/theekshanas
upload files again
39de480
"""
Python Backend API to chat with private data
08/14/2023
D.M. Theekshana Samaradiwakara
"""
import os
from dotenv import load_dotenv
import glob
import torch
import pickle
import io
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from chromadb.config import Settings
load_dotenv()
import streamlit as st
embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME")
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
def does_chroma_vectorstore_exist(persist_directory: str) -> bool:
# Checks if vectorstore exists
if os.path.exists(os.path.join(persist_directory, 'index')):
if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
# At least 3 documents are needed in a working vectorstore
if len(list_index_files) > 3:
return True
return False
def load_store(directory: str) -> Chroma:
index_path = "data/{0}".format(directory)
# index_exists = os.path.exists(index_path)
index_exists = does_chroma_vectorstore_exist(index_path)
if index_exists:
try:
CHROMA_SETTINGS = Settings(
chroma_db_impl='duckdb+parquet',
persist_directory=index_path,
anonymized_telemetry=False
)
# return Chroma.load(index_path)
vectorstore= Chroma(
persist_directory=index_path,
embedding_function=embeddings,
client_settings=CHROMA_SETTINGS
)
# with open("vectorstore.pkl", "wb") as f:
# pickle.dump(vectorstore, f)
return vectorstore
except Exception as e:
raise Exception(f"Error loading vector store: {e} ")
else:
# raise exception if model_type is not supported
raise Exception(f"A vector store in directory {directory} is not created. Please choose a valid one")
class CPU_Unpickler(pickle.Unpickler):
def find_class(self, module, name):
if module == 'torch.storage' and name == '_load_from_bytes':
return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
else:
return super().find_class(module, name)
def create_db(document_splits,persist_directory):
return Chroma.from_documents(
documents=document_splits,
embedding=embeddings,
persist_directory=persist_directory
)
def save_files(persist_directory, document_splits):
print(f"Saving document splits...")
if does_chroma_vectorstore_exist(persist_directory):
print(f"Updating esisting vector store. May take some minutes...")
#update function
db = Chroma(
persist_directory=index_path,
embedding_function=embeddings,
)
db.aadd_documents(document_splits)
else:
print(f"Creating new vector store. May take some minutes...")
index_path = "data/{0}".format(persist_directory)
db = create_db(document_splits,index_path)
db.persist()