""" Python Backend API to chat with private data 08/14/2023 D.M. Theekshana Samaradiwakara """ import os from dotenv import load_dotenv import glob import torch import pickle import io from langchain.vectorstores import Chroma from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from chromadb.config import Settings load_dotenv() import streamlit as st embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME") embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) def does_chroma_vectorstore_exist(persist_directory: str) -> bool: # Checks if vectorstore exists if os.path.exists(os.path.join(persist_directory, 'index')): if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')): list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin')) list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl')) # At least 3 documents are needed in a working vectorstore if len(list_index_files) > 3: return True return False def load_store(directory: str) -> Chroma: index_path = "data/{0}".format(directory) # index_exists = os.path.exists(index_path) index_exists = does_chroma_vectorstore_exist(index_path) if index_exists: try: CHROMA_SETTINGS = Settings( chroma_db_impl='duckdb+parquet', persist_directory=index_path, anonymized_telemetry=False ) # return Chroma.load(index_path) vectorstore= Chroma( persist_directory=index_path, embedding_function=embeddings, client_settings=CHROMA_SETTINGS ) # with open("vectorstore.pkl", "wb") as f: # pickle.dump(vectorstore, f) return vectorstore except Exception as e: raise Exception(f"Error loading vector store: {e} ") else: # raise exception if model_type is not supported raise Exception(f"A vector store in directory {directory} is not created. Please choose a valid one") class CPU_Unpickler(pickle.Unpickler): def find_class(self, module, name): if module == 'torch.storage' and name == '_load_from_bytes': return lambda b: torch.load(io.BytesIO(b), map_location='cpu') else: return super().find_class(module, name) def create_db(document_splits,persist_directory): return Chroma.from_documents( documents=document_splits, embedding=embeddings, persist_directory=persist_directory ) def save_files(persist_directory, document_splits): print(f"Saving document splits...") if does_chroma_vectorstore_exist(persist_directory): print(f"Updating esisting vector store. May take some minutes...") #update function db = Chroma( persist_directory=index_path, embedding_function=embeddings, ) db.aadd_documents(document_splits) else: print(f"Creating new vector store. May take some minutes...") index_path = "data/{0}".format(persist_directory) db = create_db(document_splits,index_path) db.persist()