File size: 3,428 Bytes
39de480
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
Python Backend API to chat with private data  

08/14/2023
D.M. Theekshana Samaradiwakara
"""

import os
from dotenv import load_dotenv
import glob

import torch
import pickle
import io

from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

from langchain.embeddings import HuggingFaceEmbeddings

from chromadb.config import Settings

load_dotenv()

import streamlit as st
embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME")
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

def does_chroma_vectorstore_exist(persist_directory: str) -> bool:
    # Checks if vectorstore exists
    if os.path.exists(os.path.join(persist_directory, 'index')):
        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
            list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
            list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
            # At least 3 documents are needed in a working vectorstore
            if len(list_index_files) > 3:
                return True
    return False

def load_store(directory: str) -> Chroma:
    index_path = "data/{0}".format(directory)
    # index_exists = os.path.exists(index_path)
    index_exists = does_chroma_vectorstore_exist(index_path)

    if index_exists:
        try:

            CHROMA_SETTINGS = Settings(
                    chroma_db_impl='duckdb+parquet',
                    persist_directory=index_path,
                    anonymized_telemetry=False
            )

            # return Chroma.load(index_path)
            vectorstore= Chroma(
                persist_directory=index_path,
                embedding_function=embeddings,
                client_settings=CHROMA_SETTINGS
            )
        
            # with open("vectorstore.pkl", "wb") as f:
            #     pickle.dump(vectorstore, f)

            return vectorstore
        except Exception as e:
             raise Exception(f"Error loading vector store: {e} ")
    
    else:
        # raise exception if model_type is not supported
        raise Exception(f"A vector store in directory {directory} is not created. Please choose a valid one")

class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else:
            return super().find_class(module, name)
        
def create_db(document_splits,persist_directory):
    return Chroma.from_documents(
        documents=document_splits,
        embedding=embeddings,
        persist_directory=persist_directory
    )

def save_files(persist_directory, document_splits):
    print(f"Saving document splits...")
    if does_chroma_vectorstore_exist(persist_directory):
        print(f"Updating esisting vector store. May take some minutes...")
        #update function
        db =  Chroma(
            persist_directory=index_path,
            embedding_function=embeddings,
        )
        db.aadd_documents(document_splits)

    else:
        print(f"Creating new vector store. May take some minutes...")
        index_path = "data/{0}".format(persist_directory)
        db = create_db(document_splits,index_path)
    db.persist()