Spaces:

theekshana
/

boardpac_chat_app_test

Runtime error

Boardpac/theekshanas

upload files again

39de480 9 months ago

No virus

3.43 kB

	"""
	Python Backend API to chat with private data

	08/14/2023
	D.M. Theekshana Samaradiwakara
	"""

	import os
	from dotenv import load_dotenv
	import glob

	import torch
	import pickle
	import io

	from langchain.vectorstores import Chroma
	from langchain.vectorstores import FAISS

	from langchain.embeddings import HuggingFaceEmbeddings

	from chromadb.config import Settings

	load_dotenv()

	import streamlit as st
	embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME")
	embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

	def does_chroma_vectorstore_exist(persist_directory: str) -> bool:
	# Checks if vectorstore exists
	if os.path.exists(os.path.join(persist_directory, 'index')):
	if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
	list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
	list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
	# At least 3 documents are needed in a working vectorstore
	if len(list_index_files) > 3:
	return True
	return False

	def load_store(directory: str) -> Chroma:
	index_path = "data/{0}".format(directory)
	# index_exists = os.path.exists(index_path)
	index_exists = does_chroma_vectorstore_exist(index_path)

	if index_exists:
	try:

	CHROMA_SETTINGS = Settings(
	chroma_db_impl='duckdb+parquet',
	persist_directory=index_path,
	anonymized_telemetry=False
	)

	# return Chroma.load(index_path)
	vectorstore= Chroma(
	persist_directory=index_path,
	embedding_function=embeddings,
	client_settings=CHROMA_SETTINGS
	)

	# with open("vectorstore.pkl", "wb") as f:
	# pickle.dump(vectorstore, f)

	return vectorstore
	except Exception as e:
	raise Exception(f"Error loading vector store: {e} ")

	else:
	# raise exception if model_type is not supported
	raise Exception(f"A vector store in directory {directory} is not created. Please choose a valid one")

	class CPU_Unpickler(pickle.Unpickler):
	def find_class(self, module, name):
	if module == 'torch.storage' and name == '_load_from_bytes':
	return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
	else:
	return super().find_class(module, name)

	def create_db(document_splits,persist_directory):
	return Chroma.from_documents(
	documents=document_splits,
	embedding=embeddings,
	persist_directory=persist_directory
	)

	def save_files(persist_directory, document_splits):
	print(f"Saving document splits...")
	if does_chroma_vectorstore_exist(persist_directory):
	print(f"Updating esisting vector store. May take some minutes...")
	#update function
	db = Chroma(
	persist_directory=index_path,
	embedding_function=embeddings,
	)
	db.aadd_documents(document_splits)

	else:
	print(f"Creating new vector store. May take some minutes...")
	index_path = "data/{0}".format(persist_directory)
	db = create_db(document_splits,index_path)
	db.persist()