Spaces:

pokameswaran
/

iprepbot

Runtime error

App Files Files Community

iprepbot / chatbot_functionalities /vectordb_operations.py

pokameswaran

Added files related to the app

5efc535 almost 2 years ago

raw

history blame

4.25 kB

	from dotenv import load_dotenv, find_dotenv
	import pandas as pd
	import os
	import chromadb
	from chromadb.utils import embedding_functions


	def generate_qa_vector_db(vdb_path: str, df: pd.DataFrame) -> None:
	"""This function processes the dataframe into the required format, and then creates the following collections in a ChromaDB instance
	1. question_collection - Contains question embeddings, and the metadata as 'position' and 'interview_phase'
	2. answer_collection - Contains the answer embeddings. No metadata (yet).

	Args:
	vdb_path (str): Relative path of the location of the ChromaDB instance.
	df (pd.DataFrame): Question/answer dataset.
	"""
	chroma_client = chromadb.PersistentClient(path=vdb_path)

	huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
	api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"],
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	)

	print("q_collection will be added")
	q_collection = chroma_client.create_collection(
	name="question_collection",
	metadata={"hnsw:space": "cosine"},
	embedding_function=huggingface_ef,
	)

	# Keep only question-related columns
	df_questions = df[
	["Position/Role", "Question", "Interview Phase"]
	].drop_duplicates()

	# df_questions = df_questions.drop_duplicates().reset_index(drop=True)
	df_questions.columns = [
	x.replace(" ", "_").lower().replace("/", "_or_") for x in df_questions.columns
	]

	q_documents = [row.question for row in df_questions.itertuples()]
	q_metadata = [
	{"position": row.position_or_role, "interview_phase": row.interview_phase}
	for row in df_questions.itertuples()
	]
	q_ids = ["q_id" + str(row.Index) for row in df_questions.itertuples()]

	q_collection.add(documents=q_documents, metadatas=q_metadata, ids=q_ids)
	print("q_collection added")

	print("a_collection will be added")
	a_collection = chroma_client.create_collection(
	name="answer_collection",
	metadata={"hnsw:space": "cosine"},
	embedding_function=huggingface_ef,
	)

	df_answers = df[["Answer", "Answer Quality"]]
	df_answers.columns = [
	x.replace(" ", "_").lower().replace("/", "_or_") for x in df_answers.columns
	]

	a_documents = [row.answer for row in df_answers.itertuples()]
	a_metadata = [
	{"answer_quality": row.answer_quality} for row in df_answers.itertuples()
	]
	a_ids = ["a_id" + str(row.Index) for row in df_answers.itertuples()]

	a_collection.add(documents=a_documents, ids=a_ids, metadatas=a_metadata)
	print("a_collection added")
	return None


	def delete_collection_from_vector_db(vdb_path: str, collection_name: str) -> None:
	"""Deletes a particular collection from the persistent ChromaDB instance.

	Args:
	vdb_path (str): Path of the persistent ChromaDB instance.
	collection_name (str): Name of the collection to be deleted.
	"""
	chroma_client = chromadb.PersistentClient(path=vdb_path)
	chroma_client.delete_collection(collection_name)
	return None


	def list_collections_from_vector_db(vdb_path: str) -> None:
	"""Lists all the available collections from the persistent ChromaDB instance.

	Args:
	vdb_path (str): Path of the persistent ChromaDB instance.
	"""
	chroma_client = chromadb.PersistentClient(path=vdb_path)
	print(chroma_client.list_collections())


	def get_collection_from_vector_db(
	vdb_path: str, collection_name: str
	) -> chromadb.Collection:
	"""Fetches a particular ChromaDB collection object from the persistent ChromaDB instance.

	Args:
	vdb_path (str): Path of the persistent ChromaDB instance.
	collection_name (str): Name of the collection which needs to be retrieved.
	"""
	load_dotenv(find_dotenv())
	chroma_client = chromadb.PersistentClient(path=vdb_path)

	huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
	api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"],
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	)

	collection = chroma_client.get_collection(
	name=collection_name, embedding_function=huggingface_ef
	)

	return collection