iprepbot / chatbot_functionalities /vectordb_operations.py
pokameswaran's picture
Added files related to the app
5efc535
raw
history blame
4.25 kB
from dotenv import load_dotenv, find_dotenv
import pandas as pd
import os
import chromadb
from chromadb.utils import embedding_functions
def generate_qa_vector_db(vdb_path: str, df: pd.DataFrame) -> None:
"""This function processes the dataframe into the required format, and then creates the following collections in a ChromaDB instance
1. question_collection - Contains question embeddings, and the metadata as 'position' and 'interview_phase'
2. answer_collection - Contains the answer embeddings. No metadata (yet).
Args:
vdb_path (str): Relative path of the location of the ChromaDB instance.
df (pd.DataFrame): Question/answer dataset.
"""
chroma_client = chromadb.PersistentClient(path=vdb_path)
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"],
model_name="sentence-transformers/all-MiniLM-L6-v2",
)
print("q_collection will be added")
q_collection = chroma_client.create_collection(
name="question_collection",
metadata={"hnsw:space": "cosine"},
embedding_function=huggingface_ef,
)
# Keep only question-related columns
df_questions = df[
["Position/Role", "Question", "Interview Phase"]
].drop_duplicates()
# df_questions = df_questions.drop_duplicates().reset_index(drop=True)
df_questions.columns = [
x.replace(" ", "_").lower().replace("/", "_or_") for x in df_questions.columns
]
q_documents = [row.question for row in df_questions.itertuples()]
q_metadata = [
{"position": row.position_or_role, "interview_phase": row.interview_phase}
for row in df_questions.itertuples()
]
q_ids = ["q_id" + str(row.Index) for row in df_questions.itertuples()]
q_collection.add(documents=q_documents, metadatas=q_metadata, ids=q_ids)
print("q_collection added")
print("a_collection will be added")
a_collection = chroma_client.create_collection(
name="answer_collection",
metadata={"hnsw:space": "cosine"},
embedding_function=huggingface_ef,
)
df_answers = df[["Answer", "Answer Quality"]]
df_answers.columns = [
x.replace(" ", "_").lower().replace("/", "_or_") for x in df_answers.columns
]
a_documents = [row.answer for row in df_answers.itertuples()]
a_metadata = [
{"answer_quality": row.answer_quality} for row in df_answers.itertuples()
]
a_ids = ["a_id" + str(row.Index) for row in df_answers.itertuples()]
a_collection.add(documents=a_documents, ids=a_ids, metadatas=a_metadata)
print("a_collection added")
return None
def delete_collection_from_vector_db(vdb_path: str, collection_name: str) -> None:
"""Deletes a particular collection from the persistent ChromaDB instance.
Args:
vdb_path (str): Path of the persistent ChromaDB instance.
collection_name (str): Name of the collection to be deleted.
"""
chroma_client = chromadb.PersistentClient(path=vdb_path)
chroma_client.delete_collection(collection_name)
return None
def list_collections_from_vector_db(vdb_path: str) -> None:
"""Lists all the available collections from the persistent ChromaDB instance.
Args:
vdb_path (str): Path of the persistent ChromaDB instance.
"""
chroma_client = chromadb.PersistentClient(path=vdb_path)
print(chroma_client.list_collections())
def get_collection_from_vector_db(
vdb_path: str, collection_name: str
) -> chromadb.Collection:
"""Fetches a particular ChromaDB collection object from the persistent ChromaDB instance.
Args:
vdb_path (str): Path of the persistent ChromaDB instance.
collection_name (str): Name of the collection which needs to be retrieved.
"""
load_dotenv(find_dotenv())
chroma_client = chromadb.PersistentClient(path=vdb_path)
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"],
model_name="sentence-transformers/all-MiniLM-L6-v2",
)
collection = chroma_client.get_collection(
name=collection_name, embedding_function=huggingface_ef
)
return collection