Spaces:
Runtime error
Runtime error
| from dotenv import load_dotenv, find_dotenv | |
| import pandas as pd | |
| import os | |
| import chromadb | |
| from chromadb.utils import embedding_functions | |
| def generate_qa_vector_db(vdb_path: str, df: pd.DataFrame) -> None: | |
| """This function processes the dataframe into the required format, and then creates the following collections in a ChromaDB instance | |
| 1. question_collection - Contains question embeddings, and the metadata as 'position' and 'interview_phase' | |
| 2. answer_collection - Contains the answer embeddings. No metadata (yet). | |
| Args: | |
| vdb_path (str): Relative path of the location of the ChromaDB instance. | |
| df (pd.DataFrame): Question/answer dataset. | |
| """ | |
| chroma_client = chromadb.PersistentClient(path=vdb_path) | |
| huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction( | |
| api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"], | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| ) | |
| print("q_collection will be added") | |
| q_collection = chroma_client.create_collection( | |
| name="question_collection", | |
| metadata={"hnsw:space": "cosine"}, | |
| embedding_function=huggingface_ef, | |
| ) | |
| # Keep only question-related columns | |
| df_questions = df[ | |
| ["Position/Role", "Question", "Interview Phase"] | |
| ].drop_duplicates() | |
| # df_questions = df_questions.drop_duplicates().reset_index(drop=True) | |
| df_questions.columns = [ | |
| x.replace(" ", "_").lower().replace("/", "_or_") for x in df_questions.columns | |
| ] | |
| q_documents = [row.question for row in df_questions.itertuples()] | |
| q_metadata = [ | |
| {"position": row.position_or_role, "interview_phase": row.interview_phase} | |
| for row in df_questions.itertuples() | |
| ] | |
| q_ids = ["q_id" + str(row.Index) for row in df_questions.itertuples()] | |
| q_collection.add(documents=q_documents, metadatas=q_metadata, ids=q_ids) | |
| print("q_collection added") | |
| print("a_collection will be added") | |
| a_collection = chroma_client.create_collection( | |
| name="answer_collection", | |
| metadata={"hnsw:space": "cosine"}, | |
| embedding_function=huggingface_ef, | |
| ) | |
| df_answers = df[["Answer", "Answer Quality"]] | |
| df_answers.columns = [ | |
| x.replace(" ", "_").lower().replace("/", "_or_") for x in df_answers.columns | |
| ] | |
| a_documents = [row.answer for row in df_answers.itertuples()] | |
| a_metadata = [ | |
| {"answer_quality": row.answer_quality} for row in df_answers.itertuples() | |
| ] | |
| a_ids = ["a_id" + str(row.Index) for row in df_answers.itertuples()] | |
| a_collection.add(documents=a_documents, ids=a_ids, metadatas=a_metadata) | |
| print("a_collection added") | |
| return None | |
| def delete_collection_from_vector_db(vdb_path: str, collection_name: str) -> None: | |
| """Deletes a particular collection from the persistent ChromaDB instance. | |
| Args: | |
| vdb_path (str): Path of the persistent ChromaDB instance. | |
| collection_name (str): Name of the collection to be deleted. | |
| """ | |
| chroma_client = chromadb.PersistentClient(path=vdb_path) | |
| chroma_client.delete_collection(collection_name) | |
| return None | |
| def list_collections_from_vector_db(vdb_path: str) -> None: | |
| """Lists all the available collections from the persistent ChromaDB instance. | |
| Args: | |
| vdb_path (str): Path of the persistent ChromaDB instance. | |
| """ | |
| chroma_client = chromadb.PersistentClient(path=vdb_path) | |
| print(chroma_client.list_collections()) | |
| def get_collection_from_vector_db( | |
| vdb_path: str, collection_name: str | |
| ) -> chromadb.Collection: | |
| """Fetches a particular ChromaDB collection object from the persistent ChromaDB instance. | |
| Args: | |
| vdb_path (str): Path of the persistent ChromaDB instance. | |
| collection_name (str): Name of the collection which needs to be retrieved. | |
| """ | |
| load_dotenv(find_dotenv()) | |
| chroma_client = chromadb.PersistentClient(path=vdb_path) | |
| huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction( | |
| api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"], | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| ) | |
| collection = chroma_client.get_collection( | |
| name=collection_name, embedding_function=huggingface_ef | |
| ) | |
| return collection | |