import pandas as pd import os import chromadb from chromadb.utils import embedding_functions import math def create_domain_identification_database(vdb_path: str,collection_name:str , df: pd.DataFrame) -> None: """This function processes the dataframe into the required format, and then creates the following collections in a ChromaDB instance 1. domain_identification_collection - Contains input text embeddings, and the metadata the other columns Args: collection_name (str) : name of database collection vdb_path (str): Relative path of the location of the ChromaDB instance. df (pd.DataFrame): task scheduling dataset. """ #identify the saving location of the ChromaDB chroma_client = chromadb.PersistentClient(path=vdb_path) #extract the embedding from hugging face embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/LaBSE") #creating the collection domain_identification_collection = chroma_client.create_collection( name=collection_name, embedding_function=embedding_function, ) # the main text "query" that will be embedded domain_identification_documents = [row.query for row in df.itertuples()] # the metadata domain_identification_metadata = [ {"domain": row.domain , "label": row.label} for row in df.itertuples() ] #index domain_ids = ["domain_id " + str(row.Index) for row in df.itertuples()] length = len(df) num_iteration = length / 166 num_iteration = math.ceil(num_iteration) start = 0 # start adding the the vectors for i in range(num_iteration): if i == num_iteration - 1 : domain_identification_collection.add(documents=domain_identification_documents[start:], metadatas=domain_identification_metadata[start:], ids=domain_ids[start:]) else: end = start + 166 domain_identification_collection.add(documents=domain_identification_documents[start:end], metadatas=domain_identification_metadata[start:end], ids=domain_ids[start:end]) start = end return None def delete_collection_from_vector_db(vdb_path: str, collection_name: str) -> None: """Deletes a particular collection from the persistent ChromaDB instance. Args: vdb_path (str): Path of the persistent ChromaDB instance. collection_name (str): Name of the collection to be deleted. """ chroma_client = chromadb.PersistentClient(path=vdb_path) chroma_client.delete_collection(collection_name) return None def list_collections_from_vector_db(vdb_path: str) -> None: """Lists all the available collections from the persistent ChromaDB instance. Args: vdb_path (str): Path of the persistent ChromaDB instance. """ chroma_client = chromadb.PersistentClient(path=vdb_path) print(chroma_client.list_collections()) def get_collection_from_vector_db( vdb_path: str, collection_name: str ) -> chromadb.Collection: """Fetches a particular ChromaDB collection object from the persistent ChromaDB instance. Args: vdb_path (str): Path of the persistent ChromaDB instance. collection_name (str): Name of the collection which needs to be retrieved. """ chroma_client = chromadb.PersistentClient(path=vdb_path) huggingface_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/LaBSE") collection = chroma_client.get_collection( name=collection_name, embedding_function=huggingface_ef ) return collection def retrieval( input_text : str, num_results : int, collection: chromadb.Collection ): """fetches the domain name from the collection based on the semantic similarity args: input_text : the received text which can be news , posts , or tweets num_results : number of fetched examples from the collection collection : the extracted collection from the database that we will fetch examples from """ fetched_domain = collection.query( query_texts = [input_text], n_results = num_results, ) #extracting domain name and label from the featched domains domain = fetched_domain["metadatas"][0][0]["domain"] label = fetched_domain["metadatas"][0][0]["label"] distance = fetched_domain["distances"][0][0] return domain , label , distance