### Notebook for creating/updating the dense and sparse indices

In [None]:
from ipynb.fs.defs.preprocess_data import preprocess_data
from ipynb.fs.defs.preprocess_data import get_documents_from_files
from ipynb.fs.defs.preprocess_data import split_docs
from ipynb.fs.defs.preprocess_data import clean_and_process_chunked_documents
from ipynb.fs.defs.preprocess_data import store_documents
import chromadb
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from typing import List
import os


def build_or_update_index_vector_db(documents: List[Document], embeddings, collection_name: str, dist_function: str, collection_metadata: dict):
 '''
 Builds the index vector DB from documents with the specified embeddings and collection_name
 If it already exists, updates the index with the new documents
 '''
 new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))

 print("Starting to build index for: ", collection_metadata)

 # Check if collection already exists
 collection_exists = True
 try:
 collection = new_client.get_collection(collection_name)
 except ValueError as e:
 collection_exists = False

 if not collection_exists:
 print("Collection is new")
 # If collection does not exist, create it
 collection = new_client.create_collection(collection_name)
 # Each document needs an ID
 ids = [str(i) for i in range(1, len(documents) + 1)]

 # Store the text of the document and metadata separately in order to insert it into Chroma
 texts = []
 metadata_docs = []
 for document in documents:
 texts.append(document.page_content)
 metadata_docs.append(document.metadata)

 # Add them in batches (otherwise Chroma error)
 for start_idx in range(0, len(embeddings), 1000):
 end_idx = start_idx + 1000
 # Ensure not to go out of bounds
 embeddings_batch = embeddings[start_idx : min(end_idx, len(embeddings))]
 texts_batch = texts[start_idx : min(end_idx, len(embeddings))]
 ids_batch = ids[start_idx : min(end_idx, len(embeddings))]
 metadatas_batch = metadata_docs[start_idx : min(end_idx, len(embeddings))]

 collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)
 print(f"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}")

 vectordb = Chroma(
 client=new_client,
 collection_name=collection_name,
 collection_metadata={
 "embedding_model_provider": collection_metadata["embedding_model_provider"],
 "embedding_model_name": collection_metadata["embedding_model_name"],
 "chunk_size": collection_metadata["chunk_size"],
 "chunk_overlap": collection_metadata["chunk_overlap"],
 "hnsw:space": dist_function, # either "l2" or "ip" or "cosine"
 },
 )
 print(f"Collection {collection_name} successfully created.")
 print("There are", vectordb._collection.count(), "entries in the collection.")

 return new_client, vectordb

 else:
 print("Collection already exists")
 vectordb = Chroma(client=new_client, collection_name=collection_name)

 collection_count = vectordb._collection.count()
 print(f"There are {collection_count} entries in the collection {collection_name} prior to updating.")

 # Continue the IDs from the last ID
 ids = [str(i) for i in range(collection_count + 1, collection_count + len(documents) + 1)]
 # Store the text of the document and metadata separately in order to insert it into Chroma
 texts = []
 metadata_docs = []
 for document in documents:
 texts.append(document.page_content)
 metadata_docs.append(document.metadata)

 # Add them in batches (otherwise Chroma error)
 for start_idx in range(0, len(embeddings), 1000):
 end_idx = start_idx + 1000
 # Ensure not to go out of bounds
 embeddings_batch = embeddings[start_idx : min(end_idx, len(embeddings))]
 texts_batch = texts[start_idx : min(end_idx, len(embeddings))]
 ids_batch = ids[start_idx : min(end_idx, len(embeddings))]
 metadatas_batch = metadata_docs[start_idx : min(end_idx, len(embeddings))]

 collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)
 print(f"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}")

 collection_count = vectordb._collection.count()
 print(f"There are {collection_count} entries in the collection {collection_name} after updating.")
 return new_client, 0

In [None]:
chunk_size = 1536
chunk_overlap = 264
# If update is needed, set to False
all_docs = True

documents, embedding_model, embeddings = preprocess_data(chunk_size, chunk_overlap, all_docs)
collection_name = "ISO_27001_Collection"
collection_metadata = {
"embedding_model_provider": "Fine-tuned",
"embedding_model_name": "finetuned-BGE-large-ISO-27001",
"chunk_size": str(chunk_size),
"chunk_overlap": str(chunk_overlap),
}

build_or_update_index_vector_db(documents, embeddings, collection_name, "l2", collection_metadata)

In [None]:
def store_documents_for_sparse_retrieval(chunk_size: int, chunk_overlap: int):
 """
 Stores the documents for sparse retrieval in a basic text file
 """
 documents = get_documents_from_files(True)
 chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents)

 store_documents(chunked_cleaned_documents, f"./../sparse_index/sparse_1536_264")

In [None]:
# Create the actual sparse index
store_documents_for_sparse_retrieval(chunk_size, chunk_overlap)

#### Helper methods for Chroma

In [None]:
# Returns the vectorDB based on the collection name if it exists
def get_index_vector_db(collection_name: str):
 new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))

 # Check if collection already exists
 collection_exists = True
 try:
 new_client.get_collection(collection_name)
 except ValueError as e:
 collection_exists = False

 if not collection_exists:
 raise Exception("Error, raised exception: Collection does not exist.")
 else:
 vectordb = Chroma(client=new_client, collection_name=collection_name)

 return new_client, vectordb

In [None]:
def delete_collection(collection_name: str):
 new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))

 try:
 new_client.delete_collection(collection_name)
 except ValueError as e:
 print("Collection could not be deleted.")

In [None]:
def return_collections():
 new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))
 collections = new_client.list_collections()
 return collections