{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Notebook for creating/updating the dense and sparse indices" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from ipynb.fs.defs.preprocess_data import preprocess_data\n", "from ipynb.fs.defs.preprocess_data import get_documents_from_files\n", "from ipynb.fs.defs.preprocess_data import split_docs\n", "from ipynb.fs.defs.preprocess_data import clean_and_process_chunked_documents\n", "from ipynb.fs.defs.preprocess_data import store_documents\n", "import chromadb\n", "from langchain.vectorstores import Chroma\n", "from langchain.docstore.document import Document\n", "from typing import List\n", "import os\n", "\n", "\n", "def build_or_update_index_vector_db(documents: List[Document], embeddings, collection_name: str, dist_function: str, collection_metadata: dict):\n", " '''\n", " Builds the index vector DB from documents with the specified embeddings and collection_name\n", " If it already exists, updates the index with the new documents\n", " '''\n", " new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n", "\n", " print(\"Starting to build index for: \", collection_metadata)\n", "\n", " # Check if collection already exists\n", " collection_exists = True\n", " try:\n", " collection = new_client.get_collection(collection_name)\n", " except ValueError as e:\n", " collection_exists = False\n", "\n", " if not collection_exists:\n", " print(\"Collection is new\")\n", " # If collection does not exist, create it\n", " collection = new_client.create_collection(collection_name)\n", " # Each document needs an ID\n", " ids = [str(i) for i in range(1, len(documents) + 1)]\n", "\n", " # Store the text of the document and metadata separately in order to insert it into Chroma\n", " texts = []\n", " metadata_docs = []\n", " for document in documents:\n", " texts.append(document.page_content)\n", " metadata_docs.append(document.metadata)\n", "\n", " # Add them in batches (otherwise Chroma error)\n", " for start_idx in range(0, len(embeddings), 1000):\n", " end_idx = start_idx + 1000\n", " # Ensure not to go out of bounds\n", " embeddings_batch = embeddings[start_idx : min(end_idx, len(embeddings))]\n", " texts_batch = texts[start_idx : min(end_idx, len(embeddings))]\n", " ids_batch = ids[start_idx : min(end_idx, len(embeddings))]\n", " metadatas_batch = metadata_docs[start_idx : min(end_idx, len(embeddings))]\n", "\n", " collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)\n", " print(f\"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}\")\n", "\n", " vectordb = Chroma(\n", " client=new_client,\n", " collection_name=collection_name,\n", " collection_metadata={\n", " \"embedding_model_provider\": collection_metadata[\"embedding_model_provider\"],\n", " \"embedding_model_name\": collection_metadata[\"embedding_model_name\"],\n", " \"chunk_size\": collection_metadata[\"chunk_size\"],\n", " \"chunk_overlap\": collection_metadata[\"chunk_overlap\"],\n", " \"hnsw:space\": dist_function, # either \"l2\" or \"ip\" or \"cosine\"\n", " },\n", " )\n", " print(f\"Collection {collection_name} successfully created.\")\n", " print(\"There are\", vectordb._collection.count(), \"entries in the collection.\")\n", "\n", " return new_client, vectordb\n", "\n", " else:\n", " print(\"Collection already exists\")\n", " vectordb = Chroma(client=new_client, collection_name=collection_name)\n", "\n", " collection_count = vectordb._collection.count()\n", " print(f\"There are {collection_count} entries in the collection {collection_name} prior to updating.\")\n", "\n", " # Continue the IDs from the last ID\n", " ids = [str(i) for i in range(collection_count + 1, collection_count + len(documents) + 1)]\n", " # Store the text of the document and metadata separately in order to insert it into Chroma\n", " texts = []\n", " metadata_docs = []\n", " for document in documents:\n", " texts.append(document.page_content)\n", " metadata_docs.append(document.metadata)\n", "\n", " # Add them in batches (otherwise Chroma error)\n", " for start_idx in range(0, len(embeddings), 1000):\n", " end_idx = start_idx + 1000\n", " # Ensure not to go out of bounds\n", " embeddings_batch = embeddings[start_idx : min(end_idx, len(embeddings))]\n", " texts_batch = texts[start_idx : min(end_idx, len(embeddings))]\n", " ids_batch = ids[start_idx : min(end_idx, len(embeddings))]\n", " metadatas_batch = metadata_docs[start_idx : min(end_idx, len(embeddings))]\n", "\n", " collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)\n", " print(f\"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}\")\n", "\n", " collection_count = vectordb._collection.count()\n", " print(f\"There are {collection_count} entries in the collection {collection_name} after updating.\")\n", " return new_client, 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "chunk_size = 1536\n", "chunk_overlap = 264\n", "# If update is needed, set to False\n", "all_docs = True\n", "\n", "documents, embedding_model, embeddings = preprocess_data(chunk_size, chunk_overlap, all_docs)\n", "collection_name = \"ISO_27001_Collection\"\n", "collection_metadata = {\n", "\"embedding_model_provider\": \"Fine-tuned\",\n", "\"embedding_model_name\": \"finetuned-BGE-large-ISO-27001\",\n", "\"chunk_size\": str(chunk_size),\n", "\"chunk_overlap\": str(chunk_overlap),\n", "}\n", "\n", "build_or_update_index_vector_db(documents, embeddings, collection_name, \"l2\", collection_metadata)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def store_documents_for_sparse_retrieval(chunk_size: int, chunk_overlap: int):\n", " \"\"\"\n", " Stores the documents for sparse retrieval in a basic text file\n", " \"\"\"\n", " documents = get_documents_from_files(True)\n", " chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", " chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents)\n", "\n", " store_documents(chunked_cleaned_documents, f\"./../sparse_index/sparse_1536_264\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create the actual sparse index\n", "store_documents_for_sparse_retrieval(chunk_size, chunk_overlap)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Helper methods for Chroma" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Returns the vectorDB based on the collection name if it exists\n", "def get_index_vector_db(collection_name: str):\n", " new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n", "\n", " # Check if collection already exists\n", " collection_exists = True\n", " try:\n", " new_client.get_collection(collection_name)\n", " except ValueError as e:\n", " collection_exists = False\n", "\n", " if not collection_exists:\n", " raise Exception(\"Error, raised exception: Collection does not exist.\")\n", " else:\n", " vectordb = Chroma(client=new_client, collection_name=collection_name)\n", "\n", " return new_client, vectordb" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def delete_collection(collection_name: str):\n", " new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n", "\n", " try:\n", " new_client.delete_collection(collection_name)\n", " except ValueError as e:\n", " print(\"Collection could not be deleted.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def return_collections():\n", " new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n", " collections = new_client.list_collections()\n", " return collections" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 2 }