Spaces:

Basti8499
/

RAG_ISO_27001_Chatbot

Sleeping

File size: 9,954 Bytes

579ab0b

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Notebook for creating/updating the dense and sparse indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ipynb.fs.defs.preprocess_data import preprocess_data\n",
    "from ipynb.fs.defs.preprocess_data import get_documents_from_files\n",
    "from ipynb.fs.defs.preprocess_data import split_docs\n",
    "from ipynb.fs.defs.preprocess_data import clean_and_process_chunked_documents\n",
    "from ipynb.fs.defs.preprocess_data import store_documents\n",
    "import chromadb\n",
    "from langchain.vectorstores import Chroma\n",
    "from langchain.docstore.document import Document\n",
    "from typing import List\n",
    "import os\n",
    "\n",
    "\n",
    "def build_or_update_index_vector_db(documents: List[Document], embeddings, collection_name: str, dist_function: str, collection_metadata: dict):\n",
    "    '''\n",
    "    Builds the index vector DB from documents with the specified embeddings and collection_name\n",
    "    If it already exists, updates the index with the new documents\n",
    "    '''\n",
    "    new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
    "\n",
    "    print(\"Starting to build index for: \", collection_metadata)\n",
    "\n",
    "    # Check if collection already exists\n",
    "    collection_exists = True\n",
    "    try:\n",
    "        collection = new_client.get_collection(collection_name)\n",
    "    except ValueError as e:\n",
    "        collection_exists = False\n",
    "\n",
    "    if not collection_exists:\n",
    "        print(\"Collection is new\")\n",
    "        # If collection does not exist, create it\n",
    "        collection = new_client.create_collection(collection_name)\n",
    "        # Each document needs an ID\n",
    "        ids = [str(i) for i in range(1, len(documents) + 1)]\n",
    "\n",
    "        # Store the text of the document and metadata separately in order to insert it into Chroma\n",
    "        texts = []\n",
    "        metadata_docs = []\n",
    "        for document in documents:\n",
    "            texts.append(document.page_content)\n",
    "            metadata_docs.append(document.metadata)\n",
    "\n",
    "        # Add them in batches (otherwise Chroma error)\n",
    "        for start_idx in range(0, len(embeddings), 1000):\n",
    "            end_idx = start_idx + 1000\n",
    "            # Ensure not to go out of bounds\n",
    "            embeddings_batch = embeddings[start_idx : min(end_idx, len(embeddings))]\n",
    "            texts_batch = texts[start_idx : min(end_idx, len(embeddings))]\n",
    "            ids_batch = ids[start_idx : min(end_idx, len(embeddings))]\n",
    "            metadatas_batch = metadata_docs[start_idx : min(end_idx, len(embeddings))]\n",
    "\n",
    "            collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)\n",
    "            print(f\"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}\")\n",
    "\n",
    "        vectordb = Chroma(\n",
    "            client=new_client,\n",
    "            collection_name=collection_name,\n",
    "            collection_metadata={\n",
    "                \"embedding_model_provider\": collection_metadata[\"embedding_model_provider\"],\n",
    "                \"embedding_model_name\": collection_metadata[\"embedding_model_name\"],\n",
    "                \"chunk_size\": collection_metadata[\"chunk_size\"],\n",
    "                \"chunk_overlap\": collection_metadata[\"chunk_overlap\"],\n",
    "                \"hnsw:space\": dist_function,  # either \"l2\" or \"ip\" or \"cosine\"\n",
    "            },\n",
    "        )\n",
    "        print(f\"Collection {collection_name} successfully created.\")\n",
    "        print(\"There are\", vectordb._collection.count(), \"entries in the collection.\")\n",
    "\n",
    "        return new_client, vectordb\n",
    "\n",
    "    else:\n",
    "        print(\"Collection already exists\")\n",
    "        vectordb = Chroma(client=new_client, collection_name=collection_name)\n",
    "\n",
    "        collection_count = vectordb._collection.count()\n",
    "        print(f\"There are {collection_count} entries in the collection {collection_name} prior to updating.\")\n",
    "\n",
    "        # Continue the IDs from the last ID\n",
    "        ids = [str(i) for i in range(collection_count + 1, collection_count + len(documents) + 1)]\n",
    "        # Store the text of the document and metadata separately in order to insert it into Chroma\n",
    "        texts = []\n",
    "        metadata_docs = []\n",
    "        for document in documents:\n",
    "            texts.append(document.page_content)\n",
    "            metadata_docs.append(document.metadata)\n",
    "\n",
    "        # Add them in batches (otherwise Chroma error)\n",
    "        for start_idx in range(0, len(embeddings), 1000):\n",
    "            end_idx = start_idx + 1000\n",
    "            # Ensure not to go out of bounds\n",
    "            embeddings_batch = embeddings[start_idx : min(end_idx, len(embeddings))]\n",
    "            texts_batch = texts[start_idx : min(end_idx, len(embeddings))]\n",
    "            ids_batch = ids[start_idx : min(end_idx, len(embeddings))]\n",
    "            metadatas_batch = metadata_docs[start_idx : min(end_idx, len(embeddings))]\n",
    "\n",
    "            collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)\n",
    "            print(f\"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}\")\n",
    "\n",
    "        collection_count = vectordb._collection.count()\n",
    "        print(f\"There are {collection_count} entries in the collection {collection_name} after updating.\")\n",
    "        return new_client, 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunk_size = 1536\n",
    "chunk_overlap = 264\n",
    "# If update is needed, set to False\n",
    "all_docs = True\n",
    "\n",
    "documents, embedding_model, embeddings = preprocess_data(chunk_size, chunk_overlap, all_docs)\n",
    "collection_name = \"ISO_27001_Collection\"\n",
    "collection_metadata = {\n",
    "\"embedding_model_provider\": \"Fine-tuned\",\n",
    "\"embedding_model_name\": \"finetuned-BGE-large-ISO-27001\",\n",
    "\"chunk_size\": str(chunk_size),\n",
    "\"chunk_overlap\": str(chunk_overlap),\n",
    "}\n",
    "\n",
    "build_or_update_index_vector_db(documents, embeddings, collection_name, \"l2\", collection_metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def store_documents_for_sparse_retrieval(chunk_size: int, chunk_overlap: int):\n",
    "    \"\"\"\n",
    "    Stores the documents for sparse retrieval in a basic text file\n",
    "    \"\"\"\n",
    "    documents = get_documents_from_files(True)\n",
    "    chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
    "    chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents)\n",
    "\n",
    "    store_documents(chunked_cleaned_documents, f\"./../sparse_index/sparse_1536_264\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create the actual sparse index\n",
    "store_documents_for_sparse_retrieval(chunk_size, chunk_overlap)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Helper methods for Chroma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Returns the vectorDB based on the collection name if it exists\n",
    "def get_index_vector_db(collection_name: str):\n",
    "    new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
    "\n",
    "    # Check if collection already exists\n",
    "    collection_exists = True\n",
    "    try:\n",
    "        new_client.get_collection(collection_name)\n",
    "    except ValueError as e:\n",
    "        collection_exists = False\n",
    "\n",
    "    if not collection_exists:\n",
    "        raise Exception(\"Error, raised exception: Collection does not exist.\")\n",
    "    else:\n",
    "        vectordb = Chroma(client=new_client, collection_name=collection_name)\n",
    "\n",
    "        return new_client, vectordb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def delete_collection(collection_name: str):\n",
    "    new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
    "\n",
    "    try:\n",
    "        new_client.delete_collection(collection_name)\n",
    "    except ValueError as e:\n",
    "        print(\"Collection could not be deleted.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def return_collections():\n",
    "    new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
    "    collections = new_client.list_collections()\n",
    "    return collections"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}