{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from langchain_community.document_loaders import PyMuPDFLoader\n", "from langchain_community.document_loaders import TextLoader\n", "from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n", "from langchain.storage import InMemoryStore\n", "from langchain_community.document_loaders import TextLoader\n", "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "from langchain.retrievers import ParentDocumentRetriever\n", "from langchain_community.vectorstores import Chroma\n", "from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter\n", "from langchain_community.document_loaders.csv_loader import CSVLoader\n", "import chromadb\n", "from chromadb.utils import embedding_functions\n", "import os\n", "\n", "# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/kishoregajjala/anaconda3/envs/mhc_1/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "# create the open-source embedding function\n", "huggingface_ef = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "persist_directory=\"Data/chroma\"\n", "chroma_client = chromadb.PersistentClient(path=persist_directory)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever\n", "parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)\n", "\n", "# This text splitter is used to create the child documents\n", "# It should create documents smaller than the parent\n", "child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "def get_file_paths_recursively(folder_path):\n", " file_paths = []\n", " for root, directories, files in os.walk(folder_path):\n", " for file in files:\n", " file_path = os.path.join(root, file)\n", " file_paths.append(file_path)\n", " return file_paths\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "def vdb_csv_loader(file_paths):\n", " for i in range(len(file_paths)):\n", " loader = CSVLoader(file_path=file_paths[i], encoding=\"latin-1\")\n", " db = Chroma.from_documents(documents=loader.load(), embedding=huggingface_ef, collection_name= \"mental_health_csv_collection\", persist_directory=persist_directory) # pars to imclude (docs, emb_fun, col_name, direct_path)\n", "\n", "###\n", "def generate_csv_vector_db() -> None:\n", " \n", " # Get the directory path of the current script\n", " #script_dir = os.path.dirname(os.path.abspath(__file__))\n", " folder_path = \"Data/csv\"\n", " file_paths = get_file_paths_recursively(folder_path)\n", "\n", " #loaded all the files\n", " vdb_csv_loader(file_paths)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "pdf_collection = Chroma(collection_name=\"mental_health_pdf_collection\", embedding_function=huggingface_ef, persist_directory=persist_directory) \n", "def vdb_pdf_loader(file_paths):\n", " for i in range(len(file_paths)):\n", " loader = PyMuPDFLoader(file_path=file_paths[i])\n", " documents = loader.load()\n", " \n", " store = InMemoryStore()\n", " rag_retriever = ParentDocumentRetriever(\n", " vectorstore=pdf_collection,\n", " docstore=store,\n", " child_splitter=child_splitter,\n", " parent_splitter=parent_splitter,\n", " )\n", " rag_retriever.add_documents(documents)\n", "\n", "\n", "def generate_pdf_vector_db() -> None:\n", " \n", " # Get the directory path of the current script\n", " #script_dir = os.path.dirname(os.path.abspath(__file__))\n", " folder_path = \"Data/pdf\" #os.path.join(script_dir, '/Data/pdf') \n", " file_paths = get_file_paths_recursively(folder_path)\n", " vdb_pdf_loader(file_paths)\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " # call PDF loader\n", "generate_pdf_vector_db()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# call csv loader\n", "generate_csv_vector_db()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "def vectordb_load(): \n", " # call csv loader\n", " generate_csv_vector_db()\n", "\n", " # call PDF loader\n", " generate_pdf_vector_db()\n", "\n", " \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# call vector db load\n", "vectordb_load()\n" ] } ], "metadata": { "kernelspec": { "display_name": "mhc_1", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 2 }