Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Sleeping

App Files Files Community

Vamsikrishna Chemudupati commited on May 3

Commit

f755dcf

•

1 Parent(s): f52b26d

Using a Vector DB lesson modified notebook

Browse files

Files changed (1) hide show

notebooks/04-RAG_with_VectorStore.ipynb +436 -330

notebooks/04-RAG_with_VectorStore.ipynb CHANGED Viewed

@@ -1,345 +1,451 @@
 {
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab_type": "text",
-    "id": "view-in-github"
-   },
-   "outputs": [],
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "5BGJ3fxhOk2V"
-   },
-   "source": [
-    "# Install Packages and Setup Variables"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
     },
-    "id": "QPJzr-I9XQ7l",
-    "outputId": "9949a0e5-8bf2-4ae7-9921-1f9dfbece9ae"
-   },
-   "outputs": [],
-   "source": [
-    "!pip install -q llama-index==0.10.5 openai==1.12.0 cohere==4.47 tiktoken==0.6.0 chromadb==0.4.22"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {
-    "id": "riuXwpSPcvWC"
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "I9JbAzFcjkpn"
-   },
-   "source": [
-    "# Load the Dataset (CSV)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "_Tif8-JoRH68"
-   },
-   "source": [
-    "## Download"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "4fQaa1LN1mXL"
-   },
-   "source": [
-    "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {
-    "id": "-QTUkdfJjY4N"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
-      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
-      "100  169k  100  169k    0     0   743k      0 --:--:-- --:--:-- --:--:--  747k\n"
-     ]
-    }
-   ],
-   "source": [
-    "!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "zk-4alIxROo8"
-   },
-   "source": [
-    "## Read File"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
     },
-    "id": "7CYwRT6R0o0I",
-    "outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
-   },
-   "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "841"
       ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import csv\n",
-    "\n",
-    "text = \"\"\n",
-    "\n",
-    "# Load the file as a JSON\n",
-    "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
-    "  csv_reader = csv.reader(file)\n",
-    "\n",
-    "  for row in csv_reader:\n",
-    "    text += row[0]\n",
-    "\n",
-    "# The number of characters in the dataset.\n",
-    "len( text )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "S17g2RYOjmf2"
-   },
-   "source": [
-    "# Chunking"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
     },
-    "id": "STACTMUR1z9N",
-    "outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
-   },
-   "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "2"
       ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
     }
-   ],
-   "source": [
-    "chunk_size = 512\n",
-    "chunks = []\n",
-    "\n",
-    "# Split the long text into smaller manageable chunks of 512 characters.\n",
-    "for i in range(0, len(text), chunk_size):\n",
-    "    chunks.append(text[i:i + chunk_size])\n",
-    "\n",
-    "len( chunks )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {
-    "id": "CtdsIUQ81_hT"
-   },
-   "outputs": [],
-   "source": [
-    "from llama_index.core import Document\n",
-    "\n",
-    "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
-    "documents = [Document(text=t) for t in chunks]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "OWaT6rL7ksp8"
-   },
-   "source": [
-    "# Save on Chroma"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {
-    "id": "mXi56KTXk2sp"
-   },
-   "outputs": [],
-   "source": [
-    "import chromadb\n",
-    "\n",
-    "# create client and a new collection\n",
-    "# chromadb.EphemeralClient saves data in-memory.\n",
-    "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
-    "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {
-    "id": "jKXURvLtkuTS"
-   },
-   "outputs": [],
-   "source": [
-    "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
-    "from llama_index.core import StorageContext\n",
-    "\n",
-    "# Define a storage context object using the created vector database.\n",
-    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
-    "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {
-    "id": "WsD52wtrlESi"
-   },
-   "outputs": [],
-   "source": [
-    "from llama_index.core import VectorStoreIndex\n",
-    "\n",
-    "# Add the documents to the database and create Index / embeddings\n",
-    "index = VectorStoreIndex.from_documents(\n",
-    "    documents, storage_context=storage_context\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "8JPD8yAinVSq"
-   },
-   "source": [
-    "# Query Dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {
-    "id": "mzS13x1ZlZ5X"
-   },
-   "outputs": [],
-   "source": [
-    "from llama_index.llms.openai import OpenAI\n",
-    "# Define a query engine that is responsible for retrieving related pieces of text,\n",
-    "# and using a LLM to formulate the final answer.\n",
-    "\n",
-    "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=512)\n",
-    "query_engine = index.as_query_engine(llm=llm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {
     "colab": {
-     "base_uri": "https://localhost:8080/"
     },
-    "id": "AYsQ4uLN_Oxg",
-    "outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The LLaMA2 model has 7 billion parameters.\n"
-     ]
     }
-   ],
-   "source": [
-    "response = query_engine.query(\n",
-    "    \"How many parameters LLaMA2 model has?\"\n",
-    ")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "colab": {
-   "authorship_tag": "ABX9TyNQkVEh0x7hcM9U+6JSEkSG",
-   "include_colab_link": true,
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
 }

 {
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
     },
     {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "5BGJ3fxhOk2V"
+      },
+      "source": [
+        "# Install Packages and Setup Variables"
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "QPJzr-I9XQ7l"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -q llama-index==0.10.5 llama-index-vector-stores-chroma==0.1.7 langchain==0.1.17 langchain-chroma==0.1.0 langchain_openai==0.1.5 openai==1.12.0 cohere==4.47 tiktoken==0.6.0 chromadb==0.4.22"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "riuXwpSPcvWC"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "\n",
+        "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
+        "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "I9JbAzFcjkpn"
+      },
+      "source": [
+        "# Load the Dataset (CSV)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_Tif8-JoRH68"
+      },
+      "source": [
+        "## Download"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4fQaa1LN1mXL"
+      },
+      "source": [
+        "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "-QTUkdfJjY4N",
+        "outputId": "a88b2f8a-0c84-45a0-9b32-5088fe596612"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
+            "                                 Dload  Upload   Total   Spent    Left  Speed\n",
+            "100  169k  100  169k    0     0   277k      0 --:--:-- --:--:-- --:--:--  281k\n"
+          ]
+        }
+      ],
+      "source": [
+        "!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zk-4alIxROo8"
+      },
+      "source": [
+        "## Read File"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "7CYwRT6R0o0I",
+        "outputId": "351f170f-9a00-4b09-ae08-b45c3c48fce5"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "841"
+            ]
+          },
+          "execution_count": 4,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "import csv\n",
+        "\n",
+        "text = \"\"\n",
+        "\n",
+        "# Load the file as a JSON\n",
+        "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
+        "  csv_reader = csv.reader(file)\n",
+        "\n",
+        "  for row in csv_reader:\n",
+        "    text += row[0]\n",
+        "\n",
+        "# The number of characters in the dataset.\n",
+        "len( text )"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "S17g2RYOjmf2"
+      },
+      "source": [
+        "# Chunking"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "STACTMUR1z9N",
+        "outputId": "15a61eac-8774-4cdb-db8d-e2eb5b07e517"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "2"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "chunk_size = 512\n",
+        "chunks = []\n",
+        "\n",
+        "# Split the long text into smaller manageable chunks of 512 characters.\n",
+        "for i in range(0, len(text), chunk_size):\n",
+        "    chunks.append(text[i:i + chunk_size])\n",
+        "\n",
+        "len( chunks )"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9fOomeMGqu10"
+      },
+      "source": [
+        "#Interface of Chroma with LlamaIndex"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "CtdsIUQ81_hT"
+      },
+      "outputs": [],
+      "source": [
+        "from llama_index.core import Document\n",
+        "\n",
+        "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
+        "documents = [Document(text=t) for t in chunks]"
       ]
     },
     {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OWaT6rL7ksp8"
+      },
+      "source": [
+        "Save on Chroma\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "id": "mXi56KTXk2sp"
+      },
+      "outputs": [],
+      "source": [
+        "import chromadb\n",
+        "\n",
+        "# create client and a new collection\n",
+        "# chromadb.EphemeralClient saves data in-memory.\n",
+        "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
+        "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "id": "jKXURvLtkuTS"
+      },
+      "outputs": [],
+      "source": [
+        "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
+        "from llama_index.core import StorageContext\n",
+        "# Define a storage context object using the created vector database.\n",
+        "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
+        "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "id": "WsD52wtrlESi"
+      },
+      "outputs": [],
+      "source": [
+        "from llama_index.core import VectorStoreIndex\n",
+        "\n",
+        "# Add the documents to the database and create Index / embeddings\n",
+        "index = VectorStoreIndex.from_documents(\n",
+        "    documents, storage_context=storage_context\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8JPD8yAinVSq"
+      },
+      "source": [
+        "Query Dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "id": "mzS13x1ZlZ5X"
+      },
+      "outputs": [],
+      "source": [
+        "from llama_index.llms.openai import OpenAI\n",
+        "# Define a query engine that is responsible for retrieving related pieces of text,\n",
+        "# and using a LLM to formulate the final answer.\n",
+        "\n",
+        "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=512)\n",
+        "query_engine = index.as_query_engine(llm=llm)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "AYsQ4uLN_Oxg",
+        "outputId": "5066a06c-77ff-48a2-ee61-3abe2e9755e2"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The LLaMA2 model has 7 billion parameters.\n"
+          ]
+        }
+      ],
+      "source": [
+        "response = query_engine.query(\n",
+        "    \"How many parameters LLaMA2 model has?\"\n",
+        ")\n",
+        "print(response)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kWK571VNg-qR"
+      },
+      "source": [
+        "#Interface of Chroma with LangChain"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "id": "SMPAniL2e4NP"
+      },
+      "outputs": [],
+      "source": [
+        "from langchain.schema.document import Document\n",
+        "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
+        "documents = [Document(page_content=t) for t in chunks]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QBt8qGxArUPD"
+      },
+      "source": [
+        "Save on Chroma"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "id": "2xas7HkuhJ8A"
+      },
+      "outputs": [],
+      "source": [
+        "from langchain_chroma import Chroma\n",
+        "from langchain_openai import OpenAIEmbeddings\n",
+        "# create client and a new collection\n",
+        "# chromadb.EphemeralClient saves data in-memory.\n",
+        "# Add the documents to the database and create Index / embeddings\n",
+        "\n",
+        "embeddings = OpenAIEmbeddings(model=\"text-embedding-ada-002\")\n",
+        "chroma_db = Chroma.from_documents(\n",
+        "    documents=documents,\n",
+        "    embedding=embeddings,\n",
+        "    persist_directory=\"./mini-chunked-dataset\",\n",
+        "    collection_name=\"mini-chunked-dataset\"\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "P8AXJJyBrZWF"
+      },
+      "source": [
+        "Query Dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "id": "-H64YLxshM2b"
+      },
+      "outputs": [],
+      "source": [
+        "from langchain_openai import ChatOpenAI\n",
+        "# Define a query engine that is responsible for retrieving related pieces of text,\n",
+        "# and using a LLM to formulate the final answer.\n",
+        "\n",
+        "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=512)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "AxBqPNtthPaa",
+        "outputId": "93c9ad64-1cd1-4f52-c51e-6f3ec5d6542d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The LLaMA-2 model has 7 billion parameters.\n"
+          ]
+        }
+      ],
+      "source": [
+        "from langchain.chains import RetrievalQA\n",
+        "query = \"How many parameters LLaMA2 model has?\"\n",
+        "retriever = chroma_db.as_retriever(search_kwargs={\"k\": 2})\n",
+        "chain = RetrievalQA.from_chain_type(llm=llm,\n",
+        "                                    chain_type=\"stuff\",\n",
+        "                                    retriever=retriever)\n",
+        "\n",
+        "response = chain(query)\n",
+        "print(response[\"result\"])"
       ]
     }
+  ],
+  "metadata": {
     "colab": {
+      "provenance": []
     },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.8"
     }
   },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }