Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

App Files Files Community

AlaFalaki commited on Jan 3

Commit

fbacdb1

•

1 Parent(s): 579611d

Created using Colaboratory

Browse files

Files changed (1) hide show

notebooks/03-RAG_with_LlamaIndex.ipynb +60 -26

notebooks/03-RAG_with_LlamaIndex.ipynb CHANGED Viewed

@@ -4,7 +4,7 @@
   "metadata": {
     "colab": {
       "provenance": [],
-      "authorship_tag": "ABX9TyNbBT3cLvlEHCfKEcPSqeML",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -26,9 +26,18 @@
         "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/03-RAG_with_LlamaIndex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -66,14 +75,42 @@
       "source": [
         "import os\n",
         "\n",
         "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
       ],
       "metadata": {
         "id": "XuzgSNqcABpV"
       },
-      "execution_count": 4,
       "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
@@ -86,7 +123,7 @@
         "id": "3ImRCP7pACaI",
         "outputId": "9a63bdea-54f7-4923-ccbb-cab03b312774"
       },
-      "execution_count": 5,
       "outputs": [
         {
           "output_type": "stream",
@@ -110,7 +147,7 @@
     {
       "cell_type": "markdown",
       "source": [
-        "### Read JSON"
       ],
       "metadata": {
         "id": "bZZLK_wyEc-L"
@@ -121,18 +158,11 @@
       "source": [
         "import json\n",
         "\n",
         "with open('./mini-dataset.json', 'r') as file:\n",
-        "    data = json.load(file)"
-      ],
-      "metadata": {
-        "id": "PBk0zgq6ACXA"
-      },
-      "execution_count": 15,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
         "len( data['chunks'] )"
       ],
       "metadata": {
@@ -142,7 +172,7 @@
         "id": "miUqycqAEfr7",
         "outputId": "10005d5f-15c0-4565-a58a-6cb7e466acb4"
       },
-      "execution_count": 16,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -159,18 +189,19 @@
     {
       "cell_type": "code",
       "source": [
         "texts = [item['text'] for item in data['chunks']]"
       ],
       "metadata": {
         "id": "Mq5WKj0QEfpk"
       },
-      "execution_count": 18,
       "outputs": []
     },
     {
       "cell_type": "markdown",
       "source": [
-        "### Apply Embedding"
       ],
       "metadata": {
         "id": "f86yksB9K571"
@@ -181,12 +212,13 @@
       "source": [
         "from llama_index import Document\n",
         "\n",
         "documents = [Document(text=t) for t in texts]"
       ],
       "metadata": {
         "id": "iXrr5-tnEfm9"
       },
-      "execution_count": 24,
       "outputs": []
     },
     {
@@ -194,13 +226,13 @@
       "source": [
         "from llama_index import VectorStoreIndex\n",
         "\n",
-        "# build index / generate embeddings using OpenAI\n",
         "index = VectorStoreIndex.from_documents(documents)"
       ],
       "metadata": {
         "id": "qQit27lBEfkV"
       },
-      "execution_count": 25,
       "outputs": []
     },
     {
@@ -212,13 +244,13 @@
       "metadata": {
         "id": "xxB0A9ZYM-OD"
       },
-      "execution_count": 29,
       "outputs": []
     },
     {
       "cell_type": "markdown",
       "source": [
-        "### Query Dataset"
       ],
       "metadata": {
         "id": "3DoUxd8KK--Q"
@@ -227,12 +259,14 @@
     {
       "cell_type": "code",
       "source": [
         "query_engine = index.as_query_engine()"
       ],
       "metadata": {
         "id": "bUaNH97dEfh9"
       },
-      "execution_count": 27,
       "outputs": []
     },
     {
@@ -250,7 +284,7 @@
         "id": "tEgFx_aeFS5e",
         "outputId": "9133bd0c-f0c5-4124-9c4b-ab6c4c32b07a"
       },
-      "execution_count": 28,
       "outputs": [
         {
           "output_type": "stream",

   "metadata": {
     "colab": {
       "provenance": [],
+      "authorship_tag": "ABX9TyMcuy0u2XnwzWnARu0WjaRq",
       "include_colab_link": true
     },
     "kernelspec": {
         "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/03-RAG_with_LlamaIndex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Install Packages and Setup Variables"
+      ],
+      "metadata": {
+        "id": "v9bpz99INAc1"
+      }
+    },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
       "source": [
         "import os\n",
         "\n",
+        "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
         "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
       ],
       "metadata": {
         "id": "XuzgSNqcABpV"
       },
+      "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Load Dataset"
+      ],
+      "metadata": {
+        "id": "f5eV5EnvNCMM"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Download"
+      ],
+      "metadata": {
+        "id": "q-7mRQ-mNJlm"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model."
+      ],
+      "metadata": {
+        "id": "3PsdOdMUNmEi"
+      }
+    },
     {
       "cell_type": "code",
       "source": [
         "id": "3ImRCP7pACaI",
         "outputId": "9a63bdea-54f7-4923-ccbb-cab03b312774"
       },
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
     {
       "cell_type": "markdown",
       "source": [
+        "## Read File"
       ],
       "metadata": {
         "id": "bZZLK_wyEc-L"
       "source": [
         "import json\n",
         "\n",
+        "# Load the file as a JSON\n",
         "with open('./mini-dataset.json', 'r') as file:\n",
+        "    data = json.load(file)\n",
+        "\n",
+        "# The number of chunks in the dataset.\n",
         "len( data['chunks'] )"
       ],
       "metadata": {
         "id": "miUqycqAEfr7",
         "outputId": "10005d5f-15c0-4565-a58a-6cb7e466acb4"
       },
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "execute_result",
     {
       "cell_type": "code",
       "source": [
+        "# Flatten the JSON variable to a list of texts.\n",
         "texts = [item['text'] for item in data['chunks']]"
       ],
       "metadata": {
         "id": "Mq5WKj0QEfpk"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "markdown",
       "source": [
+        "# Generate Embedding"
       ],
       "metadata": {
         "id": "f86yksB9K571"
       "source": [
         "from llama_index import Document\n",
         "\n",
+        "# Convert the texts to Document objects so the LlamaIndex framework can process them.\n",
         "documents = [Document(text=t) for t in texts]"
       ],
       "metadata": {
         "id": "iXrr5-tnEfm9"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
       "source": [
         "from llama_index import VectorStoreIndex\n",
         "\n",
+        "# Build index / generate embeddings using OpenAI.\n",
         "index = VectorStoreIndex.from_documents(documents)"
       ],
       "metadata": {
         "id": "qQit27lBEfkV"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
       "metadata": {
         "id": "xxB0A9ZYM-OD"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "markdown",
       "source": [
+        "# Query Dataset"
       ],
       "metadata": {
         "id": "3DoUxd8KK--Q"
     {
       "cell_type": "code",
       "source": [
+        "# Define a query engine that is responsible for retrieving related pieces of text,\n",
+        "# and using a LLM to formulate the final answer.\n",
         "query_engine = index.as_query_engine()"
       ],
       "metadata": {
         "id": "bUaNH97dEfh9"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
         "id": "tEgFx_aeFS5e",
         "outputId": "9133bd0c-f0c5-4124-9c4b-ab6c4c32b07a"
       },
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",