{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyMcuy0u2XnwzWnARu0WjaRq",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/03-RAG_with_LlamaIndex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Install Packages and Setup Variables"
      ],
      "metadata": {
        "id": "v9bpz99INAc1"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "BeuFJKlj9jKz",
        "outputId": "4c3a9772-cb7d-4fc1-d0e4-64186861e3e5"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.4/225.4 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.7/51.7 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m35.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m35.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.0/143.0 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
            "tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
            "\u001b[0m"
          ]
        }
      ],
      "source": [
        "!pip install -q llama-index==0.9.21 openai==1.6.0 cohere==4.39 tiktoken==0.5.2"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "\n",
        "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
        "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
      ],
      "metadata": {
        "id": "XuzgSNqcABpV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Load Dataset"
      ],
      "metadata": {
        "id": "f5eV5EnvNCMM"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Download"
      ],
      "metadata": {
        "id": "q-7mRQ-mNJlm"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model."
      ],
      "metadata": {
        "id": "3PsdOdMUNmEi"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-dataset.json"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "3ImRCP7pACaI",
        "outputId": "9a63bdea-54f7-4923-ccbb-cab03b312774"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2023-12-25 17:33:36--  https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-dataset.json\n",
            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...\n",
            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 25361 (25K) [text/plain]\n",
            "Saving to: ‘mini-dataset.json’\n",
            "\n",
            "mini-dataset.json   100%[===================>]  24.77K  --.-KB/s    in 0.006s  \n",
            "\n",
            "2023-12-25 17:33:37 (3.76 MB/s) - ‘mini-dataset.json’ saved [25361/25361]\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Read File"
      ],
      "metadata": {
        "id": "bZZLK_wyEc-L"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import json\n",
        "\n",
        "# Load the file as a JSON\n",
        "with open('./mini-dataset.json', 'r') as file:\n",
        "    data = json.load(file)\n",
        "\n",
        "# The number of chunks in the dataset.\n",
        "len( data['chunks'] )"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "miUqycqAEfr7",
        "outputId": "10005d5f-15c0-4565-a58a-6cb7e466acb4"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "22"
            ]
          },
          "metadata": {},
          "execution_count": 16
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Flatten the JSON variable to a list of texts.\n",
        "texts = [item['text'] for item in data['chunks']]"
      ],
      "metadata": {
        "id": "Mq5WKj0QEfpk"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Generate Embedding"
      ],
      "metadata": {
        "id": "f86yksB9K571"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index import Document\n",
        "\n",
        "# Convert the texts to Document objects so the LlamaIndex framework can process them.\n",
        "documents = [Document(text=t) for t in texts]"
      ],
      "metadata": {
        "id": "iXrr5-tnEfm9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index import VectorStoreIndex\n",
        "\n",
        "# Build index / generate embeddings using OpenAI.\n",
        "index = VectorStoreIndex.from_documents(documents)"
      ],
      "metadata": {
        "id": "qQit27lBEfkV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Save the generated embeddings.\n",
        "# index.storage_context.persist(persist_dir=\"indexes\")"
      ],
      "metadata": {
        "id": "xxB0A9ZYM-OD"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Query Dataset"
      ],
      "metadata": {
        "id": "3DoUxd8KK--Q"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Define a query engine that is responsible for retrieving related pieces of text,\n",
        "# and using a LLM to formulate the final answer.\n",
        "query_engine = index.as_query_engine()"
      ],
      "metadata": {
        "id": "bUaNH97dEfh9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "response = query_engine.query(\n",
        "    \"How many parameters LLaMA2 model has?\"\n",
        ")\n",
        "print(response)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "tEgFx_aeFS5e",
        "outputId": "9133bd0c-f0c5-4124-9c4b-ab6c4c32b07a"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "The Llama 2 model has four different model sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters.\n"
          ]
        }
      ]
    }
  ]
}