{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyNzaxKiokXX5SPot1IBiMhR",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/LlamaIndex_101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install -q llama-index==0.10.37 openai==1.30.1 tiktoken==0.7.0 chromadb==0.5.0 llama-index-vector-stores-chroma==0.1.7 llama-index-readers-wikipedia==0.1.4 wikipedia==1.4.0"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "y_GAV7-zos0Y",
        "outputId": "74d4a3c4-3576-455b-fbe2-1b8b67bf20d5"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Building wheel for wikipedia (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.\n",
        "\n",
        "import nest_asyncio\n",
        "\n",
        "nest_asyncio.apply()"
      ],
      "metadata": {
        "id": "Ua0KNwgvyCaj"
      },
      "execution_count": 49,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "\n",
        "os.environ['OPENAI_API_KEY'] = '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'"
      ],
      "metadata": {
        "id": "--Q2zk06wElp"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import logging\n",
        "import sys\n",
        "\n",
        "#You can set the logging level to DEBUG for more verbose output,\n",
        "# or use level=logging.INFO for less detailed information.\n",
        "logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n",
        "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
      ],
      "metadata": {
        "id": "tjwZjA8-wITr"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Wikipedia Example"
      ],
      "metadata": {
        "id": "HjI_gRaRutfj"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## LlamaHub Wikipedia Integration"
      ],
      "metadata": {
        "id": "PLUDcXpI41Q_"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index.readers.wikipedia import WikipediaReader\n",
        "\n",
        "# Initialize WikipediaReader\n",
        "reader = WikipediaReader()"
      ],
      "metadata": {
        "id": "2gko9Q3hrlMh"
      },
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Load data from Wikipedia\n",
        "documents = reader.load_data(pages=['Natural Language Processing', 'Artificial Intelligence'])"
      ],
      "metadata": {
        "id": "Z35ot7P1wIO0"
      },
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "len( documents )"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0i9Zp6BJwILk",
        "outputId": "a6a1e0a7-98cf-4ba4-d48a-e4f5833b4967"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "2"
            ]
          },
          "metadata": {},
          "execution_count": 14
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Save on DeepLake"
      ],
      "metadata": {
        "id": "03lff4VUTaN9"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import chromadb\n",
        "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
        "\n",
        "# Load the vector store from the local storage.\n",
        "db = chromadb.PersistentClient(path=\"./wikipedia-articles\")\n",
        "chroma_collection = db.get_or_create_collection(\"wikipedia-articles\")\n",
        "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
      ],
      "metadata": {
        "id": "eo8CTHSFTcaR"
      },
      "execution_count": 15,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Create Nodes"
      ],
      "metadata": {
        "id": "qkKPAnIl44ss"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index.core.node_parser import SimpleNodeParser\n",
        "\n",
        "# Initialize the parser\n",
        "parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)\n",
        "\n",
        "# Parse documents into nodes\n",
        "nodes = parser.get_nodes_from_documents(documents)\n",
        "print( len( nodes ) )"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "eB6Rc0U0wII_",
        "outputId": "ec338be1-deca-45a7-e6ba-9997e4b7e25a"
      },
      "execution_count": 20,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "45\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Storage Context"
      ],
      "metadata": {
        "id": "E8tHMS5ZucFE"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index.core import StorageContext\n",
        "\n",
        "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
      ],
      "metadata": {
        "id": "eWFtVpM_TcTQ"
      },
      "execution_count": 18,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Create index from Documents"
      ],
      "metadata": {
        "id": "kCgdd197CTDt"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index.core import VectorStoreIndex\n",
        "\n",
        "index = VectorStoreIndex(\n",
        "    nodes=nodes, storage_context=storage_context\n",
        ")"
      ],
      "metadata": {
        "id": "g3GCf8LrULIW"
      },
      "execution_count": 24,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "query_engine = index.as_query_engine()\n",
        "response = query_engine.query(\"What does NLP stands for?\")\n",
        "response.response"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "id": "G7BdNn-Q5AlG",
        "outputId": "a311ec41-6cdc-4fe1-fb59-ad338d0b6149"
      },
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'NLP stands for Natural Language Processing.'"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 25
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Store/Load Vector Store"
      ],
      "metadata": {
        "id": "r6cGiUtxu5ga"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Index Storage Checks\n",
        "import os.path\n",
        "from llama_index.core import StorageContext, load_index_from_storage\n",
        "\n",
        "# Let's see if our index already exists in storage.\n",
        "if not os.path.exists(\"./storage\"):\n",
        "    index.storage_context.persist()\n",
        "\n",
        "else:\n",
        "    # If the index already exists, we'll just load it:\n",
        "    storage_context = StorageContext.from_defaults(persist_dir=\"./storage\")\n",
        "    index = load_index_from_storage(storage_context)"
      ],
      "metadata": {
        "id": "GHtB0C0mu7f6"
      },
      "execution_count": 28,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Paul Graham Essay"
      ],
      "metadata": {
        "id": "iF8hwfMKuzst"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!mkdir -p './paul_graham/'\n",
        "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O './paul_graham/paul_graham_essay.txt'"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "DrzbBAglwUo0",
        "outputId": "73f30202-a708-4112-8491-9152e228c6cb"
      },
      "execution_count": 35,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2024-07-24 17:20:40--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt\n",
            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n",
            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 75042 (73K) [text/plain]\n",
            "Saving to: ‘./paul_graham/paul_graham_essay.txt’\n",
            "\n",
            "\r          ./paul_gr   0%[                    ]       0  --.-KB/s               \r./paul_graham/paul_ 100%[===================>]  73.28K  --.-KB/s    in 0.02s   \n",
            "\n",
            "2024-07-24 17:20:40 (3.33 MB/s) - ‘./paul_graham/paul_graham_essay.txt’ saved [75042/75042]\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index.core import SimpleDirectoryReader\n",
        "\n",
        "# load documents\n",
        "documents = SimpleDirectoryReader(\"./paul_graham\").load_data()"
      ],
      "metadata": {
        "id": "S8-QmnkCwIiU"
      },
      "execution_count": 37,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import chromadb\n",
        "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
        "\n",
        "# Load the vector store from the local storage.\n",
        "db = chromadb.PersistentClient(path=\"./paul-graham\")\n",
        "chroma_collection = db.get_or_create_collection(\"paul-graham\")\n",
        "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
      ],
      "metadata": {
        "id": "DfWglp75xc5f"
      },
      "execution_count": 38,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index.core import StorageContext\n",
        "\n",
        "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
        "storage_context.docstore.add_documents(nodes)"
      ],
      "metadata": {
        "id": "-EVBlUC-xcj1"
      },
      "execution_count": 39,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index.core import VectorStoreIndex\n",
        "\n",
        "index = VectorStoreIndex.from_documents(\n",
        "    documents, storage_context=storage_context\n",
        ")"
      ],
      "metadata": {
        "id": "8lMa4h9Cwn8b"
      },
      "execution_count": 40,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "query_engine = index.as_query_engine(similarity_top_k=10)"
      ],
      "metadata": {
        "id": "tJsfskjHxj0e"
      },
      "execution_count": 44,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index.core.tools import QueryEngineTool, ToolMetadata\n",
        "from llama_index.core.query_engine import SubQuestionQueryEngine\n",
        "\n",
        "query_engine_tools = [\n",
        "    QueryEngineTool(\n",
        "        query_engine=query_engine,\n",
        "        metadata=ToolMetadata(\n",
        "            name=\"pg_essay\",\n",
        "            description=\"Paul Graham essay on What I Worked On\",\n",
        "        ),\n",
        "    ),\n",
        "]\n",
        "\n",
        "query_engine = SubQuestionQueryEngine.from_defaults(\n",
        "    query_engine_tools=query_engine_tools,\n",
        "    use_async=True,\n",
        ")"
      ],
      "metadata": {
        "id": "yL9TsFwxxuoA"
      },
      "execution_count": 50,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "response = query_engine.query(\n",
        "    \"How was Paul Grahams life different before, during, and after YC?\"\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JWc_n5Lhx2bq",
        "outputId": "cd992d20-c701-4eb7-aaf2-30f790d1ca24"
      },
      "execution_count": 51,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Generated 3 sub questions.\n",
            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on before Y Combinator?\n",
            "\u001b[0m\u001b[1;3;38;2;90;149;237m[pg_essay] Q: What did Paul Graham work on during Y Combinator?\n",
            "\u001b[0m\u001b[1;3;38;2;11;159;203m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n",
            "\u001b[0mGenerated 1 sub questions.\n",
            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n",
            "\u001b[0mGenerated 1 sub questions.\n",
            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
            "\u001b[0mGenerated 1 sub questions.\n",
            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
            "\u001b[0mGenerated 1 sub questions.\n",
            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n",
            "\u001b[0mGenerated 1 sub questions.\n",
            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
            "\u001b[0m"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/lib/python3.10/abc.py:123: RuntimeWarning: coroutine 'run_async_tasks.<locals>._gather' was never awaited\n",
            "  return _abc_subclasscheck(cls, subclass)\n",
            "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
            "/usr/lib/python3.10/abc.py:123: RuntimeWarning: coroutine 'SubQuestionQueryEngine._aquery_subq' was never awaited\n",
            "  return _abc_subclasscheck(cls, subclass)\n",
            "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Generated 1 sub questions.\n",
            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n",
            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: The title of Paul Graham's essay on What I Worked On is \"What I Worked On\".\n",
            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n",
            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: What I Worked On\n",
            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: What I Worked On\n",
            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: The title of Paul Graham's essay on What I Worked On is \"What I Worked On\".\n",
            "\u001b[0m\u001b[1;3;38;2;11;159;203m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n",
            "\u001b[0m\u001b[1;3;38;2;90;149;237m[pg_essay] A: Paul Graham worked on various projects during his time at Y Combinator.\n",
            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: Paul Graham worked on developing Viaweb before Y Combinator.\n",
            "\u001b[0m"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print( \">>> The final response:\\n\", response )"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "T-ZC66Ltx5Za",
        "outputId": "d7a1c85d-d73c-467d-e0df-9e06078622e2"
      },
      "execution_count": 52,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            ">>> The final response:\n",
            " Paul Graham worked on developing Viaweb before Y Combinator, on various projects during his time at Y Combinator, and started painting after Y Combinator.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "27fS3JcDyFSj"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}