Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

App Files Files Community

AlaFalaki commited on Jul 24, 2024

Commit

7e62fee

1 Parent(s): 14a73a7

Created using Colab

Browse files

Files changed (1) hide show

notebooks/LlamaIndex_101.ipynb +586 -0

notebooks/LlamaIndex_101.ipynb ADDED Viewed

	@@ -0,0 +1,586 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyNzaxKiokXX5SPot1IBiMhR",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/LlamaIndex_101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install -q llama-index==0.10.37 openai==1.30.1 tiktoken==0.7.0 chromadb==0.5.0 llama-index-vector-stores-chroma==0.1.7 llama-index-readers-wikipedia==0.1.4 wikipedia==1.4.0"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "y_GAV7-zos0Y",
+        "outputId": "74d4a3c4-3576-455b-fbe2-1b8b67bf20d5"
+      },
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Building wheel for wikipedia (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.\n",
+        "\n",
+        "import nest_asyncio\n",
+        "\n",
+        "nest_asyncio.apply()"
+      ],
+      "metadata": {
+        "id": "Ua0KNwgvyCaj"
+      },
+      "execution_count": 49,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "\n",
+        "os.environ['OPENAI_API_KEY'] = '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'"
+      ],
+      "metadata": {
+        "id": "--Q2zk06wElp"
+      },
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import logging\n",
+        "import sys\n",
+        "\n",
+        "#You can set the logging level to DEBUG for more verbose output,\n",
+        "# or use level=logging.INFO for less detailed information.\n",
+        "logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n",
+        "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
+      ],
+      "metadata": {
+        "id": "tjwZjA8-wITr"
+      },
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Wikipedia Example"
+      ],
+      "metadata": {
+        "id": "HjI_gRaRutfj"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## LlamaHub Wikipedia Integration"
+      ],
+      "metadata": {
+        "id": "PLUDcXpI41Q_"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from llama_index.readers.wikipedia import WikipediaReader\n",
+        "\n",
+        "# Initialize WikipediaReader\n",
+        "reader = WikipediaReader()"
+      ],
+      "metadata": {
+        "id": "2gko9Q3hrlMh"
+      },
+      "execution_count": 12,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load data from Wikipedia\n",
+        "documents = reader.load_data(pages=['Natural Language Processing', 'Artificial Intelligence'])"
+      ],
+      "metadata": {
+        "id": "Z35ot7P1wIO0"
+      },
+      "execution_count": 13,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "len( documents )"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "0i9Zp6BJwILk",
+        "outputId": "a6a1e0a7-98cf-4ba4-d48a-e4f5833b4967"
+      },
+      "execution_count": 14,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "2"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 14
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Save on DeepLake"
+      ],
+      "metadata": {
+        "id": "03lff4VUTaN9"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import chromadb\n",
+        "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
+        "\n",
+        "# Load the vector store from the local storage.\n",
+        "db = chromadb.PersistentClient(path=\"./wikipedia-articles\")\n",
+        "chroma_collection = db.get_or_create_collection(\"wikipedia-articles\")\n",
+        "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
+      ],
+      "metadata": {
+        "id": "eo8CTHSFTcaR"
+      },
+      "execution_count": 15,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Create Nodes"
+      ],
+      "metadata": {
+        "id": "qkKPAnIl44ss"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from llama_index.core.node_parser import SimpleNodeParser\n",
+        "\n",
+        "# Initialize the parser\n",
+        "parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)\n",
+        "\n",
+        "# Parse documents into nodes\n",
+        "nodes = parser.get_nodes_from_documents(documents)\n",
+        "print( len( nodes ) )"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "eB6Rc0U0wII_",
+        "outputId": "ec338be1-deca-45a7-e6ba-9997e4b7e25a"
+      },
+      "execution_count": 20,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "45\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Storage Context"
+      ],
+      "metadata": {
+        "id": "E8tHMS5ZucFE"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from llama_index.core import StorageContext\n",
+        "\n",
+        "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
+      ],
+      "metadata": {
+        "id": "eWFtVpM_TcTQ"
+      },
+      "execution_count": 18,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Create index from Documents"
+      ],
+      "metadata": {
+        "id": "kCgdd197CTDt"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from llama_index.core import VectorStoreIndex\n",
+        "\n",
+        "index = VectorStoreIndex(\n",
+        "    nodes=nodes, storage_context=storage_context\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "g3GCf8LrULIW"
+      },
+      "execution_count": 24,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "query_engine = index.as_query_engine()\n",
+        "response = query_engine.query(\"What does NLP stands for?\")\n",
+        "response.response"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "id": "G7BdNn-Q5AlG",
+        "outputId": "a311ec41-6cdc-4fe1-fb59-ad338d0b6149"
+      },
+      "execution_count": 25,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "'NLP stands for Natural Language Processing.'"
+            ],
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "type": "string"
+            }
+          },
+          "metadata": {},
+          "execution_count": 25
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Store/Load Vector Store"
+      ],
+      "metadata": {
+        "id": "r6cGiUtxu5ga"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Index Storage Checks\n",
+        "import os.path\n",
+        "from llama_index.core import StorageContext, load_index_from_storage\n",
+        "\n",
+        "# Let's see if our index already exists in storage.\n",
+        "if not os.path.exists(\"./storage\"):\n",
+        "    index.storage_context.persist()\n",
+        "\n",
+        "else:\n",
+        "    # If the index already exists, we'll just load it:\n",
+        "    storage_context = StorageContext.from_defaults(persist_dir=\"./storage\")\n",
+        "    index = load_index_from_storage(storage_context)"
+      ],
+      "metadata": {
+        "id": "GHtB0C0mu7f6"
+      },
+      "execution_count": 28,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Paul Graham Essay"
+      ],
+      "metadata": {
+        "id": "iF8hwfMKuzst"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!mkdir -p './paul_graham/'\n",
+        "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O './paul_graham/paul_graham_essay.txt'"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "DrzbBAglwUo0",
+        "outputId": "73f30202-a708-4112-8491-9152e228c6cb"
+      },
+      "execution_count": 35,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "--2024-07-24 17:20:40--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt\n",
+            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n",
+            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 75042 (73K) [text/plain]\n",
+            "Saving to: ‘./paul_graham/paul_graham_essay.txt’\n",
+            "\n",
+            "\r          ./paul_gr   0%[                    ]       0  --.-KB/s               \r./paul_graham/paul_ 100%[===================>]  73.28K  --.-KB/s    in 0.02s   \n",
+            "\n",
+            "2024-07-24 17:20:40 (3.33 MB/s) - ‘./paul_graham/paul_graham_essay.txt’ saved [75042/75042]\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from llama_index.core import SimpleDirectoryReader\n",
+        "\n",
+        "# load documents\n",
+        "documents = SimpleDirectoryReader(\"./paul_graham\").load_data()"
+      ],
+      "metadata": {
+        "id": "S8-QmnkCwIiU"
+      },
+      "execution_count": 37,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import chromadb\n",
+        "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
+        "\n",
+        "# Load the vector store from the local storage.\n",
+        "db = chromadb.PersistentClient(path=\"./paul-graham\")\n",
+        "chroma_collection = db.get_or_create_collection(\"paul-graham\")\n",
+        "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
+      ],
+      "metadata": {
+        "id": "DfWglp75xc5f"
+      },
+      "execution_count": 38,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from llama_index.core import StorageContext\n",
+        "\n",
+        "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
+        "storage_context.docstore.add_documents(nodes)"
+      ],
+      "metadata": {
+        "id": "-EVBlUC-xcj1"
+      },
+      "execution_count": 39,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from llama_index.core import VectorStoreIndex\n",
+        "\n",
+        "index = VectorStoreIndex.from_documents(\n",
+        "    documents, storage_context=storage_context\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "8lMa4h9Cwn8b"
+      },
+      "execution_count": 40,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "query_engine = index.as_query_engine(similarity_top_k=10)"
+      ],
+      "metadata": {
+        "id": "tJsfskjHxj0e"
+      },
+      "execution_count": 44,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from llama_index.core.tools import QueryEngineTool, ToolMetadata\n",
+        "from llama_index.core.query_engine import SubQuestionQueryEngine\n",
+        "\n",
+        "query_engine_tools = [\n",
+        "    QueryEngineTool(\n",
+        "        query_engine=query_engine,\n",
+        "        metadata=ToolMetadata(\n",
+        "            name=\"pg_essay\",\n",
+        "            description=\"Paul Graham essay on What I Worked On\",\n",
+        "        ),\n",
+        "    ),\n",
+        "]\n",
+        "\n",
+        "query_engine = SubQuestionQueryEngine.from_defaults(\n",
+        "    query_engine_tools=query_engine_tools,\n",
+        "    use_async=True,\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "yL9TsFwxxuoA"
+      },
+      "execution_count": 50,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "response = query_engine.query(\n",
+        "    \"How was Paul Grahams life different before, during, and after YC?\"\n",
+        ")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "JWc_n5Lhx2bq",
+        "outputId": "cd992d20-c701-4eb7-aaf2-30f790d1ca24"
+      },
+      "execution_count": 51,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Generated 3 sub questions.\n",
+            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on before Y Combinator?\n",
+            "\u001b[0m\u001b[1;3;38;2;90;149;237m[pg_essay] Q: What did Paul Graham work on during Y Combinator?\n",
+            "\u001b[0m\u001b[1;3;38;2;11;159;203m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n",
+            "\u001b[0mGenerated 1 sub questions.\n",
+            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n",
+            "\u001b[0mGenerated 1 sub questions.\n",
+            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
+            "\u001b[0mGenerated 1 sub questions.\n",
+            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
+            "\u001b[0mGenerated 1 sub questions.\n",
+            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n",
+            "\u001b[0mGenerated 1 sub questions.\n",
+            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
+            "\u001b[0m"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "/usr/lib/python3.10/abc.py:123: RuntimeWarning: coroutine 'run_async_tasks.<locals>._gather' was never awaited\n",
+            "  return _abc_subclasscheck(cls, subclass)\n",
+            "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
+            "/usr/lib/python3.10/abc.py:123: RuntimeWarning: coroutine 'SubQuestionQueryEngine._aquery_subq' was never awaited\n",
+            "  return _abc_subclasscheck(cls, subclass)\n",
+            "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Generated 1 sub questions.\n",
+            "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
+            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n",
+            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: The title of Paul Graham's essay on What I Worked On is \"What I Worked On\".\n",
+            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n",
+            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: What I Worked On\n",
+            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: What I Worked On\n",
+            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: The title of Paul Graham's essay on What I Worked On is \"What I Worked On\".\n",
+            "\u001b[0m\u001b[1;3;38;2;11;159;203m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n",
+            "\u001b[0m\u001b[1;3;38;2;90;149;237m[pg_essay] A: Paul Graham worked on various projects during his time at Y Combinator.\n",
+            "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: Paul Graham worked on developing Viaweb before Y Combinator.\n",
+            "\u001b[0m"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print( \">>> The final response:\\n\", response )"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "T-ZC66Ltx5Za",
+        "outputId": "d7a1c85d-d73c-467d-e0df-9e06078622e2"
+      },
+      "execution_count": 52,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            ">>> The final response:\n",
+            " Paul Graham worked on developing Viaweb before Y Combinator, on various projects during his time at Y Combinator, and started painting after Y Combinator.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "27fS3JcDyFSj"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}