{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "authorship_tag": "ABX9TyNzaxKiokXX5SPot1IBiMhR", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "source": [ "!pip install -q llama-index==0.10.37 openai==1.30.1 tiktoken==0.7.0 chromadb==0.5.0 llama-index-vector-stores-chroma==0.1.7 llama-index-readers-wikipedia==0.1.4 wikipedia==1.4.0" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "y_GAV7-zos0Y", "outputId": "74d4a3c4-3576-455b-fbe2-1b8b67bf20d5" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for wikipedia (setup.py) ... \u001b[?25l\u001b[?25hdone\n" ] } ] }, { "cell_type": "code", "source": [ "# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.\n", "\n", "import nest_asyncio\n", "\n", "nest_asyncio.apply()" ], "metadata": { "id": "Ua0KNwgvyCaj" }, "execution_count": 49, "outputs": [] }, { "cell_type": "code", "source": [ "import os\n", "\n", "os.environ['OPENAI_API_KEY'] = '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'" ], "metadata": { "id": "--Q2zk06wElp" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "import logging\n", "import sys\n", "\n", "#You can set the logging level to DEBUG for more verbose output,\n", "# or use level=logging.INFO for less detailed information.\n", "logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n", "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" ], "metadata": { "id": "tjwZjA8-wITr" }, "execution_count": 3, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Wikipedia Example" ], "metadata": { "id": "HjI_gRaRutfj" } }, { "cell_type": "markdown", "source": [ "## LlamaHub Wikipedia Integration" ], "metadata": { "id": "PLUDcXpI41Q_" } }, { "cell_type": "code", "source": [ "from llama_index.readers.wikipedia import WikipediaReader\n", "\n", "# Initialize WikipediaReader\n", "reader = WikipediaReader()" ], "metadata": { "id": "2gko9Q3hrlMh" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "# Load data from Wikipedia\n", "documents = reader.load_data(pages=['Natural Language Processing', 'Artificial Intelligence'])" ], "metadata": { "id": "Z35ot7P1wIO0" }, "execution_count": 13, "outputs": [] }, { "cell_type": "code", "source": [ "len( documents )" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0i9Zp6BJwILk", "outputId": "a6a1e0a7-98cf-4ba4-d48a-e4f5833b4967" }, "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "2" ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "markdown", "source": [ "## Save on DeepLake" ], "metadata": { "id": "03lff4VUTaN9" } }, { "cell_type": "code", "source": [ "import chromadb\n", "from llama_index.vector_stores.chroma import ChromaVectorStore\n", "\n", "# Load the vector store from the local storage.\n", "db = chromadb.PersistentClient(path=\"./wikipedia-articles\")\n", "chroma_collection = db.get_or_create_collection(\"wikipedia-articles\")\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" ], "metadata": { "id": "eo8CTHSFTcaR" }, "execution_count": 15, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Create Nodes" ], "metadata": { "id": "qkKPAnIl44ss" } }, { "cell_type": "code", "source": [ "from llama_index.core.node_parser import SimpleNodeParser\n", "\n", "# Initialize the parser\n", "parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)\n", "\n", "# Parse documents into nodes\n", "nodes = parser.get_nodes_from_documents(documents)\n", "print( len( nodes ) )" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eB6Rc0U0wII_", "outputId": "ec338be1-deca-45a7-e6ba-9997e4b7e25a" }, "execution_count": 20, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "45\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Storage Context" ], "metadata": { "id": "E8tHMS5ZucFE" } }, { "cell_type": "code", "source": [ "from llama_index.core import StorageContext\n", "\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ], "metadata": { "id": "eWFtVpM_TcTQ" }, "execution_count": 18, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Create index from Documents" ], "metadata": { "id": "kCgdd197CTDt" } }, { "cell_type": "code", "source": [ "from llama_index.core import VectorStoreIndex\n", "\n", "index = VectorStoreIndex(\n", " nodes=nodes, storage_context=storage_context\n", ")" ], "metadata": { "id": "g3GCf8LrULIW" }, "execution_count": 24, "outputs": [] }, { "cell_type": "code", "source": [ "query_engine = index.as_query_engine()\n", "response = query_engine.query(\"What does NLP stands for?\")\n", "response.response" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "G7BdNn-Q5AlG", "outputId": "a311ec41-6cdc-4fe1-fb59-ad338d0b6149" }, "execution_count": 25, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'NLP stands for Natural Language Processing.'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 25 } ] }, { "cell_type": "markdown", "source": [ "## Store/Load Vector Store" ], "metadata": { "id": "r6cGiUtxu5ga" } }, { "cell_type": "code", "source": [ "# Index Storage Checks\n", "import os.path\n", "from llama_index.core import StorageContext, load_index_from_storage\n", "\n", "# Let's see if our index already exists in storage.\n", "if not os.path.exists(\"./storage\"):\n", " index.storage_context.persist()\n", "\n", "else:\n", " # If the index already exists, we'll just load it:\n", " storage_context = StorageContext.from_defaults(persist_dir=\"./storage\")\n", " index = load_index_from_storage(storage_context)" ], "metadata": { "id": "GHtB0C0mu7f6" }, "execution_count": 28, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Paul Graham Essay" ], "metadata": { "id": "iF8hwfMKuzst" } }, { "cell_type": "code", "source": [ "!mkdir -p './paul_graham/'\n", "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O './paul_graham/paul_graham_essay.txt'" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DrzbBAglwUo0", "outputId": "73f30202-a708-4112-8491-9152e228c6cb" }, "execution_count": 35, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2024-07-24 17:20:40-- https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 75042 (73K) [text/plain]\n", "Saving to: ‘./paul_graham/paul_graham_essay.txt’\n", "\n", "\r ./paul_gr 0%[ ] 0 --.-KB/s \r./paul_graham/paul_ 100%[===================>] 73.28K --.-KB/s in 0.02s \n", "\n", "2024-07-24 17:20:40 (3.33 MB/s) - ‘./paul_graham/paul_graham_essay.txt’ saved [75042/75042]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "from llama_index.core import SimpleDirectoryReader\n", "\n", "# load documents\n", "documents = SimpleDirectoryReader(\"./paul_graham\").load_data()" ], "metadata": { "id": "S8-QmnkCwIiU" }, "execution_count": 37, "outputs": [] }, { "cell_type": "code", "source": [ "import chromadb\n", "from llama_index.vector_stores.chroma import ChromaVectorStore\n", "\n", "# Load the vector store from the local storage.\n", "db = chromadb.PersistentClient(path=\"./paul-graham\")\n", "chroma_collection = db.get_or_create_collection(\"paul-graham\")\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" ], "metadata": { "id": "DfWglp75xc5f" }, "execution_count": 38, "outputs": [] }, { "cell_type": "code", "source": [ "from llama_index.core import StorageContext\n", "\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", "storage_context.docstore.add_documents(nodes)" ], "metadata": { "id": "-EVBlUC-xcj1" }, "execution_count": 39, "outputs": [] }, { "cell_type": "code", "source": [ "from llama_index.core import VectorStoreIndex\n", "\n", "index = VectorStoreIndex.from_documents(\n", " documents, storage_context=storage_context\n", ")" ], "metadata": { "id": "8lMa4h9Cwn8b" }, "execution_count": 40, "outputs": [] }, { "cell_type": "code", "source": [ "query_engine = index.as_query_engine(similarity_top_k=10)" ], "metadata": { "id": "tJsfskjHxj0e" }, "execution_count": 44, "outputs": [] }, { "cell_type": "code", "source": [ "from llama_index.core.tools import QueryEngineTool, ToolMetadata\n", "from llama_index.core.query_engine import SubQuestionQueryEngine\n", "\n", "query_engine_tools = [\n", " QueryEngineTool(\n", " query_engine=query_engine,\n", " metadata=ToolMetadata(\n", " name=\"pg_essay\",\n", " description=\"Paul Graham essay on What I Worked On\",\n", " ),\n", " ),\n", "]\n", "\n", "query_engine = SubQuestionQueryEngine.from_defaults(\n", " query_engine_tools=query_engine_tools,\n", " use_async=True,\n", ")" ], "metadata": { "id": "yL9TsFwxxuoA" }, "execution_count": 50, "outputs": [] }, { "cell_type": "code", "source": [ "response = query_engine.query(\n", " \"How was Paul Grahams life different before, during, and after YC?\"\n", ")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JWc_n5Lhx2bq", "outputId": "cd992d20-c701-4eb7-aaf2-30f790d1ca24" }, "execution_count": 51, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Generated 3 sub questions.\n", "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on before Y Combinator?\n", "\u001b[0m\u001b[1;3;38;2;90;149;237m[pg_essay] Q: What did Paul Graham work on during Y Combinator?\n", "\u001b[0m\u001b[1;3;38;2;11;159;203m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n", "\u001b[0mGenerated 1 sub questions.\n", "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n", "\u001b[0mGenerated 1 sub questions.\n", "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n", "\u001b[0mGenerated 1 sub questions.\n", "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n", "\u001b[0mGenerated 1 sub questions.\n", "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n", "\u001b[0mGenerated 1 sub questions.\n", "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n", "\u001b[0m" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/lib/python3.10/abc.py:123: RuntimeWarning: coroutine 'run_async_tasks.._gather' was never awaited\n", " return _abc_subclasscheck(cls, subclass)\n", "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n", "/usr/lib/python3.10/abc.py:123: RuntimeWarning: coroutine 'SubQuestionQueryEngine._aquery_subq' was never awaited\n", " return _abc_subclasscheck(cls, subclass)\n", "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Generated 1 sub questions.\n", "\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n", "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n", "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: The title of Paul Graham's essay on What I Worked On is \"What I Worked On\".\n", "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n", "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: What I Worked On\n", "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: What I Worked On\n", "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: The title of Paul Graham's essay on What I Worked On is \"What I Worked On\".\n", "\u001b[0m\u001b[1;3;38;2;11;159;203m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n", "\u001b[0m\u001b[1;3;38;2;90;149;237m[pg_essay] A: Paul Graham worked on various projects during his time at Y Combinator.\n", "\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: Paul Graham worked on developing Viaweb before Y Combinator.\n", "\u001b[0m" ] } ] }, { "cell_type": "code", "source": [ "print( \">>> The final response:\\n\", response )" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "T-ZC66Ltx5Za", "outputId": "d7a1c85d-d73c-467d-e0df-9e06078622e2" }, "execution_count": 52, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ ">>> The final response:\n", " Paul Graham worked on developing Viaweb before Y Combinator, on various projects during his time at Y Combinator, and started painting after Y Combinator.\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "27fS3JcDyFSj" }, "execution_count": null, "outputs": [] } ] }