{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyNzaxKiokXX5SPot1IBiMhR",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
""
]
},
{
"cell_type": "code",
"source": [
"!pip install -q llama-index==0.10.37 openai==1.30.1 tiktoken==0.7.0 chromadb==0.5.0 llama-index-vector-stores-chroma==0.1.7 llama-index-readers-wikipedia==0.1.4 wikipedia==1.4.0"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "y_GAV7-zos0Y",
"outputId": "74d4a3c4-3576-455b-fbe2-1b8b67bf20d5"
},
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Building wheel for wikipedia (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.\n",
"\n",
"import nest_asyncio\n",
"\n",
"nest_asyncio.apply()"
],
"metadata": {
"id": "Ua0KNwgvyCaj"
},
"execution_count": 49,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import os\n",
"\n",
"os.environ['OPENAI_API_KEY'] = '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'"
],
"metadata": {
"id": "--Q2zk06wElp"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import logging\n",
"import sys\n",
"\n",
"#You can set the logging level to DEBUG for more verbose output,\n",
"# or use level=logging.INFO for less detailed information.\n",
"logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n",
"logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
],
"metadata": {
"id": "tjwZjA8-wITr"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Wikipedia Example"
],
"metadata": {
"id": "HjI_gRaRutfj"
}
},
{
"cell_type": "markdown",
"source": [
"## LlamaHub Wikipedia Integration"
],
"metadata": {
"id": "PLUDcXpI41Q_"
}
},
{
"cell_type": "code",
"source": [
"from llama_index.readers.wikipedia import WikipediaReader\n",
"\n",
"# Initialize WikipediaReader\n",
"reader = WikipediaReader()"
],
"metadata": {
"id": "2gko9Q3hrlMh"
},
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Load data from Wikipedia\n",
"documents = reader.load_data(pages=['Natural Language Processing', 'Artificial Intelligence'])"
],
"metadata": {
"id": "Z35ot7P1wIO0"
},
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"source": [
"len( documents )"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0i9Zp6BJwILk",
"outputId": "a6a1e0a7-98cf-4ba4-d48a-e4f5833b4967"
},
"execution_count": 14,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"2"
]
},
"metadata": {},
"execution_count": 14
}
]
},
{
"cell_type": "markdown",
"source": [
"## Save on DeepLake"
],
"metadata": {
"id": "03lff4VUTaN9"
}
},
{
"cell_type": "code",
"source": [
"import chromadb\n",
"from llama_index.vector_stores.chroma import ChromaVectorStore\n",
"\n",
"# Load the vector store from the local storage.\n",
"db = chromadb.PersistentClient(path=\"./wikipedia-articles\")\n",
"chroma_collection = db.get_or_create_collection(\"wikipedia-articles\")\n",
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
],
"metadata": {
"id": "eo8CTHSFTcaR"
},
"execution_count": 15,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Create Nodes"
],
"metadata": {
"id": "qkKPAnIl44ss"
}
},
{
"cell_type": "code",
"source": [
"from llama_index.core.node_parser import SimpleNodeParser\n",
"\n",
"# Initialize the parser\n",
"parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)\n",
"\n",
"# Parse documents into nodes\n",
"nodes = parser.get_nodes_from_documents(documents)\n",
"print( len( nodes ) )"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "eB6Rc0U0wII_",
"outputId": "ec338be1-deca-45a7-e6ba-9997e4b7e25a"
},
"execution_count": 20,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"45\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## Storage Context"
],
"metadata": {
"id": "E8tHMS5ZucFE"
}
},
{
"cell_type": "code",
"source": [
"from llama_index.core import StorageContext\n",
"\n",
"storage_context = StorageContext.from_defaults(vector_store=vector_store)"
],
"metadata": {
"id": "eWFtVpM_TcTQ"
},
"execution_count": 18,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Create index from Documents"
],
"metadata": {
"id": "kCgdd197CTDt"
}
},
{
"cell_type": "code",
"source": [
"from llama_index.core import VectorStoreIndex\n",
"\n",
"index = VectorStoreIndex(\n",
" nodes=nodes, storage_context=storage_context\n",
")"
],
"metadata": {
"id": "g3GCf8LrULIW"
},
"execution_count": 24,
"outputs": []
},
{
"cell_type": "code",
"source": [
"query_engine = index.as_query_engine()\n",
"response = query_engine.query(\"What does NLP stands for?\")\n",
"response.response"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"id": "G7BdNn-Q5AlG",
"outputId": "a311ec41-6cdc-4fe1-fb59-ad338d0b6149"
},
"execution_count": 25,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'NLP stands for Natural Language Processing.'"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 25
}
]
},
{
"cell_type": "markdown",
"source": [
"## Store/Load Vector Store"
],
"metadata": {
"id": "r6cGiUtxu5ga"
}
},
{
"cell_type": "code",
"source": [
"# Index Storage Checks\n",
"import os.path\n",
"from llama_index.core import StorageContext, load_index_from_storage\n",
"\n",
"# Let's see if our index already exists in storage.\n",
"if not os.path.exists(\"./storage\"):\n",
" index.storage_context.persist()\n",
"\n",
"else:\n",
" # If the index already exists, we'll just load it:\n",
" storage_context = StorageContext.from_defaults(persist_dir=\"./storage\")\n",
" index = load_index_from_storage(storage_context)"
],
"metadata": {
"id": "GHtB0C0mu7f6"
},
"execution_count": 28,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Paul Graham Essay"
],
"metadata": {
"id": "iF8hwfMKuzst"
}
},
{
"cell_type": "code",
"source": [
"!mkdir -p './paul_graham/'\n",
"!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O './paul_graham/paul_graham_essay.txt'"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "DrzbBAglwUo0",
"outputId": "73f30202-a708-4112-8491-9152e228c6cb"
},
"execution_count": 35,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"--2024-07-24 17:20:40-- https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 75042 (73K) [text/plain]\n",
"Saving to: ‘./paul_graham/paul_graham_essay.txt’\n",
"\n",
"\r ./paul_gr 0%[ ] 0 --.-KB/s \r./paul_graham/paul_ 100%[===================>] 73.28K --.-KB/s in 0.02s \n",
"\n",
"2024-07-24 17:20:40 (3.33 MB/s) - ‘./paul_graham/paul_graham_essay.txt’ saved [75042/75042]\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from llama_index.core import SimpleDirectoryReader\n",
"\n",
"# load documents\n",
"documents = SimpleDirectoryReader(\"./paul_graham\").load_data()"
],
"metadata": {
"id": "S8-QmnkCwIiU"
},
"execution_count": 37,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import chromadb\n",
"from llama_index.vector_stores.chroma import ChromaVectorStore\n",
"\n",
"# Load the vector store from the local storage.\n",
"db = chromadb.PersistentClient(path=\"./paul-graham\")\n",
"chroma_collection = db.get_or_create_collection(\"paul-graham\")\n",
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
],
"metadata": {
"id": "DfWglp75xc5f"
},
"execution_count": 38,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from llama_index.core import StorageContext\n",
"\n",
"storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
"storage_context.docstore.add_documents(nodes)"
],
"metadata": {
"id": "-EVBlUC-xcj1"
},
"execution_count": 39,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from llama_index.core import VectorStoreIndex\n",
"\n",
"index = VectorStoreIndex.from_documents(\n",
" documents, storage_context=storage_context\n",
")"
],
"metadata": {
"id": "8lMa4h9Cwn8b"
},
"execution_count": 40,
"outputs": []
},
{
"cell_type": "code",
"source": [
"query_engine = index.as_query_engine(similarity_top_k=10)"
],
"metadata": {
"id": "tJsfskjHxj0e"
},
"execution_count": 44,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from llama_index.core.tools import QueryEngineTool, ToolMetadata\n",
"from llama_index.core.query_engine import SubQuestionQueryEngine\n",
"\n",
"query_engine_tools = [\n",
" QueryEngineTool(\n",
" query_engine=query_engine,\n",
" metadata=ToolMetadata(\n",
" name=\"pg_essay\",\n",
" description=\"Paul Graham essay on What I Worked On\",\n",
" ),\n",
" ),\n",
"]\n",
"\n",
"query_engine = SubQuestionQueryEngine.from_defaults(\n",
" query_engine_tools=query_engine_tools,\n",
" use_async=True,\n",
")"
],
"metadata": {
"id": "yL9TsFwxxuoA"
},
"execution_count": 50,
"outputs": []
},
{
"cell_type": "code",
"source": [
"response = query_engine.query(\n",
" \"How was Paul Grahams life different before, during, and after YC?\"\n",
")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "JWc_n5Lhx2bq",
"outputId": "cd992d20-c701-4eb7-aaf2-30f790d1ca24"
},
"execution_count": 51,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generated 3 sub questions.\n",
"\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on before Y Combinator?\n",
"\u001b[0m\u001b[1;3;38;2;90;149;237m[pg_essay] Q: What did Paul Graham work on during Y Combinator?\n",
"\u001b[0m\u001b[1;3;38;2;11;159;203m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n",
"\u001b[0mGenerated 1 sub questions.\n",
"\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n",
"\u001b[0mGenerated 1 sub questions.\n",
"\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
"\u001b[0mGenerated 1 sub questions.\n",
"\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
"\u001b[0mGenerated 1 sub questions.\n",
"\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on after Y Combinator?\n",
"\u001b[0mGenerated 1 sub questions.\n",
"\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
"\u001b[0m"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/lib/python3.10/abc.py:123: RuntimeWarning: coroutine 'run_async_tasks.._gather' was never awaited\n",
" return _abc_subclasscheck(cls, subclass)\n",
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
"/usr/lib/python3.10/abc.py:123: RuntimeWarning: coroutine 'SubQuestionQueryEngine._aquery_subq' was never awaited\n",
" return _abc_subclasscheck(cls, subclass)\n",
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generated 1 sub questions.\n",
"\u001b[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?\n",
"\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n",
"\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: The title of Paul Graham's essay on What I Worked On is \"What I Worked On\".\n",
"\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n",
"\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: What I Worked On\n",
"\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: What I Worked On\n",
"\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: The title of Paul Graham's essay on What I Worked On is \"What I Worked On\".\n",
"\u001b[0m\u001b[1;3;38;2;11;159;203m[pg_essay] A: After Y Combinator, Paul Graham started painting.\n",
"\u001b[0m\u001b[1;3;38;2;90;149;237m[pg_essay] A: Paul Graham worked on various projects during his time at Y Combinator.\n",
"\u001b[0m\u001b[1;3;38;2;237;90;200m[pg_essay] A: Paul Graham worked on developing Viaweb before Y Combinator.\n",
"\u001b[0m"
]
}
]
},
{
"cell_type": "code",
"source": [
"print( \">>> The final response:\\n\", response )"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "T-ZC66Ltx5Za",
"outputId": "d7a1c85d-d73c-467d-e0df-9e06078622e2"
},
"execution_count": 52,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
">>> The final response:\n",
" Paul Graham worked on developing Viaweb before Y Combinator, on various projects during his time at Y Combinator, and started painting after Y Combinator.\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "27fS3JcDyFSj"
},
"execution_count": null,
"outputs": []
}
]
}