"Open

In [1]:
!pip install -q llama-index==0.10.37 openai==1.30.1 tiktoken==0.7.0 chromadb==0.5.0 llama-index-vector-stores-chroma==0.1.7 llama-index-readers-wikipedia==0.1.4 wikipedia==1.4.0

 Preparing metadata (setup.py) ... [?25l[?25hdone
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h Installing build dependencies ... [?25l[?25hdone
 Getting requirements to build wheel ... [?25l[?25hdone
 Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m219.0 kB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m273.8/273.8 kB[

In [2]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.

import nest_asyncio

nest_asyncio.apply()

In [3]:
import os

os.environ['OPENAI_API_KEY'] = '[OPENAI_API_KEY]'

In [4]:
import logging
import sys

#You can set the logging level to DEBUG for more verbose output,
# or use level=logging.INFO for less detailed information.
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Wikipedia Example

## LlamaHub Wikipedia Integration

In [12]:
from llama_index.readers.wikipedia import WikipediaReader

# Initialize WikipediaReader
reader = WikipediaReader()

In [13]:
# Load data from Wikipedia
documents = reader.load_data(pages=['Natural Language Processing', 'Artificial Intelligence'])

In [14]:
len( documents )

2

## Save on DeepLake

In [15]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

# Load the vector store from the local storage.
db = chromadb.PersistentClient(path="./wikipedia-articles")
chroma_collection = db.get_or_create_collection("wikipedia-articles")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

## Create Nodes

In [20]:
from llama_index.core.node_parser import SimpleNodeParser

# Initialize the parser
parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)

# Parse documents into nodes
nodes = parser.get_nodes_from_documents(documents)
print( len( nodes ) )

45


## Storage Context

In [18]:
from llama_index.core import StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)

## Create index from Documents

In [24]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(
 nodes=nodes, storage_context=storage_context
)

In [25]:
query_engine = index.as_query_engine()
response = query_engine.query("What does NLP stands for?")
response.response

'NLP stands for Natural Language Processing.'

## Store/Load Vector Store

In [28]:
# Index Storage Checks
import os.path
from llama_index.core import StorageContext, load_index_from_storage

# Let's see if our index already exists in storage.
if not os.path.exists("./storage"):
 index.storage_context.persist()

else:
 # If the index already exists, we'll just load it:
 storage_context = StorageContext.from_defaults(persist_dir="./storage")
 index = load_index_from_storage(storage_context)

# Paul Graham Essay

In [6]:
!mkdir -p './paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O './paul_graham/paul_graham_essay.txt'

--2024-07-24 18:48:21-- https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘./paul_graham/paul_graham_essay.txt’


2024-07-24 18:48:21 (2.95 MB/s) - ‘./paul_graham/paul_graham_essay.txt’ saved [75042/75042]



In [7]:
from llama_index.core import SimpleDirectoryReader

# load documents
documents = SimpleDirectoryReader("./paul_graham").load_data()

In [8]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

# Load the vector store from the local storage.
db = chromadb.PersistentClient(path="./paul-graham")
chroma_collection = db.get_or_create_collection("paul-graham")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [11]:
from llama_index.core import StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [12]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(
 documents, storage_context=storage_context
)

In [13]:
query_engine = index.as_query_engine(similarity_top_k=10)

In [14]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

query_engine_tools = [
 QueryEngineTool(
 query_engine=query_engine,
 metadata=ToolMetadata(
 name="pg_essay",
 description="Paul Graham essay on What I Worked On",
 ),
 ),
]

query_engine = SubQuestionQueryEngine.from_defaults(
 query_engine_tools=query_engine_tools,
 use_async=True,
)

In [15]:
response = query_engine.query(
 "How was Paul Grahams life different before, during, and after YC?"
)

Generated 3 sub questions.
[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on before Y Combinator?
[0m[1;3;38;2;90;149;237m[pg_essay] Q: What did Paul Graham work on during Y Combinator?
[0m[1;3;38;2;11;159;203m[pg_essay] Q: What did Paul Graham work on after Y Combinator?
[0m[1;3;38;2;237;90;200m[pg_essay] A: Paul Graham worked on building online stores through a web app called Viaweb before starting Y Combinator.
[0m[1;3;38;2;11;159;203m[pg_essay] A: After Y Combinator, Paul Graham started painting.
[0m[1;3;38;2;90;149;237m[pg_essay] A: Paul Graham worked on various aspects during Y Combinator, including being an angel firm, funding startups in batches, providing seed investments, and offering support to startups similar to what he had received when starting his own company.
[0m

In [16]:
print( ">>> The final response:\n", response )

>>> The final response:
 Paul Graham's life involved building online stores through Viaweb before Y Combinator, working on various aspects within Y Combinator such as funding startups and providing support, and then transitioning to painting after Y Combinator.
