<a href="https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/LlamaIndex_101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install -q llama-index==0.10.37 openai==1.30.1 tiktoken==0.7.0 chromadb==0.5.0 llama-index-vector-stores-chroma==0.1.7 llama-index-readers-wikipedia==0.1.4 wikipedia==1.4.0

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone


In [49]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.

import nest_asyncio

nest_asyncio.apply()

In [2]:
import os

os.environ['OPENAI_API_KEY'] = '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'

In [3]:
import logging
import sys

#You can set the logging level to DEBUG for more verbose output,
# or use level=logging.INFO for less detailed information.
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Wikipedia Example

## LlamaHub Wikipedia Integration

In [12]:
from llama_index.readers.wikipedia import WikipediaReader

# Initialize WikipediaReader
reader = WikipediaReader()

In [13]:
# Load data from Wikipedia
documents = reader.load_data(pages=['Natural Language Processing', 'Artificial Intelligence'])

In [14]:
len( documents )

2

## Save on DeepLake

In [15]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

# Load the vector store from the local storage.
db = chromadb.PersistentClient(path="./wikipedia-articles")
chroma_collection = db.get_or_create_collection("wikipedia-articles")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

## Create Nodes

In [20]:
from llama_index.core.node_parser import SimpleNodeParser

# Initialize the parser
parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)

# Parse documents into nodes
nodes = parser.get_nodes_from_documents(documents)
print( len( nodes ) )

45


## Storage Context

In [18]:
from llama_index.core import StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)

## Create index from Documents

In [24]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(
    nodes=nodes, storage_context=storage_context
)

In [25]:
query_engine = index.as_query_engine()
response = query_engine.query("What does NLP stands for?")
response.response

'NLP stands for Natural Language Processing.'

## Store/Load Vector Store

In [28]:
# Index Storage Checks
import os.path
from llama_index.core import StorageContext, load_index_from_storage

# Let's see if our index already exists in storage.
if not os.path.exists("./storage"):
    index.storage_context.persist()

else:
    # If the index already exists, we'll just load it:
    storage_context = StorageContext.from_defaults(persist_dir="./storage")
    index = load_index_from_storage(storage_context)

# Paul Graham Essay

In [35]:
!mkdir -p './paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O './paul_graham/paul_graham_essay.txt'

--2024-07-24 17:20:40--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘./paul_graham/paul_graham_essay.txt’


2024-07-24 17:20:40 (3.33 MB/s) - ‘./paul_graham/paul_graham_essay.txt’ saved [75042/75042]



In [37]:
from llama_index.core import SimpleDirectoryReader

# load documents
documents = SimpleDirectoryReader("./paul_graham").load_data()

In [38]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

# Load the vector store from the local storage.
db = chromadb.PersistentClient(path="./paul-graham")
chroma_collection = db.get_or_create_collection("paul-graham")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [39]:
from llama_index.core import StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context.docstore.add_documents(nodes)

In [40]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

In [44]:
query_engine = index.as_query_engine(similarity_top_k=10)

In [50]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name="pg_essay",
            description="Paul Graham essay on What I Worked On",
        ),
    ),
]

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    use_async=True,
)

In [51]:
response = query_engine.query(
    "How was Paul Grahams life different before, during, and after YC?"
)

Generated 3 sub questions.
[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on before Y Combinator?
[0m[1;3;38;2;90;149;237m[pg_essay] Q: What did Paul Graham work on during Y Combinator?
[0m[1;3;38;2;11;159;203m[pg_essay] Q: What did Paul Graham work on after Y Combinator?
[0mGenerated 1 sub questions.
[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on after Y Combinator?
[0mGenerated 1 sub questions.
[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?
[0mGenerated 1 sub questions.
[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?
[0mGenerated 1 sub questions.
[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on after Y Combinator?
[0mGenerated 1 sub questions.
[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?
[0m

  return _abc_subclasscheck(cls, subclass)
  return _abc_subclasscheck(cls, subclass)


Generated 1 sub questions.
[1;3;38;2;237;90;200m[pg_essay] Q: What is the title of Paul Graham's essay on What I Worked On?
[0m[1;3;38;2;237;90;200m[pg_essay] A: After Y Combinator, Paul Graham started painting.
[0m[1;3;38;2;237;90;200m[pg_essay] A: The title of Paul Graham's essay on What I Worked On is "What I Worked On".
[0m[1;3;38;2;237;90;200m[pg_essay] A: After Y Combinator, Paul Graham started painting.
[0m[1;3;38;2;237;90;200m[pg_essay] A: What I Worked On
[0m[1;3;38;2;237;90;200m[pg_essay] A: What I Worked On
[0m[1;3;38;2;237;90;200m[pg_essay] A: The title of Paul Graham's essay on What I Worked On is "What I Worked On".
[0m[1;3;38;2;11;159;203m[pg_essay] A: After Y Combinator, Paul Graham started painting.
[0m[1;3;38;2;90;149;237m[pg_essay] A: Paul Graham worked on various projects during his time at Y Combinator.
[0m[1;3;38;2;237;90;200m[pg_essay] A: Paul Graham worked on developing Viaweb before Y Combinator.
[0m

In [52]:
print( ">>> The final response:\n", response )

>>> The final response:
 Paul Graham worked on developing Viaweb before Y Combinator, on various projects during his time at Y Combinator, and started painting after Y Combinator.
