# Install Packages and Setup Variables

In [1]:
!pip install -q llama-index==0.10.11 openai==1.12.0 llama-index-finetuning llama-index-embeddings-huggingface llama-index-readers-web tiktoken==0.6.0 chromadb==0.4.22 pandas==2.2.0 html2text sentence_transformers pydantic

In [1]:
import os

# Set the "OPENAI_API_KEY" in the Python environment. Will be used by OpenAI client later.
os.environ["OPENAI_API_KEY"] = ""

In [2]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.

import nest_asyncio

nest_asyncio.apply()

# Load a Model

In [3]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(temperature=0.9, model="gpt-3.5-turbo", max_tokens=512)

 from .autonotebook import tqdm as notebook_tqdm


# Create a VectoreStore

In [4]:
import chromadb

# create client and a new collection
# chromadb.EphemeralClient saves data in-memory.
chroma_client = chromadb.PersistentClient(path="./mini-llama-articles")
chroma_collection = chroma_client.create_collection("mini-llama-articles")

In [6]:
from llama_index.vector_stores.chroma import ChromaVectorStore

# Define a storage context object using the created vector database.
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Load the Dataset (CSV)

## Download

The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string.

In [7]:
!curl -o ./mini-llama-articles.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv

 % Total % Received % Xferd Average Speed Time Time Time Current
 Dload Upload Total Spent Left Speed
100 169k 100 169k 0 0 864k 0 --:--:-- --:--:-- --:--:-- 865k


## Read File

In [8]:
import csv

rows = []

# Load the file as a JSON
with open("./mini-llama-articles.csv", mode="r", encoding="utf-8") as file:
 csv_reader = csv.reader(file)

 for idx, row in enumerate( csv_reader ):
 if idx == 0: continue; # Skip header row
 rows.append( row )

# The number of characters in the dataset.
len( rows )

14

# Convert to Document obj

In [9]:
from llama_index.core import Document

# Convert the chunks to Document objects so the LlamaIndex framework can process them.
documents = [Document(text=row[1], metadata={"title": row[0], "url": row[2], "source_name": row[3]}) for row in rows]

# Transforming

In [10]:
from llama_index.core.text_splitter import TokenTextSplitter

# Define the splitter object that split the text into segments with 512 tokens,
# with a 128 overlap between the segments.
text_splitter = TokenTextSplitter(
 separator=" ", chunk_size=512, chunk_overlap=128
)

In [12]:
from llama_index.core.extractors import (
 SummaryExtractor,
 QuestionsAnsweredExtractor,
 KeywordExtractor,
)
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline

# Create the pipeline to apply the transformation on each chunk,
# and store the transformed text in the chroma vector store.
pipeline = IngestionPipeline(
 transformations=[
 text_splitter,
 QuestionsAnsweredExtractor(questions=3, llm=llm),
 SummaryExtractor(summaries=["prev", "self"], llm=llm),
 KeywordExtractor(keywords=10, llm=llm),
 OpenAIEmbedding(),
 ],
 vector_store=vector_store
)

# Run the transformation pipeline.
nodes = pipeline.run(documents=documents, show_progress=True);

Parsing nodes: 0%| | 0/14 [00:00, ?it/s]

Parsing nodes: 100%|██████████| 14/14 [00:00<00:00, 27.40it/s]
100%|██████████| 108/108 [00:59<00:00, 1.81it/s]
100%|██████████| 108/108 [01:08<00:00, 1.58it/s]
100%|██████████| 108/108 [00:27<00:00, 3.88it/s]
Generating embeddings: 100%|██████████| 108/108 [00:01<00:00, 77.68it/s]


In [13]:
len( nodes )

108

In [14]:
# Compress the vector store directory to a zip file to be able to download and use later.
!zip -r vectorstore.zip mini-llama-articles

updating: mini-llama-articles/ (stored 0%)
updating: mini-llama-articles/chroma.sqlite3 (deflated 65%)
 adding: mini-llama-articles/6059cb71-7dfb-4096-aaab-f06eaf1d0ace/ (stored 0%)
 adding: mini-llama-articles/6059cb71-7dfb-4096-aaab-f06eaf1d0ace/data_level0.bin (deflated 97%)
 adding: mini-llama-articles/6059cb71-7dfb-4096-aaab-f06eaf1d0ace/length.bin (deflated 23%)
 adding: mini-llama-articles/6059cb71-7dfb-4096-aaab-f06eaf1d0ace/link_lists.bin (stored 0%)
 adding: mini-llama-articles/6059cb71-7dfb-4096-aaab-f06eaf1d0ace/header.bin (deflated 61%)


# Load Indexes

If you have already uploaded the zip file for the vector store checkpoint, please uncomment the code in the following cell block to extract its contents. After doing so, you will be able to load the dataset from local storage.

In [15]:
# !unzip vectorstore.zip

In [16]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

# Load the vector store from the local storage.
db = chromadb.PersistentClient(path="./mini-llama-articles")
chroma_collection = db.get_or_create_collection("mini-llama-articles")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [17]:
from llama_index.core import VectorStoreIndex

# Create the index based on the vector store.
vector_index = VectorStoreIndex.from_vector_store(vector_store)

# Retrieving All the Nodes

To develop a custom retriever with keyword index, we require access to all nodes. We use the index as a retriever and requesting it to fetch a large number of documents, we can ensure that the retriever returns every document stored in the vector store. (This method serves as a temporary solution because LlamaIndex currently lacks the capability to fetch all documents from a chromadb. However, this limitation may be addressed in future updates.)

In [18]:
# Set similarity_top_k to a large number to retrieve all the nodes
retriever = vector_index.as_retriever(similarity_top_k=100000000)

# Retrieve all nodes
all_nodes = retriever.retrieve('Hello!')

Number of requested results 100000000 is greater than number of elements in index 108, updating n_results = 108


In [19]:
all_nodes = [item.node for item in all_nodes]

In [20]:
len( all_nodes )

108

In [21]:
from llama_index.core import SimpleKeywordTableIndex

# Define the KeyworddTableIndex using all the nodes.
keyword_index = SimpleKeywordTableIndex(nodes=all_nodes)

# Custom Retriever

In [22]:
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
from llama_index.core.retrievers import (
 BaseRetriever,
 VectorIndexRetriever,
 KeywordTableSimpleRetriever,
)
from typing import List

# The custom retriever that can use both vector index and keyword index to retrieve documents.
# It has two modes: "AND" meaning it uses nodes that are retrieved in both indexes.
# "OR" meaning that it merges the retrieved nodes.
class CustomRetriever(BaseRetriever):
 """Custom retriever that performs both semantic search and hybrid search."""

 def __init__(
 self,
 vector_retriever: VectorIndexRetriever,
 keyword_retriever: KeywordTableSimpleRetriever,
 mode: str = "AND",
 ) -> None:
 """Init params."""

 self._vector_retriever = vector_retriever
 self._keyword_retriever = keyword_retriever
 if mode not in ("AND", "OR"):
 raise ValueError("Invalid mode.")
 self._mode = mode
 super().__init__()

 def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
 """Retrieve nodes given query."""

 vector_nodes = self._vector_retriever.retrieve(query_bundle)
 keyword_nodes = self._keyword_retriever.retrieve(query_bundle)

 vector_ids = {n.node.node_id for n in vector_nodes}
 keyword_ids = {n.node.node_id for n in keyword_nodes}

 combined_dict = {n.node.node_id: n for n in vector_nodes}
 combined_dict.update({n.node.node_id: n for n in keyword_nodes})

 if self._mode == "AND":
 retrieve_ids = vector_ids.intersection(keyword_ids)
 else:
 retrieve_ids = vector_ids.union(keyword_ids)

 retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]

 return retrieve_nodes

In [23]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine

# define custom retriever
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=2)
keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index, max_keywords_per_query=2)
custom_retriever = CustomRetriever(vector_retriever, keyword_retriever, "OR")

# define response synthesizer
response_synthesizer = get_response_synthesizer()

# Query Dataset

In [24]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.
custom_query_engine = RetrieverQueryEngine(
 retriever=custom_retriever,
 response_synthesizer=response_synthesizer,
)

res = custom_query_engine.query("How many parameters LLaMA2 model has?")

In [25]:
res.response

'The LLaMA2 model has 52 billion parameters.'

In [26]:
# Show the retrieved nodes
for src in res.source_nodes:
 print("Node ID\t", src.node_id)
 print("Title\t", src.metadata['title'])
 print("Text\t", src.text)
 print("Score\t", src.score)
 print("-_"*20)

Node ID	 322a5cb0-5b0c-413f-bc5e-e72747b385d1
Title	 Building Intuition on the Concepts behind LLMs like ChatGPT - Part 1- Neural Networks, Transformers, Pretraining, and Fine Tuning
Text	 backpropagation, the degree of the error of the model (the loss value) is propagated backward through the neural network. It computes the derivative to the output of each individual weight and bias i.e. how sensitive the output is to changes in each specific parameter. For my people who didn't take on differential calculus in school (such as myself), think of the model parameters (weights/biases) as adjustable knobs. These knobs are arbitrary - in the sense that you can't tell in what specific way it governs the prediction ability of the model. The knobs, which can be rotated clockwise or counterclockwise have different effects on the behavior of the output. Knob A might increase the loss 3x when turned clockwise, knob B reduces the loss by 1/8 when turned counterclockwise (and so on). All these knob

# Evaluate

In [27]:
from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.llms.openai import OpenAI

# Create questions for each segment. These questions will be used to
# assess whether the retriever can accurately identify and return the
# corresponding segment when queried.
llm = OpenAI(model="gpt-3.5-turbo")
rag_eval_dataset = generate_question_context_pairs(
 nodes,
 llm=llm,
 num_questions_per_chunk=1
)

# We can save the evaluation dataset as a json file for later use.
rag_eval_dataset.save_json("./rag_eval_dataset.json")

100%|██████████| 108/108 [06:17<00:00, 3.49s/it]


If you have uploaded the generated question JSON file, please uncomment the code in the next cell block. This will avoid the need to generate the questions manually, saving you time and effort.

In [None]:
# from llama_index.finetuning.embeddings.common import (
# EmbeddingQAFinetuneDataset,
# )
# rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json(
# "./rag_eval_dataset.json"
# )

In [28]:
import pandas as pd

# A simple function to show the evaluation result.
def display_results_retriever(name, eval_results):
 """Display results from evaluate."""

 metric_dicts = []
 for eval_result in eval_results:
 metric_dict = eval_result.metric_vals_dict
 metric_dicts.append(metric_dict)

 full_df = pd.DataFrame(metric_dicts)

 hit_rate = full_df["hit_rate"].mean()
 mrr = full_df["mrr"].mean()

 metric_df = pd.DataFrame(
 {"Retriever Name": [name], "Hit Rate": [hit_rate], "MRR": [mrr]}
 )

 return metric_df

In [29]:
from llama_index.core.evaluation import RetrieverEvaluator

# We can evaluate the retievers with different top_k values.
for i in [2, 4, 6, 8, 10]:
 vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=i)
 custom_retriever = CustomRetriever(vector_retriever, keyword_retriever, "OR")
 custom_query_engine = RetrieverQueryEngine(
 retriever=custom_retriever,
 response_synthesizer=response_synthesizer,
 )
 retriever_evaluator = RetrieverEvaluator.from_metric_names(
 ["mrr", "hit_rate"], retriever=custom_query_engine
 )
 eval_results = await retriever_evaluator.aevaluate_dataset(rag_eval_dataset)
 print(display_results_retriever(f"Retriever top_{i}", eval_results))

ValidationError: 1 validation error for RetrieverEvaluator
retriever
 instance of BaseRetriever expected (type=type_error.arbitrary_type; expected_arbitrary_type=BaseRetriever)