# Install Packages and Setup Variables


In [1]:
!pip install -q llama-index==0.10.57 openai==1.37.0 tiktoken==0.7.0 chromadb==0.5.5 llama-index-vector-stores-chroma==0.1.10 llama-index-llms-gemini==0.1.11

[?25l [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h Installing build dependencies ... [?25l[?25hdone
 Getting requirements to build wheel ... [?25l[?25hdone
 Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.0/337.0 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m584.3/584.3 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━

In [2]:
import os

# Set the following API Keys in the Python environment. Will be used later.
os.environ["OPENAI_API_KEY"] = ""
os.environ["GOOGLE_API_KEY"] = ""

In [3]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.

import nest_asyncio

nest_asyncio.apply()

# Create a VectoreStore


In [4]:
import chromadb

# create client and a new collection
# chromadb.EphemeralClient saves data in-memory.
chroma_client = chromadb.PersistentClient(path="./mini-llama-articles")
chroma_collection = chroma_client.create_collection("mini-llama-articles")

In [5]:
from llama_index.vector_stores.chroma import ChromaVectorStore

# Define a storage context object using the created vector database.
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Load the Dataset (CSV)


## Download


The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model.


In [6]:
!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv

 % Total % Received % Xferd Average Speed Time Time Time Current
 Dload Upload Total Spent Left Speed
100 169k 100 169k 0 0 273k 0 --:--:-- --:--:-- --:--:-- 274k


## Load the Articles


In [7]:
import csv

rows = []

# Load the file as a JSON
with open("./mini-dataset.csv", mode="r", encoding="utf-8") as file:
 csv_reader = csv.reader(file)

 for idx, row in enumerate(csv_reader):
 if idx == 0:
 continue
 # Skip header row
 rows.append(row)

# The number of characters in the dataset.
len(rows)

14

# Convert to Document obj


In [8]:
from llama_index.core import Document
from llama_index.core.schema import TextNode

# Convert the chunks to Document objects so the LlamaIndex framework can process them.
documents = [
 Document(
 text=row[1],
 metadata={"title": row[0], "url": row[2], "source_name": row[3]},
 )
 for row in rows
]
# By default, the node/chunks ids are set to random uuids. To ensure same id's per run, we manually set them.
for idx, doc in enumerate(documents):
 doc.id_ = f"doc_{idx}"

In [9]:
documents[0]

Document(id_='doc_0', embedding=None, metadata={'title': "Beyond GPT-4: What's New?", 'url': 'https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8', 'source_name': 'towards_ai'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='LLM Variants and Meta\'s Open Source Before shedding light on four major trends, I\'d share the latest Meta\'s Llama 2 and Code Llama. Meta\'s Llama 2 represents a sophisticated evolution in LLMs. This suite spans models pretrained and fine-tuned across a parameter spectrum of 7 billion to 70 billion. A specialized derivative, Llama 2-Chat, has been engineered explicitly for dialogue-centric applications. Benchmarking revealed Llama 2\'s superior performance over most extant open-source chat models. Human-centric evaluations, focusing on safety and utility metrics, positioned Llama 2-Chat as a potential contender against proprietary, closed-source counterparts. The development trajectory of Llama 2 emphasized 

# Transforming


In [10]:
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import BaseNode
import hashlib


def deterministic_id_func(i: int, doc: BaseNode) -> str:
 """Deterministic ID function for the text splitter.
 This will be used to generate a unique repeatable identifier for each node."""
 unique_identifier = doc.id_ + str(i)
 hasher = hashlib.sha256()
 hasher.update(unique_identifier.encode("utf-8"))
 return hasher.hexdigest()


text_splitter = TokenTextSplitter(
 separator=" ", chunk_size=512, chunk_overlap=128, id_func=deterministic_id_func
)

In [11]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
 transformations=[
 text_splitter,
 OpenAIEmbedding(model = 'text-embedding-3-small'),
 ],
 vector_store=vector_store,
)

nodes = pipeline.run(documents=documents, show_progress=True)

Parsing nodes: 0%| | 0/14 [00:00, ?it/s]

Generating embeddings: 0%| | 0/108 [00:00, ?it/s]

In [12]:
nodes[0]

TextNode(id_='4ab5bd897f01474fc9b0049f95e31edae3ccd9e74d0f0acd3932b50a74d608b6', embedding=[0.004633472301065922, 0.016692597419023514, 0.06155563145875931, -0.016222193837165833, 0.020455822348594666, -0.0224449560046196, 0.00625972356647253, 0.014663142152130604, -0.00014427100541070104, 0.005826280917972326, 0.02755219303071499, -0.045642558485269547, -0.03534744679927826, 0.004250429570674896, -0.035132404416799545, -0.02787475474178791, -0.034218478947877884, -0.04634144529700279, -0.015294826589524746, 0.03763226419687271, 0.013137691654264927, 0.0072442106902599335, -0.034541040658950806, 0.025952821597456932, -0.005110595840960741, -0.026893628761172295, -0.0479004941880703, 0.01755276322364807, -0.01737804152071476, -0.02486417442560196, 0.05268516764044762, -0.025348016992211342, -0.02216271497309208, -0.01169288158416748, -0.024837292730808258, 0.018386049196124077, -0.005261796526610851, -0.010080070234835148, 0.020294541493058205, -0.004458751063793898, -0.0322831049561500

# Load Indexes


In [13]:
# Create your index
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_vector_store(vector_store)

In [14]:
from llama_index.llms.gemini import Gemini

# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.

llm = Gemini(model="models/gemini-1.5-flash", temperature=0.3, max_tokens=512)
query_engine = index.as_query_engine(llm=llm, similarity_top_k=5)

In [15]:
res = query_engine.query("How many parameters LLaMA 2 model has?")

In [16]:
res.response

'The Llama2 model has 7 billion parameters. \n'

In [17]:
for src in res.source_nodes:
 print("Node ID\t", src.node_id)
 print("Title\t", src.metadata["title"])
 print("Text\t", src.text)
 print("Score\t", src.score)
 print("-_" * 20)

Node ID	 de49ab9024a434ca1cd1efba258fbaa9a3e2d9a1bca3ab4a0349220cc1e2754f
Title	 Building a Q&A Bot over Private Documents with OpenAI and LangChain
Text	 Private data to be used The example provided can be used with any dataset. I am using a data set that has Analyst recommendations from various stocks. For the purpose of demonstration, I have gathered publicly available analyst recommendations to showcase its capabilities. You can replace this with your own information to try this. Below is a partial extract of the information commonly found in these documents. If you wish to try it yourself, you can download analyst recommendations for your preferred stocks from online sources or access them through subscription platforms like Barron's. Although the example provided focuses on analyst recommendations, the underlying structure can be utilized to query various other types of documents in any industry as well. I have assembled such data for a few stocks for demonstration purposes. This

# Evaluate the retrieval process and quality of answers

We can evaluate our RAG system with a dataset of questions and associated chunks. Given a question, we can see if the RAG system retrieves the correct chunks of text that can answer the question.

You can generate a synthetic dataset with an LLM such as `gemini-1.5-flash` or create an authentic and manually curated dataset.

Note that a **well curated dataset will always be a better option**, especially for a specific domain or use case.


In our example, we will generate a synthetic dataset using `gemini-1.5-flash` to make it simple.

This is the default prompt that the `generate_question_context_pairs` function will uses:

```python
DEFAULT_QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and no prior knowledge,
generate only questions based on the below query.

You are a Teacher/Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."
"""
```


In [18]:
# Free Tier-Gemini API key
from llama_index.core.llms.utils import LLM
from llama_index.core.schema import MetadataMode, TextNode
from tqdm import tqdm
import json
import re
import uuid
import warnings
import time
from typing import Dict, List, Tuple
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

DEFAULT_QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."
"""

def generate_question_context_pairs(
 nodes: List[TextNode],
 llm: LLM,
 qa_generate_prompt_tmpl: str = DEFAULT_QA_GENERATE_PROMPT_TMPL,
 num_questions_per_chunk: int = 2,
 request_delay: float = 2.0
) -> EmbeddingQAFinetuneDataset:
 """Generate examples given a set of nodes with delays between requests."""
 node_dict = {
 node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
 for node in nodes
 }

 queries = {}
 relevant_docs = {}

 for node_id, text in tqdm(node_dict.items()):
 query = qa_generate_prompt_tmpl.format(
 context_str=text, num_questions_per_chunk=num_questions_per_chunk
 )
 response = llm.complete(query)

 result = str(response).strip().split("\n")
 questions = [
 re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
 ]
 questions = [question for question in questions if len(question) > 0][
 :num_questions_per_chunk
 ]

 num_questions_generated = len(questions)
 if num_questions_generated < num_questions_per_chunk:
 warnings.warn(
 f"Fewer questions generated ({num_questions_generated}) "
 f"than requested ({num_questions_per_chunk})."
 )

 for question in questions:
 question_id = str(uuid.uuid4())
 queries[question_id] = question
 relevant_docs[question_id] = [node_id]

 time.sleep(request_delay)

 return EmbeddingQAFinetuneDataset(
 queries=queries, corpus=node_dict, relevant_docs=relevant_docs
 )

#from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.llms.gemini import Gemini

llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=512)

rag_eval_dataset = generate_question_context_pairs(
 nodes[:25],
 llm=llm,
 num_questions_per_chunk=1,
 request_delay=4
)

# Save the dataset as a json file for later use
rag_eval_dataset.save_json("./rag_eval_dataset.json")


100%|██████████| 25/25 [02:41<00:00, 6.46s/it]


In [19]:
# #Paid-Gemini API Key

# from llama_index.core.evaluation import generate_question_context_pairs
# from llama_index.llms.gemini import Gemini

# llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=512)
# rag_eval_dataset = generate_question_context_pairs(nodes, llm=llm, num_questions_per_chunk=1)

# # We can save the dataset as a json file for later use.
# rag_eval_dataset.save_json("./rag_eval_dataset.json")

In [20]:
# We can also load the dataset from a previously saved json file.
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json("./rag_eval_dataset.json")

### Evaluation for Hit Rate and Mean Reciprocal Rank (MRR)

We will make use of `RetrieverEvaluator` available in Llama-index. We will measure the Hit Rate and Mean Reciprocal Rank (MRR).

**Hit Rate:**

Think of the Hit Rate like playing a game of guessing. You're given a question and you need to guess the correct answer from a list of options. The Hit Rate measures how often you guess the correct answer by only looking at your top few guesses. If you often find the right answer in your first few guesses, you have a high Hit Rate. So, in the context of a retrieval system, it's about how frequently the system finds the correct document within its top 'k' picks (where 'k' is a number you decide, like top 5 or top 10).

**Mean Reciprocal Rank (MRR):**

MRR is a bit like measuring how quickly you can find a treasure in a list of boxes. Imagine you have a row of boxes and only one of them has a treasure. The MRR calculates how close to the start of the row the treasure box is, on average. If the treasure is always in the first box you open, you're doing great and have an MRR of 1. If it's in the second box, the score is 1/2, since you took two tries to find it. If it's in the third box, your score is 1/3, and so on. MRR averages these scores across all your searches. So, for a retrieval system, MRR looks at where the correct document ranks in the system's guesses. If it's usually near the top, the MRR will be high, indicating good performance.
In summary, Hit Rate tells you how often the system gets it right in its top guesses, and MRR tells you how close to the top the right answer usually is. Both metrics are useful for evaluating the effectiveness of a retrieval system, like how well a search engine or a recommendation system works.


In [21]:
import pandas as pd


def display_results_retriever(name, eval_results):
 """Display results from evaluate."""

 metric_dicts = []
 for eval_result in eval_results:
 metric_dict = eval_result.metric_vals_dict
 metric_dicts.append(metric_dict)

 full_df = pd.DataFrame(metric_dicts)

 hit_rate = full_df["hit_rate"].mean()
 mrr = full_df["mrr"].mean()

 metric_df = pd.DataFrame(
 {"Retriever Name": [name], "Hit Rate": [hit_rate], "MRR": [mrr]}
 )

 return metric_df

In [22]:
from llama_index.core.evaluation import RetrieverEvaluator

# We can evaluate the retievers with different top_k values.
for i in [2, 4, 6, 8, 10]:
 retriever = index.as_retriever(similarity_top_k=i)
 retriever_evaluator = RetrieverEvaluator.from_metric_names(
 ["mrr", "hit_rate"], retriever=retriever
 )
 eval_results = await retriever_evaluator.aevaluate_dataset(
 rag_eval_dataset, workers=32
 )
 print(display_results_retriever(f"Retriever top_{i}", eval_results))

time.sleep(60)

 Retriever Name Hit Rate MRR
0 Retriever top_4 0.12 0.043333
 Retriever Name Hit Rate MRR
0 Retriever top_6 0.16 0.05
 Retriever Name Hit Rate MRR
0 Retriever top_8 0.2 0.055714
 Retriever Name Hit Rate MRR
0 Retriever top_10 0.24 0.060159


### Evaluation using Relevance and Faithfulness metrics.

Here, we evaluate the answer generated by the LLM. Is the answer using the correct context? Is the answer faithful to the context? Is the answer relevant to the question?

An LLM will answer these questions, more specifically `gpt-4o`.

**`FaithfulnessEvaluator`**
Evaluates if the answer is faithful to the retrieved contexts (in other words, whether there's an hallucination).

**`RelevancyEvaluator`**
Evaluates whether the retrieved context and answer are relevant to the user question.

Now, let's see how the top_k value affects these two metrics.


In [30]:
from llama_index.core.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner
from llama_index.llms.openai import OpenAI

# Create your index
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_vector_store(vector_store)

# Define an LLM as a judge
llm_gpt4o = OpenAI(temperature=0, model="gpt-4o")
llm_gpt4o_mini = OpenAI(temperature=0, model="gpt-4o-mini")

# Initiate the faithfulnes and relevancy evaluator objects
faithfulness_evaluator = FaithfulnessEvaluator(llm=llm_gpt4o)
relevancy_evaluator = RelevancyEvaluator(llm=llm_gpt4o)

# Extract the questions from the dataset
queries = list(rag_eval_dataset.queries.values())
# Limit to first 10 question to save time (!!remove this line in production!!)
batch_eval_queries = queries[:20]

# The batch evaluator runs the evaluation in batches
runner = BatchEvalRunner(
 {"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
 workers=32,
)


# Define a for-loop to try different `similarity_top_k` values
for i in [2, 4, 6, 8, 10]:
 # Set query engine with different number of returned chunks
 query_engine = index.as_query_engine(similarity_top_k=i, llm = llm_gpt4o_mini)

 # Run the evaluation
 eval_results = await runner.aevaluate_queries(query_engine, queries=batch_eval_queries)

 # Printing the results
 faithfulness_score = sum(
 result.passing for result in eval_results["faithfulness"]
 ) / len(eval_results["faithfulness"])
 print(f"top_{i} faithfulness_score: {faithfulness_score}")

 relevancy_score = sum(result.passing for result in eval_results["relevancy"]) / len(
 eval_results["relevancy"]
 )
 print(f"top_{i} relevancy_score: {relevancy_score}")
 print("="*15)


top_2 faithfulness_score: 0.25
top_2 relevancy_score: 0.6
top_4 faithfulness_score: 0.1
top_4 relevancy_score: 0.95
top_6 faithfulness_score: 0.2
top_6 relevancy_score: 0.9
top_8 faithfulness_score: 0.1
top_8 relevancy_score: 0.6
top_10 faithfulness_score: 0.05
top_10 relevancy_score: 0.55


### Correctness


In [24]:
from llama_index.core.evaluation import CorrectnessEvaluator

query = (
 "Can you explain the theory of relativity proposed by Albert Einstein in" " detail?"
)

reference = """
Certainly! Albert Einstein's theory of relativity consists of two main components: special relativity and general relativity. Special relativity, published in 1905, introduced the concept that the laws of physics are the same for all non-accelerating observers and that the speed of light in a vacuum is a constant, regardless of the motion of the source or observer. It also gave rise to the famous equation E=mc², which relates energy (E) and mass (m).

General relativity, published in 1915, extended these ideas to include the effects of gravity. According to general relativity, gravity is not a force between masses, as described by Newton's theory of gravity, but rather the result of the warping of space and time by mass and energy. Massive objects, such as planets and stars, cause a curvature in spacetime, and smaller objects follow curved paths in response to this curvature. This concept is often illustrated using the analogy of a heavy ball placed on a rubber sheet, causing it to create a depression that other objects (representing smaller masses) naturally move towards.

In essence, general relativity provided a new understanding of gravity, explaining phenomena like the bending of light by gravity (gravitational lensing) and the precession of the orbit of Mercury. It has been confirmed through numerous experiments and observations and has become a fundamental theory in modern physics.
"""

response = """
Certainly! Albert Einstein's theory of relativity consists of two main components: special relativity and general relativity. Special relativity, published in 1905, introduced the concept that the laws of physics are the same for all non-accelerating observers and that the speed of light in a vacuum is a constant, regardless of the motion of the source or observer. It also gave rise to the famous equation E=mc², which relates energy (E) and mass (m).

However, general relativity, published in 1915, extended these ideas to include the effects of magnetism. According to general relativity, gravity is not a force between masses but rather the result of the warping of space and time by magnetic fields generated by massive objects. Massive objects, such as planets and stars, create magnetic fields that cause a curvature in spacetime, and smaller objects follow curved paths in response to this magnetic curvature. This concept is often illustrated using the analogy of a heavy ball placed on a rubber sheet with magnets underneath, causing it to create a depression that other objects (representing smaller masses) naturally move towards due to magnetic attraction.
"""

In [25]:
evaluator = CorrectnessEvaluator(llm=llm_gpt4o)

result = evaluator.evaluate(query=query,response=response,reference=reference,)

In [26]:
result.score

2.0

In [27]:
result.feedback

'The generated answer is mostly relevant but contains significant inaccuracies. It incorrectly states that general relativity involves the effects of magnetism and magnetic fields, which is not true. General relativity deals with the warping of space and time by mass and energy, not magnetic fields. This fundamental error reduces the correctness of the answer.'