# Install Packages and Setup Variables

In [11]:
!pip install -q llama-index==0.10.11 openai==1.12.0 chromadb==0.4.22 cohere==4.47 tiktoken==0.6.0 pandas==2.2.0

In [1]:
# Test with a few sample, processing dataset fully can be costly depanding on the size.
# NOTE: Checkpoints are provided in the lesson, so no need to run the code on full dataset.
testing = True

In [2]:
import os

# Set the "OPENAI_API_KEY" in the Python environment. Will be used by OpenAI client later.
os.environ["OPENAI_API_KEY"] = ""

# Load the Dataset (Webpages)

## Download

In [3]:
TRAIN_URLs = [
 "https://towardsai.net/p/machine-learning/metas-llama-2-revolutionizing-open-source-language-models-for-commercial-use",
 "https://towardsai.net/p/machine-learning/fine-tuning-a-llama-2-7b-model-for-python-code-generation",
 "https://towardsai.net/p/machine-learning/how-to-create-llama-2-chatbot-with-gradio-and-hugging-face-in-free-colab",
 "https://towardsai.net/p/machine-learning/meta-releases-llama-2-free-for-commercial-use",
 "https://towardsai.net/p/machine-learning/gpt-4-llama-2-claude-how-different-language-models-react-to-prompts",
 "https://towardsai.net/p/machine-learning/a-simple-hugging-face-guide-to-chatting-with-the-llama-2-7b-model-in-a-colab-notebook",
 "https://towardsai.net/p/machine-learning/fine-tuning-a-llama-2-7b-model-for-python-code-generation",
 "https://towardsai.net/p/machine-learning/llamaindex-last-version-from-basics-to-advanced-techniques-in-python-part-3",
 "https://towardsai.net/p/machine-learning/meta-releases-llama-will-it-fail-too",
 "https://towardsai.net/p/machine-learning/llama-by-meta-leaked-by-an-anonymous-forum-questions-arises-on-meta"
]
VALIDATION_URLs = [
 "https://towardsai.net/p/machine-learning/deep-diving-into-llama-2-meta-ai-new-open-source-foundation-model",
 "https://towardsai.net/p/machine-learning/gptq-quantization-on-a-llama-2-7b-fine-tuned-model-with-huggingface",
 "https://towardsai.net/p/machine-learning/powerinfer-11x-speed-up-llama-ii-inference-on-a-local-gpu",
 "https://towardsai.net/p/machine-learning/dense-x-retrieval-technique-in-langchain-and-llamaindex",
 "https://towardsai.net/p/machine-learning/exploring-large-language-models-part-2",
 "https://towardsai.net/p/machine-learning/inside-code-llama-meta-ais-entrance-in-the-code-llm-space",
 "https://towardsai.net/p/machine-learning/llamaindex-use-the-power-of-llms-on-your-data",
 "https://towardsai.net/p/l/inside-llama-meta-ai-new-large-language-model-that-outperforms-gpt-3-across-many-tasks"
]

## Read the Page

In [4]:
from llama_index.readers.web import SimpleWebPageReader

# Read the content of webpage into lists. We need two sets of documents for Training, and Validation.
TRAIN_DOCs = SimpleWebPageReader(html_to_text=True).load_data(TRAIN_URLs)
VALIDATION_DOCs = SimpleWebPageReader(html_to_text=True).load_data(VALIDATION_URLs)
print( len(TRAIN_DOCs), len(VALIDATION_DOCs) )

10 8


# Chunking

In [5]:
from llama_index.core.node_parser import SimpleNodeParser

# Define a parser to perform the chunking process.
parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)

# Apply chunking on the training/validation sets.
TRAIN_NODEs = parser.get_nodes_from_documents(TRAIN_DOCs)
VALIDATION_NODEs = parser.get_nodes_from_documents(VALIDATION_DOCs)
print( len( TRAIN_NODEs ), len( VALIDATION_NODEs ) )

272 221


In [6]:
# Use a subset of the dataset (5 samples) if testing.
if testing:
 TRAIN_NODEs = TRAIN_NODEs [0:5]
 VALIDATION_NODEs = VALIDATION_NODEs[0:5]

# Generate Question

We use a Large Language Model (LLM) to produce questions for each chunk of the dataset. Then we can use these data to train the model to develop embeddings that more accurately represent the types of questions users may ask.

In [7]:
# Use this block of code if you don't want to generate the questions for the dataset. (Avoid API call charges!)
# Uncomment the following code, and keep in mind to comment all the contents in the next coding block.

# from llama_index.finetuning import EmbeddingQAFinetuneDataset

# # Load the pre-generated questions json files.
# TRAIN_DATASET = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
# VALIDATION_DATASET = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

In [8]:
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.llms.openai import OpenAI

# Load the OpenAI API with the "gpt-3.5-turbo" model
llm = OpenAI(model="gpt-3.5-turbo")

# Generate questions for each chunk.
TRAIN_DATASET = generate_qa_embedding_pairs(TRAIN_NODEs, llm=llm)
VALIDATION_DATASET = generate_qa_embedding_pairs(VALIDATION_NODEs, llm=llm)

TRAIN_DATASET.save_json("train_dataset.json")
VALIDATION_DATASET.save_json("val_dataset.json")

100%|██████████| 5/5 [00:06<00:00, 1.29s/it]
100%|██████████| 5/5 [00:07<00:00, 1.60s/it]


# Load an Embedding Model

In [12]:
from llama_index.core.embeddings import resolve_embed_model

# Load an existing embedding model with a linear layer adopter on top.
base_embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")

 from .autonotebook import tqdm as notebook_tqdm
config.json: 100%|██████████| 743/743 [00:00<00:00, 4.08MB/s]
model.safetensors: 100%|██████████| 133M/133M [00:02<00:00, 53.6MB/s] 
tokenizer_config.json: 100%|██████████| 366/366 [00:00<00:00, 2.01MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 5.38MB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 16.4MB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 781kB/s]


In [13]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
import torch

# Finetune the adapter
finetune_engine = EmbeddingAdapterFinetuneEngine(
 TRAIN_DATASET,
 base_embed_model,
 model_output_path="model_output_test",
 epochs=4,
 verbose=True,
)

In [14]:
# Initiate the Finetuning process
finetune_engine.finetune()

[1;3;34m> Prepared optimizer, scheduler, and loss model.
[0m

Epoch: 0%| | 0/4 [00:00, ?it/s]

[1;3;34m> [Epoch 0] Current loss: 1.579697608947754
[0m

Iteration: 100%|██████████| 1/1 [00:05<00:00, 5.16s/it]
Iteration: 100%|██████████| 1/1 [00:00<00:00, 1.04it/s]
Epoch: 50%|█████ | 2/4 [00:06<00:05, 2.69s/it]

[1;3;34m> [Epoch 1] Current loss: 1.5728983879089355
[0m

Iteration: 100%|██████████| 1/1 [00:00<00:00, 1.01it/s]
Epoch: 75%|███████▌ | 3/4 [00:07<00:01, 1.92s/it]

[1;3;34m> [Epoch 2] Current loss: 1.5678406953811646
[0m

Iteration: 100%|██████████| 1/1 [00:00<00:00, 1.10it/s]
Epoch: 100%|██████████| 4/4 [00:08<00:00, 2.01s/it]

[1;3;34m> [Epoch 3] Current loss: 1.5644880533218384
[0m[1;3;34m> Finished training, saving to model_output_test
[0m




In [15]:
embed_model = finetune_engine.get_finetuned_model()

# Or, import model from the directory whenever required.
# from llama_index.core.embeddings import LinearAdapterEmbeddingModel
# embed_model = LinearAdapterEmbeddingModel(base_embed_model, "model_output_test")

In [16]:
embed_model

AdapterEmbeddingModel(model_name='Adapter for BAAI/bge-small-en-v1.5', embed_batch_size=10, callback_manager=)

# Evaluate

## Define the Evaluation Functions

Hit-rate metric: For each (query, context) pair, we retrieve the top-k documents with the query. It’s a hit if the results contain the ground-truth context.

In [17]:
from llama_index.core import ServiceContext, VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm import tqdm

def evaluate( dataset, embed_model, top_k=5, verbose=False):
 corpus = dataset.corpus
 queries = dataset.queries
 relevant_docs = dataset.relevant_docs

 # Chunking the documents and generating embeddings
 service_context = ServiceContext.from_defaults(embed_model=embed_model)
 nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
 index = VectorStoreIndex(
 nodes, service_context=service_context, show_progress=True
 )

 # Define a retriever to answer the questions
 retriever = index.as_retriever(similarity_top_k=top_k)

 eval_results = []

 # Look into each response sources to see if the chunk that contains the answer is retrieved.
 for query_id, query in tqdm(queries.items()):
 retrieved_nodes = retriever.retrieve(query)
 retrieved_ids = [node.node.node_id for node in retrieved_nodes]
 expected_id = relevant_docs[query_id][0]
 is_hit = expected_id in retrieved_ids # assume 1 relevant doc

 eval_result = {
 "is_hit": is_hit,
 "retrieved": retrieved_ids,
 "expected": expected_id,
 "query": query_id,
 }
 eval_results.append(eval_result)
 return eval_results

## OpenAI

In [18]:
from llama_index.embeddings.openai import OpenAIEmbedding

# Load the OpenAI Ada model and evaluate it.
ada = OpenAIEmbedding()
ada_val_results = evaluate(VALIDATION_DATASET, ada)

 service_context = ServiceContext.from_defaults(embed_model=embed_model)
Generating embeddings: 100%|██████████| 5/5 [00:00<00:00, 11.71it/s]
100%|██████████| 10/10 [00:01<00:00, 5.41it/s]


In [19]:
import pandas as pd

df_ada = pd.DataFrame(ada_val_results)
hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada

1.0

## BAAI Model

In [20]:
# Load the Base model without fine-tuning
base_embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")
bge_val_results = evaluate(VALIDATION_DATASET, base_embed_model)

 service_context = ServiceContext.from_defaults(embed_model=embed_model)
Generating embeddings: 100%|██████████| 5/5 [00:01<00:00, 4.27it/s]
100%|██████████| 10/10 [00:01<00:00, 8.29it/s]


In [21]:
df_bge = pd.DataFrame(bge_val_results)
hit_rate_bge = df_bge["is_hit"].mean()
hit_rate_bge

1.0

## FineTuned

In [25]:
from llama_index.embeddings.adapter import LinearAdapterEmbeddingModel

# Load the Fine-tuned model.
embed_model = LinearAdapterEmbeddingModel(base_embed_model, "model_output_test")

val_results_finetuned = evaluate(VALIDATION_DATASET, embed_model)

 service_context = ServiceContext.from_defaults(embed_model=embed_model)
Generating embeddings: 100%|██████████| 5/5 [00:01<00:00, 4.69it/s]
100%|██████████| 10/10 [00:00<00:00, 14.90it/s]


In [26]:
df_finetuned = pd.DataFrame(val_results_finetuned)
hit_rate_finetuned = df_finetuned["is_hit"].mean()
hit_rate_finetuned

1.0