# Install Packages and Setup Variables


In [None]:
!pip install -q llama-index==0.12.42 llama-index-finetuning==0.3.2 llama-index-embeddings-adapter==0.3.0 openai==1.84.0 tiktoken==0.9.0 chromadb==1.0.12 llama-index-vector-stores-chroma==0.4.2 cohere==5.15.0 llama-index-llms-gemini==0.5.0 html2text==2025.4.15 llama-index-llms-openai==0.4.7 llama-index-embeddings-huggingface==0.5.4 llama-index-embeddings-openai==0.3.1

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m98.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.5/259.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m91.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.8/266.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os

# Set the following API Keys in the Python environment. Will be used later.
#os.environ["OPENAI_API_KEY"] = "<YOUR_API_KEY>"

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')

os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN2')
HF_TOKEN = userdata.get('HF_TOKEN2')

# Download the Dataset


In [None]:
from huggingface_hub import snapshot_download

snapshot_download(repo_id="vicpada/AzureResources", allow_patterns=[ "*.pkl"],repo_type="dataset",local_dir="/content")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

'/content'

In [None]:
# prompt: join all pkl into one file using pickle.load

import pickle
import os

all_data = []

# Assuming the directory is /content where the files were downloaded
for filename in os.listdir('/content'):
  if filename.endswith('.pkl'):
    file_path = os.path.join('/content', filename)
    with open(file_path, 'rb') as f:
      data = pickle.load(f)
      all_data.extend(data) # Assuming each pickle file contains a list

# Optional: Save the combined data to a new pickle file
with open('combined_data.pkl', 'wb') as f:
  pickle.dump(all_data, f)

print(f"Combined data from {len([f for f in os.listdir('/content') if f.endswith('.pkl')])} files into a single list with {len(all_data)} items.")


Combined data from 5 files into a single list with 391414 items.


### Splitting Dataset


In [None]:
import random

random.shuffle(all_data)
split_index = int(len(all_data) * 0.9)

# TRAIN_DOCs and VALIDATION_DOCs
TRAIN_NODEs = all_data[:split_index]
VALIDATION_NODEs = all_data[split_index:]

# Chunking


In [None]:
# Use a subset of the dataset if testing.

# Test with a few sample, processing dataset fully can be costly depanding on the size.
# NOTE: Checkpoints are provided in the lesson, so no need to run the code on full dataset.

testing = False

if testing:
    TRAIN_NODEs = TRAIN_NODEs[:10]
    VALIDATION_NODEs = VALIDATION_NODEs[:5]
else: #10000 is enough
    TRAIN_NODEs = TRAIN_NODEs[:9000]
    VALIDATION_NODEs = VALIDATION_NODEs[:1000]

# Generate Question


We use a Large Language Model (LLM) to produce questions for each chunk of the dataset. Then we can use these data to train the model to develop embeddings that more accurately represent the types of questions users may ask.


In [None]:
# Use this block of code if you don't want to generate the questions for the dataset. (Avoid API call charges!)
# Uncomment the following code, and keep in mind to comment all the contents in the next coding block.


from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.llms.openai import OpenAI



llm = OpenAI(model="gpt-4.1-nano", temperature=1, max_tokens=512)

#Generate questions for each chunk.

TRAIN_DATASET = generate_qa_embedding_pairs(TRAIN_NODEs, llm=llm, num_questions_per_chunk=1, output_path="./train_dataset_full.json")

VALIDATION_DATASET = generate_qa_embedding_pairs(VALIDATION_NODEs,  num_questions_per_chunk=1, llm=llm, output_path="./val_dataset_full.json")

  6%|▌         | 500/8891 [06:12<1:16:36,  1.83it/s]

Saved progress at 500 entries.


 11%|█         | 1000/8891 [11:58<1:39:08,  1.33it/s]

Saved progress at 1000 entries.


 17%|█▋        | 1500/8891 [17:38<1:14:58,  1.64it/s]

Saved progress at 1500 entries.


 22%|██▏       | 2000/8891 [24:38<1:38:43,  1.16it/s]

Saved progress at 2000 entries.


 28%|██▊       | 2500/8891 [30:29<1:13:50,  1.44it/s]

Saved progress at 2500 entries.


 34%|███▎      | 3000/8891 [36:39<1:06:05,  1.49it/s]

Saved progress at 3000 entries.


 39%|███▉      | 3500/8891 [43:31<1:01:10,  1.47it/s]

Saved progress at 3500 entries.


 45%|████▍     | 4000/8891 [50:38<1:05:59,  1.24it/s]

Saved progress at 4000 entries.


 51%|█████     | 4500/8891 [56:45<48:04,  1.52it/s]

Saved progress at 4500 entries.


 56%|█████▌    | 5000/8891 [1:04:01<1:04:42,  1.00it/s]

Saved progress at 5000 entries.


 62%|██████▏   | 5500/8891 [1:09:18<40:14,  1.40it/s]

Saved progress at 5500 entries.


 67%|██████▋   | 6000/8891 [1:14:55<30:28,  1.58it/s]

Saved progress at 6000 entries.


 73%|███████▎  | 6500/8891 [1:20:44<26:17,  1.52it/s]

Saved progress at 6500 entries.


 79%|███████▊  | 7000/8891 [1:26:43<24:24,  1.29it/s]

Saved progress at 7000 entries.


 84%|████████▍ | 7500/8891 [1:33:10<20:52,  1.11it/s]

Saved progress at 7500 entries.


 90%|████████▉ | 8000/8891 [1:39:00<09:45,  1.52it/s]

Saved progress at 8000 entries.


 96%|█████████▌| 8500/8891 [1:45:14<17:20,  2.66s/it]

Saved progress at 8500 entries.


8901it [1:50:10,  1.34it/s]


Final dataset saved.


 50%|█████     | 500/995 [06:29<05:35,  1.48it/s]

Saved progress at 500 entries.


1000it [11:49,  1.40it/s]

Saved progress at 1000 entries.
Final dataset saved.





In [None]:
# Upload the new documents CSV to Hugging Face

from huggingface_hub import HfApi

api = HfApi(token=HF_TOKEN)

files = ['train_dataset_full.json','val_dataset_full.json']

for file in files:
  api.upload_file(
      path_or_fileobj=file,
      path_in_repo=file,
      repo_id="vicpada/AzureResources",
      repo_type="dataset",
  )


No files have been modified since last commit. Skipping to prevent empty commit.


No files have been modified since last commit. Skipping to prevent empty commit.


No files have been modified since last commit. Skipping to prevent empty commit.


No files have been modified since last commit. Skipping to prevent empty commit.


In [None]:
#from huggingface_hub import snapshot_download
#snapshot_download(repo_id="jaiganesan/Embedding-model-fine-tuning-dataset", repo_type="dataset",local_dir="/content/")


#from llama_index.finetuning import EmbeddingQAFinetuneDataset

# Load the pre-generated questions json files.
#TRAIN_DATASET = EmbeddingQAFinetuneDataset.from_json("./train_dataset.json")
#VALIDATION_DATASET = EmbeddingQAFinetuneDataset.from_json("./val_dataset.json")

# Load an Embedding Model


In [None]:
from llama_index.core.embeddings import resolve_embed_model

# Load an existing embedding model with a adapter layer on top.
base_embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
import torch

# Finetune the adapter
finetune_engine = EmbeddingAdapterFinetuneEngine(
    TRAIN_DATASET,
    base_embed_model,
    model_output_path="model_output_test",
    epochs=2,
    verbose=True,
    bias=True,
)

In [None]:
# Initiate the Finetuning process
finetune_engine.finetune()

[1;3;34m> Prepared optimizer, scheduler, and loss model.
[0m

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

[1;3;34m> [Epoch 0] Current loss: 1.1686890125274658
[0m[1;3;34m> [Epoch 0] Current loss: 1.2109415531158447
[0m[1;3;34m> [Epoch 0] Current loss: 1.0172332525253296
[0m[1;3;34m> [Epoch 0] Current loss: 0.46401476860046387
[0m[1;3;34m> [Epoch 0] Current loss: 0.44793954491615295
[0m[1;3;34m> [Epoch 0] Current loss: 1.3964780569076538
[0m[1;3;34m> [Epoch 0] Current loss: 0.9287071228027344
[0m[1;3;34m> [Epoch 0] Current loss: 0.4293813705444336
[0m[1;3;34m> [Epoch 0] Current loss: 1.4944758415222168
[0m[1;3;34m> [Epoch 0] Current loss: 0.400224506855011
[0m[1;3;34m> [Epoch 0] Current loss: 0.6548662781715393
[0m[1;3;34m> [Epoch 0] Current loss: 1.4040919542312622
[0m[1;3;34m> [Epoch 0] Current loss: 1.1982499361038208
[0m[1;3;34m> [Epoch 0] Current loss: 0.764290452003479
[0m[1;3;34m> [Epoch 0] Current loss: 0.6448008418083191
[0m[1;3;34m> [Epoch 0] Current loss: 1.229644536972046
[0m[1;3;34m> [Epoch 0] Current loss: 0.5034639835357666
[0m[1;3;34m> [Epo

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

[1;3;34m> [Epoch 1] Current loss: 1.0316441059112549
[0m[1;3;34m> [Epoch 1] Current loss: 0.9674192667007446
[0m[1;3;34m> [Epoch 1] Current loss: 0.9151577949523926
[0m[1;3;34m> [Epoch 1] Current loss: 0.27198293805122375
[0m[1;3;34m> [Epoch 1] Current loss: 0.29684069752693176
[0m[1;3;34m> [Epoch 1] Current loss: 1.1955819129943848
[0m[1;3;34m> [Epoch 1] Current loss: 0.6276596188545227
[0m[1;3;34m> [Epoch 1] Current loss: 0.20209062099456787
[0m[1;3;34m> [Epoch 1] Current loss: 1.2147716283798218
[0m[1;3;34m> [Epoch 1] Current loss: 0.2806827127933502
[0m[1;3;34m> [Epoch 1] Current loss: 0.4517034888267517
[0m[1;3;34m> [Epoch 1] Current loss: 1.3200963735580444
[0m[1;3;34m> [Epoch 1] Current loss: 0.9926549792289734
[0m[1;3;34m> [Epoch 1] Current loss: 0.539482057094574
[0m[1;3;34m> [Epoch 1] Current loss: 0.5028634071350098
[0m[1;3;34m> [Epoch 1] Current loss: 1.2258899211883545
[0m[1;3;34m> [Epoch 1] Current loss: 0.36430853605270386
[0m[1;3;34m> 

In [None]:
embed_model = finetune_engine.get_finetuned_model()

# Or, import model from the directory whenever required.
# from llama_index.core.embeddings import LinearAdapterEmbeddingModel
# embed_model = LinearAdapterEmbeddingModel(base_embed_model, "model_output_test")

In [None]:
embed_model

AdapterEmbeddingModel(model_name='Adapter for BAAI/bge-small-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x782e56075c90>, num_workers=None, embeddings_cache=None)

In [None]:
# prompt: upload embed_model to hugging face

repo_id = "vicpada/finetuned_embed_model_full"

# Create a new repository
api.create_repo(repo_id=repo_id, repo_type="model",exist_ok=True)

# Upload the model files
api.upload_folder(
    folder_path="model_output_test",
    repo_id=repo_id,
    repo_type="model",

)

pytorch_model.bin:   0%|          | 0.00/593k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/vicpada/finetuned_embed_model_full/commit/735fcb7f7aeea0841454d92e38d7f307f25878e4', commit_message='Upload folder using huggingface_hub', commit_description='', oid='735fcb7f7aeea0841454d92e38d7f307f25878e4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/vicpada/finetuned_embed_model_full', endpoint='https://huggingface.co', repo_type='model', repo_id='vicpada/finetuned_embed_model_full'), pr_revision=None, pr_num=None)

## Fine tuning OpenAI Embedding Model using Adapter method

In [None]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.embeddings.openai import OpenAIEmbedding

openai_finetune_engine = EmbeddingAdapterFinetuneEngine(
    TRAIN_DATASET,
    OpenAIEmbedding(model="text-embedding-3-small"),
    model_output_path="model_output_test_openai",
    bias=True,
    epochs=2,
    verbose=True,
)

In [None]:
openai_finetune_engine.finetune()

openai_embed_model = openai_finetune_engine.get_finetuned_model()

In [None]:
openai_embed_model

# Evaluate


## Define the Evaluation Functions


In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from llama_index.core import Settings
from tqdm import tqdm
import pandas as pd

def evaluate(dataset, embedding_model, top_k=5, verbose=False):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    Settings.embed_model = embedding_model

    # Chunking the documents and generating embeddings
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(nodes, show_progress=True)

    # Define a retriever to answer the questions
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []

    # Look into each response sources to see if the chunk that contains the answer is retrieved.
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]

        try:
            rank = retrieved_ids.index(expected_id) + 1
            reciprocal_rank = 1 / rank
        except ValueError:
            rank = None
            reciprocal_rank = 0

        is_hit = expected_id in retrieved_ids

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
            "rank": rank,
            "reciprocal_rank": reciprocal_rank
        }
        eval_results.append(eval_result)

    return eval_results

## OpenAI Embedding Model Evaluation


In [None]:
# Load the OpenAI Ada model and evaluate it.
openai_text_embedding_small = OpenAIEmbedding(model="text-embedding-3-small")
openai_embedding_val_results = evaluate(VALIDATION_DATASET, embedding_model=openai_text_embedding_small)

In [None]:
openai_embedding_val_results = [
    result for result in openai_embedding_val_results if isinstance(result, dict)
]

df_openai = pd.DataFrame(openai_embedding_val_results)

hit_rate_openai = df_openai["is_hit"].mean()
mrr_openai = df_openai["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_openai}")
print(f"MRR: {mrr_openai}")

### OpenAI Embedding Model with Fine Tuned Adapter Model Evaluation

In [None]:
from llama_index.embeddings.adapter import AdapterEmbeddingModel

openai_embed_model = AdapterEmbeddingModel(openai_text_embedding_small, "model_output_test_openai")

val_results_ft_openai = evaluate(VALIDATION_DATASET, embedding_model = openai_embed_model)

In [None]:
val_results_ft_openai = [
    result for result in val_results_ft_openai if isinstance(result, dict)
]

df_openai_ft = pd.DataFrame(val_results_ft_openai)

hit_rate_openai_ft = df_openai_ft["is_hit"].mean()
mrr_openai_ft = df_openai_ft["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_openai_ft}")
print(f"MRR: {mrr_openai_ft}")

## Open Source BAAI Model Evaluation


In [None]:
# Load the Base model without fine-tuning
base_embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")
bge_val_results = evaluate(VALIDATION_DATASET, embedding_model=base_embed_model)

Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1005/1005 [00:56<00:00, 17.76it/s]


In [None]:
bge_val_results = [
    result for result in bge_val_results if isinstance(result, dict)
]

df_bge = pd.DataFrame(bge_val_results)

hit_rate_bge = df_bge["is_hit"].mean()
mrr_bge = df_bge["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_bge}")
print(f"MRR: {mrr_bge}")

Hit rate: 0.6567164179104478
MRR: 0.5616915422885572


## FineTuned BAAI Adapter Embedding Model Evaluation


In [None]:
from llama_index.embeddings.adapter import AdapterEmbeddingModel

# Load the Fine-tuned model.
embed_model = AdapterEmbeddingModel(base_embed_model, "model_output_test")

val_results_finetuned = evaluate(VALIDATION_DATASET, embedding_model=embed_model)

Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1005/1005 [00:57<00:00, 17.57it/s]


In [None]:
val_results_finetuned = [
    result for result in val_results_finetuned if isinstance(result, dict)
]

df_finetuned = pd.DataFrame(val_results_finetuned)

hit_rate_finetuned = df_finetuned["is_hit"].mean()
mrr_finetuned = df_finetuned["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_finetuned}")
print(f"MRR: {mrr_finetuned}")

Hit rate: 0.7074626865671642
MRR: 0.6007628524046434
