{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# imports\n", "import pandas as pd\n", "import numpy as np\n", "# import matplotlib as plt\n", "import random as rn\n", "import os\n", "os.environ['PYTHONHASHSEED'] = '0'\n", "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n", "np.random.seed(37)\n", "rn.seed(1254)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load data, train, test, validation splits" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SentenceLabel
S.No.
1Introduction to Quantum Mechanics1.0
2In this chapter, we explore the foundational p...0.0
3The Rise and Fall of Civilizations1.0
4Historical records reveal the complex trajecto...0.0
5Part III: Advanced Mathematical Concepts1.0
\n", "
" ], "text/plain": [ " Sentence Label\n", "S.No. \n", "1 Introduction to Quantum Mechanics 1.0\n", "2 In this chapter, we explore the foundational p... 0.0\n", "3 The Rise and Fall of Civilizations 1.0\n", "4 Historical records reveal the complex trajecto... 0.0\n", "5 Part III: Advanced Mathematical Concepts 1.0" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Label
count198.000000
mean0.555051
std0.313770
min0.000000
25%0.300000
50%0.650000
75%0.800000
max1.000000
\n", "
" ], "text/plain": [ " Label\n", "count 198.000000\n", "mean 0.555051\n", "std 0.313770\n", "min 0.000000\n", "25% 0.300000\n", "50% 0.650000\n", "75% 0.800000\n", "max 1.000000" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(198, 2)" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# EDA\n", "path_to_data = \"./data/Sentences_200.csv\"\n", "new_data_5_cat = pd.read_csv(path_to_data, index_col='S.No.')\n", "print(type(new_data_5_cat))\n", "display(new_data_5_cat.head())\n", "display(new_data_5_cat.describe())\n", "display(new_data_5_cat.shape)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Geetansh\\Desktop\\New_folder\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "data": { "text/plain": [ "Dataset({\n", " features: ['Sentence', 'Label', 'S.No.'],\n", " num_rows: 160\n", "})" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Dataset({\n", " features: ['Sentence', 'Label', 'S.No.'],\n", " num_rows: 20\n", "})" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Dataset({\n", " features: ['Sentence', 'Label', 'S.No.'],\n", " num_rows: 18\n", "})" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Make test, train, cv splits\n", "from datasets import Dataset\n", "ds = Dataset.from_pandas(new_data_5_cat)\n", "\n", "ds_train_temp_dict = ds.train_test_split(train_size=160)\n", "ds_train = ds_train_temp_dict['train']\n", "ds_test_cv_dict = ds_train_temp_dict['test'].train_test_split(test_size=20)\n", "ds_cv = ds_test_cv_dict['train']\n", "ds_test = ds_test_cv_dict['test']\n", "display(ds_train)\n", "display(ds_test)\n", "display(ds_cv)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fine tune LLM" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Geetansh\\Desktop\\New_folder\\venv\\Lib\\site-packages\\transformers\\convert_slow_tokenizer.py:561: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "['▁My', '▁name', '▁is', '▁Geeta', 'n', 'sh', '▁Bhardwaj', '.']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get Tokenizer\n", "from transformers import AutoTokenizer\n", "model_nm = 'microsoft/deberta-v3-small'\n", "tokz = AutoTokenizer.from_pretrained(model_nm)\n", "tokz.tokenize('My name is Geetansh Bhardwaj.')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 160/160 [00:00<00:00, 4079.69 examples/s]\n" ] } ], "source": [ "# Tokenize the 'Sentence' column\n", "def tokenize_string(row):\n", " return tokz(row['Sentence'])\n", "\n", "def tokenize_sentence_col(ds):\n", " '''\n", " We will tokenize the 'Sentence' column and add another column 'Sentence_id'. It will be used for fine-tuning\n", " ds: a dataset with 'Sentence' column\n", " '''\n", "\n", " tokenized_ds = ds.map(tokenize_string, batch_size=5)\n", " return tokenized_ds\n", "\n", "tokenized_ds_train = tokenize_sentence_col(ds_train)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 18/18 [00:00<00:00, 2243.01 examples/s]\n" ] } ], "source": [ "# An undocumented fact: Transformers assume that your label column is named \"labels\". Ours is named \"Label\", so we will change that\n", "tokenized_ds_train = tokenized_ds_train.rename_columns({'Label' : 'labels'})\n", "tokenized_ds_train\n", "\n", "tokenized_ds_cv = tokenize_sentence_col(ds_cv)\n", "tokenized_ds_cv = tokenized_ds_cv.rename_columns({'Label' : 'labels'})" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "# Get the model (We are actually using a pre-trained one)\n", "from transformers import AutoModelForSequenceClassification\n", "my_model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From c:\\Users\\Geetansh\\Desktop\\New_folder\\venv\\Lib\\site-packages\\tf_keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Geetansh\\Desktop\\New_folder\\venv\\Lib\\site-packages\\transformers\\training_args.py:1559: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n", " warnings.warn(\n", "C:\\Users\\Geetansh\\AppData\\Local\\Temp\\ipykernel_4252\\1403743469.py:8: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n", " trainer = Trainer(my_model, args, train_dataset=tokenized_ds_train, eval_dataset=tokenized_ds_cv,\n" ] } ], "source": [ "from transformers import TrainingArguments, Trainer\n", "bs = 5\n", "epochs = 4\n", "lr = 8e-5\n", "args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,\n", " evaluation_strategy=\"epoch\", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,\n", " num_train_epochs=epochs, weight_decay=0.01, report_to='none')\n", "trainer = Trainer(my_model, args, train_dataset=tokenized_ds_train, eval_dataset=tokenized_ds_cv,\n", " tokenizer=tokz)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " \n", " 25%|██▌ | 32/128 [00:10<00:26, 3.56it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'eval_loss': 0.09050914645195007, 'eval_runtime': 0.3554, 'eval_samples_per_second': 50.653, 'eval_steps_per_second': 5.628, 'epoch': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \n", " 50%|█████ | 64/128 [00:19<00:17, 3.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'eval_loss': 0.04030601680278778, 'eval_runtime': 0.3239, 'eval_samples_per_second': 55.567, 'eval_steps_per_second': 6.174, 'epoch': 2.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \n", " 76%|███████▌ | 97/128 [00:28<00:10, 2.98it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'eval_loss': 0.022483834996819496, 'eval_runtime': 0.3246, 'eval_samples_per_second': 55.448, 'eval_steps_per_second': 6.161, 'epoch': 3.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \n", "100%|██████████| 128/128 [00:41<00:00, 3.07it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'eval_loss': 0.0200485959649086, 'eval_runtime': 0.3606, 'eval_samples_per_second': 49.921, 'eval_steps_per_second': 5.547, 'epoch': 4.0}\n", "{'train_runtime': 41.7528, 'train_samples_per_second': 15.328, 'train_steps_per_second': 3.066, 'train_loss': 0.11997667700052261, 'epoch': 4.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/plain": [ "TrainOutput(global_step=128, training_loss=0.11997667700052261, metrics={'train_runtime': 41.7528, 'train_samples_per_second': 15.328, 'train_steps_per_second': 3.066, 'total_flos': 1818871829700.0, 'train_loss': 0.11997667700052261, 'epoch': 4.0})" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Train (Here, fine tune) the model\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 20/20 [00:00<00:00, 162.84 examples/s]\n", "100%|██████████| 2/2 [00:00<00:00, 13.26it/s]\n" ] }, { "data": { "text/plain": [ "array([0.86230469, 0.28979492, 0.91162109, 0.86816406, 0.87988281,\n", " 0.21826172, 0.91064453, 0.89013672, 0.41748047, 0.8984375 ,\n", " 0.89355469, 0.14257812, 0.89160156, 0.35131836, 0.34375 ,\n", " 0.23815918, 0.87841797, 0.20471191, 0.10784912, 0.02485657])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Report loss for your model using the test set\n", "tokenized_ds_test = tokenize_sentence_col(ds_test)\n", "tokenized_ds_test = tokenized_ds_test.rename_columns({'Label' : 'labels'})\n", "\n", "preds = trainer.predict(tokenized_ds_test).predictions.astype(float)\n", "preds" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MAE: 0.09301467895507813\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ab
00.850.862305
10.400.289795
20.800.911621
30.850.868164
40.700.879883
50.300.218262
60.750.910645
70.850.890137
80.700.417480
90.900.898438
100.700.893555
110.200.142578
120.900.891602
130.200.351318
140.400.343750
150.200.238159
160.750.878418
170.300.204712
180.000.107849
190.000.024857
\n", "
" ], "text/plain": [ " a b\n", "0 0.85 0.862305\n", "1 0.40 0.289795\n", "2 0.80 0.911621\n", "3 0.85 0.868164\n", "4 0.70 0.879883\n", "5 0.30 0.218262\n", "6 0.75 0.910645\n", "7 0.85 0.890137\n", "8 0.70 0.417480\n", "9 0.90 0.898438\n", "10 0.70 0.893555\n", "11 0.20 0.142578\n", "12 0.90 0.891602\n", "13 0.20 0.351318\n", "14 0.40 0.343750\n", "15 0.20 0.238159\n", "16 0.75 0.878418\n", "17 0.30 0.204712\n", "18 0.00 0.107849\n", "19 0.00 0.024857" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Using MAE to calculate loss\n", "def get_mae(preds, real):\n", " '''\n", " preds, real: array \n", " '''\n", "\n", " mae = np.mean(np.abs(preds - real))\n", " return mae\n", "\n", "real = np.array(tokenized_ds_test['labels'])\n", "\n", "print(f\"MAE: {get_mae(preds, real)}\")\n", "\n", "# Print predictions on test side-by-side\n", "m = pd.DataFrame({'a':real.reshape(20,), 'b':preds.reshape(20)})\n", "m" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# MAE of my model: 0.1 (Based on test set)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Check if your GPU is available" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "torch.cuda.is_available()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Try Exporting the model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### How to pass input to the model for inference" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SequenceClassifierOutput(loss={'logits': tensor([[0.6899]], device='cuda:0')}, logits=tensor([[0.6899]], device='cuda:0'), hidden_states=None, attentions=None)\n" ] } ], "source": [ "import torch\n", "\n", "# Use GPU if available, otherwise fall back to CPU\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "\n", "# Move the model to the same device\n", "my_model.to(device)\n", "\n", "# Tokenize input and ensure tensors are returned\n", "sentence = \"Hey, it's Geetansh\"\n", "output = tokz(sentence, return_tensors='pt')\n", "\n", "# Move input tensors to the same device as the model\n", "output = {key: val.to(device) for key, val in output.items()}\n", "# print(output)\n", "\n", "# Set model to evaluation mode\n", "my_model.eval()\n", "\n", "# Perform inference without tracking gradients\n", "with torch.no_grad():\n", " # Pass tokenized input to the model\n", " predictions = my_model(**output)\n", "\n", "# Print predictions\n", "print(predictions)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Method 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SequenceClassifierOutput(loss=None, logits=tensor([[0.3520]], device='cuda:0'), hidden_states=None, attentions=None)\n" ] } ], "source": [ "# Save the model and tokeniser to disk\n", "save_dir = \"./saved_model\"\n", "# tokz.save_pretrained(save_directory=save_dir)\n", "# my_model.save_pretrained(save_directory=save_dir)\n", "\n", "# Use GPU if available, otherwise fall back to CPU\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "\n", "# Load the saved model and tokeniser from the disk \n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", "loaded_tokeniser = AutoTokenizer.from_pretrained(save_dir)\n", "loaded_model = AutoModelForSequenceClassification.from_pretrained(save_dir)\n", "\n", "loaded_model.to(device)\n", "\n", "# Test with the dummy input\n", "# Create a dummy input (same structure as your tokenizer output)\n", "dummy_input = loaded_tokeniser(\"This is a test sentence.\", return_tensors='pt')\n", "dummy_input = {key: val.to(device) for key, val in dummy_input.items()}\n", "\n", "with torch.no_grad():\n", " output = loaded_model(**dummy_input)\n", "print(output) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Method 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SequenceClassifierOutput(loss=None, logits=tensor([[0.3520]], device='cuda:0'), hidden_states=None, attentions=None)\n" ] } ], "source": [ "# Save the model and tokeniser to disk\n", "save_dir = \"./saved_model2\"\n", "# trainer.save_model(save_dir)\n", "\n", "# Use GPU if available, otherwise fall back to CPU\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "\n", "# Load the saved model and tokeniser from the disk \n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", "loaded_tokeniser = AutoTokenizer.from_pretrained(save_dir)\n", "loaded_model = AutoModelForSequenceClassification.from_pretrained(save_dir)\n", "\n", "loaded_model.to(device)\n", "\n", "# Test with the same dummy input as before\n", "# Create a dummy input (same structure as your tokenizer output)\n", "dummy_input = loaded_tokeniser(\"This is a test sentence.\", return_tensors='pt')\n", "dummy_input = {key: val.to(device) for key, val in dummy_input.items()}\n", "\n", "with torch.no_grad():\n", " output = loaded_model(**dummy_input)\n", "print(output) " ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 2 }