{ "cells": [ { "cell_type": "code", "execution_count": 12, "metadata": { "id": "5f93b7d1" }, "outputs": [], "source": [ "from transformers import AutoModelForSeq2SeqLM\n", "import peft\n", "from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, IA3Config, TaskType\n", "import torch\n", "from datasets import load_dataset\n", "import os\n", "\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", "from transformers import AutoTokenizer\n", "from torch.utils.data import DataLoader\n", "from transformers import default_data_collator, get_linear_schedule_with_warmup\n", "from tqdm import tqdm\n", "from datasets import load_dataset\n", "\n", "device = \"cuda\"\n", "model_name_or_path = \"bigscience/mt0-large\"\n", "tokenizer_name_or_path = \"bigscience/mt0-large\"\n", "\n", "checkpoint_name = \"financial_sentiment_analysis_ia3_v1.pt\"\n", "text_column = \"sentence\"\n", "label_column = \"text_label\"\n", "max_length = 128\n", "lr = 8e-3\n", "num_epochs = 3\n", "batch_size = 8" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b9e6368c", "outputId": "fc2888a8-4fe9-4d61-dd2d-753e751e1416" }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import importlib\n", "\n", "importlib.reload(peft)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "8d0850ac" }, "outputs": [], "source": [ "# creating model\n", "peft_config = IA3Config(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, feedforward_modules=[])\n", "\n", "model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "e10c3831", "outputId": "e69c5e07-ae58-446c-8301-e99ac6b85d62" }, "outputs": [ { "data": { "text/plain": [ "MT5ForConditionalGeneration(\n", " (shared): Embedding(250112, 1024)\n", " (encoder): MT5Stack(\n", " (embed_tokens): Embedding(250112, 1024)\n", " (block): ModuleList(\n", " (0): MT5Block(\n", " (layer): ModuleList(\n", " (0): MT5LayerSelfAttention(\n", " (SelfAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " (relative_attention_bias): Embedding(32, 16)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): MT5LayerFF(\n", " (DenseReluDense): MT5DenseGatedActDense(\n", " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wi_1): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): NewGELUActivation()\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (1-23): 23 x MT5Block(\n", " (layer): ModuleList(\n", " (0): MT5LayerSelfAttention(\n", " (SelfAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): MT5LayerFF(\n", " (DenseReluDense): MT5DenseGatedActDense(\n", " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wi_1): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): NewGELUActivation()\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (final_layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (decoder): MT5Stack(\n", " (embed_tokens): Embedding(250112, 1024)\n", " (block): ModuleList(\n", " (0): MT5Block(\n", " (layer): ModuleList(\n", " (0): MT5LayerSelfAttention(\n", " (SelfAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " (relative_attention_bias): Embedding(32, 16)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): MT5LayerCrossAttention(\n", " (EncDecAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): MT5LayerFF(\n", " (DenseReluDense): MT5DenseGatedActDense(\n", " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wi_1): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): NewGELUActivation()\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (1-23): 23 x MT5Block(\n", " (layer): ModuleList(\n", " (0): MT5LayerSelfAttention(\n", " (SelfAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): MT5LayerCrossAttention(\n", " (EncDecAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): MT5LayerFF(\n", " (DenseReluDense): MT5DenseGatedActDense(\n", " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wi_1): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): NewGELUActivation()\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (final_layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (lm_head): Linear(in_features=1024, out_features=250112, bias=False)\n", ")" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "05978e96", "outputId": "ea9b7d40-010f-4df0-ec64-a7146a5f8b08" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "trainable params: 282,624 || all params: 1,229,863,936 || trainable%: 0.022980103060766553\n" ] }, { "data": { "text/plain": [ "PeftModelForSeq2SeqLM(\n", " (base_model): IA3Model(\n", " (model): MT5ForConditionalGeneration(\n", " (shared): Embedding(250112, 1024)\n", " (encoder): MT5Stack(\n", " (embed_tokens): Embedding(250112, 1024)\n", " (block): ModuleList(\n", " (0): MT5Block(\n", " (layer): ModuleList(\n", " (0): MT5LayerSelfAttention(\n", " (SelfAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (v): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " (relative_attention_bias): Embedding(32, 16)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): MT5LayerFF(\n", " (DenseReluDense): MT5DenseGatedActDense(\n", " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wi_1): Linear(\n", " in_features=1024, out_features=2816, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n", " )\n", " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): NewGELUActivation()\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (1-23): 23 x MT5Block(\n", " (layer): ModuleList(\n", " (0): MT5LayerSelfAttention(\n", " (SelfAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (v): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): MT5LayerFF(\n", " (DenseReluDense): MT5DenseGatedActDense(\n", " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wi_1): Linear(\n", " in_features=1024, out_features=2816, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n", " )\n", " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): NewGELUActivation()\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (final_layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (decoder): MT5Stack(\n", " (embed_tokens): Embedding(250112, 1024)\n", " (block): ModuleList(\n", " (0): MT5Block(\n", " (layer): ModuleList(\n", " (0): MT5LayerSelfAttention(\n", " (SelfAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (v): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " (relative_attention_bias): Embedding(32, 16)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): MT5LayerCrossAttention(\n", " (EncDecAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (v): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): MT5LayerFF(\n", " (DenseReluDense): MT5DenseGatedActDense(\n", " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wi_1): Linear(\n", " in_features=1024, out_features=2816, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n", " )\n", " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): NewGELUActivation()\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (1-23): 23 x MT5Block(\n", " (layer): ModuleList(\n", " (0): MT5LayerSelfAttention(\n", " (SelfAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (v): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): MT5LayerCrossAttention(\n", " (EncDecAttention): MT5Attention(\n", " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", " (k): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (v): Linear(\n", " in_features=1024, out_features=1024, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", " )\n", " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): MT5LayerFF(\n", " (DenseReluDense): MT5DenseGatedActDense(\n", " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", " (wi_1): Linear(\n", " in_features=1024, out_features=2816, bias=False\n", " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n", " )\n", " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): NewGELUActivation()\n", " )\n", " (layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (final_layer_norm): MT5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (lm_head): Linear(in_features=1024, out_features=250112, bias=False)\n", " )\n", " )\n", ")" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = get_peft_model(model, peft_config)\n", "model.print_trainable_parameters()\n", "model" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 140, "referenced_widgets": [ "bbfb7533b5ca459194e171df56b79566", "c894e8237aa34c56bb250acab1466005", "a5a126b229064812bf3dcb228118be50", "661e1b29c59a4295b594edfa4f50ff87", "1bcba805972b484d8b6aa6542c81841c", "e71f5c7f1d5d4f83b58c68d2fa310d9c", "6a567e0a1a5447519c5df10e777520cf", "7aeca19b84904906a04c12659f84ff9e", "dd4b895874ce46ceb1ad0d9bc973f98f", "b138f91be7f94008806eaf0a6988bc3f", "da14180f51ab44b48470cb9ea74d3864", "9e12d97af6124a5a8c6627708b300c1e", "faa18df899c14e9cac6721253e6c9128", "79d0ede7a5b24756aa6d34fda8c29159", "3b175b452f4347558aa3c4501cc90030", "fc4637a1b37e4e90874c71aa4271ac74", "1b8aada826a0451bb60c418b19178c8c", "a91916e02e9c424e881e45b3aa978574", "ca509bd409624c998e555c9a779b8aae", "9c890fc422954347b86d3bde7a421caf", "6f9453484ea94587a64d70f1b3a1f6e4", "48770ef159f44c01be2a75c75aecd80f", "0c561dab67914ea9b6e1aab803600551", "1e021a1954b44d69a90101a96c360661", "013e3343285f437a893bdd673fb90e22", "28802da68fb04d70b1c6bc511a04676f", "94174da0d6554be087d4527bea5b511a", "dc8ab16a1e6c4e6893c95ccd16568f9a", "72383136663448d89cf3b82b87cbb392", "5b1bdaf16cbc473081e4237f839167b9", "51f8fb45485540bb985b606d43ae04ea", "f760cd4758334ca9a43fd15612fd808b", "f60e9915d2a74ca7bc010d7684f5acf6" ] }, "id": "4ee2babf", "outputId": "3c413083-247d-47da-f25c-032764be0beb" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:datasets.builder:Found cached dataset financial_phrasebank (/root/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bbfb7533b5ca459194e171df56b79566", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00