{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Ноутбук с обучением модели"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from transformers import AutoTokenizer\n",
    "from datasets import load_dataset\n",
    "from transformers import DataCollatorWithPadding, Trainer, TrainingArguments, AutoModelForSequenceClassification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('roberta-base')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default-bd943fc5bb724360\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset csv/default to /Users/seal/.cache/huggingface/datasets/csv/default-bd943fc5bb724360/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "548860682d5749cd979a64b49275ea32",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "97bf74e33b0d4607ba6dbed03fc625ae",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0 tables [00:00, ? tables/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0 tables [00:00, ? tables/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset csv downloaded and prepared to /Users/seal/.cache/huggingface/datasets/csv/default-bd943fc5bb724360/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7ffe361486c64883b25e6ab953c1cd16",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Parameter 'function'=<function <lambda> at 0x1641221f0> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "484e702a879448a3b09e2c7a2d672407",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/11883 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2cb11d60bff946fa86c386e1d03ba9fb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2097 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "data_hf = load_dataset('csv', data_files={'train': 'data_train.csv',\n",
    "                                          'test': 'data_test.csv'})\n",
    "data_hf_tokenized = data_hf.map(lambda x: tokenizer(x['full_info'], max_length=512, truncation=True))\n",
    "data_hf_tokenized = data_hf_tokenized.rename_column('tags', 'label')\n",
    "data_hf_tokenized = data_hf_tokenized.remove_columns(['full_info'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "args_dict = {'output_dir': 'logs', 'per_device_train_batch_size': 8, 'do_train': True}\n",
    "args = TrainingArguments(**args_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias']\n",
      "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=19)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  warnings.warn(\n",
      "***** Running training *****\n",
      "  Num examples = 11883\n",
      "  Num Epochs = 3\n",
      "  Instantaneous batch size per device = 8\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 8\n",
      "  Gradient Accumulation steps = 1\n",
      "  Total optimization steps = 4458\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='4458' max='4458' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [4458/4458 20:39:50, Epoch 3/3]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>500</td>\n",
       "      <td>1.356300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1000</td>\n",
       "      <td>1.088700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1500</td>\n",
       "      <td>1.026400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2000</td>\n",
       "      <td>0.897900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2500</td>\n",
       "      <td>0.835500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3000</td>\n",
       "      <td>0.795100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3500</td>\n",
       "      <td>0.684000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4000</td>\n",
       "      <td>0.641000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving model checkpoint to logs/checkpoint-500\n",
      "Configuration saved in logs/checkpoint-500/config.json\n",
      "Model weights saved in logs/checkpoint-500/pytorch_model.bin\n",
      "Saving model checkpoint to logs/checkpoint-1000\n",
      "Configuration saved in logs/checkpoint-1000/config.json\n",
      "Model weights saved in logs/checkpoint-1000/pytorch_model.bin\n",
      "Saving model checkpoint to logs/checkpoint-1500\n",
      "Configuration saved in logs/checkpoint-1500/config.json\n",
      "Model weights saved in logs/checkpoint-1500/pytorch_model.bin\n",
      "Saving model checkpoint to logs/checkpoint-2000\n",
      "Configuration saved in logs/checkpoint-2000/config.json\n",
      "Model weights saved in logs/checkpoint-2000/pytorch_model.bin\n",
      "Saving model checkpoint to logs/checkpoint-2500\n",
      "Configuration saved in logs/checkpoint-2500/config.json\n",
      "Model weights saved in logs/checkpoint-2500/pytorch_model.bin\n",
      "Saving model checkpoint to logs/checkpoint-3000\n",
      "Configuration saved in logs/checkpoint-3000/config.json\n",
      "Model weights saved in logs/checkpoint-3000/pytorch_model.bin\n",
      "Saving model checkpoint to logs/checkpoint-3500\n",
      "Configuration saved in logs/checkpoint-3500/config.json\n",
      "Model weights saved in logs/checkpoint-3500/pytorch_model.bin\n",
      "Saving model checkpoint to logs/checkpoint-4000\n",
      "Configuration saved in logs/checkpoint-4000/config.json\n",
      "Model weights saved in logs/checkpoint-4000/pytorch_model.bin\n",
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=4458, training_loss=0.8849190480509057, metrics={'train_runtime': 74409.2364, 'train_samples_per_second': 0.479, 'train_steps_per_second': 0.06, 'total_flos': 4731926386857384.0, 'train_loss': 0.8849190480509057, 'epoch': 3.0})"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer = Trainer(\n",
    "    model, \n",
    "    args,\n",
    "    train_dataset=data_hf_tokenized[\"train\"],\n",
    "    eval_dataset=data_hf_tokenized[\"test\"],\n",
    "    data_collator=DataCollatorWithPadding(tokenizer),\n",
    ")\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running Prediction *****\n",
      "  Num examples = 2097\n",
      "  Batch size = 8\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='263' max='263' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [263/263 21:44]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "predictions = trainer.predict(data_hf_tokenized[\"test\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving model checkpoint to model_roberta_trained\n",
      "Configuration saved in model_roberta_trained/config.json\n",
      "Model weights saved in model_roberta_trained/pytorch_model.bin\n"
     ]
    }
   ],
   "source": [
    "trainer.save_model('model_roberta_trained')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Обучила ещё distilbert в коллабе, но там результат немного хуже.\n",
    "Пользовалась этим:\n",
    "https://github.com/ThilinaRajapakse/pytorch-transformers-classification\n",
    "https://github.com/huggingface/transformers/tree/main/src/transformers"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}