{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "215a1aae", "metadata": { "id": "215a1aae" }, "outputs": [], "source": [ "import torch\n", "from torch.utils.data import Dataset, DataLoader\n", "\n", "# import torch_xla\n", "# import torch_xla.core.xla_model as xm\n", "\n", "import pandas as pd\n", "\n", "from transformers import BertTokenizerFast, BertForSequenceClassification\n", "from transformers import Trainer, TrainingArguments" ] }, { "cell_type": "code", "source": [ "device = \"cuda:0\"\n", "\n", "model_name = \"bert-base-uncased\"\n", "tokenizer = BertTokenizerFast.from_pretrained(model_name)\n", "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device)\n", "max_len = 200\n", "\n", "training_args = TrainingArguments(\n", " output_dir=\"results\",\n", " num_train_epochs=1,\n", " per_device_train_batch_size=16,\n", " per_device_eval_batch_size=64,\n", " warmup_steps=500,\n", " learning_rate=5e-5,\n", " weight_decay=0.01,\n", " logging_dir=\"./logs\",\n", " logging_steps=10\n", " )\n", "\n", "# dataset class that inherits from torch.utils.data.Dataset\n", "\n", " \n", "class TokenizerDataset(Dataset):\n", " def __init__(self, strings):\n", " self.strings = strings\n", " \n", " def __getitem__(self, idx):\n", " return self.strings[idx]\n", " \n", " def __len__(self):\n", " return len(self.strings)\n", " " ], "metadata": { "id": "J5Tlgp4tNd0U", "outputId": "5d45330f-ec42-4766-8bf6-85ba08af7c3b", "colab": { "base_uri": "https://localhost:8080/" } }, "id": "J5Tlgp4tNd0U", "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']\n", "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ] }, { "cell_type": "code", "execution_count": null, "id": "9969c58c", "metadata": { "scrolled": false, "id": "9969c58c", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "cc7363d4-0ad4-4b58-baae-72efe63c7aad" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " id comment_text \\\n", "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... \n", "1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... \n", "2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... \n", "3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... \n", "4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... \n", "... ... ... \n", "159566 ffe987279560d7ff \":::::And for the second time of asking, when ... \n", "159567 ffea4adeee384e90 You should be ashamed of yourself \\n\\nThat is ... \n", "159568 ffee36eab5c267c9 Spitzer \\n\\nUmm, theres no actual article for ... \n", "159569 fff125370e4aaaf3 And it looks like it was actually you who put ... \n", "159570 fff46fc426af1f9a \"\\nAnd ... I really don't think you understand... \n", "\n", " toxic severe_toxic obscene threat insult identity_hate \n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", "... ... ... ... ... ... ... \n", "159566 0 0 0 0 0 0 \n", "159567 0 0 0 0 0 0 \n", "159568 0 0 0 0 0 0 \n", "159569 0 0 0 0 0 0 \n", "159570 0 0 0 0 0 0 \n", "\n", "[159571 rows x 8 columns]\n" ] } ], "source": [ "train_data = pd.read_csv(\"data/train.csv\")\n", "print(train_data)\n", "train_text = train_data[\"comment_text\"]\n", "train_labels = train_data[[\"toxic\", \"severe_toxic\", \n", " \"obscene\", \"threat\", \n", " \"insult\", \"identity_hate\"]]\n", "\n", "test_text = pd.read_csv(\"data/test.csv\")[\"comment_text\"]\n", "test_labels = pd.read_csv(\"data/test_labels.csv\")[[\n", " \"toxic\", \"severe_toxic\", \n", " \"obscene\", \"threat\", \n", " \"insult\", \"identity_hate\"]]\n", "\n", "# data preprocessing\n", "\n", "\n", "\n", "train_text = train_text.values.tolist()\n", "train_labels = train_labels.values.tolist()\n", "test_text = test_text.values.tolist()\n", "test_labels = test_labels.values.tolist()\n" ] }, { "cell_type": "code", "source": [ "# prepare tokenizer and dataset\n", "\n", "class TweetDataset(Dataset):\n", " def __init__(self, encodings, labels):\n", " self.encodings = encodings\n", " self.labels = labels\n", " self.tok = tokenizer\n", " \n", " def __getitem__(self, idx):\n", " # print(idx)\n", " # print(len(self.labels))\n", " encoding = self.tok(self.encodings.strings[idx], truncation=True, \n", " padding=\"max_length\", max_length=max_len)\n", " # print(encoding.items())\n", " item = { key: torch.tensor(val) for key, val in encoding.items() }\n", " item['labels'] = torch.tensor(self.labels[idx])\n", " # print(item)\n", " return item\n", " \n", " def __len__(self):\n", " return len(self.labels)\n", "\n", "\n", "\n", "\n", "\n", "train_strings = TokenizerDataset(train_text)\n", "test_strings = TokenizerDataset(test_text)\n", "\n", "train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)\n", "test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)\n", "\n", "\n", "\n", "\n", "# train_encodings = tokenizer.batch_encode_plus(train_text, \\\n", "# max_length=200, pad_to_max_length=True, \\\n", "# truncation=True, return_token_type_ids=False)\n", "# # return_tensors='pt')\n", "# test_encodings = tokenizer.batch_encode_plus(test_text, \\\n", "# max_length=200, pad_to_max_length=True, \\\n", "# truncation=True, return_token_type_ids=False)\n", "# # return_tensors='pt')\n", "\n", "# train_encodings = tokenizer(train_text, truncation=True, padding=True)\n", "# test_encodings = tokenizer(test_text, truncation=True, padding=True)" ], "metadata": { "id": "1n56TME9Njde" }, "id": "1n56TME9Njde", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "train_dataset = TweetDataset(train_strings, train_labels)\n", "test_dataset = TweetDataset(test_strings, test_labels)\n", "\n", "print(len(train_dataset.labels))\n", "print(len(train_strings))\n", "\n", "\n", "class MultilabelTrainer(Trainer):\n", " def compute_loss(self, model, inputs, return_outputs=False):\n", " labels = inputs.pop(\"labels\")\n", " outputs = model(**inputs)\n", " logits = outputs.logits\n", " loss_fct = torch.nn.BCEWithLogitsLoss()\n", " loss = loss_fct(logits.view(-1, self.model.config.num_labels), \n", " labels.float().view(-1, self.model.config.num_labels))\n", " return (loss, outputs) if return_outputs else loss\n", "\n", "\n", "# training\n", "trainer = MultilabelTrainer(\n", " model=model, \n", " args=training_args, \n", " train_dataset=train_dataset, \n", " eval_dataset=test_dataset\n", " )" ], "metadata": { "id": "4kwydz67qjW9", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8405ba5b-6ef8-4bb1-87c0-637510e11cdc" }, "id": "4kwydz67qjW9", "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "159571\n", "159571\n" ] } ] }, { "cell_type": "code", "source": [ "trainer.train()" ], "metadata": { "id": "VwsyMZg_tgTg", "outputId": "2153bf25-56d5-4b1f-a24a-8e2f4731638e", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 } }, "id": "VwsyMZg_tgTg", "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.9/dist-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "
\n", " \n", " \n", " [ 582/9974 05:37 < 1:30:57, 1.72 it/s, Epoch 0.06/1]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
100.695800
200.674200
300.631900
400.570600
500.541100
600.500300
700.440800
800.405400
900.336200
1000.285000
1100.232400
1200.239500
1300.197300
1400.196700
1500.143900
1600.153700
1700.098200
1800.129700
1900.094500
2000.104400
2100.119000
2200.081700
2300.081800
2400.079700
2500.077800
2600.093200
2700.066400
2800.064000
2900.074000
3000.084200
3100.064300
3200.082100
3300.057900
3400.065000
3500.072900
3600.064500
3700.064300
3800.071900
3900.044600
4000.059300
4100.063000
4200.082400
4300.070100
4400.042700
4500.089500
4600.061400
4700.097300
4800.062700
4900.067800
5000.083300
5100.083500
5200.053300
5300.045400
5400.052300
5500.075300
5600.069000
5700.084800
5800.028800

" ] }, "metadata": {} }, { "output_type": "error", "ename": "KeyboardInterrupt", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/local/lib/python3.9/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1660\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_inner_training_loop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_train_batch_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_find_batch_size\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1661\u001b[0m )\n\u001b[0;32m-> 1662\u001b[0;31m return inner_training_loop(\n\u001b[0m\u001b[1;32m 1663\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1664\u001b[0m \u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.9/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1927\u001b[0m \u001b[0mtr_loss_step\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining_step\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1928\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1929\u001b[0;31m \u001b[0mtr_loss_step\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining_step\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1930\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1931\u001b[0m if (\n", "\u001b[0;32m/usr/local/lib/python3.9/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtraining_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m 2715\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdeepspeed\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2716\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2717\u001b[0;31m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2718\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2719\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdetach\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.9/dist-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 486\u001b[0m )\n\u001b[0;32m--> 487\u001b[0;31m torch.autograd.backward(\n\u001b[0m\u001b[1;32m 488\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 489\u001b[0m )\n", "\u001b[0;32m/usr/local/lib/python3.9/dist-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[0;31m# some Python versions print out the first line of a multi-line function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[0;31m# calls in the traceback and some print out the last line\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 200\u001b[0;31m Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n\u001b[0m\u001b[1;32m 201\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 202\u001b[0m allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to run the backward pass\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ] }, { "cell_type": "code", "source": [ "!nvidia-smi" ], "metadata": { "id": "EJPePRRQG1QK" }, "id": "EJPePRRQG1QK", "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "colab": { "provenance": [], "gpuType": "T4" }, "accelerator": "GPU", "gpuClass": "standard" }, "nbformat": 4, "nbformat_minor": 5 }