{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 273, "referenced_widgets": [ "4d09491ebe2841b982bf64786645634f", "1f73436d60ca49dd914e554c7997eeb2", "6c4bff5535ce47b5b87057ef91878fa3", "c87bcd7571d74b268232c0ec57b291e5", "f0d4737c95344c1fa9872ac4870f16a7", "448d6f9135994f11904a3bae0d15eb70", "bcc4a1d83f7643d69d3750d6e08baf92", "039685cbef934e63b220a67530c17734", "11098a25620e45cea2abd48be7bf8fdb", "6ad3b0a7c4b54f9b87d4e98190faa771", "43b398cff94341cd9badde1aab571e2b", "abac05717f58473a98d7adef21a08d3c", "62861e7d14264254831d46c0eca1da8b", "120413f008ee4a30826efc4278b5c93a", "3f434eb05362439ebcc846f976719127", "eaafd7991e9d4a1684238b30c9721932", "1a19000214cd4324a56dbd71841dae2b", "0883d430f2cc4bfc8152ca83550bc8d0", "c31ba7458f3d490482208dde6a807a41", "bc2e3a0d3a81423fa44c8d76e170e582", "19c7764e2f874004a97e732326b26818", "90fd365f03c04f3e81924555dc0836fc", "395dfa83a3dc49a4be75a0703465ac0c", "06743f21137447a495c0fb6e03652736", "37dd778ef2d449bc8b0f4705094e6831", "be8f443309964e27a877634223210cf0", "9b2b7dcb21344d8c83ab33408553a6f6", "fbe16655d7454557a0bf79b3722e7481", "37ab8cbcff8b49958bee27a2d190d7b3", "22024f8402ac43b0a65738e944a6dd7b", "39e7dca43dde425b95510fa02d3120ea", "5804e23033834606862ab0b9027da11a", "064054f546dc42aca0b2c93d57a07d64", "c6ca37bb8d4d48a2aefd1a0143e9dc59", "1c41dc2b241249aeb595136168f24805", "04b0b3015b1b4df1bf0ab1bdbfe8ec50", "4a40c4ee96004463872e2d4befd0f063", "2f5489f855a74c509be1a73832e2e695", "33ee49e24a034b53985ceeb4e132261c", "f2ac2164347843bea69a1959f485c634", "daea55f718f94dc2bbb730974f2664e9", "38c64573b70d4904b9b40ad0eaec7bd2", "90fb2cc106eb4e19892f2c82144b5f60", "11ad22ad87d14180aee3a1564a21680c", "97e8ad2b1dca4080b71b1ad14692ecd8", "280a1e52702a43dfbeac3f92b541a5c3", "f738ff6690a84f28a6144f42405c82da", "2ecd7cdf1a9e4728b368b0541b12bc99", "26bcd4611199452596cb9a7061ae98ba", "41e7544ed30a4d3da6f2f99699a48bd0", "ce5372feee8e4617a513590de0b7481a", "60ce09959efc43ae98893fde33859b1d", "9e8db0555941493b857da8109cb58109", "874e6ebce6dd4e539b96236cc80316af", "52f10fd6667346fb8c6a8043785c14f0", "404cfe0788514c649016cb59dc8afc95", "426fdecdffe940e5807d92643c40a155", "af93b2665d81438f95b8b69ef739a73d", "2005f703fc8e438bb6f64e6f03f30653", "326acc1ec09c499ba112a7363af6f57f", "0836813c2def402884886bf1953a6ef2", "cd2c319c4a324a88ac1a40da9338d814", "10b5252bb7334d518630990bf3a29922", "f57cf7e3f5ec4f9784cd90141d5c3a25", "4c87154da008476091b97981c0dbb02d", "e880543b0c354c50b2e5f527b17f9776", "96b63a006fd349a5a979d4b4996de3db", "c77f9152c7d64d178a88128dc1e6a3ce", "33d713a10d93449594d4935978b34f08", "0cacba3260364cce876b5735bd268cc1", "ab15cc65737e460ca96b8c13da724bb0", "7fc5b46c09444730b19de0fe1255e8ed", "3b52015c85304a6db337d5d0592982cc", "495c23ede6174c28b91faf3fbc71c164", "dda476f556c84c9f84ddc5a386ee435f", "8c4dabfd677246209805176b64ec7653", "d8ac8215eda945bea58101010f07d993", "892b4306cec1483ba70234a92edb74a3", "e8c235eac34841c889f5dea485eb96be", "b828b7ddc43240569f91e786bb64b2ca", "b177d33d788840749b82cbfcd1fef735", "a28f3eb8523a40ee89ac0684b3ed8c0e", "eaf387d1161d49eeaf810b84415cb54d", "96f5478c161543169528f1edaf39e5cf", "0ab3ca2e2320409db06919399137a6ad", "c459af3c51464a93a821dbda4ad70b56", "651212c815884b87a476510235e65720", "cbd0fb78ddb14d06b1e9b15dc4cbf69a" ] }, "id": "w6qrl7_SvPKg", "outputId": "eeee9b67-b27a-418c-d8c3-1113e295d860" }, "outputs": [], "source": [ "from transformers import AutoTokenizer ,AutoModelForCausalLM\n", "import torch\n", "import re\n", "from sklearn.utils import shuffle" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "model_checkpoint = \"Kirili4ik/ruDialoGpt3-medium-finetuned-telegram\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "model = AutoModelForCausalLM.from_pretrained(\"BlackSamorez/rudialogpt3_medium_based_on_gpt2_2ch\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "esy3uaT5d8Qm" }, "outputs": [], "source": [ "# util function to get expected len after tokenizing\n", "def get_length_param(text: str, tokenizer) -> str:\n", " tokens_count = len(tokenizer.encode(text))\n", " if tokens_count <= 15:\n", " len_param = '1'\n", " elif tokens_count <= 50:\n", " len_param = '2'\n", " elif tokens_count <= 256:\n", " len_param = '3'\n", " else:\n", " len_param = '-'\n", " return len_param" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "JXzruCdIERD7", "outputId": "27c2fdc7-c72e-401e-961b-2224d3c0c4c8" }, "outputs": [], "source": [ "import pandas as pd\n", "df = pd.read_csv(\"Data/combined.csv\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(226312, 2)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "df = shuffle(df)\n", "df.reset_index(drop=True, inplace=True)\n", "df = df.dropna()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
QuoteResponse
0все проще. просто у них еще не было случая пер...ну да, разумеется. они только что прилетели с ...
1если вы про табличку сверху то там нет слов га...вы спросили какая - я ответил. нормативные и с...
2конечно, и беспощадно укрывались от налётов па...именно вы - умный человек.
3а вообще удивительно , чтобы убедить людей в п...- в этом никого не надо убеждать, это следует ...
4когда боятся - не стреляютя ничего не могу сказать хорошего или плохого ...
\n", "
" ], "text/plain": [ " Quote \\\n", "0 все проще. просто у них еще не было случая пер... \n", "1 если вы про табличку сверху то там нет слов га... \n", "2 конечно, и беспощадно укрывались от налётов па... \n", "3 а вообще удивительно , чтобы убедить людей в п... \n", "4 когда боятся - не стреляют \n", "\n", " Response \n", "0 ну да, разумеется. они только что прилетели с ... \n", "1 вы спросили какая - я ответил. нормативные и с... \n", "2 именно вы - умный человек. \n", "3 - в этом никого не надо убеждать, это следует ... \n", "4 я ничего не могу сказать хорошего или плохого ... " ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "wHgkJFvuop9E" }, "outputs": [], "source": [ "def remove_duplicates(S):\n", " S = re.sub(r'[a-zA-Z]+', '', S) #Remove english\n", " S = S.split()\n", " result = \"\"\n", " for subst in S:\n", " if subst not in result:\n", " result += subst+\" \"\n", " return result.rstrip()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Ik0QAWondzuW", "outputId": "5347a398-f52b-4ee0-d54e-94bddf02dc6e" }, "outputs": [], "source": [ "#model.eval()" ] }, { "cell_type": "markdown", "metadata": { "id": "vyloqgoalE4x" }, "source": [ "https://huggingface.co/Kirili4ik/ruDialoGpt3-medium-finetuned-telegram" ] }, { "cell_type": "markdown", "metadata": { "id": "IuzSROqxjUKM" }, "source": [ "## Model initial configuration" ] }, { "cell_type": "markdown", "metadata": { "id": "TC3qNlfp30aU" }, "source": [ "Let's train our chatbot. For start, we will need basic configuration and a dataset.\n", "Configuration and training scripts are mostly based on this [script](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py) from Huggingface and great [tutorial](https://nathancooper.io/i-am-a-nerd/chatbot/deep-learning/gpt2/2020/05/12/chatbot-part-1.html) from Nathan Cooper." ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "id": "VxR1uj0FN16V" }, "outputs": [], "source": [ "\"\"\"\n", "Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).\n", "GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned\n", "using a masked language modeling (MLM) loss.\n", "\"\"\"\n", "\n", "import glob\n", "import logging\n", "import os\n", "import pickle\n", "import random\n", "import re\n", "import shutil\n", "from typing import Dict, List, Tuple\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import torch\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "from torch.nn.utils.rnn import pad_sequence\n", "from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler\n", "from torch.utils.data.distributed import DistributedSampler\n", "from tqdm.notebook import tqdm, trange\n", "\n", "from pathlib import Path\n", "\n", "from transformers import (\n", " MODEL_WITH_LM_HEAD_MAPPING,\n", " WEIGHTS_NAME,\n", " AdamW,\n", " AutoConfig,\n", " AutoModelForCausalLM,\n", " AutoTokenizer,\n", " PreTrainedModel,\n", " PreTrainedTokenizer,\n", " get_linear_schedule_with_warmup,\n", ")\n", "\n", "\n", "try:\n", " from torch.utils.tensorboard import SummaryWriter\n", "except ImportError:\n", " from tensorboardX import SummaryWriter\n", "\n", "# Configs\n", "logger = logging.getLogger(__name__)\n", "\n", "MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())\n", "MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "id": "MWuPME79OH6e" }, "outputs": [], "source": [ "# Args to allow for easy convertion of python script to notebook\n", "class Args():\n", " def __init__(self):\n", " self.output_dir = 'WarBot'\n", " self.model_type = 'gpt2'\n", " self.model_name_or_path = model_checkpoint\n", " self.config_name = model_checkpoint\n", " self.tokenizer_name = model_checkpoint\n", " self.cache_dir = 'cached'\n", " self.block_size = 512\n", " self.do_train = True\n", " self.do_eval = True\n", " self.evaluate_during_training = False\n", " self.per_gpu_train_batch_size = 1 #4\n", " self.per_gpu_eval_batch_size = 1 #4\n", " self.gradient_accumulation_steps = 1\n", " self.learning_rate = 5e-5\n", " self.weight_decay = 0.0\n", " self.adam_epsilon = 1e-8\n", " self.max_grad_norm = 1.0\n", " self.num_train_epochs = 3\n", " self.max_steps = -1\n", " self.warmup_steps = 0\n", " self.logging_steps = 1000\n", " self.save_steps = 3500\n", " self.save_total_limit = None\n", " self.eval_all_checkpoints = False\n", " self.no_cuda = False\n", " self.overwrite_output_dir = True\n", " self.overwrite_cache = True\n", " self.should_continue = False\n", " self.seed = 42\n", " self.local_rank = -1\n", " self.fp16 = False\n", " self.fp16_opt_level = 'O1'\n", "\n", "args = Args()" ] }, { "cell_type": "markdown", "metadata": { "id": "X_qYqlTe9yx2" }, "source": [ "## Prepare Dataset" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "MrJWywM0Osmx" }, "outputs": [], "source": [ "# Our script is in df instead of all_rick" ] }, { "cell_type": "markdown", "metadata": { "id": "n51hsl2mAG8v" }, "source": [ "Original dataset was converted in a way that every responce row will contain **n** previous responces as a context. For our purposes seven previous responces will be enough." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "3Rz_hLgGUzbl", "outputId": "1013aad0-6b8b-4611-b55c-e757db83e848" }, "outputs": [ { "ename": "NameError", "evalue": "name 'df' is not defined", "output_type": "error", "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)", "Cell \u001B[1;32mIn[1], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mdf\u001B[49m\u001B[38;5;241m.\u001B[39mhead()\n", "\u001B[1;31mNameError\u001B[0m: name 'df' is not defined" ] } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "aBeM8pvEjigq" }, "source": [ "Split our dataset into a training and test parts." ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "g1CeutVVlL85", "outputId": "00cb0ef2-9207-4f92-9269-5eb386222ea2" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
QuoteResponse
129348так поставь вместо россии китай, что кординаль...с пак фа все давно ясно. халтура для распила б...
23948из дубая запустят самый длительный беспересадо...рейс qf7879 приземлился в сиднея после перелет...
99732что значит \"пытались\"? он из этого стрелял?конечно он из этого не стрелял. ну мы же не са...
6869311-й раз: 161710 161711 161712 подмосковный т...ты для начала сделай таких же размеров и массы...
86282- вы очень, очень серьёзно заблуждаетесь. это ...ну так количественную разницу никто не отменял...
\n", "
" ], "text/plain": [ " Quote \\\n", "129348 так поставь вместо россии китай, что кординаль... \n", "23948 из дубая запустят самый длительный беспересадо... \n", "99732 что значит \"пытались\"? он из этого стрелял? \n", "68693 11-й раз: 161710 161711 161712 подмосковный т... \n", "86282 - вы очень, очень серьёзно заблуждаетесь. это ... \n", "\n", " Response \n", "129348 с пак фа все давно ясно. халтура для распила б... \n", "23948 рейс qf7879 приземлился в сиднея после перелет... \n", "99732 конечно он из этого не стрелял. ну мы же не са... \n", "68693 ты для начала сделай таких же размеров и массы... \n", "86282 ну так количественную разницу никто не отменял... " ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trn_df, val_df = train_test_split(df, test_size = 0.2)\n", "trn_df.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "86F3WhnFO4H8" }, "source": [ "Now will convert our dataset in a format suitable for our model. In the original dataset they have concatenate responses in one string for each row (additionally adding special 'end of string' token between responses, so the model will understand end of each response in a string)." ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "id": "PX7jeWpYmOe_" }, "outputs": [], "source": [ "def construct_conv(row, tokenizer, eos = True):\n", " flatten = lambda l: [item for sublist in l for item in sublist]\n", " conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))\n", " conv = flatten(conv)\n", " return conv\n", "\n", "class ConversationDataset(Dataset):\n", " def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):\n", "\n", " block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)\n", "\n", " directory = args.cache_dir\n", " cached_features_file = os.path.join(\n", " directory, args.model_type + \"_cached_lm_\" + str(block_size)\n", " )\n", "\n", " if os.path.exists(cached_features_file) and not args.overwrite_cache:\n", " logger.info(\"Loading features from cached file %s\", cached_features_file)\n", " with open(cached_features_file, \"rb\") as handle:\n", " self.examples = pickle.load(handle)\n", " else:\n", " logger.info(\"Creating features from dataset file at %s\", directory)\n", "\n", " self.examples = []\n", " for _, row in df.iterrows():\n", " conv = construct_conv(row, tokenizer)\n", " self.examples.append(conv)\n", "\n", " logger.info(\"Saving features into cached file %s\", cached_features_file)\n", " with open(cached_features_file, \"wb\") as handle:\n", " pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", "\n", " def __len__(self):\n", " return len(self.examples)\n", "\n", " def __getitem__(self, item):\n", " return torch.tensor(self.examples[item], dtype=torch.long)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "id": "naaRHoXgnStq" }, "outputs": [], "source": [ "# Cacheing and storing of data/checkpoints\n", "\n", "def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):\n", " return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)\n", "\n", "\n", "def set_seed(args):\n", " random.seed(args.seed)\n", " np.random.seed(args.seed)\n", " torch.manual_seed(args.seed)\n", " if args.n_gpu > 0:\n", " torch.cuda.manual_seed_all(args.seed)\n", "\n", "\n", "def _sorted_checkpoints(args, checkpoint_prefix=\"checkpoint\", use_mtime=False) -> List[str]:\n", " ordering_and_checkpoint_path = []\n", "\n", " glob_checkpoints = glob.glob(os.path.join(args.output_dir, \"{}-*\".format(checkpoint_prefix)))\n", "\n", " for path in glob_checkpoints:\n", " if use_mtime:\n", " ordering_and_checkpoint_path.append((os.path.getmtime(path), path))\n", " else:\n", " regex_match = re.match(\".*{}-([0-9]+)\".format(checkpoint_prefix), path)\n", " if regex_match and regex_match.groups():\n", " ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))\n", "\n", " checkpoints_sorted = sorted(ordering_and_checkpoint_path)\n", " checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]\n", " return checkpoints_sorted\n", "\n", "\n", "def _rotate_checkpoints(args, checkpoint_prefix=\"checkpoint\", use_mtime=False) -> None:\n", " if not args.save_total_limit:\n", " return\n", " if args.save_total_limit <= 0:\n", " return\n", "\n", " # Check if we should delete older checkpoint(s)\n", " checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)\n", " if len(checkpoints_sorted) <= args.save_total_limit:\n", " return\n", "\n", " number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)\n", " checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]\n", " for checkpoint in checkpoints_to_be_deleted:\n", " logger.info(\"Deleting older checkpoint [{}] due to args.save_total_limit\".format(checkpoint))\n", " shutil.rmtree(checkpoint)" ] }, { "cell_type": "markdown", "metadata": { "id": "pkvMNnrnVHQw" }, "source": [ "## Training and Evaluating\n", "\n", "There will be quite a lot of code needed for training our model but everything should work as is, the main thing is to give the model the dataset in the right format.\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "id": "tXzKlXHeu0Mb" }, "outputs": [], "source": [ "def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:\n", " \"\"\" Train the model \"\"\"\n", " if args.local_rank in [-1, 0]:\n", " tb_writer = SummaryWriter()\n", "\n", " args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)\n", "\n", " def collate(examples: List[torch.Tensor]):\n", " if tokenizer._pad_token is None:\n", " return pad_sequence(examples, batch_first=True)\n", " return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)\n", "\n", " train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)\n", " train_dataloader = DataLoader(\n", " train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True\n", " )\n", "\n", " if args.max_steps > 0:\n", " t_total = args.max_steps\n", " args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1\n", " else:\n", " t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs\n", "\n", " model = model.module if hasattr(model, \"module\") else model # Take care of distributed/parallel training\n", " model.resize_token_embeddings(len(tokenizer))\n", " # add_special_tokens_(model, tokenizer)\n", "\n", "\n", " # Prepare optimizer and schedule (linear warmup and decay)\n", " no_decay = [\"bias\", \"LayerNorm.weight\"]\n", " optimizer_grouped_parameters = [\n", " {\n", " \"params\": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],\n", " \"weight_decay\": args.weight_decay,\n", " },\n", " {\"params\": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], \"weight_decay\": 0.0},\n", " ]\n", " optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)\n", " scheduler = get_linear_schedule_with_warmup(\n", " optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total\n", " )\n", "\n", " # Check if saved optimizer or scheduler states exist\n", " if (\n", " args.model_name_or_path\n", " and os.path.isfile(os.path.join(args.model_name_or_path, \"optimizer.pt\"))\n", " and os.path.isfile(os.path.join(args.model_name_or_path, \"scheduler.pt\"))\n", " ):\n", " # Load in optimizer and scheduler states\n", " optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, \"optimizer.pt\")))\n", " scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, \"scheduler.pt\")))\n", "\n", " if args.fp16:\n", " try:\n", " from apex import amp\n", " except ImportError:\n", " raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use fp16 training.\")\n", " model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)\n", "\n", " # multi-gpu training (should be after apex fp16 initialization)\n", " if args.n_gpu > 1:\n", " model = torch.nn.DataParallel(model)\n", "\n", " # Distributed training (should be after apex fp16 initialization)\n", " if args.local_rank != -1:\n", " model = torch.nn.parallel.DistributedDataParallel(\n", " model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True\n", " )\n", "\n", " # Train!\n", " logger.info(\"***** Running training *****\")\n", " logger.info(\" Num examples = %d\", len(train_dataset))\n", " logger.info(\" Num Epochs = %d\", args.num_train_epochs)\n", " logger.info(\" Instantaneous batch size per GPU = %d\", args.per_gpu_train_batch_size)\n", " logger.info(\n", " \" Total train batch size (w. parallel, distributed & accumulation) = %d\",\n", " args.train_batch_size\n", " * args.gradient_accumulation_steps\n", " * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),\n", " )\n", " logger.info(\" Gradient Accumulation steps = %d\", args.gradient_accumulation_steps)\n", " logger.info(\" Total optimization steps = %d\", t_total)\n", "\n", " global_step = 0\n", " epochs_trained = 0\n", " steps_trained_in_current_epoch = 0\n", " # Check if continuing training from a checkpoint\n", " if args.model_name_or_path and os.path.exists(args.model_name_or_path):\n", " try:\n", " # set global_step to gobal_step of last saved checkpoint from model path\n", " checkpoint_suffix = args.model_name_or_path.split(\"-\")[-1].split(\"/\")[0]\n", " global_step = int(checkpoint_suffix)\n", " epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)\n", " steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)\n", "\n", " logger.info(\" Continuing training from checkpoint, will skip to saved global_step\")\n", " logger.info(\" Continuing training from epoch %d\", epochs_trained)\n", " logger.info(\" Continuing training from global step %d\", global_step)\n", " logger.info(\" Will skip the first %d steps in the first epoch\", steps_trained_in_current_epoch)\n", " except ValueError:\n", " logger.info(\" Starting fine-tuning.\")\n", "\n", " tr_loss, logging_loss = 0.0, 0.0\n", "\n", " model.zero_grad()\n", " train_iterator = trange(\n", " epochs_trained, int(args.num_train_epochs), desc=\"Epoch\", disable=args.local_rank not in [-1, 0]\n", " )\n", " set_seed(args) # Added here for reproducibility\n", " for _ in train_iterator:\n", " epoch_iterator = tqdm(train_dataloader, desc=\"Iteration\", disable=args.local_rank not in [-1, 0])\n", " for step, batch in enumerate(epoch_iterator):\n", "\n", " # Skip past any already trained steps if resuming training\n", " if steps_trained_in_current_epoch > 0:\n", " steps_trained_in_current_epoch -= 1\n", " continue\n", "\n", " inputs, labels = (batch, batch)\n", " if inputs.shape[1] > 1024: continue\n", " inputs = inputs.to(args.device)\n", " labels = labels.to(args.device)\n", " model.train()\n", " outputs = model(inputs, labels=labels)\n", " loss = outputs[0] # model outputs are always tuple in transformers (see doc)\n", "\n", " if args.n_gpu > 1:\n", " loss = loss.mean() # mean() to average on multi-gpu parallel training\n", " if args.gradient_accumulation_steps > 1:\n", " loss = loss / args.gradient_accumulation_steps\n", "\n", " if args.fp16:\n", " with amp.scale_loss(loss, optimizer) as scaled_loss:\n", " scaled_loss.backward()\n", " else:\n", " loss.backward()\n", "\n", " tr_loss += loss.item()\n", " if (step + 1) % args.gradient_accumulation_steps == 0:\n", " if args.fp16:\n", " torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)\n", " else:\n", " torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)\n", " optimizer.step()\n", " scheduler.step() # Update learning rate schedule\n", " model.zero_grad()\n", " global_step += 1\n", "\n", " if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:\n", " # Log metrics\n", " if (\n", " args.local_rank == -1 and args.evaluate_during_training\n", " ): # Only evaluate when single GPU otherwise metrics may not average well\n", " results = evaluate(args, model, tokenizer)\n", " for key, value in results.items():\n", " tb_writer.add_scalar(\"eval_{}\".format(key), value, global_step)\n", " tb_writer.add_scalar(\"lr\", scheduler.get_lr()[0], global_step)\n", " tb_writer.add_scalar(\"loss\", (tr_loss - logging_loss) / args.logging_steps, global_step)\n", " logging_loss = tr_loss\n", "\n", " if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:\n", " checkpoint_prefix = \"checkpoint\"\n", " # Save model checkpoint\n", " output_dir = os.path.join(args.output_dir, \"{}-{}\".format(checkpoint_prefix, global_step))\n", " os.makedirs(output_dir, exist_ok=True)\n", " model_to_save = (\n", " model.module if hasattr(model, \"module\") else model\n", " ) # Take care of distributed/parallel training\n", " model_to_save.save_pretrained(output_dir)\n", " tokenizer.save_pretrained(output_dir)\n", "\n", " torch.save(args, os.path.join(output_dir, \"training_args.bin\"))\n", " logger.info(\"Saving model checkpoint to %s\", output_dir)\n", "\n", " _rotate_checkpoints(args, checkpoint_prefix)\n", "\n", " torch.save(optimizer.state_dict(), os.path.join(output_dir, \"optimizer.pt\"))\n", " torch.save(scheduler.state_dict(), os.path.join(output_dir, \"scheduler.pt\"))\n", " logger.info(\"Saving optimizer and scheduler states to %s\", output_dir)\n", "\n", " if args.max_steps > 0 and global_step > args.max_steps:\n", " epoch_iterator.close()\n", " break\n", " if args.max_steps > 0 and global_step > args.max_steps:\n", " train_iterator.close()\n", " break\n", "\n", " if args.local_rank in [-1, 0]:\n", " tb_writer.close()\n", "\n", " return global_step, tr_loss / global_step\n", "\n", "# Evaluation of some model\n", "\n", "def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix=\"\") -> Dict:\n", " # Loop to handle MNLI double evaluation (matched, mis-matched)\n", " eval_output_dir = args.output_dir\n", "\n", " eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)\n", " os.makedirs(eval_output_dir, exist_ok=True)\n", " args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)\n", " # Note that DistributedSampler samples randomly\n", "\n", " def collate(examples: List[torch.Tensor]):\n", " if tokenizer._pad_token is None:\n", " return pad_sequence(examples, batch_first=True)\n", " return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)\n", "\n", " eval_sampler = SequentialSampler(eval_dataset)\n", " eval_dataloader = DataLoader(\n", " eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True\n", " )\n", "\n", " # multi-gpu evaluate\n", " if args.n_gpu > 1:\n", " model = torch.nn.DataParallel(model)\n", "\n", " # Eval!\n", " logger.info(\"***** Running evaluation {} *****\".format(prefix))\n", " logger.info(\" Num examples = %d\", len(eval_dataset))\n", " logger.info(\" Batch size = %d\", args.eval_batch_size)\n", " eval_loss = 0.0\n", " nb_eval_steps = 0\n", " model.eval()\n", "\n", " for batch in tqdm(eval_dataloader, desc=\"Evaluating\"):\n", " inputs, labels = (batch, batch)\n", " inputs = inputs.to(args.device)\n", " labels = labels.to(args.device)\n", "\n", " with torch.no_grad():\n", " outputs = model(inputs, labels=labels)\n", " lm_loss = outputs[0]\n", " eval_loss += lm_loss.mean().item()\n", " nb_eval_steps += 1\n", "\n", " eval_loss = eval_loss / nb_eval_steps\n", " perplexity = torch.exp(torch.tensor(eval_loss))\n", "\n", " result = {\"perplexity\": perplexity}\n", "\n", " output_eval_file = os.path.join(eval_output_dir, prefix, \"eval_results.txt\")\n", " with open(output_eval_file, \"w\") as writer:\n", " logger.info(\"***** Eval results {} *****\".format(prefix))\n", " for key in sorted(result.keys()):\n", " logger.info(\" %s = %s\", key, str(result[key]))\n", " writer.write(\"%s = %s\\n\" % (key, str(result[key])))\n", "\n", " return result" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "id": "-MGD6bFXV4Z-" }, "outputs": [], "source": [ "# Main runner\n", "\n", "def main(df_trn, df_val):\n", " args = Args()\n", " \n", " if args.should_continue:\n", " sorted_checkpoints = _sorted_checkpoints(args)\n", " if len(sorted_checkpoints) == 0:\n", " raise ValueError(\"Used --should_continue but no checkpoint was found in --output_dir.\")\n", " else:\n", " args.model_name_or_path = sorted_checkpoints[-1]\n", "\n", " if (\n", " os.path.exists(args.output_dir)\n", " and os.listdir(args.output_dir)\n", " and args.do_train\n", " and not args.overwrite_output_dir\n", " and not args.should_continue\n", " ):\n", " raise ValueError(\n", " \"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.\".format(\n", " args.output_dir\n", " )\n", " )\n", "\n", " # Setup CUDA, GPU & distributed training\n", " device = torch.device(\"cuda\")\n", " args.n_gpu = torch.cuda.device_count()\n", " args.device = device\n", "\n", " # Setup logging\n", " logging.basicConfig(\n", " format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n", " datefmt=\"%m/%d/%Y %H:%M:%S\",\n", " level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,\n", " )\n", " logger.warning(\n", " \"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s\",\n", " args.local_rank,\n", " device,\n", " args.n_gpu,\n", " bool(args.local_rank != -1),\n", " args.fp16,\n", " )\n", "\n", " # Set seed\n", " set_seed(args)\n", "\n", " config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)\n", " tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)\n", " model = AutoModelForCausalLM.from_pretrained(\n", " args.model_name_or_path,\n", " from_tf=False,\n", " config=config,\n", " cache_dir=args.cache_dir,\n", " )\n", " model.to(args.device)\n", " \n", " logger.info(\"Training/evaluation parameters %s\", args)\n", "\n", " # Training\n", " if args.do_train:\n", " train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)\n", "\n", " global_step, tr_loss = train(args, train_dataset, model, tokenizer)\n", " logger.info(\" global_step = %s, average loss = %s\", global_step, tr_loss)\n", "\n", " # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()\n", " if args.do_train:\n", " # Create output directory if needed\n", " os.makedirs(args.output_dir, exist_ok=True)\n", "\n", " logger.info(\"Saving model checkpoint to %s\", args.output_dir)\n", " # Save a trained model, configuration and tokenizer using `save_pretrained()`.\n", " # They can then be reloaded using `from_pretrained()`\n", " model_to_save = (\n", " model.module if hasattr(model, \"module\") else model\n", " ) # Take care of distributed/parallel training\n", " model_to_save.save_pretrained(args.output_dir)\n", " tokenizer.save_pretrained(args.output_dir)\n", "\n", " # Good practice: save your training arguments together with the trained model\n", " torch.save(args, os.path.join(args.output_dir, \"training_args.bin\"))\n", "\n", " # Load a trained model and vocabulary that you have fine-tuned\n", " model = AutoModelForCausalLM.from_pretrained(args.output_dir)\n", " tokenizer = AutoTokenizer.from_pretrained(args.output_dir)\n", " model.to(args.device)\n", "\n", " # Evaluation\n", " results = {}\n", " if args.do_eval and args.local_rank in [-1, 0]:\n", " checkpoints = [args.output_dir]\n", " if args.eval_all_checkpoints:\n", " checkpoints = list(\n", " os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + \"/**/\" + WEIGHTS_NAME, recursive=True))\n", " )\n", " logging.getLogger(\"transformers.modeling_utils\").setLevel(logging.WARN) # Reduce logging\n", " logger.info(\"Evaluate the following checkpoints: %s\", checkpoints)\n", " for checkpoint in checkpoints:\n", " global_step = checkpoint.split(\"-\")[-1] if len(checkpoints) > 1 else \"\"\n", " prefix = checkpoint.split(\"/\")[-1] if checkpoint.find(\"checkpoint\") != -1 else \"\"\n", "\n", " model = AutoModelForCausalLM.from_pretrained(checkpoint)\n", " model.to(args.device)\n", " result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)\n", " result = dict((k + \"_{}\".format(global_step), v) for k, v in result.items())\n", " results.update(result)\n", "\n", " return results" ] }, { "cell_type": "markdown", "metadata": { "id": "UZEHDzR0Vjs7" }, "source": [ "It is time to train the model!" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "id": "__iqR8YFV-Ex" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "02/09/2023 22:57:28 - WARNING - __main__ - Process rank: -1, device: cuda, n_gpu: 1, distributed training: False, 16-bits training: False\n", "02/09/2023 22:57:37 - INFO - __main__ - Training/evaluation parameters <__main__.Args object at 0x00000140C4DFD910>\n", "02/09/2023 22:57:37 - INFO - __main__ - Creating features from dataset file at cached\n", "02/09/2023 23:00:07 - INFO - __main__ - Saving features into cached file cached\\gpt2_cached_lm_512\n", "F:\\Projects\\WarBot\\venv\\lib\\site-packages\\transformers\\optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "02/09/2023 23:00:10 - INFO - __main__ - ***** Running training *****\n", "02/09/2023 23:00:10 - INFO - __main__ - Num examples = 180598\n", "02/09/2023 23:00:10 - INFO - __main__ - Num Epochs = 3\n", "02/09/2023 23:00:10 - INFO - __main__ - Instantaneous batch size per GPU = 1\n", "02/09/2023 23:00:10 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 1\n", "02/09/2023 23:00:10 - INFO - __main__ - Gradient Accumulation steps = 1\n", "02/09/2023 23:00:10 - INFO - __main__ - Total optimization steps = 541794\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5a86882285eb41a59884b68b2f58278e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Epoch: 0%| | 0/3 [00:00