Random0-0
/

DialoGPT-small-Azomekern

Model card Files Files and versions Community

Random0-0 commited on Dec 4, 2021

Commit

aaf8763

•

1 Parent(s): 6f8a836

Upload Untitled0.ipynb

Browse files

Files changed (1) hide show

Untitled0.ipynb +872 -0

Untitled0.ipynb ADDED Viewed

	@@ -0,0 +1,872 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Untitled0.ipynb",
+      "private_outputs": true,
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-ExZYuS4whSi"
+      },
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive/')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "iU-h1ApL0bue"
+      },
+      "source": [
+        "!pip -q install transformers"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "a0P1MPcH1IaF"
+      },
+      "source": [
+        "import os\n",
+        "os.chdir(\"/content/drive/My Drive/Colab Notebooks\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "FY23HIap1K3w"
+      },
+      "source": [
+        "import glob\n",
+        "import logging\n",
+        "import os\n",
+        "import pickle\n",
+        "import random\n",
+        "import re\n",
+        "import shutil\n",
+        "from typing import Dict, List, Tuple\n",
+        "\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "\n",
+        "from torch.nn.utils.rnn import pad_sequence\n",
+        "from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler\n",
+        "from torch.utils.data.distributed import DistributedSampler\n",
+        "from tqdm.notebook import tqdm, trange\n",
+        "\n",
+        "from pathlib import Path\n",
+        "\n",
+        "from transformers import (\n",
+        "    MODEL_WITH_LM_HEAD_MAPPING,\n",
+        "    WEIGHTS_NAME,\n",
+        "    AdamW,\n",
+        "    AutoConfig,\n",
+        "    PreTrainedModel,\n",
+        "    PreTrainedTokenizer,\n",
+        "    get_linear_schedule_with_warmup,\n",
+        ")\n",
+        "\n",
+        "\n",
+        "try:\n",
+        "    from torch.utils.tensorboard import SummaryWriter\n",
+        "except ImportError:\n",
+        "    from tensorboardX import SummaryWriter"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "4pldsjBT1QqG"
+      },
+      "source": [
+        "data= pd.read_csv('/sukuna.csv')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "hESxj45g2PKd"
+      },
+      "source": [
+        "data.sample(6)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "UaHgCPjf2Ryg"
+      },
+      "source": [
+        "CHARACTER_NAME = 'Sukuna'"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "7MU-Ocxw2W9X"
+      },
+      "source": [
+        "contexted = []\n",
+        "\n",
+        "# context window of size 7\n",
+        "n = 7\n",
+        "\n",
+        "for i in data[data.name == CHARACTER_NAME].index:\n",
+        "  if i < n:\n",
+        "    continue\n",
+        "  row = []\n",
+        "  prev = i - 1 - n # we additionally substract 1, so row will contain current responce and 7 previous responces  \n",
+        "  for j in range(i, prev, -1):\n",
+        "    row.append(data.line[j])\n",
+        "  contexted.append(row)\n",
+        "\n",
+        "columns = ['response', 'context'] \n",
+        "columns = columns + ['context/' + str(i) for i in range(n - 1)]\n",
+        "\n",
+        "df = pd.DataFrame.from_records(contexted, columns=columns)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "HVl-w-TE2ZQl"
+      },
+      "source": [
+        "df.sample(6)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zzPDyi9p2bRf"
+      },
+      "source": [
+        "trn_df, val_df = train_test_split(df, test_size=0.1)\n",
+        "trn_df.head()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "7Mj6dxsk2dlU"
+      },
+      "source": [
+        "def construct_conv(row, tokenizer, eos = True):\n",
+        "    flatten = lambda l: [item for sublist in l for item in sublist]\n",
+        "    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))\n",
+        "    conv = flatten(conv)\n",
+        "    return conv\n",
+        "\n",
+        "class ConversationDataset(Dataset):\n",
+        "    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):\n",
+        "\n",
+        "        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)\n",
+        "\n",
+        "        directory = args.cache_dir\n",
+        "        cached_features_file = os.path.join(\n",
+        "            directory, args.model_type + \"_cached_lm_\" + str(block_size)\n",
+        "        )\n",
+        "\n",
+        "        if os.path.exists(cached_features_file) and not args.overwrite_cache:\n",
+        "            logger.info(\"Loading features from cached file %s\", cached_features_file)\n",
+        "            with open(cached_features_file, \"rb\") as handle:\n",
+        "                self.examples = pickle.load(handle)\n",
+        "        else:\n",
+        "            logger.info(\"Creating features from dataset file at %s\", directory)\n",
+        "\n",
+        "            self.examples = []\n",
+        "            for _, row in df.iterrows():\n",
+        "                conv = construct_conv(row, tokenizer)\n",
+        "                self.examples.append(conv)\n",
+        "\n",
+        "            logger.info(\"Saving features into cached file %s\", cached_features_file)\n",
+        "            with open(cached_features_file, \"wb\") as handle:\n",
+        "                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
+        "\n",
+        "    def __len__(self):\n",
+        "        return len(self.examples)\n",
+        "\n",
+        "    def __getitem__(self, item):\n",
+        "        return torch.tensor(self.examples[item], dtype=torch.long)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "HDSfFNhZ2j5U"
+      },
+      "source": [
+        "def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):\n",
+        "    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)\n",
+        "\n",
+        "\n",
+        "def set_seed(args):\n",
+        "    random.seed(args.seed)\n",
+        "    np.random.seed(args.seed)\n",
+        "    torch.manual_seed(args.seed)\n",
+        "    if args.n_gpu > 0:\n",
+        "        torch.cuda.manual_seed_all(args.seed)\n",
+        "\n",
+        "\n",
+        "def _sorted_checkpoints(args, checkpoint_prefix=\"checkpoint\", use_mtime=False) -> List[str]:\n",
+        "    ordering_and_checkpoint_path = []\n",
+        "\n",
+        "    glob_checkpoints = glob.glob(os.path.join(args.output_dir, \"{}-*\".format(checkpoint_prefix)))\n",
+        "\n",
+        "    for path in glob_checkpoints:\n",
+        "        if use_mtime:\n",
+        "            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))\n",
+        "        else:\n",
+        "            regex_match = re.match(\".*{}-([0-9]+)\".format(checkpoint_prefix), path)\n",
+        "            if regex_match and regex_match.groups():\n",
+        "                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))\n",
+        "\n",
+        "    checkpoints_sorted = sorted(ordering_and_checkpoint_path)\n",
+        "    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]\n",
+        "    return checkpoints_sorted\n",
+        "\n",
+        "\n",
+        "def _rotate_checkpoints(args, checkpoint_prefix=\"checkpoint\", use_mtime=False) -> None:\n",
+        "    if not args.save_total_limit:\n",
+        "        return\n",
+        "    if args.save_total_limit <= 0:\n",
+        "        return\n",
+        "\n",
+        "    # Check if we should delete older checkpoint(s)\n",
+        "    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)\n",
+        "    if len(checkpoints_sorted) <= args.save_total_limit:\n",
+        "        return\n",
+        "\n",
+        "    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)\n",
+        "    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]\n",
+        "    for checkpoint in checkpoints_to_be_deleted:\n",
+        "        logger.info(\"Deleting older checkpoint [{}] due to args.save_total_limit\".format(checkpoint))\n",
+        "        shutil.rmtree(checkpoint)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "5VII0lGJ2nmz"
+      },
+      "source": [
+        "from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer\n",
+        "import torch\n",
+        "\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/DialoGPT-small\")\n",
+        "model = AutoModelWithLMHead.from_pretrained(\"microsoft/DialoGPT-small\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "2U7RIR4L2yNa"
+      },
+      "source": [
+        "\"\"\"\n",
+        "Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).\n",
+        "GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned\n",
+        "using a masked language modeling (MLM) loss.\n",
+        "\"\"\"\n",
+        "\n",
+        "# Configs\n",
+        "logger = logging.getLogger(__name__)\n",
+        "\n",
+        "MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())\n",
+        "MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "r3VZpWtb3hFU"
+      },
+      "source": [
+        "class Args():\n",
+        "    def __init__(self):\n",
+        "        self.output_dir = 'output-small'\n",
+        "        self.model_type = 'gpt2'\n",
+        "        self.model_name_or_path = 'microsoft/DialoGPT-small'\n",
+        "        self.config_name = 'microsoft/DialoGPT-small'\n",
+        "        self.tokenizer_name = 'microsoft/DialoGPT-small'\n",
+        "        self.cache_dir = 'cached'\n",
+        "        self.block_size = 512\n",
+        "        self.do_train = True\n",
+        "        self.do_eval = True\n",
+        "        self.evaluate_during_training = False\n",
+        "        self.per_gpu_train_batch_size = 4\n",
+        "        self.per_gpu_eval_batch_size = 4\n",
+        "        self.gradient_accumulation_steps = 1\n",
+        "        self.learning_rate = 5e-5\n",
+        "        self.weight_decay = 0.0\n",
+        "        self.adam_epsilon = 1e-8\n",
+        "        self.max_grad_norm = 1.0\n",
+        "        self.num_train_epochs = 4\n",
+        "        self.max_steps = -1\n",
+        "        self.warmup_steps = 0\n",
+        "        self.logging_steps = 1000\n",
+        "        self.save_steps = 3500\n",
+        "        self.save_total_limit = None\n",
+        "        self.eval_all_checkpoints = False\n",
+        "        self.no_cuda = False\n",
+        "        self.overwrite_output_dir = True\n",
+        "        self.overwrite_cache = True\n",
+        "        self.should_continue = False\n",
+        "        self.seed = 42\n",
+        "        self.local_rank = -1\n",
+        "        self.fp16 = False\n",
+        "        self.fp16_opt_level = 'O1'\n",
+        "\n",
+        "args = Args()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "8m7XgQqW3l_N"
+      },
+      "source": [
+        "def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:\n",
+        "    \"\"\" Train the model \"\"\"\n",
+        "    if args.local_rank in [-1, 0]:\n",
+        "        tb_writer = SummaryWriter()\n",
+        "\n",
+        "    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)\n",
+        "\n",
+        "    def collate(examples: List[torch.Tensor]):\n",
+        "        if tokenizer._pad_token is None:\n",
+        "            return pad_sequence(examples, batch_first=True)\n",
+        "        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)\n",
+        "\n",
+        "    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)\n",
+        "    train_dataloader = DataLoader(\n",
+        "        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True\n",
+        "    )\n",
+        "\n",
+        "    if args.max_steps > 0:\n",
+        "        t_total = args.max_steps\n",
+        "        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1\n",
+        "    else:\n",
+        "        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs\n",
+        "\n",
+        "    model = model.module if hasattr(model, \"module\") else model  # Take care of distributed/parallel training\n",
+        "    model.resize_token_embeddings(len(tokenizer))\n",
+        "    # add_special_tokens_(model, tokenizer)\n",
+        "\n",
+        "\n",
+        "    # Prepare optimizer and schedule (linear warmup and decay)\n",
+        "    no_decay = [\"bias\", \"LayerNorm.weight\"]\n",
+        "    optimizer_grouped_parameters = [\n",
+        "        {\n",
+        "            \"params\": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],\n",
+        "            \"weight_decay\": args.weight_decay,\n",
+        "        },\n",
+        "        {\"params\": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], \"weight_decay\": 0.0},\n",
+        "    ]\n",
+        "    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)\n",
+        "    scheduler = get_linear_schedule_with_warmup(\n",
+        "        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total\n",
+        "    )\n",
+        "\n",
+        "    # Check if saved optimizer or scheduler states exist\n",
+        "    if (\n",
+        "        args.model_name_or_path\n",
+        "        and os.path.isfile(os.path.join(args.model_name_or_path, \"optimizer.pt\"))\n",
+        "        and os.path.isfile(os.path.join(args.model_name_or_path, \"scheduler.pt\"))\n",
+        "    ):\n",
+        "        # Load in optimizer and scheduler states\n",
+        "        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, \"optimizer.pt\")))\n",
+        "        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, \"scheduler.pt\")))\n",
+        "\n",
+        "    if args.fp16:\n",
+        "        try:\n",
+        "            from apex import amp\n",
+        "        except ImportError:\n",
+        "            raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use fp16 training.\")\n",
+        "        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)\n",
+        "\n",
+        "    # multi-gpu training (should be after apex fp16 initialization)\n",
+        "    if args.n_gpu > 1:\n",
+        "        model = torch.nn.DataParallel(model)\n",
+        "\n",
+        "    # Distributed training (should be after apex fp16 initialization)\n",
+        "    if args.local_rank != -1:\n",
+        "        model = torch.nn.parallel.DistributedDataParallel(\n",
+        "            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True\n",
+        "        )\n",
+        "\n",
+        "    # Train!\n",
+        "    logger.info(\"***** Running training *****\")\n",
+        "    logger.info(\"  Num examples = %d\", len(train_dataset))\n",
+        "    logger.info(\"  Num Epochs = %d\", args.num_train_epochs)\n",
+        "    logger.info(\"  Instantaneous batch size per GPU = %d\", args.per_gpu_train_batch_size)\n",
+        "    logger.info(\n",
+        "        \"  Total train batch size (w. parallel, distributed & accumulation) = %d\",\n",
+        "        args.train_batch_size\n",
+        "        * args.gradient_accumulation_steps\n",
+        "        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),\n",
+        "    )\n",
+        "    logger.info(\"  Gradient Accumulation steps = %d\", args.gradient_accumulation_steps)\n",
+        "    logger.info(\"  Total optimization steps = %d\", t_total)\n",
+        "\n",
+        "    global_step = 0\n",
+        "    epochs_trained = 0\n",
+        "    steps_trained_in_current_epoch = 0\n",
+        "    # Check if continuing training from a checkpoint\n",
+        "    if args.model_name_or_path and os.path.exists(args.model_name_or_path):\n",
+        "        try:\n",
+        "            # set global_step to gobal_step of last saved checkpoint from model path\n",
+        "            checkpoint_suffix = args.model_name_or_path.split(\"-\")[-1].split(\"/\")[0]\n",
+        "            global_step = int(checkpoint_suffix)\n",
+        "            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)\n",
+        "            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)\n",
+        "\n",
+        "            logger.info(\"  Continuing training from checkpoint, will skip to saved global_step\")\n",
+        "            logger.info(\"  Continuing training from epoch %d\", epochs_trained)\n",
+        "            logger.info(\"  Continuing training from global step %d\", global_step)\n",
+        "            logger.info(\"  Will skip the first %d steps in the first epoch\", steps_trained_in_current_epoch)\n",
+        "        except ValueError:\n",
+        "            logger.info(\"  Starting fine-tuning.\")\n",
+        "\n",
+        "    tr_loss, logging_loss = 0.0, 0.0\n",
+        "\n",
+        "    model.zero_grad()\n",
+        "    train_iterator = trange(\n",
+        "        epochs_trained, int(args.num_train_epochs), desc=\"Epoch\", disable=args.local_rank not in [-1, 0]\n",
+        "    )\n",
+        "    set_seed(args)  # Added here for reproducibility\n",
+        "    for _ in train_iterator:\n",
+        "        epoch_iterator = tqdm(train_dataloader, desc=\"Iteration\", disable=args.local_rank not in [-1, 0])\n",
+        "        for step, batch in enumerate(epoch_iterator):\n",
+        "\n",
+        "            # Skip past any already trained steps if resuming training\n",
+        "            if steps_trained_in_current_epoch > 0:\n",
+        "                steps_trained_in_current_epoch -= 1\n",
+        "                continue\n",
+        "\n",
+        "            inputs, labels = (batch, batch)\n",
+        "            if inputs.shape[1] > 1024: continue\n",
+        "            inputs = inputs.to(args.device)\n",
+        "            labels = labels.to(args.device)\n",
+        "            model.train()\n",
+        "            outputs = model(inputs, labels=labels)\n",
+        "            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)\n",
+        "\n",
+        "            if args.n_gpu > 1:\n",
+        "                loss = loss.mean()  # mean() to average on multi-gpu parallel training\n",
+        "            if args.gradient_accumulation_steps > 1:\n",
+        "                loss = loss / args.gradient_accumulation_steps\n",
+        "\n",
+        "            if args.fp16:\n",
+        "                with amp.scale_loss(loss, optimizer) as scaled_loss:\n",
+        "                    scaled_loss.backward()\n",
+        "            else:\n",
+        "                loss.backward()\n",
+        "\n",
+        "            tr_loss += loss.item()\n",
+        "            if (step + 1) % args.gradient_accumulation_steps == 0:\n",
+        "                if args.fp16:\n",
+        "                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)\n",
+        "                else:\n",
+        "                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)\n",
+        "                optimizer.step()\n",
+        "                scheduler.step()  # Update learning rate schedule\n",
+        "                model.zero_grad()\n",
+        "                global_step += 1\n",
+        "\n",
+        "                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:\n",
+        "                    # Log metrics\n",
+        "                    if (\n",
+        "                        args.local_rank == -1 and args.evaluate_during_training\n",
+        "                    ):  # Only evaluate when single GPU otherwise metrics may not average well\n",
+        "                        results = evaluate(args, model, tokenizer)\n",
+        "                        for key, value in results.items():\n",
+        "                            tb_writer.add_scalar(\"eval_{}\".format(key), value, global_step)\n",
+        "                    tb_writer.add_scalar(\"lr\", scheduler.get_lr()[0], global_step)\n",
+        "                    tb_writer.add_scalar(\"loss\", (tr_loss - logging_loss) / args.logging_steps, global_step)\n",
+        "                    logging_loss = tr_loss\n",
+        "\n",
+        "                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:\n",
+        "                    checkpoint_prefix = \"checkpoint\"\n",
+        "                    # Save model checkpoint\n",
+        "                    output_dir = os.path.join(args.output_dir, \"{}-{}\".format(checkpoint_prefix, global_step))\n",
+        "                    os.makedirs(output_dir, exist_ok=True)\n",
+        "                    model_to_save = (\n",
+        "                        model.module if hasattr(model, \"module\") else model\n",
+        "                    )  # Take care of distributed/parallel training\n",
+        "                    model_to_save.save_pretrained(output_dir)\n",
+        "                    tokenizer.save_pretrained(output_dir)\n",
+        "\n",
+        "                    torch.save(args, os.path.join(output_dir, \"training_args.bin\"))\n",
+        "                    logger.info(\"Saving model checkpoint to %s\", output_dir)\n",
+        "\n",
+        "                    _rotate_checkpoints(args, checkpoint_prefix)\n",
+        "\n",
+        "                    torch.save(optimizer.state_dict(), os.path.join(output_dir, \"optimizer.pt\"))\n",
+        "                    torch.save(scheduler.state_dict(), os.path.join(output_dir, \"scheduler.pt\"))\n",
+        "                    logger.info(\"Saving optimizer and scheduler states to %s\", output_dir)\n",
+        "\n",
+        "            if args.max_steps > 0 and global_step > args.max_steps:\n",
+        "                epoch_iterator.close()\n",
+        "                break\n",
+        "        if args.max_steps > 0 and global_step > args.max_steps:\n",
+        "            train_iterator.close()\n",
+        "            break\n",
+        "\n",
+        "    if args.local_rank in [-1, 0]:\n",
+        "        tb_writer.close()\n",
+        "\n",
+        "    return global_step, tr_loss / global_step\n",
+        "\n",
+        "# Evaluation of some model\n",
+        "\n",
+        "def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix=\"\") -> Dict:\n",
+        "    # Loop to handle MNLI double evaluation (matched, mis-matched)\n",
+        "    eval_output_dir = args.output_dir\n",
+        "\n",
+        "    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)\n",
+        "    os.makedirs(eval_output_dir, exist_ok=True)\n",
+        "    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)\n",
+        "    # Note that DistributedSampler samples randomly\n",
+        "\n",
+        "    def collate(examples: List[torch.Tensor]):\n",
+        "        if tokenizer._pad_token is None:\n",
+        "            return pad_sequence(examples, batch_first=True)\n",
+        "        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)\n",
+        "\n",
+        "    eval_sampler = SequentialSampler(eval_dataset)\n",
+        "    eval_dataloader = DataLoader(\n",
+        "        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True\n",
+        "    )\n",
+        "\n",
+        "    # multi-gpu evaluate\n",
+        "    if args.n_gpu > 1:\n",
+        "        model = torch.nn.DataParallel(model)\n",
+        "\n",
+        "    # Eval!\n",
+        "    logger.info(\"***** Running evaluation {} *****\".format(prefix))\n",
+        "    logger.info(\"  Num examples = %d\", len(eval_dataset))\n",
+        "    logger.info(\"  Batch size = %d\", args.eval_batch_size)\n",
+        "    eval_loss = 0.0\n",
+        "    nb_eval_steps = 0\n",
+        "    model.eval()\n",
+        "\n",
+        "    for batch in tqdm(eval_dataloader, desc=\"Evaluating\"):\n",
+        "        inputs, labels = (batch, batch)\n",
+        "        inputs = inputs.to(args.device)\n",
+        "        labels = labels.to(args.device)\n",
+        "\n",
+        "        with torch.no_grad():\n",
+        "            outputs = model(inputs, labels=labels)\n",
+        "            lm_loss = outputs[0]\n",
+        "            eval_loss += lm_loss.mean().item()\n",
+        "        nb_eval_steps += 1\n",
+        "\n",
+        "    eval_loss = eval_loss / nb_eval_steps\n",
+        "    perplexity = torch.exp(torch.tensor(eval_loss))\n",
+        "\n",
+        "    result = {\"perplexity\": perplexity}\n",
+        "\n",
+        "    output_eval_file = os.path.join(eval_output_dir, prefix, \"eval_results.txt\")\n",
+        "    with open(output_eval_file, \"w\") as writer:\n",
+        "        logger.info(\"***** Eval results {} *****\".format(prefix))\n",
+        "        for key in sorted(result.keys()):\n",
+        "            logger.info(\"  %s = %s\", key, str(result[key]))\n",
+        "            writer.write(\"%s = %s\\n\" % (key, str(result[key])))\n",
+        "\n",
+        "    return result"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nL2b4rsF3uAn"
+      },
+      "source": [
+        "def main(df_trn, df_val):\n",
+        "    args = Args()\n",
+        "    \n",
+        "    if args.should_continue:\n",
+        "        sorted_checkpoints = _sorted_checkpoints(args)\n",
+        "        if len(sorted_checkpoints) == 0:\n",
+        "            raise ValueError(\"Used --should_continue but no checkpoint was found in --output_dir.\")\n",
+        "        else:\n",
+        "            args.model_name_or_path = sorted_checkpoints[-1]\n",
+        "\n",
+        "    if (\n",
+        "        os.path.exists(args.output_dir)\n",
+        "        and os.listdir(args.output_dir)\n",
+        "        and args.do_train\n",
+        "        and not args.overwrite_output_dir\n",
+        "        and not args.should_continue\n",
+        "    ):\n",
+        "        raise ValueError(\n",
+        "            \"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.\".format(\n",
+        "                args.output_dir\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "    # Setup CUDA, GPU & distributed training\n",
+        "    device = torch.device(\"cuda\")\n",
+        "    args.n_gpu = torch.cuda.device_count()\n",
+        "    args.device = device\n",
+        "\n",
+        "    # Setup logging\n",
+        "    logging.basicConfig(\n",
+        "        format=\"%(asctime)s - %(levelname)s - %(name)s -   %(message)s\",\n",
+        "        datefmt=\"%m/%d/%Y %H:%M:%S\",\n",
+        "        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,\n",
+        "    )\n",
+        "    logger.warning(\n",
+        "        \"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s\",\n",
+        "        args.local_rank,\n",
+        "        device,\n",
+        "        args.n_gpu,\n",
+        "        bool(args.local_rank != -1),\n",
+        "        args.fp16,\n",
+        "    )\n",
+        "\n",
+        "    # Set seed\n",
+        "    set_seed(args)\n",
+        "\n",
+        "    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)\n",
+        "    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)\n",
+        "    model = AutoModelWithLMHead.from_pretrained(\n",
+        "        args.model_name_or_path,\n",
+        "        from_tf=False,\n",
+        "        config=config,\n",
+        "        cache_dir=args.cache_dir,\n",
+        "    )\n",
+        "    model.to(args.device)\n",
+        "    \n",
+        "    logger.info(\"Training/evaluation parameters %s\", args)\n",
+        "\n",
+        "    # Training\n",
+        "    if args.do_train:\n",
+        "        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)\n",
+        "\n",
+        "        global_step, tr_loss = train(args, train_dataset, model, tokenizer)\n",
+        "        logger.info(\" global_step = %s, average loss = %s\", global_step, tr_loss)\n",
+        "\n",
+        "    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()\n",
+        "    if args.do_train:\n",
+        "        # Create output directory if needed\n",
+        "        os.makedirs(args.output_dir, exist_ok=True)\n",
+        "\n",
+        "        logger.info(\"Saving model checkpoint to %s\", args.output_dir)\n",
+        "        # Save a trained model, configuration and tokenizer using `save_pretrained()`.\n",
+        "        # They can then be reloaded using `from_pretrained()`\n",
+        "        model_to_save = (\n",
+        "            model.module if hasattr(model, \"module\") else model\n",
+        "        )  # Take care of distributed/parallel training\n",
+        "        model_to_save.save_pretrained(args.output_dir)\n",
+        "        tokenizer.save_pretrained(args.output_dir)\n",
+        "\n",
+        "        # Good practice: save your training arguments together with the trained model\n",
+        "        torch.save(args, os.path.join(args.output_dir, \"training_args.bin\"))\n",
+        "\n",
+        "        # Load a trained model and vocabulary that you have fine-tuned\n",
+        "        model = AutoModelWithLMHead.from_pretrained(args.output_dir)\n",
+        "        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)\n",
+        "        model.to(args.device)\n",
+        "\n",
+        "    # Evaluation\n",
+        "    results = {}\n",
+        "    if args.do_eval and args.local_rank in [-1, 0]:\n",
+        "        checkpoints = [args.output_dir]\n",
+        "        if args.eval_all_checkpoints:\n",
+        "            checkpoints = list(\n",
+        "                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + \"/**/\" + WEIGHTS_NAME, recursive=True))\n",
+        "            )\n",
+        "            logging.getLogger(\"transformers.modeling_utils\").setLevel(logging.WARN)  # Reduce logging\n",
+        "        logger.info(\"Evaluate the following checkpoints: %s\", checkpoints)\n",
+        "        for checkpoint in checkpoints:\n",
+        "            global_step = checkpoint.split(\"-\")[-1] if len(checkpoints) > 1 else \"\"\n",
+        "            prefix = checkpoint.split(\"/\")[-1] if checkpoint.find(\"checkpoint\") != -1 else \"\"\n",
+        "\n",
+        "            model = AutoModelWithLMHead.from_pretrained(checkpoint)\n",
+        "            model.to(args.device)\n",
+        "            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)\n",
+        "            result = dict((k + \"_{}\".format(global_step), v) for k, v in result.items())\n",
+        "            results.update(result)\n",
+        "\n",
+        "    return results"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "bx4BrSPQ3yW8"
+      },
+      "source": [
+        "main(trn_df, val_df)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vXESR3FP4Poe"
+      },
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')\n",
+        "model = AutoModelWithLMHead.from_pretrained('output-small')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "RXL71M_W4UOp"
+      },
+      "source": [
+        "for step in range(4):\n",
+        "    # encode the new user input, add the eos_token and return a tensor in Pytorch\n",
+        "    new_user_input_ids = tokenizer.encode(input(\">> User:\") + tokenizer.eos_token, return_tensors='pt')\n",
+        "    # print(new_user_input_ids)\n",
+        "\n",
+        "    # append the new user input tokens to the chat history\n",
+        "    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids\n",
+        "\n",
+        "    # generated a response while limiting the total chat history to 1000 tokens, \n",
+        "    chat_history_ids = model.generate(\n",
+        "        bot_input_ids, max_length=200,\n",
+        "        pad_token_id=tokenizer.eos_token_id,  \n",
+        "        no_repeat_ngram_size=3,       \n",
+        "        do_sample=True, \n",
+        "        top_k=100, \n",
+        "        top_p=0.7,\n",
+        "        temperature=0.8\n",
+        "    )\n",
+        "    \n",
+        "    # pretty print last ouput tokens from bot\n",
+        "    print(\"Azomekern: {}\".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "LbRL51QLD7Lf"
+      },
+      "source": [
+        "!sudo apt-get install git-lfs"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "GaFNUk5QFn6H"
+      },
+      "source": [
+        "!pip install huggingface_hub"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "EHv1wuvCGMeJ"
+      },
+      "source": [
+        "!huggingface-cli login\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "_Od3UgHDG72I"
+      },
+      "source": [
+        "!sudo apt-get install git-lfs"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "HeRBcWdoJYYM"
+      },
+      "source": [
+        "!git config --global user.email \"emanuelmaximum40@gmail.com\"\n",
+        "# Tip: using the same email as your huggingface.co account will link your commits to your profile\n",
+        "!git config --global user.name \"Random0-0\""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "pBZI6c2HLXZl"
+      },
+      "source": [
+        "MY_MODEL_NAME = 'Random0-0/DialoGPT-small-Azomekern'\n",
+        "with open('HuggingFace-API-key.txt', 'rt') as f:\n",
+        "  HUGGINGFACE_API_KEY = f.read().strip()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}