{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploring the GLUE - MRPC dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "from pprint import pprint\n",
    "\n",
    "raw_dataset = load_dataset(path = \"glue\", name = \"mrpc\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_home_.cache_huggingface_datasets_glue_mrpc_0.0.0_bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c.lock\n",
      "downloads\n",
      "glue\n"
     ]
    }
   ],
   "source": [
    "!ls ~/.cache/huggingface/datasets/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
       "        num_rows: 3668\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
       "        num_rows: 408\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
       "        num_rows: 1725\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence1': 'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n",
       " 'sentence2': 'Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n",
       " 'label': 1,\n",
       " 'idx': 0}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_dataset[\"train\"][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence1': Value(dtype='string', id=None),\n",
       " 'sentence2': Value(dtype='string', id=None),\n",
       " 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),\n",
       " 'idx': Value(dtype='int32', id=None)}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_dataset[\"train\"].features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'idx': 16,\n",
      " 'label': 0,\n",
      " 'sentence1': 'Rudder was most recently senior vice president for the '\n",
      "              'Developer & Platform Evangelism Business .',\n",
      " 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the '\n",
      "              'Developer and Platform Evangelism unit , will lead the new '\n",
      "              'entity .'}\n",
      "{'idx': 812,\n",
      " 'label': 0,\n",
      " 'sentence1': 'However , EPA officials would not confirm the 20 percent figure '\n",
      "              '.',\n",
      " 'sentence2': 'Only in the past few weeks have officials settled on the 20 '\n",
      "              'percent figure .'}\n"
     ]
    }
   ],
   "source": [
    "# Look at the 15th and 87th element of the train and validation datasets respectively\n",
    "pprint(raw_dataset[\"train\"][15])\n",
    "pprint(raw_dataset[\"validation\"][87])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tokenizer for pair processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = \"bert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
      " 'input_ids': [101,\n",
      "               2023,\n",
      "               2003,\n",
      "               1996,\n",
      "               2034,\n",
      "               6251,\n",
      "               1012,\n",
      "               102,\n",
      "               2023,\n",
      "               2003,\n",
      "               1996,\n",
      "               2117,\n",
      "               2028,\n",
      "               1012,\n",
      "               102],\n",
      " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]}\n"
     ]
    }
   ],
   "source": [
    "inputs = tokenizer(\"This is the first sentence.\", \"This is the second one.\")\n",
    "pprint(inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'[CLS] this is the first sentence. [SEP] this is the second one. [SEP]'"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode(inputs['input_ids'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we can see that the tokenizer has appended the two sentences together and introduced `[CLS]` and `[SEP]` tokens specially because that's how bert was trained for next sentence prediction task."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n"
     ]
    }
   ],
   "source": [
    "print(tokenizer(raw_dataset[\"train\"][15][\"sentence1\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': [101, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n"
     ]
    }
   ],
   "source": [
    "print(tokenizer(raw_dataset[\"train\"][15][\"sentence2\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n"
     ]
    }
   ],
   "source": [
    "print(tokenizer(raw_dataset[\"train\"][15][\"sentence1\"], raw_dataset[\"train\"][15][\"sentence2\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we need to observe the `token_type_ids` field. It is different if we encode the two sentences at the same time vs if we do them independently. Also the `[CLS]` and `[SEP]` tokens are added differently in the two cases."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset Map to create new datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_function(example):\n",
    "    return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 3668/3668 [00:00<00:00, 9953.91 examples/s] \n",
      "Map: 100%|██████████| 408/408 [00:00<00:00, 9044.46 examples/s]\n",
      "Map: 100%|██████████| 1725/1725 [00:00<00:00, 9891.51 examples/s] \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 3668\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 408\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 1725\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)\n",
    "tokenized_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
       "        num_rows: 3668\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
       "        num_rows: 408\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
       "        num_rows: 1725\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we see that as out tokenize functions returns new keys of `'input_ids', 'token_type_ids', 'attention_mask'`, those simply get added to the new tokenized_dataset Dataset and rest remains the same."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DataCollatorWithPadding\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This `DataCollatorWithPadding` is meant to do dynamic collation of batches in the dataset based on the max length from among all the sequences in the batch."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[50, 59, 47]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "samples = tokenized_datasets[\"train\"][:3]\n",
    "samples = {k: v for k, v in samples.items()}\n",
    "[len(x) for x in samples[\"input_ids\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence1': ['Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n",
       "  \"Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .\",\n",
       "  'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .'],\n",
       " 'sentence2': ['Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n",
       "  \"Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .\",\n",
       "  \"On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .\"],\n",
       " 'label': [1, 0, 1],\n",
       " 'idx': [0, 1, 2],\n",
       " 'input_ids': [[101,\n",
       "   2572,\n",
       "   3217,\n",
       "   5831,\n",
       "   5496,\n",
       "   2010,\n",
       "   2567,\n",
       "   1010,\n",
       "   3183,\n",
       "   2002,\n",
       "   2170,\n",
       "   1000,\n",
       "   1996,\n",
       "   7409,\n",
       "   1000,\n",
       "   1010,\n",
       "   1997,\n",
       "   9969,\n",
       "   4487,\n",
       "   23809,\n",
       "   3436,\n",
       "   2010,\n",
       "   3350,\n",
       "   1012,\n",
       "   102,\n",
       "   7727,\n",
       "   2000,\n",
       "   2032,\n",
       "   2004,\n",
       "   2069,\n",
       "   1000,\n",
       "   1996,\n",
       "   7409,\n",
       "   1000,\n",
       "   1010,\n",
       "   2572,\n",
       "   3217,\n",
       "   5831,\n",
       "   5496,\n",
       "   2010,\n",
       "   2567,\n",
       "   1997,\n",
       "   9969,\n",
       "   4487,\n",
       "   23809,\n",
       "   3436,\n",
       "   2010,\n",
       "   3350,\n",
       "   1012,\n",
       "   102],\n",
       "  [101,\n",
       "   9805,\n",
       "   3540,\n",
       "   11514,\n",
       "   2050,\n",
       "   3079,\n",
       "   11282,\n",
       "   2243,\n",
       "   1005,\n",
       "   1055,\n",
       "   2077,\n",
       "   4855,\n",
       "   1996,\n",
       "   4677,\n",
       "   2000,\n",
       "   3647,\n",
       "   4576,\n",
       "   1999,\n",
       "   2687,\n",
       "   2005,\n",
       "   1002,\n",
       "   1016,\n",
       "   1012,\n",
       "   1019,\n",
       "   4551,\n",
       "   1012,\n",
       "   102,\n",
       "   9805,\n",
       "   3540,\n",
       "   11514,\n",
       "   2050,\n",
       "   4149,\n",
       "   11282,\n",
       "   2243,\n",
       "   1005,\n",
       "   1055,\n",
       "   1999,\n",
       "   2786,\n",
       "   2005,\n",
       "   1002,\n",
       "   6353,\n",
       "   2509,\n",
       "   2454,\n",
       "   1998,\n",
       "   2853,\n",
       "   2009,\n",
       "   2000,\n",
       "   3647,\n",
       "   4576,\n",
       "   2005,\n",
       "   1002,\n",
       "   1015,\n",
       "   1012,\n",
       "   1022,\n",
       "   4551,\n",
       "   1999,\n",
       "   2687,\n",
       "   1012,\n",
       "   102],\n",
       "  [101,\n",
       "   2027,\n",
       "   2018,\n",
       "   2405,\n",
       "   2019,\n",
       "   15147,\n",
       "   2006,\n",
       "   1996,\n",
       "   4274,\n",
       "   2006,\n",
       "   2238,\n",
       "   2184,\n",
       "   1010,\n",
       "   5378,\n",
       "   1996,\n",
       "   6636,\n",
       "   2005,\n",
       "   5096,\n",
       "   1010,\n",
       "   2002,\n",
       "   2794,\n",
       "   1012,\n",
       "   102,\n",
       "   2006,\n",
       "   2238,\n",
       "   2184,\n",
       "   1010,\n",
       "   1996,\n",
       "   2911,\n",
       "   1005,\n",
       "   1055,\n",
       "   5608,\n",
       "   2018,\n",
       "   2405,\n",
       "   2019,\n",
       "   15147,\n",
       "   2006,\n",
       "   1996,\n",
       "   4274,\n",
       "   1010,\n",
       "   5378,\n",
       "   1996,\n",
       "   14792,\n",
       "   2005,\n",
       "   5096,\n",
       "   1012,\n",
       "   102]],\n",
       " 'token_type_ids': [[0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1],\n",
       "  [0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1],\n",
       "  [0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1]],\n",
       " 'attention_mask': [[1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1],\n",
       "  [1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1],\n",
       "  [1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1]]}"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "samples_to_collate = tokenized_datasets[\"train\"][:3]\n",
    "samples_to_collate.pop(\"sentence1\"); samples_to_collate.pop(\"sentence2\"); samples_to_collate.pop(\"idx\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'input_ids': torch.Size([3, 59]),\n",
       " 'token_type_ids': torch.Size([3, 59]),\n",
       " 'attention_mask': torch.Size([3, 59]),\n",
       " 'labels': torch.Size([3])}"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch = data_collator(samples_to_collate)\n",
    "{k: v.shape for k, v in batch.items()}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Replication of the above preprocessing on GLUE-SST2 dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading data: 100%|██████████| 3.11M/3.11M [00:00<00:00, 4.89MB/s]\n",
      "Downloading data: 100%|██████████| 72.8k/72.8k [00:00<00:00, 128kB/s]\n",
      "Downloading data: 100%|██████████| 148k/148k [00:00<00:00, 260kB/s]\n",
      "Generating train split: 100%|██████████| 67349/67349 [00:00<00:00, 467302.76 examples/s]\n",
      "Generating validation split: 100%|██████████| 872/872 [00:00<00:00, 137580.24 examples/s]\n",
      "Generating test split: 100%|██████████| 1821/1821 [00:00<00:00, 205588.75 examples/s]\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "raw_dataset = load_dataset(\"glue\", \"sst2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence': 'hide new secretions from the parental units ',\n",
       " 'label': 0,\n",
       " 'idx': 0}"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_dataset[\"train\"][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n",
      "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n",
      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']\n",
      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "from transformers import AutoModelForSequenceClassification\n",
    "\n",
    "checkpoint = \"bert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_function(sequence):\n",
    "    return tokenizer(sequence[\"sentence\"], padding = True, truncation = True, return_tensors=\"pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:   0%|          | 0/67349 [00:00<?, ? examples/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 67349/67349 [00:06<00:00, 11164.26 examples/s]\n",
      "Map: 100%|██████████| 872/872 [00:00<00:00, 10952.43 examples/s]\n",
      "Map: 100%|██████████| 1821/1821 [00:00<00:00, 11315.74 examples/s]\n"
     ]
    }
   ],
   "source": [
    "tokenized_dataset = raw_dataset.map(tokenize_function, batched = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 67349\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 872\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 1821\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DataCollatorWithPadding\n",
    "dc = DataCollatorWithPadding(tokenizer = tokenizer, padding = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "samples = tokenized_dataset[\"train\"][:3]\n",
    "samples = {k: v for k,v in samples.items() if k not in [\"sentence\", \"ids\"]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'])"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "samples.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'label': [0, 0, 1],\n",
       " 'idx': [0, 1, 2],\n",
       " 'input_ids': [[101,\n",
       "   5342,\n",
       "   2047,\n",
       "   3595,\n",
       "   8496,\n",
       "   2013,\n",
       "   1996,\n",
       "   18643,\n",
       "   3197,\n",
       "   102,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0],\n",
       "  [101,\n",
       "   3397,\n",
       "   2053,\n",
       "   15966,\n",
       "   1010,\n",
       "   2069,\n",
       "   4450,\n",
       "   2098,\n",
       "   18201,\n",
       "   2015,\n",
       "   102,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0],\n",
       "  [101,\n",
       "   2008,\n",
       "   7459,\n",
       "   2049,\n",
       "   3494,\n",
       "   1998,\n",
       "   10639,\n",
       "   2015,\n",
       "   2242,\n",
       "   2738,\n",
       "   3376,\n",
       "   2055,\n",
       "   2529,\n",
       "   3267,\n",
       "   102,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0]],\n",
       " 'token_type_ids': [[0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0],\n",
       "  [0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0],\n",
       "  [0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0]],\n",
       " 'attention_mask': [[1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0],\n",
       "  [1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0],\n",
       "  [1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0]]}"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "samples"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The datasets library is pretty intuitive in the way it is structured. We just need to make sure before collating, we have the necessary fields and drop the unnecessary fields from the dataset. And that we do dynamic padding based on a batch of data and not on the model dim or the max sequence length of the entire corpus. It will be economical in terms of computation and also help training."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}