{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploring the GLUE - MRPC dataset" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "from pprint import pprint\n", "\n", "raw_dataset = load_dataset(path = \"glue\", name = \"mrpc\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "_home_.cache_huggingface_datasets_glue_mrpc_0.0.0_bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c.lock\n", "downloads\n", "glue\n" ] } ], "source": [ "!ls ~/.cache/huggingface/datasets/" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 3668\n", " })\n", " validation: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 408\n", " })\n", " test: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 1725\n", " })\n", "})" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_dataset" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'sentence1': 'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n", " 'sentence2': 'Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n", " 'label': 1,\n", " 'idx': 0}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_dataset[\"train\"][0]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'sentence1': Value(dtype='string', id=None),\n", " 'sentence2': Value(dtype='string', id=None),\n", " 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),\n", " 'idx': Value(dtype='int32', id=None)}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_dataset[\"train\"].features" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'idx': 16,\n", " 'label': 0,\n", " 'sentence1': 'Rudder was most recently senior vice president for the '\n", " 'Developer & Platform Evangelism Business .',\n", " 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the '\n", " 'Developer and Platform Evangelism unit , will lead the new '\n", " 'entity .'}\n", "{'idx': 812,\n", " 'label': 0,\n", " 'sentence1': 'However , EPA officials would not confirm the 20 percent figure '\n", " '.',\n", " 'sentence2': 'Only in the past few weeks have officials settled on the 20 '\n", " 'percent figure .'}\n" ] } ], "source": [ "# Look at the 15th and 87th element of the train and validation datasets respectively\n", "pprint(raw_dataset[\"train\"][15])\n", "pprint(raw_dataset[\"validation\"][87])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tokenizer for pair processing" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = \"bert-base-uncased\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", " 'input_ids': [101,\n", " 2023,\n", " 2003,\n", " 1996,\n", " 2034,\n", " 6251,\n", " 1012,\n", " 102,\n", " 2023,\n", " 2003,\n", " 1996,\n", " 2117,\n", " 2028,\n", " 1012,\n", " 102],\n", " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]}\n" ] } ], "source": [ "inputs = tokenizer(\"This is the first sentence.\", \"This is the second one.\")\n", "pprint(inputs)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'[CLS] this is the first sentence. [SEP] this is the second one. [SEP]'" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode(inputs['input_ids'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we can see that the tokenizer has appended the two sentences together and introduced `[CLS]` and `[SEP]` tokens specially because that's how bert was trained for next sentence prediction task." ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n" ] } ], "source": [ "print(tokenizer(raw_dataset[\"train\"][15][\"sentence1\"]))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': [101, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n" ] } ], "source": [ "print(tokenizer(raw_dataset[\"train\"][15][\"sentence2\"]))" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n" ] } ], "source": [ "print(tokenizer(raw_dataset[\"train\"][15][\"sentence1\"], raw_dataset[\"train\"][15][\"sentence2\"]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we need to observe the `token_type_ids` field. It is different if we encode the two sentences at the same time vs if we do them independently. Also the `[CLS]` and `[SEP]` tokens are added differently in the two cases." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Dataset Map to create new datasets" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def tokenize_function(example):\n", " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 3668/3668 [00:00<00:00, 9953.91 examples/s] \n", "Map: 100%|██████████| 408/408 [00:00<00:00, 9044.46 examples/s]\n", "Map: 100%|██████████| 1725/1725 [00:00<00:00, 9891.51 examples/s] \n" ] }, { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 3668\n", " })\n", " validation: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 408\n", " })\n", " test: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 1725\n", " })\n", "})" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)\n", "tokenized_datasets" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 3668\n", " })\n", " validation: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 408\n", " })\n", " test: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 1725\n", " })\n", "})" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we see that as out tokenize functions returns new keys of `'input_ids', 'token_type_ids', 'attention_mask'`, those simply get added to the new tokenized_dataset Dataset and rest remains the same." ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorWithPadding\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This `DataCollatorWithPadding` is meant to do dynamic collation of batches in the dataset based on the max length from among all the sequences in the batch." ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[50, 59, 47]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "samples = tokenized_datasets[\"train\"][:3]\n", "samples = {k: v for k, v in samples.items()}\n", "[len(x) for x in samples[\"input_ids\"]]" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'sentence1': ['Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n", " \"Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .\",\n", " 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .'],\n", " 'sentence2': ['Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n", " \"Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .\",\n", " \"On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .\"],\n", " 'label': [1, 0, 1],\n", " 'idx': [0, 1, 2],\n", " 'input_ids': [[101,\n", " 2572,\n", " 3217,\n", " 5831,\n", " 5496,\n", " 2010,\n", " 2567,\n", " 1010,\n", " 3183,\n", " 2002,\n", " 2170,\n", " 1000,\n", " 1996,\n", " 7409,\n", " 1000,\n", " 1010,\n", " 1997,\n", " 9969,\n", " 4487,\n", " 23809,\n", " 3436,\n", " 2010,\n", " 3350,\n", " 1012,\n", " 102,\n", " 7727,\n", " 2000,\n", " 2032,\n", " 2004,\n", " 2069,\n", " 1000,\n", " 1996,\n", " 7409,\n", " 1000,\n", " 1010,\n", " 2572,\n", " 3217,\n", " 5831,\n", " 5496,\n", " 2010,\n", " 2567,\n", " 1997,\n", " 9969,\n", " 4487,\n", " 23809,\n", " 3436,\n", " 2010,\n", " 3350,\n", " 1012,\n", " 102],\n", " [101,\n", " 9805,\n", " 3540,\n", " 11514,\n", " 2050,\n", " 3079,\n", " 11282,\n", " 2243,\n", " 1005,\n", " 1055,\n", " 2077,\n", " 4855,\n", " 1996,\n", " 4677,\n", " 2000,\n", " 3647,\n", " 4576,\n", " 1999,\n", " 2687,\n", " 2005,\n", " 1002,\n", " 1016,\n", " 1012,\n", " 1019,\n", " 4551,\n", " 1012,\n", " 102,\n", " 9805,\n", " 3540,\n", " 11514,\n", " 2050,\n", " 4149,\n", " 11282,\n", " 2243,\n", " 1005,\n", " 1055,\n", " 1999,\n", " 2786,\n", " 2005,\n", " 1002,\n", " 6353,\n", " 2509,\n", " 2454,\n", " 1998,\n", " 2853,\n", " 2009,\n", " 2000,\n", " 3647,\n", " 4576,\n", " 2005,\n", " 1002,\n", " 1015,\n", " 1012,\n", " 1022,\n", " 4551,\n", " 1999,\n", " 2687,\n", " 1012,\n", " 102],\n", " [101,\n", " 2027,\n", " 2018,\n", " 2405,\n", " 2019,\n", " 15147,\n", " 2006,\n", " 1996,\n", " 4274,\n", " 2006,\n", " 2238,\n", " 2184,\n", " 1010,\n", " 5378,\n", " 1996,\n", " 6636,\n", " 2005,\n", " 5096,\n", " 1010,\n", " 2002,\n", " 2794,\n", " 1012,\n", " 102,\n", " 2006,\n", " 2238,\n", " 2184,\n", " 1010,\n", " 1996,\n", " 2911,\n", " 1005,\n", " 1055,\n", " 5608,\n", " 2018,\n", " 2405,\n", " 2019,\n", " 15147,\n", " 2006,\n", " 1996,\n", " 4274,\n", " 1010,\n", " 5378,\n", " 1996,\n", " 14792,\n", " 2005,\n", " 5096,\n", " 1012,\n", " 102]],\n", " 'token_type_ids': [[0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1],\n", " [0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1],\n", " [0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1]],\n", " 'attention_mask': [[1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1],\n", " [1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1],\n", " [1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1]]}" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "samples" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "samples_to_collate = tokenized_datasets[\"train\"][:3]\n", "samples_to_collate.pop(\"sentence1\"); samples_to_collate.pop(\"sentence2\"); samples_to_collate.pop(\"idx\");" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/plain": [ "{'input_ids': torch.Size([3, 59]),\n", " 'token_type_ids': torch.Size([3, 59]),\n", " 'attention_mask': torch.Size([3, 59]),\n", " 'labels': torch.Size([3])}" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch = data_collator(samples_to_collate)\n", "{k: v.shape for k, v in batch.items()}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Replication of the above preprocessing on GLUE-SST2 dataset" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading data: 100%|██████████| 3.11M/3.11M [00:00<00:00, 4.89MB/s]\n", "Downloading data: 100%|██████████| 72.8k/72.8k [00:00<00:00, 128kB/s]\n", "Downloading data: 100%|██████████| 148k/148k [00:00<00:00, 260kB/s]\n", "Generating train split: 100%|██████████| 67349/67349 [00:00<00:00, 467302.76 examples/s]\n", "Generating validation split: 100%|██████████| 872/872 [00:00<00:00, 137580.24 examples/s]\n", "Generating test split: 100%|██████████| 1821/1821 [00:00<00:00, 205588.75 examples/s]\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "raw_dataset = load_dataset(\"glue\", \"sst2\")" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'sentence': 'hide new secretions from the parental units ',\n", " 'label': 0,\n", " 'idx': 0}" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_dataset[\"train\"][0]" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n", "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n", "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']\n", "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "from transformers import AutoModelForSequenceClassification\n", "\n", "checkpoint = \"bert-base-uncased\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "def tokenize_function(sequence):\n", " return tokenizer(sequence[\"sentence\"], padding = True, truncation = True, return_tensors=\"pt\")" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 0%| | 0/67349 [00:00