Alexandru Gherghescu commited on Feb 28

Commit

7e53000

•

1 Parent(s): b11cd65

Add tokenized dataset, pre-training script

Browse files

Files changed (27) hide show

README.md +17 -2
data/dataset_dict.json +1 -0
data/test/data-00000-of-00001.arrow +3 -0
data/test/dataset_info.json +121 -0
data/test/state.json +13 -0
data/train/data-00000-of-00019.arrow +3 -0
data/train/data-00001-of-00019.arrow +3 -0
data/train/data-00002-of-00019.arrow +3 -0
data/train/data-00003-of-00019.arrow +3 -0
data/train/data-00004-of-00019.arrow +3 -0
data/train/data-00005-of-00019.arrow +3 -0
data/train/data-00006-of-00019.arrow +3 -0
data/train/data-00007-of-00019.arrow +3 -0
data/train/data-00008-of-00019.arrow +3 -0
data/train/data-00009-of-00019.arrow +3 -0
data/train/data-00010-of-00019.arrow +3 -0
data/train/data-00011-of-00019.arrow +3 -0
data/train/data-00012-of-00019.arrow +3 -0
data/train/data-00013-of-00019.arrow +3 -0
data/train/data-00014-of-00019.arrow +3 -0
data/train/data-00015-of-00019.arrow +3 -0
data/train/data-00016-of-00019.arrow +3 -0
data/train/data-00017-of-00019.arrow +3 -0
data/train/data-00018-of-00019.arrow +3 -0
data/train/dataset_info.json +121 -0
data/train/state.json +67 -0
pre_training.py +16 -5

README.md CHANGED Viewed

@@ -9,15 +9,30 @@ This repository aims to re-create the GPT 1 architecture, using HuggingFace's
 `transformers`.
 The original paper of the model can be found [here][gpt1-paper]. The blog post
-accompanying this paper is [here][gpt1-blog].
 The original model was trained, as noted in OpenAI's blogpost, 1 month on 8
-GPU's, on the BookCorpus dataset.
 [gpt1-paper]:
 https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf
 [gpt1-blog]: https://openai.com/research/language-unsupervised
 # How to use
 See `inference.py` for an example.

 `transformers`.
 The original paper of the model can be found [here][gpt1-paper]. The blog post
+accompanying this paper is [here][gpt1-blog]. The code and weights can be found
+[here][gpt1-code].
 The original model was trained, as noted in OpenAI's blogpost, 1 month on 8
+GPU's (P600's), on the original BookCorpus dataset (containing around ~7000
+books).
+This model instead is trained using the [BookCorpusOpen][bco-dataset] dataset,
+which contains ~17.000 books (around ~6GB). The tokenized dataset (~9GB) can be
+found in `data/` in this repository. The tokenizer is a BPE tokenizer, with
+40.000 vocabulary merges, as the original paper. It is re-implemented using
+HuggingFace `tokenizers` library, and trained on the
+[BookCorpusOpen][bco-dataset] dataset.
 [gpt1-paper]:
 https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf
 [gpt1-blog]: https://openai.com/research/language-unsupervised
+[gpt1-code]: https://github.com/openai/finetune-transformer-lm/
+[bco-dataset]: https://huggingface.co/datasets/lucadiliello/bookcorpusopen
 # How to use
+See `preprocessing.py` on how the data was preprocessed and tokenized.
+See `pre_training.py` on how the model was pre-trained.
 See `inference.py` for an example.

data/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train", "test"]}

data/test/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:092fbd8038f5797037574c4002cd5a6e1ba33ebcc067f3d9ae486cedae35c6ea
+size 477939672

data/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "bookcorpusopen",
+  "dataset_size": 6643434832,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00000-of-00014-e40347a4a9a752dd.parquet": {
+      "num_bytes": 312662419,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00001-of-00014-4f769efe80e66fc3.parquet": {
+      "num_bytes": 276422009,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00002-of-00014-fb4feb3c719446aa.parquet": {
+      "num_bytes": 285252367,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00003-of-00014-02d44dfa3e71d7db.parquet": {
+      "num_bytes": 278564550,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00004-of-00014-f13bdb35926815eb.parquet": {
+      "num_bytes": 283741488,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00005-of-00014-4306e38807aec8ea.parquet": {
+      "num_bytes": 268541858,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00006-of-00014-d6a12d722a46f987.parquet": {
+      "num_bytes": 271706112,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00007-of-00014-9ca5d4d4fb468d96.parquet": {
+      "num_bytes": 286651952,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00008-of-00014-e6e5e9598b224507.parquet": {
+      "num_bytes": 282522639,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00009-of-00014-a8ecf1b869b5fa7c.parquet": {
+      "num_bytes": 267958223,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00010-of-00014-7dfefa88147121dc.parquet": {
+      "num_bytes": 298926548,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00011-of-00014-19c764033d9839eb.parquet": {
+      "num_bytes": 287813481,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00012-of-00014-8d37c841e68119d0.parquet": {
+      "num_bytes": 271499277,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00013-of-00014-523aa82c71ab4ceb.parquet": {
+      "num_bytes": 268326367,
+      "checksum": null
+    }
+  },
+  "download_size": 3940589290,
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "token_type_ids": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "attention_mask": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "overflow_to_sample_mapping": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 10584024122,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 6643434832,
+      "num_examples": 17868,
+      "shard_lengths": [
+        1277,
+        2554,
+        2553,
+        2552,
+        2552,
+        2552,
+        2552,
+        1276
+      ],
+      "dataset_name": "bookcorpusopen"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

data/test/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "822b2c728621ccb1",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}

data/train/data-00000-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a1ca57e659a2491da6e6dac287ce6f129138102ab93fab529d6b10cbd8a1c75
+size 479554856

data/train/data-00001-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10b2969f7d7fff2ddc0ee28fdea1c9301cbb93ec5f5d077e5e9754efd705a1c8
+size 479608656

data/train/data-00002-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65e96bc5cd59207de21ebe39971498c6a7e2991c468c93f5254bbd4b6d2e77bd
+size 479668592

data/train/data-00003-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fcc1e7829afff3d185dd02ecfba1bb8b76fb3a8a67848f28d1e9222d767f9bf
+size 479494392

data/train/data-00004-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbca1a272f9675b1e258e02dfeb5987a6ce7b1f7122416cc86edfe93b86287b3
+size 479536600

data/train/data-00005-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3aaf0873a26fe076ac132d5951433d72c52e0c22547bb355f41aae567eda51eb
+size 479597920

data/train/data-00006-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cec720fcff434842dfb28e25f494362b83a769c7312658def71ee1ee64eda4fe
+size 479543768

data/train/data-00007-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc7f5b678f104e6a64ec1fff83ae0ed910235f5250be61da95ace3ace608bce2
+size 479485360

data/train/data-00008-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd3b65e6ccf9fa2b0ca906cd425ecc018b6f04f67e34b7b73486d5725aef41c6
+size 479555792

data/train/data-00009-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3a46b3aacc6df1cc573f7ccb8a62f1db853bcedc83c7059eab86d2ef622c88f
+size 479590648

data/train/data-00010-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad9211c4f883c731d0298874c9c9be8c43fe6f57c7df88022138bce4078b030e
+size 479589760

data/train/data-00011-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7676275416b7250243267e095f85fcf30a270e47d0d6c2559b7e5629992577e
+size 479551864

data/train/data-00012-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0339c8623e411922caae471aa11397d79f36cc82bb4c41855f97f01754646806
+size 479467024

data/train/data-00013-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:525fd21f723d96003a236f81f4bd0968cda0cb62549d95fc45f86d636e309be3
+size 479595808

data/train/data-00014-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03bb40439490de9c683493099d4d80574b1cb825312f7013f38a4d058cb2c930
+size 479580776

data/train/data-00015-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:147e12d12af77db481fdfddb4f882723a87ea08bdd04e789bc66e76242761974
+size 479549728

data/train/data-00016-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78f8b6e2823f47337012f26a90d185b7171379e2a74fbede6d5f1b19d79c25a5
+size 479501896

data/train/data-00017-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03db569df9cebdd8dd7acf029853f97bff11970384a287e474a673b93a91bc59
+size 479508896

data/train/data-00018-of-00019.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e176cb57117b9233413b1a1c0e94cd170e1ee77e6ee3bb839096582fee05cc3
+size 479574000

data/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "bookcorpusopen",
+  "dataset_size": 6643434832,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00000-of-00014-e40347a4a9a752dd.parquet": {
+      "num_bytes": 312662419,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00001-of-00014-4f769efe80e66fc3.parquet": {
+      "num_bytes": 276422009,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00002-of-00014-fb4feb3c719446aa.parquet": {
+      "num_bytes": 285252367,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00003-of-00014-02d44dfa3e71d7db.parquet": {
+      "num_bytes": 278564550,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00004-of-00014-f13bdb35926815eb.parquet": {
+      "num_bytes": 283741488,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00005-of-00014-4306e38807aec8ea.parquet": {
+      "num_bytes": 268541858,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00006-of-00014-d6a12d722a46f987.parquet": {
+      "num_bytes": 271706112,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00007-of-00014-9ca5d4d4fb468d96.parquet": {
+      "num_bytes": 286651952,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00008-of-00014-e6e5e9598b224507.parquet": {
+      "num_bytes": 282522639,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00009-of-00014-a8ecf1b869b5fa7c.parquet": {
+      "num_bytes": 267958223,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00010-of-00014-7dfefa88147121dc.parquet": {
+      "num_bytes": 298926548,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00011-of-00014-19c764033d9839eb.parquet": {
+      "num_bytes": 287813481,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00012-of-00014-8d37c841e68119d0.parquet": {
+      "num_bytes": 271499277,
+      "checksum": null
+    },
+    "hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00013-of-00014-523aa82c71ab4ceb.parquet": {
+      "num_bytes": 268326367,
+      "checksum": null
+    }
+  },
+  "download_size": 3940589290,
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "token_type_ids": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "attention_mask": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "overflow_to_sample_mapping": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 10584024122,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 6643434832,
+      "num_examples": 17868,
+      "shard_lengths": [
+        1277,
+        2554,
+        2553,
+        2552,
+        2552,
+        2552,
+        2552,
+        1276
+      ],
+      "dataset_name": "bookcorpusopen"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

data/train/state.json ADDED Viewed

	@@ -0,0 +1,67 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00019.arrow"
+    },
+    {
+      "filename": "data-00001-of-00019.arrow"
+    },
+    {
+      "filename": "data-00002-of-00019.arrow"
+    },
+    {
+      "filename": "data-00003-of-00019.arrow"
+    },
+    {
+      "filename": "data-00004-of-00019.arrow"
+    },
+    {
+      "filename": "data-00005-of-00019.arrow"
+    },
+    {
+      "filename": "data-00006-of-00019.arrow"
+    },
+    {
+      "filename": "data-00007-of-00019.arrow"
+    },
+    {
+      "filename": "data-00008-of-00019.arrow"
+    },
+    {
+      "filename": "data-00009-of-00019.arrow"
+    },
+    {
+      "filename": "data-00010-of-00019.arrow"
+    },
+    {
+      "filename": "data-00011-of-00019.arrow"
+    },
+    {
+      "filename": "data-00012-of-00019.arrow"
+    },
+    {
+      "filename": "data-00013-of-00019.arrow"
+    },
+    {
+      "filename": "data-00014-of-00019.arrow"
+    },
+    {
+      "filename": "data-00015-of-00019.arrow"
+    },
+    {
+      "filename": "data-00016-of-00019.arrow"
+    },
+    {
+      "filename": "data-00017-of-00019.arrow"
+    },
+    {
+      "filename": "data-00018-of-00019.arrow"
+    }
+  ],
+  "_fingerprint": "214c03786ad46a46",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}

pre_training.py CHANGED Viewed

@@ -11,6 +11,10 @@ from datasets import load_from_disk
 from configuration_gpt1 import GPT1Config
 from modeling_gpt1 import GPT1Model, GPT1ForCausalLM
 GPT1Config.register_for_auto_class()
 GPT1Model.register_for_auto_class('AutoModel')
@@ -19,6 +23,9 @@ GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')
 # load the already tokenized dataset (see training_preprocessing.py)
 tokenized_datasets = load_from_disk('tokenized_bookcorpusopen')
 print(tokenized_datasets)
 tokenizer = AutoTokenizer.from_pretrained('.')
@@ -30,7 +37,7 @@ print(model)
 _total_params = sum(p.numel() for p in model.parameters())
 print(f"Model parameters: {_total_params}")
-batch_size = 32
 epochs = 100
 tokenizer.pad_token = tokenizer.eos_token
@@ -39,18 +46,22 @@ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
 optimizer = Adam(model.parameters(), lr=2.5e-4, weight_decay=0.01)
 scheduler = get_scheduler('cosine',
                           optimizer=optimizer,
-                          num_warmup_steps=4000,
                           num_training_steps=epochs * len(tokenized_datasets['train']))
 args = TrainingArguments(
     output_dir='checkpoints',
-    per_device_train_batch_size=batch_size,
-    per_device_eval_batch_size=batch_size,
     evaluation_strategy='epoch',
-    gradient_accumulation_steps=1,
     num_train_epochs=epochs,
     save_total_limit=10,
     max_grad_norm=1.0,
     fp16=False,
 )

 from configuration_gpt1 import GPT1Config
 from modeling_gpt1 import GPT1Model, GPT1ForCausalLM
+# a few more things to try to get the model to train (in this order)
+# actually manually check the input (the books), and the tokenizer output (i
+# don't know if it tokenizes correctly, if it adds eos_token etc.)
 GPT1Config.register_for_auto_class()
 GPT1Model.register_for_auto_class('AutoModel')
 # load the already tokenized dataset (see training_preprocessing.py)
 tokenized_datasets = load_from_disk('tokenized_bookcorpusopen')
+# shuffle for good measure
+tokenized_datasets = tokenized_datasets.shuffle(seed=42)
 print(tokenized_datasets)
 tokenizer = AutoTokenizer.from_pretrained('.')
 _total_params = sum(p.numel() for p in model.parameters())
 print(f"Model parameters: {_total_params}")
+batch_size = 16
 epochs = 100
 tokenizer.pad_token = tokenizer.eos_token
 optimizer = Adam(model.parameters(), lr=2.5e-4, weight_decay=0.01)
 scheduler = get_scheduler('cosine',
                           optimizer=optimizer,
+                          num_warmup_steps=2000,
                           num_training_steps=epochs * len(tokenized_datasets['train']))
 args = TrainingArguments(
     output_dir='checkpoints',
+    per_device_train_batch_size=batch_size, # divide by number of GPU's
+    per_device_eval_batch_size=batch_size, # divide by number of GPU's
     evaluation_strategy='epoch',
+    gradient_accumulation_steps=4,
     num_train_epochs=epochs,
     save_total_limit=10,
     max_grad_norm=1.0,
+    logging_strategy='steps',
+    logging_steps=100,
+    logging_first_step=True,
+    logging_nan_inf_filter=False,
     fp16=False,
 )