Alexandru Gherghescu
commited on
Commit
•
7e53000
1
Parent(s):
b11cd65
Add tokenized dataset, pre-training script
Browse files- README.md +17 -2
- data/dataset_dict.json +1 -0
- data/test/data-00000-of-00001.arrow +3 -0
- data/test/dataset_info.json +121 -0
- data/test/state.json +13 -0
- data/train/data-00000-of-00019.arrow +3 -0
- data/train/data-00001-of-00019.arrow +3 -0
- data/train/data-00002-of-00019.arrow +3 -0
- data/train/data-00003-of-00019.arrow +3 -0
- data/train/data-00004-of-00019.arrow +3 -0
- data/train/data-00005-of-00019.arrow +3 -0
- data/train/data-00006-of-00019.arrow +3 -0
- data/train/data-00007-of-00019.arrow +3 -0
- data/train/data-00008-of-00019.arrow +3 -0
- data/train/data-00009-of-00019.arrow +3 -0
- data/train/data-00010-of-00019.arrow +3 -0
- data/train/data-00011-of-00019.arrow +3 -0
- data/train/data-00012-of-00019.arrow +3 -0
- data/train/data-00013-of-00019.arrow +3 -0
- data/train/data-00014-of-00019.arrow +3 -0
- data/train/data-00015-of-00019.arrow +3 -0
- data/train/data-00016-of-00019.arrow +3 -0
- data/train/data-00017-of-00019.arrow +3 -0
- data/train/data-00018-of-00019.arrow +3 -0
- data/train/dataset_info.json +121 -0
- data/train/state.json +67 -0
- pre_training.py +16 -5
README.md
CHANGED
@@ -9,15 +9,30 @@ This repository aims to re-create the GPT 1 architecture, using HuggingFace's
|
|
9 |
`transformers`.
|
10 |
|
11 |
The original paper of the model can be found [here][gpt1-paper]. The blog post
|
12 |
-
accompanying this paper is [here][gpt1-blog].
|
|
|
13 |
|
14 |
The original model was trained, as noted in OpenAI's blogpost, 1 month on 8
|
15 |
-
GPU's, on the BookCorpus dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
[gpt1-paper]:
|
18 |
https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf
|
19 |
[gpt1-blog]: https://openai.com/research/language-unsupervised
|
|
|
|
|
20 |
|
21 |
# How to use
|
22 |
|
|
|
|
|
|
|
|
|
23 |
See `inference.py` for an example.
|
|
|
9 |
`transformers`.
|
10 |
|
11 |
The original paper of the model can be found [here][gpt1-paper]. The blog post
|
12 |
+
accompanying this paper is [here][gpt1-blog]. The code and weights can be found
|
13 |
+
[here][gpt1-code].
|
14 |
|
15 |
The original model was trained, as noted in OpenAI's blogpost, 1 month on 8
|
16 |
+
GPU's (P600's), on the original BookCorpus dataset (containing around ~7000
|
17 |
+
books).
|
18 |
+
|
19 |
+
This model instead is trained using the [BookCorpusOpen][bco-dataset] dataset,
|
20 |
+
which contains ~17.000 books (around ~6GB). The tokenized dataset (~9GB) can be
|
21 |
+
found in `data/` in this repository. The tokenizer is a BPE tokenizer, with
|
22 |
+
40.000 vocabulary merges, as the original paper. It is re-implemented using
|
23 |
+
HuggingFace `tokenizers` library, and trained on the
|
24 |
+
[BookCorpusOpen][bco-dataset] dataset.
|
25 |
|
26 |
[gpt1-paper]:
|
27 |
https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf
|
28 |
[gpt1-blog]: https://openai.com/research/language-unsupervised
|
29 |
+
[gpt1-code]: https://github.com/openai/finetune-transformer-lm/
|
30 |
+
[bco-dataset]: https://huggingface.co/datasets/lucadiliello/bookcorpusopen
|
31 |
|
32 |
# How to use
|
33 |
|
34 |
+
See `preprocessing.py` on how the data was preprocessed and tokenized.
|
35 |
+
|
36 |
+
See `pre_training.py` on how the model was pre-trained.
|
37 |
+
|
38 |
See `inference.py` for an example.
|
data/dataset_dict.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"splits": ["train", "test"]}
|
data/test/data-00000-of-00001.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:092fbd8038f5797037574c4002cd5a6e1ba33ebcc067f3d9ae486cedae35c6ea
|
3 |
+
size 477939672
|
data/test/dataset_info.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"builder_name": "parquet",
|
3 |
+
"citation": "",
|
4 |
+
"config_name": "default",
|
5 |
+
"dataset_name": "bookcorpusopen",
|
6 |
+
"dataset_size": 6643434832,
|
7 |
+
"description": "",
|
8 |
+
"download_checksums": {
|
9 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00000-of-00014-e40347a4a9a752dd.parquet": {
|
10 |
+
"num_bytes": 312662419,
|
11 |
+
"checksum": null
|
12 |
+
},
|
13 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00001-of-00014-4f769efe80e66fc3.parquet": {
|
14 |
+
"num_bytes": 276422009,
|
15 |
+
"checksum": null
|
16 |
+
},
|
17 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00002-of-00014-fb4feb3c719446aa.parquet": {
|
18 |
+
"num_bytes": 285252367,
|
19 |
+
"checksum": null
|
20 |
+
},
|
21 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00003-of-00014-02d44dfa3e71d7db.parquet": {
|
22 |
+
"num_bytes": 278564550,
|
23 |
+
"checksum": null
|
24 |
+
},
|
25 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00004-of-00014-f13bdb35926815eb.parquet": {
|
26 |
+
"num_bytes": 283741488,
|
27 |
+
"checksum": null
|
28 |
+
},
|
29 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00005-of-00014-4306e38807aec8ea.parquet": {
|
30 |
+
"num_bytes": 268541858,
|
31 |
+
"checksum": null
|
32 |
+
},
|
33 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00006-of-00014-d6a12d722a46f987.parquet": {
|
34 |
+
"num_bytes": 271706112,
|
35 |
+
"checksum": null
|
36 |
+
},
|
37 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00007-of-00014-9ca5d4d4fb468d96.parquet": {
|
38 |
+
"num_bytes": 286651952,
|
39 |
+
"checksum": null
|
40 |
+
},
|
41 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00008-of-00014-e6e5e9598b224507.parquet": {
|
42 |
+
"num_bytes": 282522639,
|
43 |
+
"checksum": null
|
44 |
+
},
|
45 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00009-of-00014-a8ecf1b869b5fa7c.parquet": {
|
46 |
+
"num_bytes": 267958223,
|
47 |
+
"checksum": null
|
48 |
+
},
|
49 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00010-of-00014-7dfefa88147121dc.parquet": {
|
50 |
+
"num_bytes": 298926548,
|
51 |
+
"checksum": null
|
52 |
+
},
|
53 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00011-of-00014-19c764033d9839eb.parquet": {
|
54 |
+
"num_bytes": 287813481,
|
55 |
+
"checksum": null
|
56 |
+
},
|
57 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00012-of-00014-8d37c841e68119d0.parquet": {
|
58 |
+
"num_bytes": 271499277,
|
59 |
+
"checksum": null
|
60 |
+
},
|
61 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00013-of-00014-523aa82c71ab4ceb.parquet": {
|
62 |
+
"num_bytes": 268326367,
|
63 |
+
"checksum": null
|
64 |
+
}
|
65 |
+
},
|
66 |
+
"download_size": 3940589290,
|
67 |
+
"features": {
|
68 |
+
"input_ids": {
|
69 |
+
"feature": {
|
70 |
+
"dtype": "int32",
|
71 |
+
"_type": "Value"
|
72 |
+
},
|
73 |
+
"_type": "Sequence"
|
74 |
+
},
|
75 |
+
"token_type_ids": {
|
76 |
+
"feature": {
|
77 |
+
"dtype": "int8",
|
78 |
+
"_type": "Value"
|
79 |
+
},
|
80 |
+
"_type": "Sequence"
|
81 |
+
},
|
82 |
+
"attention_mask": {
|
83 |
+
"feature": {
|
84 |
+
"dtype": "int8",
|
85 |
+
"_type": "Value"
|
86 |
+
},
|
87 |
+
"_type": "Sequence"
|
88 |
+
},
|
89 |
+
"overflow_to_sample_mapping": {
|
90 |
+
"dtype": "int64",
|
91 |
+
"_type": "Value"
|
92 |
+
}
|
93 |
+
},
|
94 |
+
"homepage": "",
|
95 |
+
"license": "",
|
96 |
+
"size_in_bytes": 10584024122,
|
97 |
+
"splits": {
|
98 |
+
"train": {
|
99 |
+
"name": "train",
|
100 |
+
"num_bytes": 6643434832,
|
101 |
+
"num_examples": 17868,
|
102 |
+
"shard_lengths": [
|
103 |
+
1277,
|
104 |
+
2554,
|
105 |
+
2553,
|
106 |
+
2552,
|
107 |
+
2552,
|
108 |
+
2552,
|
109 |
+
2552,
|
110 |
+
1276
|
111 |
+
],
|
112 |
+
"dataset_name": "bookcorpusopen"
|
113 |
+
}
|
114 |
+
},
|
115 |
+
"version": {
|
116 |
+
"version_str": "0.0.0",
|
117 |
+
"major": 0,
|
118 |
+
"minor": 0,
|
119 |
+
"patch": 0
|
120 |
+
}
|
121 |
+
}
|
data/test/state.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00001.arrow"
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"_fingerprint": "822b2c728621ccb1",
|
8 |
+
"_format_columns": null,
|
9 |
+
"_format_kwargs": {},
|
10 |
+
"_format_type": null,
|
11 |
+
"_output_all_columns": false,
|
12 |
+
"_split": "train"
|
13 |
+
}
|
data/train/data-00000-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a1ca57e659a2491da6e6dac287ce6f129138102ab93fab529d6b10cbd8a1c75
|
3 |
+
size 479554856
|
data/train/data-00001-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10b2969f7d7fff2ddc0ee28fdea1c9301cbb93ec5f5d077e5e9754efd705a1c8
|
3 |
+
size 479608656
|
data/train/data-00002-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65e96bc5cd59207de21ebe39971498c6a7e2991c468c93f5254bbd4b6d2e77bd
|
3 |
+
size 479668592
|
data/train/data-00003-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3fcc1e7829afff3d185dd02ecfba1bb8b76fb3a8a67848f28d1e9222d767f9bf
|
3 |
+
size 479494392
|
data/train/data-00004-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dbca1a272f9675b1e258e02dfeb5987a6ce7b1f7122416cc86edfe93b86287b3
|
3 |
+
size 479536600
|
data/train/data-00005-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3aaf0873a26fe076ac132d5951433d72c52e0c22547bb355f41aae567eda51eb
|
3 |
+
size 479597920
|
data/train/data-00006-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cec720fcff434842dfb28e25f494362b83a769c7312658def71ee1ee64eda4fe
|
3 |
+
size 479543768
|
data/train/data-00007-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cc7f5b678f104e6a64ec1fff83ae0ed910235f5250be61da95ace3ace608bce2
|
3 |
+
size 479485360
|
data/train/data-00008-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd3b65e6ccf9fa2b0ca906cd425ecc018b6f04f67e34b7b73486d5725aef41c6
|
3 |
+
size 479555792
|
data/train/data-00009-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f3a46b3aacc6df1cc573f7ccb8a62f1db853bcedc83c7059eab86d2ef622c88f
|
3 |
+
size 479590648
|
data/train/data-00010-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad9211c4f883c731d0298874c9c9be8c43fe6f57c7df88022138bce4078b030e
|
3 |
+
size 479589760
|
data/train/data-00011-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7676275416b7250243267e095f85fcf30a270e47d0d6c2559b7e5629992577e
|
3 |
+
size 479551864
|
data/train/data-00012-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0339c8623e411922caae471aa11397d79f36cc82bb4c41855f97f01754646806
|
3 |
+
size 479467024
|
data/train/data-00013-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:525fd21f723d96003a236f81f4bd0968cda0cb62549d95fc45f86d636e309be3
|
3 |
+
size 479595808
|
data/train/data-00014-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:03bb40439490de9c683493099d4d80574b1cb825312f7013f38a4d058cb2c930
|
3 |
+
size 479580776
|
data/train/data-00015-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:147e12d12af77db481fdfddb4f882723a87ea08bdd04e789bc66e76242761974
|
3 |
+
size 479549728
|
data/train/data-00016-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78f8b6e2823f47337012f26a90d185b7171379e2a74fbede6d5f1b19d79c25a5
|
3 |
+
size 479501896
|
data/train/data-00017-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:03db569df9cebdd8dd7acf029853f97bff11970384a287e474a673b93a91bc59
|
3 |
+
size 479508896
|
data/train/data-00018-of-00019.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e176cb57117b9233413b1a1c0e94cd170e1ee77e6ee3bb839096582fee05cc3
|
3 |
+
size 479574000
|
data/train/dataset_info.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"builder_name": "parquet",
|
3 |
+
"citation": "",
|
4 |
+
"config_name": "default",
|
5 |
+
"dataset_name": "bookcorpusopen",
|
6 |
+
"dataset_size": 6643434832,
|
7 |
+
"description": "",
|
8 |
+
"download_checksums": {
|
9 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00000-of-00014-e40347a4a9a752dd.parquet": {
|
10 |
+
"num_bytes": 312662419,
|
11 |
+
"checksum": null
|
12 |
+
},
|
13 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00001-of-00014-4f769efe80e66fc3.parquet": {
|
14 |
+
"num_bytes": 276422009,
|
15 |
+
"checksum": null
|
16 |
+
},
|
17 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00002-of-00014-fb4feb3c719446aa.parquet": {
|
18 |
+
"num_bytes": 285252367,
|
19 |
+
"checksum": null
|
20 |
+
},
|
21 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00003-of-00014-02d44dfa3e71d7db.parquet": {
|
22 |
+
"num_bytes": 278564550,
|
23 |
+
"checksum": null
|
24 |
+
},
|
25 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00004-of-00014-f13bdb35926815eb.parquet": {
|
26 |
+
"num_bytes": 283741488,
|
27 |
+
"checksum": null
|
28 |
+
},
|
29 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00005-of-00014-4306e38807aec8ea.parquet": {
|
30 |
+
"num_bytes": 268541858,
|
31 |
+
"checksum": null
|
32 |
+
},
|
33 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00006-of-00014-d6a12d722a46f987.parquet": {
|
34 |
+
"num_bytes": 271706112,
|
35 |
+
"checksum": null
|
36 |
+
},
|
37 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00007-of-00014-9ca5d4d4fb468d96.parquet": {
|
38 |
+
"num_bytes": 286651952,
|
39 |
+
"checksum": null
|
40 |
+
},
|
41 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00008-of-00014-e6e5e9598b224507.parquet": {
|
42 |
+
"num_bytes": 282522639,
|
43 |
+
"checksum": null
|
44 |
+
},
|
45 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00009-of-00014-a8ecf1b869b5fa7c.parquet": {
|
46 |
+
"num_bytes": 267958223,
|
47 |
+
"checksum": null
|
48 |
+
},
|
49 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00010-of-00014-7dfefa88147121dc.parquet": {
|
50 |
+
"num_bytes": 298926548,
|
51 |
+
"checksum": null
|
52 |
+
},
|
53 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00011-of-00014-19c764033d9839eb.parquet": {
|
54 |
+
"num_bytes": 287813481,
|
55 |
+
"checksum": null
|
56 |
+
},
|
57 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00012-of-00014-8d37c841e68119d0.parquet": {
|
58 |
+
"num_bytes": 271499277,
|
59 |
+
"checksum": null
|
60 |
+
},
|
61 |
+
"hf://datasets/lucadiliello/bookcorpusopen@edb74e6c88abb38f0a0fc993a7068ab00a32db45/data/train-00013-of-00014-523aa82c71ab4ceb.parquet": {
|
62 |
+
"num_bytes": 268326367,
|
63 |
+
"checksum": null
|
64 |
+
}
|
65 |
+
},
|
66 |
+
"download_size": 3940589290,
|
67 |
+
"features": {
|
68 |
+
"input_ids": {
|
69 |
+
"feature": {
|
70 |
+
"dtype": "int32",
|
71 |
+
"_type": "Value"
|
72 |
+
},
|
73 |
+
"_type": "Sequence"
|
74 |
+
},
|
75 |
+
"token_type_ids": {
|
76 |
+
"feature": {
|
77 |
+
"dtype": "int8",
|
78 |
+
"_type": "Value"
|
79 |
+
},
|
80 |
+
"_type": "Sequence"
|
81 |
+
},
|
82 |
+
"attention_mask": {
|
83 |
+
"feature": {
|
84 |
+
"dtype": "int8",
|
85 |
+
"_type": "Value"
|
86 |
+
},
|
87 |
+
"_type": "Sequence"
|
88 |
+
},
|
89 |
+
"overflow_to_sample_mapping": {
|
90 |
+
"dtype": "int64",
|
91 |
+
"_type": "Value"
|
92 |
+
}
|
93 |
+
},
|
94 |
+
"homepage": "",
|
95 |
+
"license": "",
|
96 |
+
"size_in_bytes": 10584024122,
|
97 |
+
"splits": {
|
98 |
+
"train": {
|
99 |
+
"name": "train",
|
100 |
+
"num_bytes": 6643434832,
|
101 |
+
"num_examples": 17868,
|
102 |
+
"shard_lengths": [
|
103 |
+
1277,
|
104 |
+
2554,
|
105 |
+
2553,
|
106 |
+
2552,
|
107 |
+
2552,
|
108 |
+
2552,
|
109 |
+
2552,
|
110 |
+
1276
|
111 |
+
],
|
112 |
+
"dataset_name": "bookcorpusopen"
|
113 |
+
}
|
114 |
+
},
|
115 |
+
"version": {
|
116 |
+
"version_str": "0.0.0",
|
117 |
+
"major": 0,
|
118 |
+
"minor": 0,
|
119 |
+
"patch": 0
|
120 |
+
}
|
121 |
+
}
|
data/train/state.json
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00019.arrow"
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"filename": "data-00001-of-00019.arrow"
|
8 |
+
},
|
9 |
+
{
|
10 |
+
"filename": "data-00002-of-00019.arrow"
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"filename": "data-00003-of-00019.arrow"
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"filename": "data-00004-of-00019.arrow"
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"filename": "data-00005-of-00019.arrow"
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"filename": "data-00006-of-00019.arrow"
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"filename": "data-00007-of-00019.arrow"
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"filename": "data-00008-of-00019.arrow"
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"filename": "data-00009-of-00019.arrow"
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"filename": "data-00010-of-00019.arrow"
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"filename": "data-00011-of-00019.arrow"
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"filename": "data-00012-of-00019.arrow"
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"filename": "data-00013-of-00019.arrow"
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"filename": "data-00014-of-00019.arrow"
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"filename": "data-00015-of-00019.arrow"
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"filename": "data-00016-of-00019.arrow"
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"filename": "data-00017-of-00019.arrow"
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"filename": "data-00018-of-00019.arrow"
|
59 |
+
}
|
60 |
+
],
|
61 |
+
"_fingerprint": "214c03786ad46a46",
|
62 |
+
"_format_columns": null,
|
63 |
+
"_format_kwargs": {},
|
64 |
+
"_format_type": null,
|
65 |
+
"_output_all_columns": false,
|
66 |
+
"_split": "train"
|
67 |
+
}
|
pre_training.py
CHANGED
@@ -11,6 +11,10 @@ from datasets import load_from_disk
|
|
11 |
from configuration_gpt1 import GPT1Config
|
12 |
from modeling_gpt1 import GPT1Model, GPT1ForCausalLM
|
13 |
|
|
|
|
|
|
|
|
|
14 |
|
15 |
GPT1Config.register_for_auto_class()
|
16 |
GPT1Model.register_for_auto_class('AutoModel')
|
@@ -19,6 +23,9 @@ GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')
|
|
19 |
# load the already tokenized dataset (see training_preprocessing.py)
|
20 |
tokenized_datasets = load_from_disk('tokenized_bookcorpusopen')
|
21 |
|
|
|
|
|
|
|
22 |
print(tokenized_datasets)
|
23 |
|
24 |
tokenizer = AutoTokenizer.from_pretrained('.')
|
@@ -30,7 +37,7 @@ print(model)
|
|
30 |
_total_params = sum(p.numel() for p in model.parameters())
|
31 |
print(f"Model parameters: {_total_params}")
|
32 |
|
33 |
-
batch_size =
|
34 |
epochs = 100
|
35 |
|
36 |
tokenizer.pad_token = tokenizer.eos_token
|
@@ -39,18 +46,22 @@ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
|
39 |
optimizer = Adam(model.parameters(), lr=2.5e-4, weight_decay=0.01)
|
40 |
scheduler = get_scheduler('cosine',
|
41 |
optimizer=optimizer,
|
42 |
-
num_warmup_steps=
|
43 |
num_training_steps=epochs * len(tokenized_datasets['train']))
|
44 |
|
45 |
args = TrainingArguments(
|
46 |
output_dir='checkpoints',
|
47 |
-
per_device_train_batch_size=batch_size,
|
48 |
-
per_device_eval_batch_size=batch_size,
|
49 |
evaluation_strategy='epoch',
|
50 |
-
gradient_accumulation_steps=
|
51 |
num_train_epochs=epochs,
|
52 |
save_total_limit=10,
|
53 |
max_grad_norm=1.0,
|
|
|
|
|
|
|
|
|
54 |
fp16=False,
|
55 |
)
|
56 |
|
|
|
11 |
from configuration_gpt1 import GPT1Config
|
12 |
from modeling_gpt1 import GPT1Model, GPT1ForCausalLM
|
13 |
|
14 |
+
# a few more things to try to get the model to train (in this order)
|
15 |
+
# actually manually check the input (the books), and the tokenizer output (i
|
16 |
+
# don't know if it tokenizes correctly, if it adds eos_token etc.)
|
17 |
+
|
18 |
|
19 |
GPT1Config.register_for_auto_class()
|
20 |
GPT1Model.register_for_auto_class('AutoModel')
|
|
|
23 |
# load the already tokenized dataset (see training_preprocessing.py)
|
24 |
tokenized_datasets = load_from_disk('tokenized_bookcorpusopen')
|
25 |
|
26 |
+
# shuffle for good measure
|
27 |
+
tokenized_datasets = tokenized_datasets.shuffle(seed=42)
|
28 |
+
|
29 |
print(tokenized_datasets)
|
30 |
|
31 |
tokenizer = AutoTokenizer.from_pretrained('.')
|
|
|
37 |
_total_params = sum(p.numel() for p in model.parameters())
|
38 |
print(f"Model parameters: {_total_params}")
|
39 |
|
40 |
+
batch_size = 16
|
41 |
epochs = 100
|
42 |
|
43 |
tokenizer.pad_token = tokenizer.eos_token
|
|
|
46 |
optimizer = Adam(model.parameters(), lr=2.5e-4, weight_decay=0.01)
|
47 |
scheduler = get_scheduler('cosine',
|
48 |
optimizer=optimizer,
|
49 |
+
num_warmup_steps=2000,
|
50 |
num_training_steps=epochs * len(tokenized_datasets['train']))
|
51 |
|
52 |
args = TrainingArguments(
|
53 |
output_dir='checkpoints',
|
54 |
+
per_device_train_batch_size=batch_size, # divide by number of GPU's
|
55 |
+
per_device_eval_batch_size=batch_size, # divide by number of GPU's
|
56 |
evaluation_strategy='epoch',
|
57 |
+
gradient_accumulation_steps=4,
|
58 |
num_train_epochs=epochs,
|
59 |
save_total_limit=10,
|
60 |
max_grad_norm=1.0,
|
61 |
+
logging_strategy='steps',
|
62 |
+
logging_steps=100,
|
63 |
+
logging_first_step=True,
|
64 |
+
logging_nan_inf_filter=False,
|
65 |
fp16=False,
|
66 |
)
|
67 |
|