AlekseyKorshuk commited on
Commit
ba0f519
1 Parent(s): 84101b5

huggingartists

Browse files
README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ datasets:
4
+ - huggingartists/kendrick-lamar
5
+ tags:
6
+ - huggingartists
7
+ - lyrics
8
+ - lm-head
9
+ - causal-lm
10
+ widget:
11
+ - text: "I am"
12
+ ---
13
+
14
+ <div class="inline-flex flex-col" style="line-height: 1.5;">
15
+ <div class="flex">
16
+ <div
17
+ style="display:DISPLAY_1; margin-left: auto; margin-right: auto; width: 92px; height:92px; border-radius: 50%; background-size: cover; background-image: url(&#39;https://images.genius.com/d6d96651b423fa5a83c38ee2a4c6c939.1000x1000x1.jpg&#39;)">
18
+ </div>
19
+ </div>
20
+ <div style="text-align: center; margin-top: 3px; font-size: 16px; font-weight: 800">🤖 HuggingArtists Model 🤖</div>
21
+ <div style="text-align: center; font-size: 16px; font-weight: 800">Kendrick Lamar</div>
22
+ <a href="https://genius.com/artists/kendrick-lamar">
23
+ <div style="text-align: center; font-size: 14px;">@kendrick-lamar</div>
24
+ </a>
25
+ </div>
26
+
27
+ I was made with [huggingartists](https://github.com/AlekseyKorshuk/huggingartists).
28
+
29
+ Create your own bot based on your favorite artist with [the demo](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb)!
30
+
31
+ ## How does it work?
32
+
33
+ To understand how the model was developed, check the [W&B report](https://wandb.ai/huggingartists/huggingartists/reportlist).
34
+
35
+ ## Training data
36
+
37
+ The model was trained on lyrics from Kendrick Lamar.
38
+
39
+ Dataset is available [here](https://huggingface.co/datasets/huggingartists/kendrick-lamar).
40
+ And can be used with:
41
+
42
+ ```python
43
+ from datasets import load_dataset
44
+
45
+ dataset = load_dataset("huggingartists/kendrick-lamar")
46
+ ```
47
+
48
+ [Explore the data](WANDB_PREPROCESS/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
+
50
+ ## Training procedure
51
+
52
+ The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Kendrick Lamar's lyrics.
53
+
54
+ Hyperparameters and metrics are recorded in the [W&B training run](WANDB_TRAIN) for full transparency and reproducibility.
55
+
56
+ At the end of training, [the final model](WANDB_TRAIN/artifacts) is logged and versioned.
57
+
58
+ ## How to use
59
+
60
+ You can use this model directly with a pipeline for text generation:
61
+
62
+ ```python
63
+ from transformers import pipeline
64
+ generator = pipeline('text-generation',
65
+ model='huggingartists/kendrick-lamar')
66
+ generator("I am", num_return_sequences=5)
67
+ ```
68
+
69
+ Or with Transformers library:
70
+
71
+ ```python
72
+ from transformers import AutoTokenizer, AutoModelWithLMHead
73
+
74
+ tokenizer = AutoTokenizer.from_pretrained("huggingartists/kendrick-lamar")
75
+
76
+ model = AutoModelWithLMHead.from_pretrained("huggingartists/kendrick-lamar")
77
+ ```
78
+
79
+ ## Limitations and bias
80
+
81
+ The model suffers from [the same limitations and bias as GPT-2](https://huggingface.co/gpt2#limitations-and-bias).
82
+
83
+ In addition, the data present in the user's tweets further affects the text generated by the model.
84
+
85
+ ## About
86
+
87
+ *Built by Aleksey Korshuk*
88
+
89
+ [![Follow](https://img.shields.io/github/followers/AlekseyKorshuk?style=social)](https://github.com/AlekseyKorshuk)
90
+
91
+ [![Follow](https://img.shields.io/twitter/follow/alekseykorshuk?style=social)](https://twitter.com/intent/follow?screen_name=alekseykorshuk)
92
+
93
+ [![Follow](https://img.shields.io/badge/dynamic/json?color=blue&label=Telegram%20Channel&query=%24.result&url=https%3A%2F%2Fapi.telegram.org%2Fbot1929545866%3AAAFGhV-KKnegEcLiyYJxsc4zV6C-bdPEBtQ%2FgetChatMemberCount%3Fchat_id%3D-1001253621662&style=social&logo=telegram)](https://t.me/joinchat/_CQ04KjcJ-4yZTky)
94
+
95
+ For more details, visit the project repository.
96
+
97
+ [![GitHub stars](https://img.shields.io/github/stars/AlekseyKorshuk/huggingartists?style=social)](https://github.com/AlekseyKorshuk/huggingartists)
config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "kendrick-lamar",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 200,
33
+ "min_length": 100,
34
+ "temperature": 1.0,
35
+ "top_p": 0.95
36
+ }
37
+ },
38
+ "torch_dtype": "float32",
39
+ "transformers_version": "4.19.2",
40
+ "use_cache": true,
41
+ "vocab_size": 50257
42
+ }
evaluation.txt ADDED
@@ -0,0 +1 @@
 
1
+ {"eval_loss": 3.553039312362671, "eval_runtime": 9.9332, "eval_samples_per_second": 40.37, "eval_steps_per_second": 5.134, "epoch": 3.0}
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88fc23ca7d4d4e6ba8e123b49d156d1727bcfc4fa1e29aa441617c39cbb7f743
3
+ size 497764120
merges.txt ADDED
The diff for this file is too large to render. See raw diff
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e9a75d06ce33952bf6ca8d64e0619aceac8ed838c873cd199b0ca47dfda1bef
3
+ size 995604017
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e00f508adfd0c8522fb8f01e38f222108699a66adc07ba0dff84341e730981a0
3
+ size 510396521
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e651a57091df8587e175f62b4d70295e46b37b34655319dc46699d0ad3ac85b
3
+ size 14503
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f059015de4876d2096cc91d2af4c7461eb3e5f4a09f212da6550c2c160d8775
3
+ size 623
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,1018 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 3.553039312362671,
3
+ "best_model_checkpoint": "output/kendrick-lamar/checkpoint-819",
4
+ "epoch": 3.0,
5
+ "global_step": 819,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 0.0001370864757190084,
13
+ "loss": 4.2722,
14
+ "step": 5
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "learning_rate": 0.00013674627861254597,
19
+ "loss": 4.3361,
20
+ "step": 10
21
+ },
22
+ {
23
+ "epoch": 0.05,
24
+ "learning_rate": 0.00013618053464655754,
25
+ "loss": 4.2033,
26
+ "step": 15
27
+ },
28
+ {
29
+ "epoch": 0.07,
30
+ "learning_rate": 0.00013539111628975924,
31
+ "loss": 4.0514,
32
+ "step": 20
33
+ },
34
+ {
35
+ "epoch": 0.09,
36
+ "learning_rate": 0.0001343806363162431,
37
+ "loss": 4.1327,
38
+ "step": 25
39
+ },
40
+ {
41
+ "epoch": 0.11,
42
+ "learning_rate": 0.00013315243915785902,
43
+ "loss": 3.96,
44
+ "step": 30
45
+ },
46
+ {
47
+ "epoch": 0.13,
48
+ "learning_rate": 0.00013171058983499535,
49
+ "loss": 3.9217,
50
+ "step": 35
51
+ },
52
+ {
53
+ "epoch": 0.15,
54
+ "learning_rate": 0.0001300598605023948,
55
+ "loss": 3.7032,
56
+ "step": 40
57
+ },
58
+ {
59
+ "epoch": 0.16,
60
+ "learning_rate": 0.00012820571465453544,
61
+ "loss": 3.9717,
62
+ "step": 45
63
+ },
64
+ {
65
+ "epoch": 0.18,
66
+ "learning_rate": 0.00012615428904285254,
67
+ "loss": 4.0279,
68
+ "step": 50
69
+ },
70
+ {
71
+ "epoch": 0.2,
72
+ "learning_rate": 0.0001239123733646515,
73
+ "loss": 3.8994,
74
+ "step": 55
75
+ },
76
+ {
77
+ "epoch": 0.22,
78
+ "learning_rate": 0.0001214873877909356,
79
+ "loss": 4.0263,
80
+ "step": 60
81
+ },
82
+ {
83
+ "epoch": 0.24,
84
+ "learning_rate": 0.00011888735840752609,
85
+ "loss": 3.9161,
86
+ "step": 65
87
+ },
88
+ {
89
+ "epoch": 0.26,
90
+ "learning_rate": 0.00011612089065075853,
91
+ "loss": 3.8483,
92
+ "step": 70
93
+ },
94
+ {
95
+ "epoch": 0.27,
96
+ "learning_rate": 0.00011319714082567585,
97
+ "loss": 3.8609,
98
+ "step": 75
99
+ },
100
+ {
101
+ "epoch": 0.29,
102
+ "learning_rate": 0.00011012578580098631,
103
+ "loss": 3.9798,
104
+ "step": 80
105
+ },
106
+ {
107
+ "epoch": 0.31,
108
+ "learning_rate": 0.00010691699098108779,
109
+ "loss": 3.7969,
110
+ "step": 85
111
+ },
112
+ {
113
+ "epoch": 0.33,
114
+ "learning_rate": 0.00010358137666116333,
115
+ "loss": 3.9596,
116
+ "step": 90
117
+ },
118
+ {
119
+ "epoch": 0.35,
120
+ "learning_rate": 0.00010012998287670373,
121
+ "loss": 4.1164,
122
+ "step": 95
123
+ },
124
+ {
125
+ "epoch": 0.37,
126
+ "learning_rate": 9.657423286379646e-05,
127
+ "loss": 3.7924,
128
+ "step": 100
129
+ },
130
+ {
131
+ "epoch": 0.38,
132
+ "learning_rate": 9.292589525111794e-05,
133
+ "loss": 4.0304,
134
+ "step": 105
135
+ },
136
+ {
137
+ "epoch": 0.4,
138
+ "learning_rate": 8.91970451087642e-05,
139
+ "loss": 3.8561,
140
+ "step": 110
141
+ },
142
+ {
143
+ "epoch": 0.42,
144
+ "learning_rate": 8.540002398283833e-05,
145
+ "loss": 3.8825,
146
+ "step": 115
147
+ },
148
+ {
149
+ "epoch": 0.44,
150
+ "learning_rate": 8.154739904807008e-05,
151
+ "loss": 3.8513,
152
+ "step": 120
153
+ },
154
+ {
155
+ "epoch": 0.46,
156
+ "learning_rate": 7.765192151366211e-05,
157
+ "loss": 3.752,
158
+ "step": 125
159
+ },
160
+ {
161
+ "epoch": 0.48,
162
+ "learning_rate": 7.372648442002871e-05,
163
+ "loss": 3.6579,
164
+ "step": 130
165
+ },
166
+ {
167
+ "epoch": 0.49,
168
+ "learning_rate": 6.978407996610966e-05,
169
+ "loss": 3.595,
170
+ "step": 135
171
+ },
172
+ {
173
+ "epoch": 0.51,
174
+ "learning_rate": 6.583775650849414e-05,
175
+ "loss": 3.8015,
176
+ "step": 140
177
+ },
178
+ {
179
+ "epoch": 0.53,
180
+ "learning_rate": 6.190057537467733e-05,
181
+ "loss": 3.756,
182
+ "step": 145
183
+ },
184
+ {
185
+ "epoch": 0.55,
186
+ "learning_rate": 5.79855676333867e-05,
187
+ "loss": 3.8087,
188
+ "step": 150
189
+ },
190
+ {
191
+ "epoch": 0.57,
192
+ "learning_rate": 5.410569096505683e-05,
193
+ "loss": 3.6749,
194
+ "step": 155
195
+ },
196
+ {
197
+ "epoch": 0.59,
198
+ "learning_rate": 5.0273786775201065e-05,
199
+ "loss": 3.9157,
200
+ "step": 160
201
+ },
202
+ {
203
+ "epoch": 0.6,
204
+ "learning_rate": 4.6502537692623556e-05,
205
+ "loss": 3.6094,
206
+ "step": 165
207
+ },
208
+ {
209
+ "epoch": 0.62,
210
+ "learning_rate": 4.2804425593141775e-05,
211
+ "loss": 3.8656,
212
+ "step": 170
213
+ },
214
+ {
215
+ "epoch": 0.64,
216
+ "learning_rate": 3.9191690287750474e-05,
217
+ "loss": 3.8787,
218
+ "step": 175
219
+ },
220
+ {
221
+ "epoch": 0.66,
222
+ "learning_rate": 3.567628901195867e-05,
223
+ "loss": 3.7476,
224
+ "step": 180
225
+ },
226
+ {
227
+ "epoch": 0.68,
228
+ "learning_rate": 3.226985685037943e-05,
229
+ "loss": 3.6491,
230
+ "step": 185
231
+ },
232
+ {
233
+ "epoch": 0.7,
234
+ "learning_rate": 2.898366822755775e-05,
235
+ "loss": 3.7995,
236
+ "step": 190
237
+ },
238
+ {
239
+ "epoch": 0.71,
240
+ "learning_rate": 2.5828599592490882e-05,
241
+ "loss": 3.8123,
242
+ "step": 195
243
+ },
244
+ {
245
+ "epoch": 0.73,
246
+ "learning_rate": 2.2815093420347238e-05,
247
+ "loss": 3.9104,
248
+ "step": 200
249
+ },
250
+ {
251
+ "epoch": 0.75,
252
+ "learning_rate": 1.9953123650527866e-05,
253
+ "loss": 3.8532,
254
+ "step": 205
255
+ },
256
+ {
257
+ "epoch": 0.77,
258
+ "learning_rate": 1.725216267546246e-05,
259
+ "loss": 3.7647,
260
+ "step": 210
261
+ },
262
+ {
263
+ "epoch": 0.79,
264
+ "learning_rate": 1.472114998939829e-05,
265
+ "loss": 3.6447,
266
+ "step": 215
267
+ },
268
+ {
269
+ "epoch": 0.81,
270
+ "learning_rate": 1.2368462600946557e-05,
271
+ "loss": 3.8278,
272
+ "step": 220
273
+ },
274
+ {
275
+ "epoch": 0.82,
276
+ "learning_rate": 1.0201887307313513e-05,
277
+ "loss": 3.7164,
278
+ "step": 225
279
+ },
280
+ {
281
+ "epoch": 0.84,
282
+ "learning_rate": 8.228594921980851e-06,
283
+ "loss": 3.7653,
284
+ "step": 230
285
+ },
286
+ {
287
+ "epoch": 0.86,
288
+ "learning_rate": 6.455116541136077e-06,
289
+ "loss": 3.6369,
290
+ "step": 235
291
+ },
292
+ {
293
+ "epoch": 0.88,
294
+ "learning_rate": 4.88732192740426e-06,
295
+ "loss": 3.8294,
296
+ "step": 240
297
+ },
298
+ {
299
+ "epoch": 0.9,
300
+ "learning_rate": 3.53040008242582e-06,
301
+ "loss": 3.9352,
302
+ "step": 245
303
+ },
304
+ {
305
+ "epoch": 0.92,
306
+ "learning_rate": 2.3888420725801435e-06,
307
+ "loss": 3.7973,
308
+ "step": 250
309
+ },
310
+ {
311
+ "epoch": 0.93,
312
+ "learning_rate": 1.4664261646974657e-06,
313
+ "loss": 3.892,
314
+ "step": 255
315
+ },
316
+ {
317
+ "epoch": 0.95,
318
+ "learning_rate": 7.662053209561833e-07,
319
+ "loss": 3.7137,
320
+ "step": 260
321
+ },
322
+ {
323
+ "epoch": 0.97,
324
+ "learning_rate": 2.9049709435396624e-07,
325
+ "loss": 3.7595,
326
+ "step": 265
327
+ },
328
+ {
329
+ "epoch": 0.99,
330
+ "learning_rate": 4.087595819657002e-08,
331
+ "loss": 3.7287,
332
+ "step": 270
333
+ },
334
+ {
335
+ "epoch": 1.0,
336
+ "eval_loss": 3.6416754722595215,
337
+ "eval_runtime": 10.1198,
338
+ "eval_samples_per_second": 39.625,
339
+ "eval_steps_per_second": 5.04,
340
+ "step": 273
341
+ },
342
+ {
343
+ "epoch": 1.01,
344
+ "learning_rate": 1.816809499134402e-08,
345
+ "loss": 3.6986,
346
+ "step": 275
347
+ },
348
+ {
349
+ "epoch": 1.03,
350
+ "learning_rate": 2.2244866199319123e-07,
351
+ "loss": 3.6869,
352
+ "step": 280
353
+ },
354
+ {
355
+ "epoch": 1.04,
356
+ "learning_rate": 6.530415424531654e-07,
357
+ "loss": 3.6129,
358
+ "step": 285
359
+ },
360
+ {
361
+ "epoch": 1.06,
362
+ "learning_rate": 1.3085215833929946e-06,
363
+ "loss": 3.6142,
364
+ "step": 290
365
+ },
366
+ {
367
+ "epoch": 1.08,
368
+ "learning_rate": 2.1867193124992013e-06,
369
+ "loss": 3.6029,
370
+ "step": 295
371
+ },
372
+ {
373
+ "epoch": 1.1,
374
+ "learning_rate": 3.2847281185250116e-06,
375
+ "loss": 3.5627,
376
+ "step": 300
377
+ },
378
+ {
379
+ "epoch": 1.12,
380
+ "learning_rate": 4.598913871434634e-06,
381
+ "loss": 3.4652,
382
+ "step": 305
383
+ },
384
+ {
385
+ "epoch": 1.14,
386
+ "learning_rate": 6.124926950450006e-06,
387
+ "loss": 3.632,
388
+ "step": 310
389
+ },
390
+ {
391
+ "epoch": 1.15,
392
+ "learning_rate": 7.857716640189785e-06,
393
+ "loss": 3.6625,
394
+ "step": 315
395
+ },
396
+ {
397
+ "epoch": 1.17,
398
+ "learning_rate": 9.791547847253513e-06,
399
+ "loss": 3.6645,
400
+ "step": 320
401
+ },
402
+ {
403
+ "epoch": 1.19,
404
+ "learning_rate": 1.1920020081922749e-05,
405
+ "loss": 3.6207,
406
+ "step": 325
407
+ },
408
+ {
409
+ "epoch": 1.21,
410
+ "learning_rate": 1.4236088642155179e-05,
411
+ "loss": 3.738,
412
+ "step": 330
413
+ },
414
+ {
415
+ "epoch": 1.23,
416
+ "learning_rate": 1.6732087929757627e-05,
417
+ "loss": 3.4442,
418
+ "step": 335
419
+ },
420
+ {
421
+ "epoch": 1.25,
422
+ "learning_rate": 1.9399756821567315e-05,
423
+ "loss": 3.7553,
424
+ "step": 340
425
+ },
426
+ {
427
+ "epoch": 1.26,
428
+ "learning_rate": 2.223026601166984e-05,
429
+ "loss": 3.5033,
430
+ "step": 345
431
+ },
432
+ {
433
+ "epoch": 1.28,
434
+ "learning_rate": 2.5214247234157134e-05,
435
+ "loss": 3.6562,
436
+ "step": 350
437
+ },
438
+ {
439
+ "epoch": 1.3,
440
+ "learning_rate": 2.8341824269706243e-05,
441
+ "loss": 3.7108,
442
+ "step": 355
443
+ },
444
+ {
445
+ "epoch": 1.32,
446
+ "learning_rate": 3.1602645633354905e-05,
447
+ "loss": 3.6002,
448
+ "step": 360
449
+ },
450
+ {
451
+ "epoch": 1.34,
452
+ "learning_rate": 3.4985918835285396e-05,
453
+ "loss": 3.6699,
454
+ "step": 365
455
+ },
456
+ {
457
+ "epoch": 1.36,
458
+ "learning_rate": 3.848044610122224e-05,
459
+ "loss": 3.6041,
460
+ "step": 370
461
+ },
462
+ {
463
+ "epoch": 1.37,
464
+ "learning_rate": 4.207466143421867e-05,
465
+ "loss": 3.5919,
466
+ "step": 375
467
+ },
468
+ {
469
+ "epoch": 1.39,
470
+ "learning_rate": 4.5756668895166686e-05,
471
+ "loss": 3.7467,
472
+ "step": 380
473
+ },
474
+ {
475
+ "epoch": 1.41,
476
+ "learning_rate": 4.9514281975331363e-05,
477
+ "loss": 3.5324,
478
+ "step": 385
479
+ },
480
+ {
481
+ "epoch": 1.43,
482
+ "learning_rate": 5.333506393059682e-05,
483
+ "loss": 3.6562,
484
+ "step": 390
485
+ },
486
+ {
487
+ "epoch": 1.45,
488
+ "learning_rate": 5.720636894392822e-05,
489
+ "loss": 3.6013,
490
+ "step": 395
491
+ },
492
+ {
493
+ "epoch": 1.47,
494
+ "learning_rate": 6.11153839798114e-05,
495
+ "loss": 3.6502,
496
+ "step": 400
497
+ },
498
+ {
499
+ "epoch": 1.48,
500
+ "learning_rate": 6.504917119214327e-05,
501
+ "loss": 3.469,
502
+ "step": 405
503
+ },
504
+ {
505
+ "epoch": 1.5,
506
+ "learning_rate": 6.899471074521375e-05,
507
+ "loss": 3.5713,
508
+ "step": 410
509
+ },
510
+ {
511
+ "epoch": 1.52,
512
+ "learning_rate": 7.293894390605244e-05,
513
+ "loss": 3.5885,
514
+ "step": 415
515
+ },
516
+ {
517
+ "epoch": 1.54,
518
+ "learning_rate": 7.686881626551516e-05,
519
+ "loss": 3.6544,
520
+ "step": 420
521
+ },
522
+ {
523
+ "epoch": 1.56,
524
+ "learning_rate": 8.077132094505965e-05,
525
+ "loss": 3.5478,
526
+ "step": 425
527
+ },
528
+ {
529
+ "epoch": 1.58,
530
+ "learning_rate": 8.463354164620745e-05,
531
+ "loss": 3.5412,
532
+ "step": 430
533
+ },
534
+ {
535
+ "epoch": 1.59,
536
+ "learning_rate": 8.844269540020853e-05,
537
+ "loss": 3.4616,
538
+ "step": 435
539
+ },
540
+ {
541
+ "epoch": 1.61,
542
+ "learning_rate": 9.218617487641824e-05,
543
+ "loss": 3.6015,
544
+ "step": 440
545
+ },
546
+ {
547
+ "epoch": 1.63,
548
+ "learning_rate": 9.585159010935713e-05,
549
+ "loss": 3.5197,
550
+ "step": 445
551
+ },
552
+ {
553
+ "epoch": 1.65,
554
+ "learning_rate": 9.942680950634723e-05,
555
+ "loss": 3.4169,
556
+ "step": 450
557
+ },
558
+ {
559
+ "epoch": 1.67,
560
+ "learning_rate": 0.00010290000000000001,
561
+ "loss": 3.6481,
562
+ "step": 455
563
+ },
564
+ {
565
+ "epoch": 1.68,
566
+ "learning_rate": 0.00010625966621266112,
567
+ "loss": 3.5313,
568
+ "step": 460
569
+ },
570
+ {
571
+ "epoch": 1.7,
572
+ "learning_rate": 0.00010949468850318882,
573
+ "loss": 3.5705,
574
+ "step": 465
575
+ },
576
+ {
577
+ "epoch": 1.72,
578
+ "learning_rate": 0.00011259435977013931,
579
+ "loss": 3.6504,
580
+ "step": 470
581
+ },
582
+ {
583
+ "epoch": 1.74,
584
+ "learning_rate": 0.00011554842088955072,
585
+ "loss": 3.5506,
586
+ "step": 475
587
+ },
588
+ {
589
+ "epoch": 1.76,
590
+ "learning_rate": 0.00011834709467003562,
591
+ "loss": 3.6447,
592
+ "step": 480
593
+ },
594
+ {
595
+ "epoch": 1.78,
596
+ "learning_rate": 0.00012098111821279934,
597
+ "loss": 3.6195,
598
+ "step": 485
599
+ },
600
+ {
601
+ "epoch": 1.79,
602
+ "learning_rate": 0.0001234417735694802,
603
+ "loss": 3.6632,
604
+ "step": 490
605
+ },
606
+ {
607
+ "epoch": 1.81,
608
+ "learning_rate": 0.00012572091659634235,
609
+ "loss": 3.4833,
610
+ "step": 495
611
+ },
612
+ {
613
+ "epoch": 1.83,
614
+ "learning_rate": 0.0001278110039093217,
615
+ "loss": 3.5942,
616
+ "step": 500
617
+ },
618
+ {
619
+ "epoch": 1.85,
620
+ "learning_rate": 0.0001297051178507093,
621
+ "loss": 3.599,
622
+ "step": 505
623
+ },
624
+ {
625
+ "epoch": 1.87,
626
+ "learning_rate": 0.00013139698938484013,
627
+ "loss": 3.6315,
628
+ "step": 510
629
+ },
630
+ {
631
+ "epoch": 1.89,
632
+ "learning_rate": 0.00013288101884700735,
633
+ "loss": 3.6382,
634
+ "step": 515
635
+ },
636
+ {
637
+ "epoch": 1.9,
638
+ "learning_rate": 0.00013415229447692924,
639
+ "loss": 3.6227,
640
+ "step": 520
641
+ },
642
+ {
643
+ "epoch": 1.92,
644
+ "learning_rate": 0.00013520660867542716,
645
+ "loss": 3.4538,
646
+ "step": 525
647
+ },
648
+ {
649
+ "epoch": 1.94,
650
+ "learning_rate": 0.00013604047193050914,
651
+ "loss": 3.4112,
652
+ "step": 530
653
+ },
654
+ {
655
+ "epoch": 1.96,
656
+ "learning_rate": 0.00013665112436676765,
657
+ "loss": 3.6708,
658
+ "step": 535
659
+ },
660
+ {
661
+ "epoch": 1.98,
662
+ "learning_rate": 0.00013703654487986559,
663
+ "loss": 3.5041,
664
+ "step": 540
665
+ },
666
+ {
667
+ "epoch": 2.0,
668
+ "learning_rate": 0.00013719545782587793,
669
+ "loss": 3.6424,
670
+ "step": 545
671
+ },
672
+ {
673
+ "epoch": 2.0,
674
+ "eval_loss": 3.619663715362549,
675
+ "eval_runtime": 10.1067,
676
+ "eval_samples_per_second": 39.676,
677
+ "eval_steps_per_second": 5.046,
678
+ "step": 546
679
+ },
680
+ {
681
+ "epoch": 2.01,
682
+ "learning_rate": 0.0001371273372433488,
683
+ "loss": 3.4712,
684
+ "step": 550
685
+ },
686
+ {
687
+ "epoch": 2.03,
688
+ "learning_rate": 0.0001368324085940902,
689
+ "loss": 3.4434,
690
+ "step": 555
691
+ },
692
+ {
693
+ "epoch": 2.05,
694
+ "learning_rate": 0.00013631164801696085,
695
+ "loss": 3.3224,
696
+ "step": 560
697
+ },
698
+ {
699
+ "epoch": 2.07,
700
+ "learning_rate": 0.00013556677909709434,
701
+ "loss": 3.3818,
702
+ "step": 565
703
+ },
704
+ {
705
+ "epoch": 2.09,
706
+ "learning_rate": 0.000134600267161271,
707
+ "loss": 3.3693,
708
+ "step": 570
709
+ },
710
+ {
711
+ "epoch": 2.11,
712
+ "learning_rate": 0.00013341531111831246,
713
+ "loss": 3.5516,
714
+ "step": 575
715
+ },
716
+ {
717
+ "epoch": 2.12,
718
+ "learning_rate": 0.00013201583287150687,
719
+ "loss": 3.3951,
720
+ "step": 580
721
+ },
722
+ {
723
+ "epoch": 2.14,
724
+ "learning_rate": 0.00013040646433810595,
725
+ "loss": 3.4224,
726
+ "step": 585
727
+ },
728
+ {
729
+ "epoch": 2.16,
730
+ "learning_rate": 0.00012859253211885616,
731
+ "loss": 3.4472,
732
+ "step": 590
733
+ },
734
+ {
735
+ "epoch": 2.18,
736
+ "learning_rate": 0.00012658003986830435,
737
+ "loss": 3.5578,
738
+ "step": 595
739
+ },
740
+ {
741
+ "epoch": 2.2,
742
+ "learning_rate": 0.00012437564842422732,
743
+ "loss": 3.3637,
744
+ "step": 600
745
+ },
746
+ {
747
+ "epoch": 2.22,
748
+ "learning_rate": 0.00012198665376195207,
749
+ "loss": 3.4449,
750
+ "step": 605
751
+ },
752
+ {
753
+ "epoch": 2.23,
754
+ "learning_rate": 0.00011942096284653183,
755
+ "loss": 3.4946,
756
+ "step": 610
757
+ },
758
+ {
759
+ "epoch": 2.25,
760
+ "learning_rate": 0.00011668706746270184,
761
+ "loss": 3.4818,
762
+ "step": 615
763
+ },
764
+ {
765
+ "epoch": 2.27,
766
+ "learning_rate": 0.00011379401610923057,
767
+ "loss": 3.4296,
768
+ "step": 620
769
+ },
770
+ {
771
+ "epoch": 2.29,
772
+ "learning_rate": 0.00011075138405068863,
773
+ "loss": 3.4033,
774
+ "step": 625
775
+ },
776
+ {
777
+ "epoch": 2.31,
778
+ "learning_rate": 0.00010756924162575734,
779
+ "loss": 3.3291,
780
+ "step": 630
781
+ },
782
+ {
783
+ "epoch": 2.33,
784
+ "learning_rate": 0.00010425812091696729,
785
+ "loss": 3.1571,
786
+ "step": 635
787
+ },
788
+ {
789
+ "epoch": 2.34,
790
+ "learning_rate": 0.00010082898089218288,
791
+ "loss": 3.5678,
792
+ "step": 640
793
+ },
794
+ {
795
+ "epoch": 2.36,
796
+ "learning_rate": 9.72931711332038e-05,
797
+ "loss": 3.3972,
798
+ "step": 645
799
+ },
800
+ {
801
+ "epoch": 2.38,
802
+ "learning_rate": 9.36623942715347e-05,
803
+ "loss": 3.3953,
804
+ "step": 650
805
+ },
806
+ {
807
+ "epoch": 2.4,
808
+ "learning_rate": 8.99486672556498e-05,
809
+ "loss": 3.446,
810
+ "step": 655
811
+ },
812
+ {
813
+ "epoch": 2.42,
814
+ "learning_rate": 8.616428157794779e-05,
815
+ "loss": 3.3233,
816
+ "step": 660
817
+ },
818
+ {
819
+ "epoch": 2.44,
820
+ "learning_rate": 8.232176259303673e-05,
821
+ "loss": 3.3524,
822
+ "step": 665
823
+ },
824
+ {
825
+ "epoch": 2.45,
826
+ "learning_rate": 7.843382806199401e-05,
827
+ "loss": 3.3982,
828
+ "step": 670
829
+ },
830
+ {
831
+ "epoch": 2.47,
832
+ "learning_rate": 7.451334605981051e-05,
833
+ "loss": 3.5036,
834
+ "step": 675
835
+ },
836
+ {
837
+ "epoch": 2.49,
838
+ "learning_rate": 7.05732923853327e-05,
839
+ "loss": 3.3534,
840
+ "step": 680
841
+ },
842
+ {
843
+ "epoch": 2.51,
844
+ "learning_rate": 6.662670761466734e-05,
845
+ "loss": 3.4981,
846
+ "step": 685
847
+ },
848
+ {
849
+ "epoch": 2.53,
850
+ "learning_rate": 6.268665394018953e-05,
851
+ "loss": 3.3561,
852
+ "step": 690
853
+ },
854
+ {
855
+ "epoch": 2.55,
856
+ "learning_rate": 5.876617193800604e-05,
857
+ "loss": 3.3629,
858
+ "step": 695
859
+ },
860
+ {
861
+ "epoch": 2.56,
862
+ "learning_rate": 5.4878237406963316e-05,
863
+ "loss": 3.478,
864
+ "step": 700
865
+ },
866
+ {
867
+ "epoch": 2.58,
868
+ "learning_rate": 5.103571842205231e-05,
869
+ "loss": 3.2104,
870
+ "step": 705
871
+ },
872
+ {
873
+ "epoch": 2.6,
874
+ "learning_rate": 4.7251332744350254e-05,
875
+ "loss": 3.5289,
876
+ "step": 710
877
+ },
878
+ {
879
+ "epoch": 2.62,
880
+ "learning_rate": 4.3537605728465284e-05,
881
+ "loss": 3.2297,
882
+ "step": 715
883
+ },
884
+ {
885
+ "epoch": 2.64,
886
+ "learning_rate": 3.990682886679629e-05,
887
+ "loss": 3.4077,
888
+ "step": 720
889
+ },
890
+ {
891
+ "epoch": 2.66,
892
+ "learning_rate": 3.637101910781716e-05,
893
+ "loss": 3.4158,
894
+ "step": 725
895
+ },
896
+ {
897
+ "epoch": 2.67,
898
+ "learning_rate": 3.294187908303268e-05,
899
+ "loss": 3.3786,
900
+ "step": 730
901
+ },
902
+ {
903
+ "epoch": 2.69,
904
+ "learning_rate": 2.9630758374242683e-05,
905
+ "loss": 3.4328,
906
+ "step": 735
907
+ },
908
+ {
909
+ "epoch": 2.71,
910
+ "learning_rate": 2.6448615949311343e-05,
911
+ "loss": 3.469,
912
+ "step": 740
913
+ },
914
+ {
915
+ "epoch": 2.73,
916
+ "learning_rate": 2.340598389076947e-05,
917
+ "loss": 3.3519,
918
+ "step": 745
919
+ },
920
+ {
921
+ "epoch": 2.75,
922
+ "learning_rate": 2.051293253729814e-05,
923
+ "loss": 3.2819,
924
+ "step": 750
925
+ },
926
+ {
927
+ "epoch": 2.77,
928
+ "learning_rate": 1.7779037153468233e-05,
929
+ "loss": 3.2787,
930
+ "step": 755
931
+ },
932
+ {
933
+ "epoch": 2.78,
934
+ "learning_rate": 1.521334623804796e-05,
935
+ "loss": 3.2639,
936
+ "step": 760
937
+ },
938
+ {
939
+ "epoch": 2.8,
940
+ "learning_rate": 1.2824351575772677e-05,
941
+ "loss": 3.3124,
942
+ "step": 765
943
+ },
944
+ {
945
+ "epoch": 2.82,
946
+ "learning_rate": 1.0619960131695668e-05,
947
+ "loss": 3.2562,
948
+ "step": 770
949
+ },
950
+ {
951
+ "epoch": 2.84,
952
+ "learning_rate": 8.607467881143831e-06,
953
+ "loss": 3.1959,
954
+ "step": 775
955
+ },
956
+ {
957
+ "epoch": 2.86,
958
+ "learning_rate": 6.793535661894062e-06,
959
+ "loss": 3.2322,
960
+ "step": 780
961
+ },
962
+ {
963
+ "epoch": 2.88,
964
+ "learning_rate": 5.184167128493107e-06,
965
+ "loss": 3.5785,
966
+ "step": 785
967
+ },
968
+ {
969
+ "epoch": 2.89,
970
+ "learning_rate": 3.784688881687565e-06,
971
+ "loss": 3.4374,
972
+ "step": 790
973
+ },
974
+ {
975
+ "epoch": 2.91,
976
+ "learning_rate": 2.599732838729015e-06,
977
+ "loss": 3.4859,
978
+ "step": 795
979
+ },
980
+ {
981
+ "epoch": 2.93,
982
+ "learning_rate": 1.6332209029056513e-06,
983
+ "loss": 3.396,
984
+ "step": 800
985
+ },
986
+ {
987
+ "epoch": 2.95,
988
+ "learning_rate": 8.883519830391712e-07,
989
+ "loss": 3.3425,
990
+ "step": 805
991
+ },
992
+ {
993
+ "epoch": 2.97,
994
+ "learning_rate": 3.6759140590977833e-07,
995
+ "loss": 3.241,
996
+ "step": 810
997
+ },
998
+ {
999
+ "epoch": 2.99,
1000
+ "learning_rate": 7.266275665120308e-08,
1001
+ "loss": 3.3081,
1002
+ "step": 815
1003
+ },
1004
+ {
1005
+ "epoch": 3.0,
1006
+ "eval_loss": 3.553039312362671,
1007
+ "eval_runtime": 10.0835,
1008
+ "eval_samples_per_second": 39.768,
1009
+ "eval_steps_per_second": 5.058,
1010
+ "step": 819
1011
+ }
1012
+ ],
1013
+ "max_steps": 819,
1014
+ "num_train_epochs": 3,
1015
+ "total_flos": 855992696832000.0,
1016
+ "trial_name": null,
1017
+ "trial_params": null
1018
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5403dc0eac98a0673a2985e0248bb53a70fadc52734e92f64f79f2e5d172b8bb
3
+ size 3247
vocab.json ADDED
The diff for this file is too large to render. See raw diff