PaulDrm commited on
Commit
9d26fad
1 Parent(s): a5a4154

first commit model

Browse files
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "roberta-base",
3
+ "architectures": [
4
+ "RobertaForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.13.0.dev0",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d13e896a4176b834de04ead67ac56f152ef86c826e79e6cc214457c9853869d
3
+ size 997696473
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f791190f60ceb242946db533b01262507c89022f36eb3746e39c9ab65e60c9d
3
+ size 498875371
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e49c7fbb73c8f0a1ba658bbde48e5c945b75819c14ba504d0242181057af6aa8
3
+ size 17563
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52d0c776f9430e2857f421df871588edf488262f8e998f8a9e39cccc73085f1c
3
+ size 623
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "roberta-base", "tokenizer_class": "RobertaTokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,1096 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 35.026963262554766,
5
+ "global_step": 12960,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.32,
12
+ "learning_rate": 8e-05,
13
+ "loss": 1.9241,
14
+ "step": 120
15
+ },
16
+ {
17
+ "epoch": 0.65,
18
+ "learning_rate": 0.00016,
19
+ "loss": 1.8026,
20
+ "step": 240
21
+ },
22
+ {
23
+ "epoch": 0.65,
24
+ "eval_loss": 1.7006735801696777,
25
+ "eval_runtime": 120.5725,
26
+ "eval_samples_per_second": 46.437,
27
+ "eval_steps_per_second": 0.73,
28
+ "step": 240
29
+ },
30
+ {
31
+ "epoch": 0.97,
32
+ "learning_rate": 0.0002,
33
+ "loss": 1.7588,
34
+ "step": 360
35
+ },
36
+ {
37
+ "epoch": 1.3,
38
+ "learning_rate": 0.0002,
39
+ "loss": 1.7242,
40
+ "step": 480
41
+ },
42
+ {
43
+ "epoch": 1.3,
44
+ "eval_loss": 1.6368365287780762,
45
+ "eval_runtime": 123.5326,
46
+ "eval_samples_per_second": 45.324,
47
+ "eval_steps_per_second": 0.712,
48
+ "step": 480
49
+ },
50
+ {
51
+ "epoch": 1.62,
52
+ "learning_rate": 0.0002,
53
+ "loss": 1.6797,
54
+ "step": 600
55
+ },
56
+ {
57
+ "epoch": 1.94,
58
+ "learning_rate": 0.0002,
59
+ "loss": 1.6544,
60
+ "step": 720
61
+ },
62
+ {
63
+ "epoch": 1.94,
64
+ "eval_loss": 1.589858889579773,
65
+ "eval_runtime": 121.9204,
66
+ "eval_samples_per_second": 45.923,
67
+ "eval_steps_per_second": 0.722,
68
+ "step": 720
69
+ },
70
+ {
71
+ "epoch": 2.27,
72
+ "learning_rate": 0.0002,
73
+ "loss": 1.639,
74
+ "step": 840
75
+ },
76
+ {
77
+ "epoch": 2.59,
78
+ "learning_rate": 0.0002,
79
+ "loss": 1.6103,
80
+ "step": 960
81
+ },
82
+ {
83
+ "epoch": 2.59,
84
+ "eval_loss": 1.559193730354309,
85
+ "eval_runtime": 118.3836,
86
+ "eval_samples_per_second": 47.295,
87
+ "eval_steps_per_second": 0.743,
88
+ "step": 960
89
+ },
90
+ {
91
+ "epoch": 2.92,
92
+ "learning_rate": 0.0002,
93
+ "loss": 1.5982,
94
+ "step": 1080
95
+ },
96
+ {
97
+ "epoch": 3.24,
98
+ "learning_rate": 0.0002,
99
+ "loss": 1.5858,
100
+ "step": 1200
101
+ },
102
+ {
103
+ "epoch": 3.24,
104
+ "eval_loss": 1.5362491607666016,
105
+ "eval_runtime": 123.4422,
106
+ "eval_samples_per_second": 45.357,
107
+ "eval_steps_per_second": 0.713,
108
+ "step": 1200
109
+ },
110
+ {
111
+ "epoch": 3.57,
112
+ "learning_rate": 0.0002,
113
+ "loss": 1.5684,
114
+ "step": 1320
115
+ },
116
+ {
117
+ "epoch": 3.89,
118
+ "learning_rate": 0.0002,
119
+ "loss": 1.5566,
120
+ "step": 1440
121
+ },
122
+ {
123
+ "epoch": 3.89,
124
+ "eval_loss": 1.51528799533844,
125
+ "eval_runtime": 120.2858,
126
+ "eval_samples_per_second": 46.547,
127
+ "eval_steps_per_second": 0.732,
128
+ "step": 1440
129
+ },
130
+ {
131
+ "epoch": 4.22,
132
+ "learning_rate": 0.0002,
133
+ "loss": 1.5593,
134
+ "step": 1560
135
+ },
136
+ {
137
+ "epoch": 4.54,
138
+ "learning_rate": 0.0002,
139
+ "loss": 1.5322,
140
+ "step": 1680
141
+ },
142
+ {
143
+ "epoch": 4.54,
144
+ "eval_loss": 1.5114836692810059,
145
+ "eval_runtime": 117.8482,
146
+ "eval_samples_per_second": 47.51,
147
+ "eval_steps_per_second": 0.747,
148
+ "step": 1680
149
+ },
150
+ {
151
+ "epoch": 4.86,
152
+ "learning_rate": 0.0002,
153
+ "loss": 1.5285,
154
+ "step": 1800
155
+ },
156
+ {
157
+ "epoch": 5.19,
158
+ "learning_rate": 0.0002,
159
+ "loss": 1.5359,
160
+ "step": 1920
161
+ },
162
+ {
163
+ "epoch": 5.19,
164
+ "eval_loss": 1.48625910282135,
165
+ "eval_runtime": 123.7493,
166
+ "eval_samples_per_second": 45.245,
167
+ "eval_steps_per_second": 0.711,
168
+ "step": 1920
169
+ },
170
+ {
171
+ "epoch": 5.51,
172
+ "learning_rate": 0.0002,
173
+ "loss": 1.5207,
174
+ "step": 2040
175
+ },
176
+ {
177
+ "epoch": 5.84,
178
+ "learning_rate": 0.0002,
179
+ "loss": 1.5079,
180
+ "step": 2160
181
+ },
182
+ {
183
+ "epoch": 5.84,
184
+ "eval_loss": 1.4822603464126587,
185
+ "eval_runtime": 119.2091,
186
+ "eval_samples_per_second": 46.968,
187
+ "eval_steps_per_second": 0.738,
188
+ "step": 2160
189
+ },
190
+ {
191
+ "epoch": 6.16,
192
+ "learning_rate": 0.0002,
193
+ "loss": 1.51,
194
+ "step": 2280
195
+ },
196
+ {
197
+ "epoch": 6.49,
198
+ "learning_rate": 0.0002,
199
+ "loss": 1.4909,
200
+ "step": 2400
201
+ },
202
+ {
203
+ "epoch": 6.49,
204
+ "eval_loss": 1.4646539688110352,
205
+ "eval_runtime": 122.8427,
206
+ "eval_samples_per_second": 45.579,
207
+ "eval_steps_per_second": 0.716,
208
+ "step": 2400
209
+ },
210
+ {
211
+ "epoch": 6.81,
212
+ "learning_rate": 0.0002,
213
+ "loss": 1.4869,
214
+ "step": 2520
215
+ },
216
+ {
217
+ "epoch": 7.13,
218
+ "learning_rate": 0.0002,
219
+ "loss": 1.4894,
220
+ "step": 2640
221
+ },
222
+ {
223
+ "epoch": 7.13,
224
+ "eval_loss": 1.4567737579345703,
225
+ "eval_runtime": 112.4698,
226
+ "eval_samples_per_second": 49.782,
227
+ "eval_steps_per_second": 0.782,
228
+ "step": 2640
229
+ },
230
+ {
231
+ "epoch": 7.46,
232
+ "learning_rate": 0.0002,
233
+ "loss": 1.4705,
234
+ "step": 2760
235
+ },
236
+ {
237
+ "epoch": 7.78,
238
+ "learning_rate": 0.0002,
239
+ "loss": 1.469,
240
+ "step": 2880
241
+ },
242
+ {
243
+ "epoch": 7.78,
244
+ "eval_loss": 1.447322130203247,
245
+ "eval_runtime": 124.434,
246
+ "eval_samples_per_second": 44.996,
247
+ "eval_steps_per_second": 0.707,
248
+ "step": 2880
249
+ },
250
+ {
251
+ "epoch": 8.11,
252
+ "learning_rate": 0.0002,
253
+ "loss": 1.4716,
254
+ "step": 3000
255
+ },
256
+ {
257
+ "epoch": 8.43,
258
+ "learning_rate": 0.0002,
259
+ "loss": 1.4525,
260
+ "step": 3120
261
+ },
262
+ {
263
+ "epoch": 8.43,
264
+ "eval_loss": 1.4480490684509277,
265
+ "eval_runtime": 120.9825,
266
+ "eval_samples_per_second": 46.279,
267
+ "eval_steps_per_second": 0.727,
268
+ "step": 3120
269
+ },
270
+ {
271
+ "epoch": 8.75,
272
+ "learning_rate": 0.0002,
273
+ "loss": 1.452,
274
+ "step": 3240
275
+ },
276
+ {
277
+ "epoch": 9.08,
278
+ "learning_rate": 0.0002,
279
+ "loss": 1.4552,
280
+ "step": 3360
281
+ },
282
+ {
283
+ "epoch": 9.08,
284
+ "eval_loss": 1.4297771453857422,
285
+ "eval_runtime": 119.4349,
286
+ "eval_samples_per_second": 46.879,
287
+ "eval_steps_per_second": 0.737,
288
+ "step": 3360
289
+ },
290
+ {
291
+ "epoch": 9.4,
292
+ "learning_rate": 0.0002,
293
+ "loss": 1.4369,
294
+ "step": 3480
295
+ },
296
+ {
297
+ "epoch": 9.73,
298
+ "learning_rate": 0.0002,
299
+ "loss": 1.4357,
300
+ "step": 3600
301
+ },
302
+ {
303
+ "epoch": 9.73,
304
+ "eval_loss": 1.4253787994384766,
305
+ "eval_runtime": 123.7286,
306
+ "eval_samples_per_second": 45.252,
307
+ "eval_steps_per_second": 0.711,
308
+ "step": 3600
309
+ },
310
+ {
311
+ "epoch": 10.05,
312
+ "learning_rate": 0.0002,
313
+ "loss": 1.4449,
314
+ "step": 3720
315
+ },
316
+ {
317
+ "epoch": 10.38,
318
+ "learning_rate": 0.0002,
319
+ "loss": 1.4245,
320
+ "step": 3840
321
+ },
322
+ {
323
+ "epoch": 10.38,
324
+ "eval_loss": 1.419893741607666,
325
+ "eval_runtime": 122.5962,
326
+ "eval_samples_per_second": 45.67,
327
+ "eval_steps_per_second": 0.718,
328
+ "step": 3840
329
+ },
330
+ {
331
+ "epoch": 10.7,
332
+ "learning_rate": 0.0002,
333
+ "loss": 1.4259,
334
+ "step": 3960
335
+ },
336
+ {
337
+ "epoch": 11.03,
338
+ "learning_rate": 0.0002,
339
+ "loss": 1.4317,
340
+ "step": 4080
341
+ },
342
+ {
343
+ "epoch": 11.03,
344
+ "eval_loss": 1.4151264429092407,
345
+ "eval_runtime": 120.6018,
346
+ "eval_samples_per_second": 46.426,
347
+ "eval_steps_per_second": 0.73,
348
+ "step": 4080
349
+ },
350
+ {
351
+ "epoch": 11.35,
352
+ "learning_rate": 0.0002,
353
+ "loss": 1.4133,
354
+ "step": 4200
355
+ },
356
+ {
357
+ "epoch": 11.67,
358
+ "learning_rate": 0.0002,
359
+ "loss": 1.4119,
360
+ "step": 4320
361
+ },
362
+ {
363
+ "epoch": 11.67,
364
+ "eval_loss": 1.4069455862045288,
365
+ "eval_runtime": 123.9031,
366
+ "eval_samples_per_second": 45.189,
367
+ "eval_steps_per_second": 0.71,
368
+ "step": 4320
369
+ },
370
+ {
371
+ "epoch": 12.0,
372
+ "learning_rate": 0.0002,
373
+ "loss": 1.4096,
374
+ "step": 4440
375
+ },
376
+ {
377
+ "epoch": 12.32,
378
+ "learning_rate": 0.0002,
379
+ "loss": 1.4086,
380
+ "step": 4560
381
+ },
382
+ {
383
+ "epoch": 12.32,
384
+ "eval_loss": 1.4099173545837402,
385
+ "eval_runtime": 121.1011,
386
+ "eval_samples_per_second": 46.234,
387
+ "eval_steps_per_second": 0.727,
388
+ "step": 4560
389
+ },
390
+ {
391
+ "epoch": 12.65,
392
+ "learning_rate": 0.0002,
393
+ "loss": 1.4031,
394
+ "step": 4680
395
+ },
396
+ {
397
+ "epoch": 12.97,
398
+ "learning_rate": 0.0002,
399
+ "loss": 1.401,
400
+ "step": 4800
401
+ },
402
+ {
403
+ "epoch": 12.97,
404
+ "eval_loss": 1.4046831130981445,
405
+ "eval_runtime": 121.8177,
406
+ "eval_samples_per_second": 45.962,
407
+ "eval_steps_per_second": 0.722,
408
+ "step": 4800
409
+ },
410
+ {
411
+ "epoch": 13.3,
412
+ "learning_rate": 0.0002,
413
+ "loss": 1.4031,
414
+ "step": 4920
415
+ },
416
+ {
417
+ "epoch": 13.62,
418
+ "learning_rate": 0.0002,
419
+ "loss": 1.394,
420
+ "step": 5040
421
+ },
422
+ {
423
+ "epoch": 13.62,
424
+ "eval_loss": 1.401537299156189,
425
+ "eval_runtime": 121.4356,
426
+ "eval_samples_per_second": 46.107,
427
+ "eval_steps_per_second": 0.725,
428
+ "step": 5040
429
+ },
430
+ {
431
+ "epoch": 13.94,
432
+ "learning_rate": 0.0002,
433
+ "loss": 1.3922,
434
+ "step": 5160
435
+ },
436
+ {
437
+ "epoch": 14.27,
438
+ "learning_rate": 0.0002,
439
+ "loss": 1.3945,
440
+ "step": 5280
441
+ },
442
+ {
443
+ "epoch": 14.27,
444
+ "eval_loss": 1.3918230533599854,
445
+ "eval_runtime": 119.2233,
446
+ "eval_samples_per_second": 46.962,
447
+ "eval_steps_per_second": 0.738,
448
+ "step": 5280
449
+ },
450
+ {
451
+ "epoch": 14.59,
452
+ "learning_rate": 0.0002,
453
+ "loss": 1.3836,
454
+ "step": 5400
455
+ },
456
+ {
457
+ "epoch": 14.92,
458
+ "learning_rate": 0.0002,
459
+ "loss": 1.3838,
460
+ "step": 5520
461
+ },
462
+ {
463
+ "epoch": 14.92,
464
+ "eval_loss": 1.385350227355957,
465
+ "eval_runtime": 113.4489,
466
+ "eval_samples_per_second": 49.353,
467
+ "eval_steps_per_second": 0.776,
468
+ "step": 5520
469
+ },
470
+ {
471
+ "epoch": 15.24,
472
+ "learning_rate": 0.0002,
473
+ "loss": 1.387,
474
+ "step": 5640
475
+ },
476
+ {
477
+ "epoch": 15.57,
478
+ "learning_rate": 0.0002,
479
+ "loss": 1.3722,
480
+ "step": 5760
481
+ },
482
+ {
483
+ "epoch": 15.57,
484
+ "eval_loss": 1.379088282585144,
485
+ "eval_runtime": 116.4932,
486
+ "eval_samples_per_second": 48.063,
487
+ "eval_steps_per_second": 0.755,
488
+ "step": 5760
489
+ },
490
+ {
491
+ "epoch": 15.89,
492
+ "learning_rate": 0.0002,
493
+ "loss": 1.3757,
494
+ "step": 5880
495
+ },
496
+ {
497
+ "epoch": 16.22,
498
+ "learning_rate": 0.0002,
499
+ "loss": 1.3775,
500
+ "step": 6000
501
+ },
502
+ {
503
+ "epoch": 16.22,
504
+ "eval_loss": 1.384007453918457,
505
+ "eval_runtime": 115.8099,
506
+ "eval_samples_per_second": 48.346,
507
+ "eval_steps_per_second": 0.76,
508
+ "step": 6000
509
+ },
510
+ {
511
+ "epoch": 16.54,
512
+ "learning_rate": 0.0002,
513
+ "loss": 1.3683,
514
+ "step": 6120
515
+ },
516
+ {
517
+ "epoch": 16.86,
518
+ "learning_rate": 0.0002,
519
+ "loss": 1.3675,
520
+ "step": 6240
521
+ },
522
+ {
523
+ "epoch": 16.86,
524
+ "eval_loss": 1.3760778903961182,
525
+ "eval_runtime": 113.2638,
526
+ "eval_samples_per_second": 49.433,
527
+ "eval_steps_per_second": 0.777,
528
+ "step": 6240
529
+ },
530
+ {
531
+ "epoch": 17.19,
532
+ "learning_rate": 0.0002,
533
+ "loss": 1.375,
534
+ "step": 6360
535
+ },
536
+ {
537
+ "epoch": 17.51,
538
+ "learning_rate": 0.0002,
539
+ "loss": 1.358,
540
+ "step": 6480
541
+ },
542
+ {
543
+ "epoch": 17.51,
544
+ "eval_loss": 1.3729970455169678,
545
+ "eval_runtime": 119.1962,
546
+ "eval_samples_per_second": 46.973,
547
+ "eval_steps_per_second": 0.738,
548
+ "step": 6480
549
+ },
550
+ {
551
+ "epoch": 17.84,
552
+ "learning_rate": 0.0002,
553
+ "loss": 1.3617,
554
+ "step": 6600
555
+ },
556
+ {
557
+ "epoch": 18.16,
558
+ "learning_rate": 0.0002,
559
+ "loss": 1.3679,
560
+ "step": 6720
561
+ },
562
+ {
563
+ "epoch": 18.16,
564
+ "eval_loss": 1.3826600313186646,
565
+ "eval_runtime": 118.9849,
566
+ "eval_samples_per_second": 47.056,
567
+ "eval_steps_per_second": 0.74,
568
+ "step": 6720
569
+ },
570
+ {
571
+ "epoch": 18.49,
572
+ "learning_rate": 0.0002,
573
+ "loss": 1.3592,
574
+ "step": 6840
575
+ },
576
+ {
577
+ "epoch": 18.81,
578
+ "learning_rate": 0.0002,
579
+ "loss": 1.3602,
580
+ "step": 6960
581
+ },
582
+ {
583
+ "epoch": 18.81,
584
+ "eval_loss": 1.3659363985061646,
585
+ "eval_runtime": 120.7081,
586
+ "eval_samples_per_second": 46.385,
587
+ "eval_steps_per_second": 0.729,
588
+ "step": 6960
589
+ },
590
+ {
591
+ "epoch": 19.13,
592
+ "learning_rate": 0.0002,
593
+ "loss": 1.3633,
594
+ "step": 7080
595
+ },
596
+ {
597
+ "epoch": 19.46,
598
+ "learning_rate": 0.0002,
599
+ "loss": 1.3522,
600
+ "step": 7200
601
+ },
602
+ {
603
+ "epoch": 19.46,
604
+ "eval_loss": 1.372406244277954,
605
+ "eval_runtime": 113.6178,
606
+ "eval_samples_per_second": 49.279,
607
+ "eval_steps_per_second": 0.775,
608
+ "step": 7200
609
+ },
610
+ {
611
+ "epoch": 19.78,
612
+ "learning_rate": 0.0002,
613
+ "loss": 1.345,
614
+ "step": 7320
615
+ },
616
+ {
617
+ "epoch": 20.11,
618
+ "learning_rate": 0.0002,
619
+ "loss": 1.3555,
620
+ "step": 7440
621
+ },
622
+ {
623
+ "epoch": 20.11,
624
+ "eval_loss": 1.368371844291687,
625
+ "eval_runtime": 118.9369,
626
+ "eval_samples_per_second": 47.075,
627
+ "eval_steps_per_second": 0.74,
628
+ "step": 7440
629
+ },
630
+ {
631
+ "epoch": 20.43,
632
+ "learning_rate": 0.0002,
633
+ "loss": 1.3396,
634
+ "step": 7560
635
+ },
636
+ {
637
+ "epoch": 20.75,
638
+ "learning_rate": 0.0002,
639
+ "loss": 1.3536,
640
+ "step": 7680
641
+ },
642
+ {
643
+ "epoch": 20.75,
644
+ "eval_loss": 1.3611598014831543,
645
+ "eval_runtime": 119.3386,
646
+ "eval_samples_per_second": 46.917,
647
+ "eval_steps_per_second": 0.737,
648
+ "step": 7680
649
+ },
650
+ {
651
+ "epoch": 21.08,
652
+ "learning_rate": 0.0002,
653
+ "loss": 1.3506,
654
+ "step": 7800
655
+ },
656
+ {
657
+ "epoch": 21.4,
658
+ "learning_rate": 0.0002,
659
+ "loss": 1.3347,
660
+ "step": 7920
661
+ },
662
+ {
663
+ "epoch": 21.4,
664
+ "eval_loss": 1.3598804473876953,
665
+ "eval_runtime": 114.0961,
666
+ "eval_samples_per_second": 49.073,
667
+ "eval_steps_per_second": 0.771,
668
+ "step": 7920
669
+ },
670
+ {
671
+ "epoch": 21.73,
672
+ "learning_rate": 0.0002,
673
+ "loss": 1.338,
674
+ "step": 8040
675
+ },
676
+ {
677
+ "epoch": 22.05,
678
+ "learning_rate": 0.0002,
679
+ "loss": 1.3463,
680
+ "step": 8160
681
+ },
682
+ {
683
+ "epoch": 22.05,
684
+ "eval_loss": 1.3614617586135864,
685
+ "eval_runtime": 121.7757,
686
+ "eval_samples_per_second": 45.978,
687
+ "eval_steps_per_second": 0.723,
688
+ "step": 8160
689
+ },
690
+ {
691
+ "epoch": 22.38,
692
+ "learning_rate": 0.0002,
693
+ "loss": 1.3305,
694
+ "step": 8280
695
+ },
696
+ {
697
+ "epoch": 22.7,
698
+ "learning_rate": 0.0002,
699
+ "loss": 1.3296,
700
+ "step": 8400
701
+ },
702
+ {
703
+ "epoch": 22.7,
704
+ "eval_loss": 1.359055519104004,
705
+ "eval_runtime": 113.3148,
706
+ "eval_samples_per_second": 49.411,
707
+ "eval_steps_per_second": 0.777,
708
+ "step": 8400
709
+ },
710
+ {
711
+ "epoch": 23.03,
712
+ "learning_rate": 0.0002,
713
+ "loss": 1.344,
714
+ "step": 8520
715
+ },
716
+ {
717
+ "epoch": 23.35,
718
+ "learning_rate": 0.0002,
719
+ "loss": 1.3201,
720
+ "step": 8640
721
+ },
722
+ {
723
+ "epoch": 23.35,
724
+ "eval_loss": 1.358960509300232,
725
+ "eval_runtime": 122.2886,
726
+ "eval_samples_per_second": 45.785,
727
+ "eval_steps_per_second": 0.72,
728
+ "step": 8640
729
+ },
730
+ {
731
+ "epoch": 23.67,
732
+ "learning_rate": 0.0002,
733
+ "loss": 1.3302,
734
+ "step": 8760
735
+ },
736
+ {
737
+ "epoch": 24.0,
738
+ "learning_rate": 0.0002,
739
+ "loss": 1.3292,
740
+ "step": 8880
741
+ },
742
+ {
743
+ "epoch": 24.0,
744
+ "eval_loss": 1.3509206771850586,
745
+ "eval_runtime": 99.6058,
746
+ "eval_samples_per_second": 56.212,
747
+ "eval_steps_per_second": 0.883,
748
+ "step": 8880
749
+ },
750
+ {
751
+ "epoch": 24.32,
752
+ "learning_rate": 0.0002,
753
+ "loss": 1.3294,
754
+ "step": 9000
755
+ },
756
+ {
757
+ "epoch": 24.65,
758
+ "learning_rate": 0.0002,
759
+ "loss": 1.3207,
760
+ "step": 9120
761
+ },
762
+ {
763
+ "epoch": 24.65,
764
+ "eval_loss": 1.357851505279541,
765
+ "eval_runtime": 105.9073,
766
+ "eval_samples_per_second": 52.867,
767
+ "eval_steps_per_second": 0.831,
768
+ "step": 9120
769
+ },
770
+ {
771
+ "epoch": 24.97,
772
+ "learning_rate": 0.0002,
773
+ "loss": 1.3215,
774
+ "step": 9240
775
+ },
776
+ {
777
+ "epoch": 25.3,
778
+ "learning_rate": 0.0002,
779
+ "loss": 1.3231,
780
+ "step": 9360
781
+ },
782
+ {
783
+ "epoch": 25.3,
784
+ "eval_loss": 1.3393853902816772,
785
+ "eval_runtime": 99.7219,
786
+ "eval_samples_per_second": 56.146,
787
+ "eval_steps_per_second": 0.882,
788
+ "step": 9360
789
+ },
790
+ {
791
+ "epoch": 25.62,
792
+ "learning_rate": 0.0002,
793
+ "loss": 1.3121,
794
+ "step": 9480
795
+ },
796
+ {
797
+ "epoch": 25.94,
798
+ "learning_rate": 0.0002,
799
+ "loss": 1.3176,
800
+ "step": 9600
801
+ },
802
+ {
803
+ "epoch": 25.94,
804
+ "eval_loss": 1.3441215753555298,
805
+ "eval_runtime": 101.3937,
806
+ "eval_samples_per_second": 55.22,
807
+ "eval_steps_per_second": 0.868,
808
+ "step": 9600
809
+ },
810
+ {
811
+ "epoch": 26.27,
812
+ "learning_rate": 0.0002,
813
+ "loss": 1.3188,
814
+ "step": 9720
815
+ },
816
+ {
817
+ "epoch": 26.59,
818
+ "learning_rate": 0.0002,
819
+ "loss": 1.3103,
820
+ "step": 9840
821
+ },
822
+ {
823
+ "epoch": 26.59,
824
+ "eval_loss": 1.3429008722305298,
825
+ "eval_runtime": 100.8116,
826
+ "eval_samples_per_second": 55.539,
827
+ "eval_steps_per_second": 0.873,
828
+ "step": 9840
829
+ },
830
+ {
831
+ "epoch": 26.92,
832
+ "learning_rate": 0.0002,
833
+ "loss": 1.313,
834
+ "step": 9960
835
+ },
836
+ {
837
+ "epoch": 27.24,
838
+ "learning_rate": 0.0002,
839
+ "loss": 1.3156,
840
+ "step": 10080
841
+ },
842
+ {
843
+ "epoch": 27.24,
844
+ "eval_loss": 1.3400343656539917,
845
+ "eval_runtime": 98.2948,
846
+ "eval_samples_per_second": 56.961,
847
+ "eval_steps_per_second": 0.895,
848
+ "step": 10080
849
+ },
850
+ {
851
+ "epoch": 27.57,
852
+ "learning_rate": 0.0002,
853
+ "loss": 1.3064,
854
+ "step": 10200
855
+ },
856
+ {
857
+ "epoch": 27.89,
858
+ "learning_rate": 0.0002,
859
+ "loss": 1.306,
860
+ "step": 10320
861
+ },
862
+ {
863
+ "epoch": 27.89,
864
+ "eval_loss": 1.339460015296936,
865
+ "eval_runtime": 97.8707,
866
+ "eval_samples_per_second": 57.208,
867
+ "eval_steps_per_second": 0.899,
868
+ "step": 10320
869
+ },
870
+ {
871
+ "epoch": 28.22,
872
+ "learning_rate": 0.0002,
873
+ "loss": 1.3093,
874
+ "step": 10440
875
+ },
876
+ {
877
+ "epoch": 28.54,
878
+ "learning_rate": 0.0002,
879
+ "loss": 1.3026,
880
+ "step": 10560
881
+ },
882
+ {
883
+ "epoch": 28.54,
884
+ "eval_loss": 1.3380861282348633,
885
+ "eval_runtime": 99.7827,
886
+ "eval_samples_per_second": 56.112,
887
+ "eval_steps_per_second": 0.882,
888
+ "step": 10560
889
+ },
890
+ {
891
+ "epoch": 28.86,
892
+ "learning_rate": 0.0002,
893
+ "loss": 1.3014,
894
+ "step": 10680
895
+ },
896
+ {
897
+ "epoch": 29.19,
898
+ "learning_rate": 0.0002,
899
+ "loss": 1.3093,
900
+ "step": 10800
901
+ },
902
+ {
903
+ "epoch": 29.19,
904
+ "eval_loss": 1.335351824760437,
905
+ "eval_runtime": 99.7514,
906
+ "eval_samples_per_second": 56.13,
907
+ "eval_steps_per_second": 0.882,
908
+ "step": 10800
909
+ },
910
+ {
911
+ "epoch": 29.51,
912
+ "learning_rate": 0.0002,
913
+ "loss": 1.2954,
914
+ "step": 10920
915
+ },
916
+ {
917
+ "epoch": 29.84,
918
+ "learning_rate": 0.0002,
919
+ "loss": 1.2982,
920
+ "step": 11040
921
+ },
922
+ {
923
+ "epoch": 29.84,
924
+ "eval_loss": 1.33037269115448,
925
+ "eval_runtime": 111.392,
926
+ "eval_samples_per_second": 50.264,
927
+ "eval_steps_per_second": 0.79,
928
+ "step": 11040
929
+ },
930
+ {
931
+ "epoch": 30.16,
932
+ "learning_rate": 0.0002,
933
+ "loss": 1.3032,
934
+ "step": 11160
935
+ },
936
+ {
937
+ "epoch": 30.49,
938
+ "learning_rate": 0.0002,
939
+ "loss": 1.2927,
940
+ "step": 11280
941
+ },
942
+ {
943
+ "epoch": 30.49,
944
+ "eval_loss": 1.3423055410385132,
945
+ "eval_runtime": 110.815,
946
+ "eval_samples_per_second": 50.526,
947
+ "eval_steps_per_second": 0.794,
948
+ "step": 11280
949
+ },
950
+ {
951
+ "epoch": 30.81,
952
+ "learning_rate": 0.0002,
953
+ "loss": 1.2968,
954
+ "step": 11400
955
+ },
956
+ {
957
+ "epoch": 31.13,
958
+ "learning_rate": 0.0002,
959
+ "loss": 1.3003,
960
+ "step": 11520
961
+ },
962
+ {
963
+ "epoch": 31.13,
964
+ "eval_loss": 1.3345474004745483,
965
+ "eval_runtime": 100.6956,
966
+ "eval_samples_per_second": 55.603,
967
+ "eval_steps_per_second": 0.874,
968
+ "step": 11520
969
+ },
970
+ {
971
+ "epoch": 31.46,
972
+ "learning_rate": 0.0002,
973
+ "loss": 1.2865,
974
+ "step": 11640
975
+ },
976
+ {
977
+ "epoch": 31.78,
978
+ "learning_rate": 0.0002,
979
+ "loss": 1.2928,
980
+ "step": 11760
981
+ },
982
+ {
983
+ "epoch": 31.78,
984
+ "eval_loss": 1.337437629699707,
985
+ "eval_runtime": 97.2235,
986
+ "eval_samples_per_second": 57.589,
987
+ "eval_steps_per_second": 0.905,
988
+ "step": 11760
989
+ },
990
+ {
991
+ "epoch": 32.11,
992
+ "learning_rate": 0.0002,
993
+ "loss": 1.2981,
994
+ "step": 11880
995
+ },
996
+ {
997
+ "epoch": 32.43,
998
+ "learning_rate": 0.0002,
999
+ "loss": 1.2847,
1000
+ "step": 12000
1001
+ },
1002
+ {
1003
+ "epoch": 32.43,
1004
+ "eval_loss": 1.3236644268035889,
1005
+ "eval_runtime": 97.4026,
1006
+ "eval_samples_per_second": 57.483,
1007
+ "eval_steps_per_second": 0.903,
1008
+ "step": 12000
1009
+ },
1010
+ {
1011
+ "epoch": 32.75,
1012
+ "learning_rate": 0.0002,
1013
+ "loss": 1.2871,
1014
+ "step": 12120
1015
+ },
1016
+ {
1017
+ "epoch": 33.08,
1018
+ "learning_rate": 0.0002,
1019
+ "loss": 1.2966,
1020
+ "step": 12240
1021
+ },
1022
+ {
1023
+ "epoch": 33.08,
1024
+ "eval_loss": 1.332656741142273,
1025
+ "eval_runtime": 97.3643,
1026
+ "eval_samples_per_second": 57.506,
1027
+ "eval_steps_per_second": 0.904,
1028
+ "step": 12240
1029
+ },
1030
+ {
1031
+ "epoch": 33.4,
1032
+ "learning_rate": 0.0002,
1033
+ "loss": 1.2789,
1034
+ "step": 12360
1035
+ },
1036
+ {
1037
+ "epoch": 33.73,
1038
+ "learning_rate": 0.0002,
1039
+ "loss": 1.2829,
1040
+ "step": 12480
1041
+ },
1042
+ {
1043
+ "epoch": 33.73,
1044
+ "eval_loss": 1.3252918720245361,
1045
+ "eval_runtime": 104.7279,
1046
+ "eval_samples_per_second": 53.462,
1047
+ "eval_steps_per_second": 0.84,
1048
+ "step": 12480
1049
+ },
1050
+ {
1051
+ "epoch": 34.05,
1052
+ "learning_rate": 0.0002,
1053
+ "loss": 1.2926,
1054
+ "step": 12600
1055
+ },
1056
+ {
1057
+ "epoch": 34.38,
1058
+ "learning_rate": 0.0002,
1059
+ "loss": 1.2756,
1060
+ "step": 12720
1061
+ },
1062
+ {
1063
+ "epoch": 34.38,
1064
+ "eval_loss": 1.326663613319397,
1065
+ "eval_runtime": 98.2526,
1066
+ "eval_samples_per_second": 56.986,
1067
+ "eval_steps_per_second": 0.896,
1068
+ "step": 12720
1069
+ },
1070
+ {
1071
+ "epoch": 34.7,
1072
+ "learning_rate": 0.0002,
1073
+ "loss": 1.2801,
1074
+ "step": 12840
1075
+ },
1076
+ {
1077
+ "epoch": 35.03,
1078
+ "learning_rate": 0.0002,
1079
+ "loss": 1.2919,
1080
+ "step": 12960
1081
+ },
1082
+ {
1083
+ "epoch": 35.03,
1084
+ "eval_loss": 1.3183717727661133,
1085
+ "eval_runtime": 99.1376,
1086
+ "eval_samples_per_second": 56.477,
1087
+ "eval_steps_per_second": 0.888,
1088
+ "step": 12960
1089
+ }
1090
+ ],
1091
+ "max_steps": 14000,
1092
+ "num_train_epochs": 38,
1093
+ "total_flos": 1.7505797492048026e+18,
1094
+ "trial_name": null,
1095
+ "trial_params": null
1096
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f036c9bbb0eda92a44e02d76db6576bc9d95c2b29eb730aa7ceb1ee9e9f0b599
3
+ size 2991
vocab.json ADDED
The diff for this file is too large to render. See raw diff