fpuentes commited on
Commit
abe4e8f
1 Parent(s): 1219953

Model save

Browse files
last-checkpoint/config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "_name_or_path": "/home/pcjf/CESGA/works/lmodels/models/tiny",
3
- "architectures": [
4
- "RobertaForMaskedLM"
5
- ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "bos_token_id": 0,
8
- "classifier_dropout": null,
9
- "eos_token_id": 2,
10
- "gradient_checkpointing": false,
11
- "hidden_act": "gelu",
12
- "hidden_dropout_prob": 0.1,
13
- "hidden_size": 768,
14
- "initializer_range": 0.02,
15
- "intermediate_size": 3072,
16
- "layer_norm_eps": 1e-12,
17
- "max_position_embeddings": 514,
18
- "model_type": "roberta",
19
- "num_attention_heads": 12,
20
- "num_hidden_layers": 6,
21
- "pad_token_id": 1,
22
- "position_embedding_type": "absolute",
23
- "torch_dtype": "float32",
24
- "transformers_version": "4.24.0",
25
- "type_vocab_size": 1,
26
- "use_cache": true,
27
- "vocab_size": 31002
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c31deffa3cbd4ce7680c9f958b65badb5e318d2f77aa59f3a9adaeda5aa198bd
3
- size 538943941
 
 
 
 
last-checkpoint/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8fb1bec33c4431f9607772bf7db9d5ab7b3cddb8480fac34b5229ab54e6b0616
3
- size 269468281
 
 
 
 
last-checkpoint/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6354b3a5422c28a3a5a73b71565a42829474a3c347615e37c20c87de92d294e2
3
- size 14575
 
 
 
 
last-checkpoint/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ff1be153872ceab362bc8f896bf3f611b155e54edf151eccfc448653a32209d
3
- size 627
 
 
 
 
last-checkpoint/special_tokens_map.json DELETED
@@ -1,51 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "<s>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "</s>",
18
- "lstrip": false,
19
- "normalized": true,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "mask_token": {
24
- "content": "<mask>",
25
- "lstrip": true,
26
- "normalized": true,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "pad_token": {
31
- "content": "<pad>",
32
- "lstrip": false,
33
- "normalized": true,
34
- "rstrip": false,
35
- "single_word": false
36
- },
37
- "sep_token": {
38
- "content": "</s>",
39
- "lstrip": false,
40
- "normalized": true,
41
- "rstrip": false,
42
- "single_word": false
43
- },
44
- "unk_token": {
45
- "content": "<unk>",
46
- "lstrip": false,
47
- "normalized": true,
48
- "rstrip": false,
49
- "single_word": false
50
- }
51
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json DELETED
@@ -1,65 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "bos_token": {
4
- "__type": "AddedToken",
5
- "content": "<s>",
6
- "lstrip": false,
7
- "normalized": true,
8
- "rstrip": false,
9
- "single_word": false
10
- },
11
- "cls_token": {
12
- "__type": "AddedToken",
13
- "content": "<s>",
14
- "lstrip": false,
15
- "normalized": true,
16
- "rstrip": false,
17
- "single_word": false
18
- },
19
- "eos_token": {
20
- "__type": "AddedToken",
21
- "content": "</s>",
22
- "lstrip": false,
23
- "normalized": true,
24
- "rstrip": false,
25
- "single_word": false
26
- },
27
- "errors": "replace",
28
- "mask_token": {
29
- "__type": "AddedToken",
30
- "content": "<mask>",
31
- "lstrip": true,
32
- "normalized": true,
33
- "rstrip": false,
34
- "single_word": false
35
- },
36
- "max_len": 512,
37
- "name_or_path": "/home/pcjf/CESGA/works/lmodels/models/tiny",
38
- "pad_token": {
39
- "__type": "AddedToken",
40
- "content": "<pad>",
41
- "lstrip": false,
42
- "normalized": true,
43
- "rstrip": false,
44
- "single_word": false
45
- },
46
- "sep_token": {
47
- "__type": "AddedToken",
48
- "content": "</s>",
49
- "lstrip": false,
50
- "normalized": true,
51
- "rstrip": false,
52
- "single_word": false
53
- },
54
- "special_tokens_map_file": null,
55
- "tokenizer_class": "RobertaTokenizer",
56
- "trim_offsets": true,
57
- "unk_token": {
58
- "__type": "AddedToken",
59
- "content": "<unk>",
60
- "lstrip": false,
61
- "normalized": true,
62
- "rstrip": false,
63
- "single_word": false
64
- }
65
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/trainer_state.json DELETED
@@ -1,982 +0,0 @@
1
- {
2
- "best_metric": 1.4411557912826538,
3
- "best_model_checkpoint": "/home/pcjf/CESGA/works/lmodels/models/tiny/checkpoint-1500",
4
- "epoch": 14.902730598086016,
5
- "global_step": 103500,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.22,
12
- "learning_rate": 9.856011519078475e-06,
13
- "loss": 1.5017,
14
- "step": 1500
15
- },
16
- {
17
- "epoch": 0.22,
18
- "eval_loss": 1.4411557912826538,
19
- "eval_runtime": 28.0714,
20
- "eval_samples_per_second": 646.423,
21
- "eval_steps_per_second": 80.83,
22
- "step": 1500
23
- },
24
- {
25
- "epoch": 0.43,
26
- "learning_rate": 9.712023038156948e-06,
27
- "loss": 1.5201,
28
- "step": 3000
29
- },
30
- {
31
- "epoch": 0.43,
32
- "eval_loss": 1.4574847221374512,
33
- "eval_runtime": 28.0255,
34
- "eval_samples_per_second": 647.481,
35
- "eval_steps_per_second": 80.962,
36
- "step": 3000
37
- },
38
- {
39
- "epoch": 0.65,
40
- "learning_rate": 9.568034557235422e-06,
41
- "loss": 1.5413,
42
- "step": 4500
43
- },
44
- {
45
- "epoch": 0.65,
46
- "eval_loss": 1.4590709209442139,
47
- "eval_runtime": 28.3842,
48
- "eval_samples_per_second": 639.299,
49
- "eval_steps_per_second": 79.939,
50
- "step": 4500
51
- },
52
- {
53
- "epoch": 0.86,
54
- "learning_rate": 9.424046076313895e-06,
55
- "loss": 1.5538,
56
- "step": 6000
57
- },
58
- {
59
- "epoch": 0.86,
60
- "eval_loss": 1.4813467264175415,
61
- "eval_runtime": 28.197,
62
- "eval_samples_per_second": 643.543,
63
- "eval_steps_per_second": 80.469,
64
- "step": 6000
65
- },
66
- {
67
- "epoch": 1.08,
68
- "learning_rate": 9.28005759539237e-06,
69
- "loss": 1.5678,
70
- "step": 7500
71
- },
72
- {
73
- "epoch": 1.08,
74
- "eval_loss": 1.4866962432861328,
75
- "eval_runtime": 28.5007,
76
- "eval_samples_per_second": 636.687,
77
- "eval_steps_per_second": 79.612,
78
- "step": 7500
79
- },
80
- {
81
- "epoch": 1.3,
82
- "learning_rate": 9.136069114470844e-06,
83
- "loss": 1.5764,
84
- "step": 9000
85
- },
86
- {
87
- "epoch": 1.3,
88
- "eval_loss": 1.513939619064331,
89
- "eval_runtime": 29.225,
90
- "eval_samples_per_second": 620.907,
91
- "eval_steps_per_second": 77.639,
92
- "step": 9000
93
- },
94
- {
95
- "epoch": 1.51,
96
- "learning_rate": 8.992080633549316e-06,
97
- "loss": 1.5846,
98
- "step": 10500
99
- },
100
- {
101
- "epoch": 1.51,
102
- "eval_loss": 1.5256775617599487,
103
- "eval_runtime": 29.0447,
104
- "eval_samples_per_second": 624.761,
105
- "eval_steps_per_second": 78.121,
106
- "step": 10500
107
- },
108
- {
109
- "epoch": 1.73,
110
- "learning_rate": 8.84809215262779e-06,
111
- "loss": 1.5979,
112
- "step": 12000
113
- },
114
- {
115
- "epoch": 1.73,
116
- "eval_loss": 1.5338753461837769,
117
- "eval_runtime": 28.1783,
118
- "eval_samples_per_second": 643.97,
119
- "eval_steps_per_second": 80.523,
120
- "step": 12000
121
- },
122
- {
123
- "epoch": 1.94,
124
- "learning_rate": 8.704103671706265e-06,
125
- "loss": 1.6076,
126
- "step": 13500
127
- },
128
- {
129
- "epoch": 1.94,
130
- "eval_loss": 1.5177161693572998,
131
- "eval_runtime": 28.0148,
132
- "eval_samples_per_second": 647.728,
133
- "eval_steps_per_second": 80.993,
134
- "step": 13500
135
- },
136
- {
137
- "epoch": 2.16,
138
- "learning_rate": 8.560115190784738e-06,
139
- "loss": 1.6124,
140
- "step": 15000
141
- },
142
- {
143
- "epoch": 2.16,
144
- "eval_loss": 1.535063624382019,
145
- "eval_runtime": 27.7415,
146
- "eval_samples_per_second": 654.111,
147
- "eval_steps_per_second": 81.791,
148
- "step": 15000
149
- },
150
- {
151
- "epoch": 2.38,
152
- "learning_rate": 8.416126709863212e-06,
153
- "loss": 1.6205,
154
- "step": 16500
155
- },
156
- {
157
- "epoch": 2.38,
158
- "eval_loss": 1.5186307430267334,
159
- "eval_runtime": 31.4398,
160
- "eval_samples_per_second": 577.166,
161
- "eval_steps_per_second": 72.17,
162
- "step": 16500
163
- },
164
- {
165
- "epoch": 2.59,
166
- "learning_rate": 8.272138228941685e-06,
167
- "loss": 1.63,
168
- "step": 18000
169
- },
170
- {
171
- "epoch": 2.59,
172
- "eval_loss": 1.5418357849121094,
173
- "eval_runtime": 32.7447,
174
- "eval_samples_per_second": 554.166,
175
- "eval_steps_per_second": 69.294,
176
- "step": 18000
177
- },
178
- {
179
- "epoch": 2.81,
180
- "learning_rate": 8.12814974802016e-06,
181
- "loss": 1.6344,
182
- "step": 19500
183
- },
184
- {
185
- "epoch": 2.81,
186
- "eval_loss": 1.5568139553070068,
187
- "eval_runtime": 36.2173,
188
- "eval_samples_per_second": 501.032,
189
- "eval_steps_per_second": 62.65,
190
- "step": 19500
191
- },
192
- {
193
- "epoch": 3.02,
194
- "learning_rate": 7.984161267098632e-06,
195
- "loss": 1.6422,
196
- "step": 21000
197
- },
198
- {
199
- "epoch": 3.02,
200
- "eval_loss": 1.5598657131195068,
201
- "eval_runtime": 36.4049,
202
- "eval_samples_per_second": 498.45,
203
- "eval_steps_per_second": 62.327,
204
- "step": 21000
205
- },
206
- {
207
- "epoch": 3.24,
208
- "learning_rate": 7.840172786177106e-06,
209
- "loss": 1.645,
210
- "step": 22500
211
- },
212
- {
213
- "epoch": 3.24,
214
- "eval_loss": 1.5735074281692505,
215
- "eval_runtime": 35.5905,
216
- "eval_samples_per_second": 509.856,
217
- "eval_steps_per_second": 63.753,
218
- "step": 22500
219
- },
220
- {
221
- "epoch": 3.46,
222
- "learning_rate": 7.69618430525558e-06,
223
- "loss": 1.6501,
224
- "step": 24000
225
- },
226
- {
227
- "epoch": 3.46,
228
- "eval_loss": 1.5522310733795166,
229
- "eval_runtime": 32.903,
230
- "eval_samples_per_second": 551.499,
231
- "eval_steps_per_second": 68.96,
232
- "step": 24000
233
- },
234
- {
235
- "epoch": 3.67,
236
- "learning_rate": 7.552195824334054e-06,
237
- "loss": 1.6549,
238
- "step": 25500
239
- },
240
- {
241
- "epoch": 3.67,
242
- "eval_loss": 1.5705277919769287,
243
- "eval_runtime": 37.9709,
244
- "eval_samples_per_second": 477.893,
245
- "eval_steps_per_second": 59.756,
246
- "step": 25500
247
- },
248
- {
249
- "epoch": 3.89,
250
- "learning_rate": 7.408207343412528e-06,
251
- "loss": 1.6613,
252
- "step": 27000
253
- },
254
- {
255
- "epoch": 3.89,
256
- "eval_loss": 1.571619987487793,
257
- "eval_runtime": 30.4083,
258
- "eval_samples_per_second": 596.745,
259
- "eval_steps_per_second": 74.618,
260
- "step": 27000
261
- },
262
- {
263
- "epoch": 4.1,
264
- "learning_rate": 7.264218862491001e-06,
265
- "loss": 1.6694,
266
- "step": 28500
267
- },
268
- {
269
- "epoch": 4.1,
270
- "eval_loss": 1.5655138492584229,
271
- "eval_runtime": 35.5369,
272
- "eval_samples_per_second": 510.624,
273
- "eval_steps_per_second": 63.849,
274
- "step": 28500
275
- },
276
- {
277
- "epoch": 4.32,
278
- "learning_rate": 7.1202303815694755e-06,
279
- "loss": 1.6698,
280
- "step": 30000
281
- },
282
- {
283
- "epoch": 4.32,
284
- "eval_loss": 1.5825392007827759,
285
- "eval_runtime": 31.4716,
286
- "eval_samples_per_second": 576.584,
287
- "eval_steps_per_second": 72.097,
288
- "step": 30000
289
- },
290
- {
291
- "epoch": 4.54,
292
- "learning_rate": 6.976241900647949e-06,
293
- "loss": 1.6723,
294
- "step": 31500
295
- },
296
- {
297
- "epoch": 4.54,
298
- "eval_loss": 1.5868340730667114,
299
- "eval_runtime": 27.534,
300
- "eval_samples_per_second": 659.039,
301
- "eval_steps_per_second": 82.407,
302
- "step": 31500
303
- },
304
- {
305
- "epoch": 4.75,
306
- "learning_rate": 6.8322534197264226e-06,
307
- "loss": 1.6763,
308
- "step": 33000
309
- },
310
- {
311
- "epoch": 4.75,
312
- "eval_loss": 1.58354914188385,
313
- "eval_runtime": 27.5271,
314
- "eval_samples_per_second": 659.204,
315
- "eval_steps_per_second": 82.428,
316
- "step": 33000
317
- },
318
- {
319
- "epoch": 4.97,
320
- "learning_rate": 6.688264938804896e-06,
321
- "loss": 1.6802,
322
- "step": 34500
323
- },
324
- {
325
- "epoch": 4.97,
326
- "eval_loss": 1.589853286743164,
327
- "eval_runtime": 27.5963,
328
- "eval_samples_per_second": 657.553,
329
- "eval_steps_per_second": 82.221,
330
- "step": 34500
331
- },
332
- {
333
- "epoch": 5.18,
334
- "learning_rate": 6.54427645788337e-06,
335
- "loss": 1.6831,
336
- "step": 36000
337
- },
338
- {
339
- "epoch": 5.18,
340
- "eval_loss": 1.5898144245147705,
341
- "eval_runtime": 27.7596,
342
- "eval_samples_per_second": 653.684,
343
- "eval_steps_per_second": 81.737,
344
- "step": 36000
345
- },
346
- {
347
- "epoch": 5.4,
348
- "learning_rate": 6.400287976961843e-06,
349
- "loss": 1.686,
350
- "step": 37500
351
- },
352
- {
353
- "epoch": 5.4,
354
- "eval_loss": 1.60897696018219,
355
- "eval_runtime": 28.0066,
356
- "eval_samples_per_second": 647.918,
357
- "eval_steps_per_second": 81.017,
358
- "step": 37500
359
- },
360
- {
361
- "epoch": 5.62,
362
- "learning_rate": 6.2562994960403175e-06,
363
- "loss": 1.6908,
364
- "step": 39000
365
- },
366
- {
367
- "epoch": 5.62,
368
- "eval_loss": 1.5985779762268066,
369
- "eval_runtime": 28.4873,
370
- "eval_samples_per_second": 636.986,
371
- "eval_steps_per_second": 79.65,
372
- "step": 39000
373
- },
374
- {
375
- "epoch": 5.83,
376
- "learning_rate": 6.112311015118791e-06,
377
- "loss": 1.696,
378
- "step": 40500
379
- },
380
- {
381
- "epoch": 5.83,
382
- "eval_loss": 1.5870490074157715,
383
- "eval_runtime": 28.3305,
384
- "eval_samples_per_second": 640.511,
385
- "eval_steps_per_second": 80.09,
386
- "step": 40500
387
- },
388
- {
389
- "epoch": 6.05,
390
- "learning_rate": 5.968322534197265e-06,
391
- "loss": 1.6947,
392
- "step": 42000
393
- },
394
- {
395
- "epoch": 6.05,
396
- "eval_loss": 1.6037499904632568,
397
- "eval_runtime": 28.3691,
398
- "eval_samples_per_second": 639.639,
399
- "eval_steps_per_second": 79.981,
400
- "step": 42000
401
- },
402
- {
403
- "epoch": 6.26,
404
- "learning_rate": 5.824334053275739e-06,
405
- "loss": 1.6989,
406
- "step": 43500
407
- },
408
- {
409
- "epoch": 6.26,
410
- "eval_loss": 1.6043579578399658,
411
- "eval_runtime": 28.3097,
412
- "eval_samples_per_second": 640.983,
413
- "eval_steps_per_second": 80.149,
414
- "step": 43500
415
- },
416
- {
417
- "epoch": 6.48,
418
- "learning_rate": 5.6803455723542124e-06,
419
- "loss": 1.6996,
420
- "step": 45000
421
- },
422
- {
423
- "epoch": 6.48,
424
- "eval_loss": 1.6131685972213745,
425
- "eval_runtime": 28.2412,
426
- "eval_samples_per_second": 642.536,
427
- "eval_steps_per_second": 80.344,
428
- "step": 45000
429
- },
430
- {
431
- "epoch": 6.7,
432
- "learning_rate": 5.536357091432686e-06,
433
- "loss": 1.7022,
434
- "step": 46500
435
- },
436
- {
437
- "epoch": 6.7,
438
- "eval_loss": 1.6012641191482544,
439
- "eval_runtime": 28.4538,
440
- "eval_samples_per_second": 637.736,
441
- "eval_steps_per_second": 79.743,
442
- "step": 46500
443
- },
444
- {
445
- "epoch": 6.91,
446
- "learning_rate": 5.3923686105111595e-06,
447
- "loss": 1.7063,
448
- "step": 48000
449
- },
450
- {
451
- "epoch": 6.91,
452
- "eval_loss": 1.6121569871902466,
453
- "eval_runtime": 28.5191,
454
- "eval_samples_per_second": 636.275,
455
- "eval_steps_per_second": 79.561,
456
- "step": 48000
457
- },
458
- {
459
- "epoch": 7.13,
460
- "learning_rate": 5.248380129589633e-06,
461
- "loss": 1.7081,
462
- "step": 49500
463
- },
464
- {
465
- "epoch": 7.13,
466
- "eval_loss": 1.611207127571106,
467
- "eval_runtime": 28.6681,
468
- "eval_samples_per_second": 632.969,
469
- "eval_steps_per_second": 79.147,
470
- "step": 49500
471
- },
472
- {
473
- "epoch": 7.34,
474
- "learning_rate": 5.1043916486681065e-06,
475
- "loss": 1.7124,
476
- "step": 51000
477
- },
478
- {
479
- "epoch": 7.34,
480
- "eval_loss": 1.6293696165084839,
481
- "eval_runtime": 28.5537,
482
- "eval_samples_per_second": 635.504,
483
- "eval_steps_per_second": 79.464,
484
- "step": 51000
485
- },
486
- {
487
- "epoch": 7.56,
488
- "learning_rate": 4.960403167746581e-06,
489
- "loss": 1.711,
490
- "step": 52500
491
- },
492
- {
493
- "epoch": 7.56,
494
- "eval_loss": 1.620770812034607,
495
- "eval_runtime": 28.4979,
496
- "eval_samples_per_second": 636.75,
497
- "eval_steps_per_second": 79.62,
498
- "step": 52500
499
- },
500
- {
501
- "epoch": 7.78,
502
- "learning_rate": 4.8164146868250544e-06,
503
- "loss": 1.7139,
504
- "step": 54000
505
- },
506
- {
507
- "epoch": 7.78,
508
- "eval_loss": 1.6238549947738647,
509
- "eval_runtime": 28.7129,
510
- "eval_samples_per_second": 631.98,
511
- "eval_steps_per_second": 79.024,
512
- "step": 54000
513
- },
514
- {
515
- "epoch": 7.99,
516
- "learning_rate": 4.672426205903528e-06,
517
- "loss": 1.7166,
518
- "step": 55500
519
- },
520
- {
521
- "epoch": 7.99,
522
- "eval_loss": 1.61404550075531,
523
- "eval_runtime": 28.7524,
524
- "eval_samples_per_second": 631.112,
525
- "eval_steps_per_second": 78.915,
526
- "step": 55500
527
- },
528
- {
529
- "epoch": 8.21,
530
- "learning_rate": 4.5284377249820015e-06,
531
- "loss": 1.7133,
532
- "step": 57000
533
- },
534
- {
535
- "epoch": 8.21,
536
- "eval_loss": 1.6073957681655884,
537
- "eval_runtime": 28.4507,
538
- "eval_samples_per_second": 637.804,
539
- "eval_steps_per_second": 79.752,
540
- "step": 57000
541
- },
542
- {
543
- "epoch": 8.42,
544
- "learning_rate": 4.384449244060476e-06,
545
- "loss": 1.7193,
546
- "step": 58500
547
- },
548
- {
549
- "epoch": 8.42,
550
- "eval_loss": 1.6162116527557373,
551
- "eval_runtime": 32.1502,
552
- "eval_samples_per_second": 564.413,
553
- "eval_steps_per_second": 70.575,
554
- "step": 58500
555
- },
556
- {
557
- "epoch": 8.64,
558
- "learning_rate": 4.240460763138949e-06,
559
- "loss": 1.7206,
560
- "step": 60000
561
- },
562
- {
563
- "epoch": 8.64,
564
- "eval_loss": 1.6276147365570068,
565
- "eval_runtime": 29.6834,
566
- "eval_samples_per_second": 611.319,
567
- "eval_steps_per_second": 76.44,
568
- "step": 60000
569
- },
570
- {
571
- "epoch": 8.86,
572
- "learning_rate": 4.096472282217423e-06,
573
- "loss": 1.7209,
574
- "step": 61500
575
- },
576
- {
577
- "epoch": 8.86,
578
- "eval_loss": 1.6199073791503906,
579
- "eval_runtime": 28.6726,
580
- "eval_samples_per_second": 632.869,
581
- "eval_steps_per_second": 79.135,
582
- "step": 61500
583
- },
584
- {
585
- "epoch": 9.07,
586
- "learning_rate": 3.952483801295896e-06,
587
- "loss": 1.7207,
588
- "step": 63000
589
- },
590
- {
591
- "epoch": 9.07,
592
- "eval_loss": 1.6250064373016357,
593
- "eval_runtime": 29.0567,
594
- "eval_samples_per_second": 624.503,
595
- "eval_steps_per_second": 78.089,
596
- "step": 63000
597
- },
598
- {
599
- "epoch": 9.29,
600
- "learning_rate": 3.8084953203743704e-06,
601
- "loss": 1.722,
602
- "step": 64500
603
- },
604
- {
605
- "epoch": 9.29,
606
- "eval_loss": 1.622145175933838,
607
- "eval_runtime": 28.9996,
608
- "eval_samples_per_second": 625.733,
609
- "eval_steps_per_second": 78.242,
610
- "step": 64500
611
- },
612
- {
613
- "epoch": 9.5,
614
- "learning_rate": 3.664506839452844e-06,
615
- "loss": 1.7268,
616
- "step": 66000
617
- },
618
- {
619
- "epoch": 9.5,
620
- "eval_loss": 1.623546838760376,
621
- "eval_runtime": 28.675,
622
- "eval_samples_per_second": 632.815,
623
- "eval_steps_per_second": 79.128,
624
- "step": 66000
625
- },
626
- {
627
- "epoch": 9.72,
628
- "learning_rate": 3.520518358531318e-06,
629
- "loss": 1.7255,
630
- "step": 67500
631
- },
632
- {
633
- "epoch": 9.72,
634
- "eval_loss": 1.6309912204742432,
635
- "eval_runtime": 28.5477,
636
- "eval_samples_per_second": 635.637,
637
- "eval_steps_per_second": 79.481,
638
- "step": 67500
639
- },
640
- {
641
- "epoch": 9.94,
642
- "learning_rate": 3.3765298776097914e-06,
643
- "loss": 1.7295,
644
- "step": 69000
645
- },
646
- {
647
- "epoch": 9.94,
648
- "eval_loss": 1.6271131038665771,
649
- "eval_runtime": 28.8046,
650
- "eval_samples_per_second": 629.968,
651
- "eval_steps_per_second": 78.772,
652
- "step": 69000
653
- },
654
- {
655
- "epoch": 10.15,
656
- "learning_rate": 3.2325413966882653e-06,
657
- "loss": 1.7267,
658
- "step": 70500
659
- },
660
- {
661
- "epoch": 10.15,
662
- "eval_loss": 1.6185855865478516,
663
- "eval_runtime": 28.5157,
664
- "eval_samples_per_second": 636.351,
665
- "eval_steps_per_second": 79.57,
666
- "step": 70500
667
- },
668
- {
669
- "epoch": 10.37,
670
- "learning_rate": 3.088552915766739e-06,
671
- "loss": 1.7238,
672
- "step": 72000
673
- },
674
- {
675
- "epoch": 10.37,
676
- "eval_loss": 1.6290473937988281,
677
- "eval_runtime": 28.937,
678
- "eval_samples_per_second": 627.087,
679
- "eval_steps_per_second": 78.412,
680
- "step": 72000
681
- },
682
- {
683
- "epoch": 10.58,
684
- "learning_rate": 2.9445644348452123e-06,
685
- "loss": 1.7272,
686
- "step": 73500
687
- },
688
- {
689
- "epoch": 10.58,
690
- "eval_loss": 1.6252139806747437,
691
- "eval_runtime": 28.8164,
692
- "eval_samples_per_second": 629.71,
693
- "eval_steps_per_second": 78.74,
694
- "step": 73500
695
- },
696
- {
697
- "epoch": 10.8,
698
- "learning_rate": 2.8005759539236867e-06,
699
- "loss": 1.7316,
700
- "step": 75000
701
- },
702
- {
703
- "epoch": 10.8,
704
- "eval_loss": 1.6189124584197998,
705
- "eval_runtime": 28.4148,
706
- "eval_samples_per_second": 638.611,
707
- "eval_steps_per_second": 79.853,
708
- "step": 75000
709
- },
710
- {
711
- "epoch": 11.02,
712
- "learning_rate": 2.6565874730021602e-06,
713
- "loss": 1.7335,
714
- "step": 76500
715
- },
716
- {
717
- "epoch": 11.02,
718
- "eval_loss": 1.6274147033691406,
719
- "eval_runtime": 28.5118,
720
- "eval_samples_per_second": 636.438,
721
- "eval_steps_per_second": 79.581,
722
- "step": 76500
723
- },
724
- {
725
- "epoch": 11.23,
726
- "learning_rate": 2.5125989920806338e-06,
727
- "loss": 1.7327,
728
- "step": 78000
729
- },
730
- {
731
- "epoch": 11.23,
732
- "eval_loss": 1.6355476379394531,
733
- "eval_runtime": 28.5252,
734
- "eval_samples_per_second": 636.138,
735
- "eval_steps_per_second": 79.544,
736
- "step": 78000
737
- },
738
- {
739
- "epoch": 11.45,
740
- "learning_rate": 2.3686105111591073e-06,
741
- "loss": 1.7332,
742
- "step": 79500
743
- },
744
- {
745
- "epoch": 11.45,
746
- "eval_loss": 1.6285896301269531,
747
- "eval_runtime": 28.4389,
748
- "eval_samples_per_second": 638.069,
749
- "eval_steps_per_second": 79.785,
750
- "step": 79500
751
- },
752
- {
753
- "epoch": 11.66,
754
- "learning_rate": 2.2246220302375812e-06,
755
- "loss": 1.7311,
756
- "step": 81000
757
- },
758
- {
759
- "epoch": 11.66,
760
- "eval_loss": 1.6327883005142212,
761
- "eval_runtime": 28.5404,
762
- "eval_samples_per_second": 635.8,
763
- "eval_steps_per_second": 79.501,
764
- "step": 81000
765
- },
766
- {
767
- "epoch": 11.88,
768
- "learning_rate": 2.0806335493160548e-06,
769
- "loss": 1.7359,
770
- "step": 82500
771
- },
772
- {
773
- "epoch": 11.88,
774
- "eval_loss": 1.6401711702346802,
775
- "eval_runtime": 28.2502,
776
- "eval_samples_per_second": 642.331,
777
- "eval_steps_per_second": 80.318,
778
- "step": 82500
779
- },
780
- {
781
- "epoch": 12.1,
782
- "learning_rate": 1.9366450683945287e-06,
783
- "loss": 1.7393,
784
- "step": 84000
785
- },
786
- {
787
- "epoch": 12.1,
788
- "eval_loss": 1.6506874561309814,
789
- "eval_runtime": 28.3732,
790
- "eval_samples_per_second": 639.547,
791
- "eval_steps_per_second": 79.97,
792
- "step": 84000
793
- },
794
- {
795
- "epoch": 12.31,
796
- "learning_rate": 1.7926565874730022e-06,
797
- "loss": 1.7337,
798
- "step": 85500
799
- },
800
- {
801
- "epoch": 12.31,
802
- "eval_loss": 1.640535593032837,
803
- "eval_runtime": 28.3112,
804
- "eval_samples_per_second": 640.947,
805
- "eval_steps_per_second": 80.145,
806
- "step": 85500
807
- },
808
- {
809
- "epoch": 12.53,
810
- "learning_rate": 1.648668106551476e-06,
811
- "loss": 1.7346,
812
- "step": 87000
813
- },
814
- {
815
- "epoch": 12.53,
816
- "eval_loss": 1.635541558265686,
817
- "eval_runtime": 28.5411,
818
- "eval_samples_per_second": 635.785,
819
- "eval_steps_per_second": 79.499,
820
- "step": 87000
821
- },
822
- {
823
- "epoch": 12.74,
824
- "learning_rate": 1.5046796256299497e-06,
825
- "loss": 1.7371,
826
- "step": 88500
827
- },
828
- {
829
- "epoch": 12.74,
830
- "eval_loss": 1.6363788843154907,
831
- "eval_runtime": 28.2122,
832
- "eval_samples_per_second": 643.197,
833
- "eval_steps_per_second": 80.426,
834
- "step": 88500
835
- },
836
- {
837
- "epoch": 12.96,
838
- "learning_rate": 1.3606911447084234e-06,
839
- "loss": 1.7374,
840
- "step": 90000
841
- },
842
- {
843
- "epoch": 12.96,
844
- "eval_loss": 1.6403627395629883,
845
- "eval_runtime": 28.3587,
846
- "eval_samples_per_second": 639.875,
847
- "eval_steps_per_second": 80.011,
848
- "step": 90000
849
- },
850
- {
851
- "epoch": 13.17,
852
- "learning_rate": 1.2167026637868972e-06,
853
- "loss": 1.7365,
854
- "step": 91500
855
- },
856
- {
857
- "epoch": 13.17,
858
- "eval_loss": 1.639408826828003,
859
- "eval_runtime": 28.3926,
860
- "eval_samples_per_second": 639.111,
861
- "eval_steps_per_second": 79.915,
862
- "step": 91500
863
- },
864
- {
865
- "epoch": 13.39,
866
- "learning_rate": 1.072714182865371e-06,
867
- "loss": 1.7356,
868
- "step": 93000
869
- },
870
- {
871
- "epoch": 13.39,
872
- "eval_loss": 1.6273094415664673,
873
- "eval_runtime": 28.5221,
874
- "eval_samples_per_second": 636.208,
875
- "eval_steps_per_second": 79.552,
876
- "step": 93000
877
- },
878
- {
879
- "epoch": 13.61,
880
- "learning_rate": 9.287257019438446e-07,
881
- "loss": 1.7364,
882
- "step": 94500
883
- },
884
- {
885
- "epoch": 13.61,
886
- "eval_loss": 1.6499429941177368,
887
- "eval_runtime": 28.4528,
888
- "eval_samples_per_second": 637.758,
889
- "eval_steps_per_second": 79.746,
890
- "step": 94500
891
- },
892
- {
893
- "epoch": 13.82,
894
- "learning_rate": 7.847372210223183e-07,
895
- "loss": 1.7395,
896
- "step": 96000
897
- },
898
- {
899
- "epoch": 13.82,
900
- "eval_loss": 1.6389094591140747,
901
- "eval_runtime": 28.3271,
902
- "eval_samples_per_second": 640.588,
903
- "eval_steps_per_second": 80.1,
904
- "step": 96000
905
- },
906
- {
907
- "epoch": 14.04,
908
- "learning_rate": 6.40748740100792e-07,
909
- "loss": 1.7369,
910
- "step": 97500
911
- },
912
- {
913
- "epoch": 14.04,
914
- "eval_loss": 1.6465275287628174,
915
- "eval_runtime": 28.5374,
916
- "eval_samples_per_second": 635.866,
917
- "eval_steps_per_second": 79.51,
918
- "step": 97500
919
- },
920
- {
921
- "epoch": 14.25,
922
- "learning_rate": 4.967602591792657e-07,
923
- "loss": 1.7371,
924
- "step": 99000
925
- },
926
- {
927
- "epoch": 14.25,
928
- "eval_loss": 1.6305017471313477,
929
- "eval_runtime": 28.4576,
930
- "eval_samples_per_second": 637.651,
931
- "eval_steps_per_second": 79.733,
932
- "step": 99000
933
- },
934
- {
935
- "epoch": 14.47,
936
- "learning_rate": 3.5277177825773936e-07,
937
- "loss": 1.7376,
938
- "step": 100500
939
- },
940
- {
941
- "epoch": 14.47,
942
- "eval_loss": 1.6379024982452393,
943
- "eval_runtime": 28.9349,
944
- "eval_samples_per_second": 627.131,
945
- "eval_steps_per_second": 78.417,
946
- "step": 100500
947
- },
948
- {
949
- "epoch": 14.69,
950
- "learning_rate": 2.0878329733621312e-07,
951
- "loss": 1.7377,
952
- "step": 102000
953
- },
954
- {
955
- "epoch": 14.69,
956
- "eval_loss": 1.6268378496170044,
957
- "eval_runtime": 28.3974,
958
- "eval_samples_per_second": 639.001,
959
- "eval_steps_per_second": 79.902,
960
- "step": 102000
961
- },
962
- {
963
- "epoch": 14.9,
964
- "learning_rate": 6.479481641468683e-08,
965
- "loss": 1.7343,
966
- "step": 103500
967
- },
968
- {
969
- "epoch": 14.9,
970
- "eval_loss": 1.6236845254898071,
971
- "eval_runtime": 28.5403,
972
- "eval_samples_per_second": 635.804,
973
- "eval_steps_per_second": 79.502,
974
- "step": 103500
975
- }
976
- ],
977
- "max_steps": 104175,
978
- "num_train_epochs": 15,
979
- "total_flos": 6.475717648203267e+17,
980
- "trial_name": null,
981
- "trial_params": null
982
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf00dfba82d3ac10ad9debd6763f43bf6a5b78e8b7425c40a36ad3d1be98fc26
3
- size 3451
 
 
 
 
last-checkpoint/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8fb1bec33c4431f9607772bf7db9d5ab7b3cddb8480fac34b5229ab54e6b0616
3
  size 269468281
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0692640dbdac167fdc61252012f23bef86652860782cce1435700a221f51750
3
  size 269468281
runs/Feb05_20-23-46_turing/events.out.tfevents.1675625038.turing.943015.1 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28d15ccd7adf8cc11c43ddd3883cd5b4db9fb9c299c465b71f22e4d982c16614
3
- size 33952
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9b4ef88fbbbd48cb525f3e17e116c1b3b386e61cbc450b777dec805e86bc2d0
3
+ size 34312