fpuentes commited on
Commit
9b1e414
1 Parent(s): b530fae

Model save

Browse files
last-checkpoint/config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "_name_or_path": "/home/pcjf/CESGA/works/lmodels/models/tiny",
3
- "architectures": [
4
- "RobertaForMaskedLM"
5
- ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "bos_token_id": 0,
8
- "classifier_dropout": null,
9
- "eos_token_id": 2,
10
- "gradient_checkpointing": false,
11
- "hidden_act": "gelu",
12
- "hidden_dropout_prob": 0.1,
13
- "hidden_size": 768,
14
- "initializer_range": 0.02,
15
- "intermediate_size": 3072,
16
- "layer_norm_eps": 1e-12,
17
- "max_position_embeddings": 514,
18
- "model_type": "roberta",
19
- "num_attention_heads": 12,
20
- "num_hidden_layers": 6,
21
- "pad_token_id": 1,
22
- "position_embedding_type": "absolute",
23
- "torch_dtype": "float32",
24
- "transformers_version": "4.24.0",
25
- "type_vocab_size": 1,
26
- "use_cache": true,
27
- "vocab_size": 31002
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2cfe56d5d4409b47893b4520846ba211ccadf119b94d737232a5fda1170af33
3
- size 538943941
 
 
 
 
last-checkpoint/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0f6d7dfce7499a39f72cf6615b36e67ad548848682144813bc3f2dd834f0220
3
- size 269468281
 
 
 
 
last-checkpoint/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6354b3a5422c28a3a5a73b71565a42829474a3c347615e37c20c87de92d294e2
3
- size 14575
 
 
 
 
last-checkpoint/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ff1be153872ceab362bc8f896bf3f611b155e54edf151eccfc448653a32209d
3
- size 627
 
 
 
 
last-checkpoint/special_tokens_map.json DELETED
@@ -1,51 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "<s>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "</s>",
18
- "lstrip": false,
19
- "normalized": true,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "mask_token": {
24
- "content": "<mask>",
25
- "lstrip": true,
26
- "normalized": true,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "pad_token": {
31
- "content": "<pad>",
32
- "lstrip": false,
33
- "normalized": true,
34
- "rstrip": false,
35
- "single_word": false
36
- },
37
- "sep_token": {
38
- "content": "</s>",
39
- "lstrip": false,
40
- "normalized": true,
41
- "rstrip": false,
42
- "single_word": false
43
- },
44
- "unk_token": {
45
- "content": "<unk>",
46
- "lstrip": false,
47
- "normalized": true,
48
- "rstrip": false,
49
- "single_word": false
50
- }
51
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json DELETED
@@ -1,65 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "bos_token": {
4
- "__type": "AddedToken",
5
- "content": "<s>",
6
- "lstrip": false,
7
- "normalized": true,
8
- "rstrip": false,
9
- "single_word": false
10
- },
11
- "cls_token": {
12
- "__type": "AddedToken",
13
- "content": "<s>",
14
- "lstrip": false,
15
- "normalized": true,
16
- "rstrip": false,
17
- "single_word": false
18
- },
19
- "eos_token": {
20
- "__type": "AddedToken",
21
- "content": "</s>",
22
- "lstrip": false,
23
- "normalized": true,
24
- "rstrip": false,
25
- "single_word": false
26
- },
27
- "errors": "replace",
28
- "mask_token": {
29
- "__type": "AddedToken",
30
- "content": "<mask>",
31
- "lstrip": true,
32
- "normalized": true,
33
- "rstrip": false,
34
- "single_word": false
35
- },
36
- "max_len": 512,
37
- "name_or_path": "/home/pcjf/CESGA/works/lmodels/models/tiny",
38
- "pad_token": {
39
- "__type": "AddedToken",
40
- "content": "<pad>",
41
- "lstrip": false,
42
- "normalized": true,
43
- "rstrip": false,
44
- "single_word": false
45
- },
46
- "sep_token": {
47
- "__type": "AddedToken",
48
- "content": "</s>",
49
- "lstrip": false,
50
- "normalized": true,
51
- "rstrip": false,
52
- "single_word": false
53
- },
54
- "special_tokens_map_file": null,
55
- "tokenizer_class": "RobertaTokenizer",
56
- "trim_offsets": true,
57
- "unk_token": {
58
- "__type": "AddedToken",
59
- "content": "<unk>",
60
- "lstrip": false,
61
- "normalized": true,
62
- "rstrip": false,
63
- "single_word": false
64
- }
65
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/trainer_state.json DELETED
@@ -1,982 +0,0 @@
1
- {
2
- "best_metric": 1.4634919166564941,
3
- "best_model_checkpoint": "/home/pcjf/CESGA/works/lmodels/models/tiny/checkpoint-1500",
4
- "epoch": 14.902730598086016,
5
- "global_step": 103500,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.22,
12
- "learning_rate": 9.856011519078475e-06,
13
- "loss": 1.4486,
14
- "step": 1500
15
- },
16
- {
17
- "epoch": 0.22,
18
- "eval_loss": 1.4634919166564941,
19
- "eval_runtime": 31.815,
20
- "eval_samples_per_second": 570.36,
21
- "eval_steps_per_second": 71.319,
22
- "step": 1500
23
- },
24
- {
25
- "epoch": 0.43,
26
- "learning_rate": 9.712023038156948e-06,
27
- "loss": 1.5424,
28
- "step": 3000
29
- },
30
- {
31
- "epoch": 0.43,
32
- "eval_loss": 1.473360300064087,
33
- "eval_runtime": 36.4429,
34
- "eval_samples_per_second": 497.929,
35
- "eval_steps_per_second": 62.262,
36
- "step": 3000
37
- },
38
- {
39
- "epoch": 0.65,
40
- "learning_rate": 9.568034557235422e-06,
41
- "loss": 1.5598,
42
- "step": 4500
43
- },
44
- {
45
- "epoch": 0.65,
46
- "eval_loss": 1.4726251363754272,
47
- "eval_runtime": 28.71,
48
- "eval_samples_per_second": 632.044,
49
- "eval_steps_per_second": 79.032,
50
- "step": 4500
51
- },
52
- {
53
- "epoch": 0.86,
54
- "learning_rate": 9.424046076313895e-06,
55
- "loss": 1.5698,
56
- "step": 6000
57
- },
58
- {
59
- "epoch": 0.86,
60
- "eval_loss": 1.4929977655410767,
61
- "eval_runtime": 28.7672,
62
- "eval_samples_per_second": 630.789,
63
- "eval_steps_per_second": 78.875,
64
- "step": 6000
65
- },
66
- {
67
- "epoch": 1.08,
68
- "learning_rate": 9.28005759539237e-06,
69
- "loss": 1.5818,
70
- "step": 7500
71
- },
72
- {
73
- "epoch": 1.08,
74
- "eval_loss": 1.4972467422485352,
75
- "eval_runtime": 28.7771,
76
- "eval_samples_per_second": 630.57,
77
- "eval_steps_per_second": 78.847,
78
- "step": 7500
79
- },
80
- {
81
- "epoch": 1.3,
82
- "learning_rate": 9.136069114470844e-06,
83
- "loss": 1.5887,
84
- "step": 9000
85
- },
86
- {
87
- "epoch": 1.3,
88
- "eval_loss": 1.5235066413879395,
89
- "eval_runtime": 28.8715,
90
- "eval_samples_per_second": 628.509,
91
- "eval_steps_per_second": 78.59,
92
- "step": 9000
93
- },
94
- {
95
- "epoch": 1.51,
96
- "learning_rate": 8.992080633549316e-06,
97
- "loss": 1.5958,
98
- "step": 10500
99
- },
100
- {
101
- "epoch": 1.51,
102
- "eval_loss": 1.5344377756118774,
103
- "eval_runtime": 28.3499,
104
- "eval_samples_per_second": 640.073,
105
- "eval_steps_per_second": 80.036,
106
- "step": 10500
107
- },
108
- {
109
- "epoch": 1.73,
110
- "learning_rate": 8.84809215262779e-06,
111
- "loss": 1.6083,
112
- "step": 12000
113
- },
114
- {
115
- "epoch": 1.73,
116
- "eval_loss": 1.5420113801956177,
117
- "eval_runtime": 28.6377,
118
- "eval_samples_per_second": 633.641,
119
- "eval_steps_per_second": 79.231,
120
- "step": 12000
121
- },
122
- {
123
- "epoch": 1.94,
124
- "learning_rate": 8.704103671706265e-06,
125
- "loss": 1.6173,
126
- "step": 13500
127
- },
128
- {
129
- "epoch": 1.94,
130
- "eval_loss": 1.525267243385315,
131
- "eval_runtime": 28.6529,
132
- "eval_samples_per_second": 633.305,
133
- "eval_steps_per_second": 79.189,
134
- "step": 13500
135
- },
136
- {
137
- "epoch": 2.16,
138
- "learning_rate": 8.560115190784738e-06,
139
- "loss": 1.6215,
140
- "step": 15000
141
- },
142
- {
143
- "epoch": 2.16,
144
- "eval_loss": 1.5423375368118286,
145
- "eval_runtime": 28.5473,
146
- "eval_samples_per_second": 635.647,
147
- "eval_steps_per_second": 79.482,
148
- "step": 15000
149
- },
150
- {
151
- "epoch": 2.38,
152
- "learning_rate": 8.416126709863212e-06,
153
- "loss": 1.629,
154
- "step": 16500
155
- },
156
- {
157
- "epoch": 2.38,
158
- "eval_loss": 1.5254136323928833,
159
- "eval_runtime": 28.2417,
160
- "eval_samples_per_second": 642.525,
161
- "eval_steps_per_second": 80.342,
162
- "step": 16500
163
- },
164
- {
165
- "epoch": 2.59,
166
- "learning_rate": 8.272138228941685e-06,
167
- "loss": 1.6382,
168
- "step": 18000
169
- },
170
- {
171
- "epoch": 2.59,
172
- "eval_loss": 1.54853355884552,
173
- "eval_runtime": 28.4675,
174
- "eval_samples_per_second": 637.43,
175
- "eval_steps_per_second": 79.705,
176
- "step": 18000
177
- },
178
- {
179
- "epoch": 2.81,
180
- "learning_rate": 8.12814974802016e-06,
181
- "loss": 1.6422,
182
- "step": 19500
183
- },
184
- {
185
- "epoch": 2.81,
186
- "eval_loss": 1.562892198562622,
187
- "eval_runtime": 28.1975,
188
- "eval_samples_per_second": 643.532,
189
- "eval_steps_per_second": 80.468,
190
- "step": 19500
191
- },
192
- {
193
- "epoch": 3.02,
194
- "learning_rate": 7.984161267098632e-06,
195
- "loss": 1.6497,
196
- "step": 21000
197
- },
198
- {
199
- "epoch": 3.02,
200
- "eval_loss": 1.5657176971435547,
201
- "eval_runtime": 28.3817,
202
- "eval_samples_per_second": 639.355,
203
- "eval_steps_per_second": 79.946,
204
- "step": 21000
205
- },
206
- {
207
- "epoch": 3.24,
208
- "learning_rate": 7.840172786177106e-06,
209
- "loss": 1.6521,
210
- "step": 22500
211
- },
212
- {
213
- "epoch": 3.24,
214
- "eval_loss": 1.5792163610458374,
215
- "eval_runtime": 28.6284,
216
- "eval_samples_per_second": 633.846,
217
- "eval_steps_per_second": 79.257,
218
- "step": 22500
219
- },
220
- {
221
- "epoch": 3.46,
222
- "learning_rate": 7.69618430525558e-06,
223
- "loss": 1.657,
224
- "step": 24000
225
- },
226
- {
227
- "epoch": 3.46,
228
- "eval_loss": 1.5577658414840698,
229
- "eval_runtime": 28.7026,
230
- "eval_samples_per_second": 632.207,
231
- "eval_steps_per_second": 79.052,
232
- "step": 24000
233
- },
234
- {
235
- "epoch": 3.67,
236
- "learning_rate": 7.552195824334054e-06,
237
- "loss": 1.6614,
238
- "step": 25500
239
- },
240
- {
241
- "epoch": 3.67,
242
- "eval_loss": 1.5757924318313599,
243
- "eval_runtime": 28.4551,
244
- "eval_samples_per_second": 637.708,
245
- "eval_steps_per_second": 79.74,
246
- "step": 25500
247
- },
248
- {
249
- "epoch": 3.89,
250
- "learning_rate": 7.408207343412528e-06,
251
- "loss": 1.6676,
252
- "step": 27000
253
- },
254
- {
255
- "epoch": 3.89,
256
- "eval_loss": 1.5766253471374512,
257
- "eval_runtime": 28.7151,
258
- "eval_samples_per_second": 631.932,
259
- "eval_steps_per_second": 79.018,
260
- "step": 27000
261
- },
262
- {
263
- "epoch": 4.1,
264
- "learning_rate": 7.264218862491001e-06,
265
- "loss": 1.6755,
266
- "step": 28500
267
- },
268
- {
269
- "epoch": 4.1,
270
- "eval_loss": 1.5704692602157593,
271
- "eval_runtime": 28.1595,
272
- "eval_samples_per_second": 644.401,
273
- "eval_steps_per_second": 80.577,
274
- "step": 28500
275
- },
276
- {
277
- "epoch": 4.32,
278
- "learning_rate": 7.1202303815694755e-06,
279
- "loss": 1.6758,
280
- "step": 30000
281
- },
282
- {
283
- "epoch": 4.32,
284
- "eval_loss": 1.587284803390503,
285
- "eval_runtime": 29.4826,
286
- "eval_samples_per_second": 615.482,
287
- "eval_steps_per_second": 76.961,
288
- "step": 30000
289
- },
290
- {
291
- "epoch": 4.54,
292
- "learning_rate": 6.976241900647949e-06,
293
- "loss": 1.678,
294
- "step": 31500
295
- },
296
- {
297
- "epoch": 4.54,
298
- "eval_loss": 1.591480016708374,
299
- "eval_runtime": 28.324,
300
- "eval_samples_per_second": 640.658,
301
- "eval_steps_per_second": 80.109,
302
- "step": 31500
303
- },
304
- {
305
- "epoch": 4.75,
306
- "learning_rate": 6.8322534197264226e-06,
307
- "loss": 1.6818,
308
- "step": 33000
309
- },
310
- {
311
- "epoch": 4.75,
312
- "eval_loss": 1.5879828929901123,
313
- "eval_runtime": 29.4058,
314
- "eval_samples_per_second": 617.089,
315
- "eval_steps_per_second": 77.162,
316
- "step": 33000
317
- },
318
- {
319
- "epoch": 4.97,
320
- "learning_rate": 6.688264938804896e-06,
321
- "loss": 1.6857,
322
- "step": 34500
323
- },
324
- {
325
- "epoch": 4.97,
326
- "eval_loss": 1.5941790342330933,
327
- "eval_runtime": 28.3554,
328
- "eval_samples_per_second": 639.948,
329
- "eval_steps_per_second": 80.02,
330
- "step": 34500
331
- },
332
- {
333
- "epoch": 5.18,
334
- "learning_rate": 6.54427645788337e-06,
335
- "loss": 1.6884,
336
- "step": 36000
337
- },
338
- {
339
- "epoch": 5.18,
340
- "eval_loss": 1.5941451787948608,
341
- "eval_runtime": 28.5176,
342
- "eval_samples_per_second": 636.309,
343
- "eval_steps_per_second": 79.565,
344
- "step": 36000
345
- },
346
- {
347
- "epoch": 5.4,
348
- "learning_rate": 6.400287976961843e-06,
349
- "loss": 1.6911,
350
- "step": 37500
351
- },
352
- {
353
- "epoch": 5.4,
354
- "eval_loss": 1.6129869222640991,
355
- "eval_runtime": 28.3069,
356
- "eval_samples_per_second": 641.045,
357
- "eval_steps_per_second": 80.157,
358
- "step": 37500
359
- },
360
- {
361
- "epoch": 5.62,
362
- "learning_rate": 6.2562994960403175e-06,
363
- "loss": 1.6958,
364
- "step": 39000
365
- },
366
- {
367
- "epoch": 5.62,
368
- "eval_loss": 1.602686882019043,
369
- "eval_runtime": 28.3684,
370
- "eval_samples_per_second": 639.656,
371
- "eval_steps_per_second": 79.983,
372
- "step": 39000
373
- },
374
- {
375
- "epoch": 5.83,
376
- "learning_rate": 6.112311015118791e-06,
377
- "loss": 1.7009,
378
- "step": 40500
379
- },
380
- {
381
- "epoch": 5.83,
382
- "eval_loss": 1.5910372734069824,
383
- "eval_runtime": 28.4126,
384
- "eval_samples_per_second": 638.659,
385
- "eval_steps_per_second": 79.859,
386
- "step": 40500
387
- },
388
- {
389
- "epoch": 6.05,
390
- "learning_rate": 5.968322534197265e-06,
391
- "loss": 1.6995,
392
- "step": 42000
393
- },
394
- {
395
- "epoch": 6.05,
396
- "eval_loss": 1.607581377029419,
397
- "eval_runtime": 28.5017,
398
- "eval_samples_per_second": 636.665,
399
- "eval_steps_per_second": 79.609,
400
- "step": 42000
401
- },
402
- {
403
- "epoch": 6.26,
404
- "learning_rate": 5.824334053275739e-06,
405
- "loss": 1.7036,
406
- "step": 43500
407
- },
408
- {
409
- "epoch": 6.26,
410
- "eval_loss": 1.608154296875,
411
- "eval_runtime": 28.3159,
412
- "eval_samples_per_second": 640.841,
413
- "eval_steps_per_second": 80.132,
414
- "step": 43500
415
- },
416
- {
417
- "epoch": 6.48,
418
- "learning_rate": 5.6803455723542124e-06,
419
- "loss": 1.7042,
420
- "step": 45000
421
- },
422
- {
423
- "epoch": 6.48,
424
- "eval_loss": 1.6170202493667603,
425
- "eval_runtime": 28.3,
426
- "eval_samples_per_second": 641.202,
427
- "eval_steps_per_second": 80.177,
428
- "step": 45000
429
- },
430
- {
431
- "epoch": 6.7,
432
- "learning_rate": 5.536357091432686e-06,
433
- "loss": 1.7067,
434
- "step": 46500
435
- },
436
- {
437
- "epoch": 6.7,
438
- "eval_loss": 1.6049327850341797,
439
- "eval_runtime": 28.1995,
440
- "eval_samples_per_second": 643.486,
441
- "eval_steps_per_second": 80.462,
442
- "step": 46500
443
- },
444
- {
445
- "epoch": 6.91,
446
- "learning_rate": 5.3923686105111595e-06,
447
- "loss": 1.7108,
448
- "step": 48000
449
- },
450
- {
451
- "epoch": 6.91,
452
- "eval_loss": 1.6157801151275635,
453
- "eval_runtime": 28.3319,
454
- "eval_samples_per_second": 640.481,
455
- "eval_steps_per_second": 80.087,
456
- "step": 48000
457
- },
458
- {
459
- "epoch": 7.13,
460
- "learning_rate": 5.248380129589633e-06,
461
- "loss": 1.7125,
462
- "step": 49500
463
- },
464
- {
465
- "epoch": 7.13,
466
- "eval_loss": 1.6146913766860962,
467
- "eval_runtime": 28.3126,
468
- "eval_samples_per_second": 640.917,
469
- "eval_steps_per_second": 80.141,
470
- "step": 49500
471
- },
472
- {
473
- "epoch": 7.34,
474
- "learning_rate": 5.1043916486681065e-06,
475
- "loss": 1.7166,
476
- "step": 51000
477
- },
478
- {
479
- "epoch": 7.34,
480
- "eval_loss": 1.6328694820404053,
481
- "eval_runtime": 28.3985,
482
- "eval_samples_per_second": 638.977,
483
- "eval_steps_per_second": 79.898,
484
- "step": 51000
485
- },
486
- {
487
- "epoch": 7.56,
488
- "learning_rate": 4.960403167746581e-06,
489
- "loss": 1.7152,
490
- "step": 52500
491
- },
492
- {
493
- "epoch": 7.56,
494
- "eval_loss": 1.6241958141326904,
495
- "eval_runtime": 28.4178,
496
- "eval_samples_per_second": 638.543,
497
- "eval_steps_per_second": 79.844,
498
- "step": 52500
499
- },
500
- {
501
- "epoch": 7.78,
502
- "learning_rate": 4.8164146868250544e-06,
503
- "loss": 1.718,
504
- "step": 54000
505
- },
506
- {
507
- "epoch": 7.78,
508
- "eval_loss": 1.6272053718566895,
509
- "eval_runtime": 28.607,
510
- "eval_samples_per_second": 634.32,
511
- "eval_steps_per_second": 79.316,
512
- "step": 54000
513
- },
514
- {
515
- "epoch": 7.99,
516
- "learning_rate": 4.672426205903528e-06,
517
- "loss": 1.7206,
518
- "step": 55500
519
- },
520
- {
521
- "epoch": 7.99,
522
- "eval_loss": 1.61719810962677,
523
- "eval_runtime": 28.3806,
524
- "eval_samples_per_second": 639.381,
525
- "eval_steps_per_second": 79.949,
526
- "step": 55500
527
- },
528
- {
529
- "epoch": 8.21,
530
- "learning_rate": 4.5284377249820015e-06,
531
- "loss": 1.7172,
532
- "step": 57000
533
- },
534
- {
535
- "epoch": 8.21,
536
- "eval_loss": 1.6106241941452026,
537
- "eval_runtime": 28.4803,
538
- "eval_samples_per_second": 637.141,
539
- "eval_steps_per_second": 79.669,
540
- "step": 57000
541
- },
542
- {
543
- "epoch": 8.42,
544
- "learning_rate": 4.384449244060476e-06,
545
- "loss": 1.7232,
546
- "step": 58500
547
- },
548
- {
549
- "epoch": 8.42,
550
- "eval_loss": 1.6194111108779907,
551
- "eval_runtime": 29.8037,
552
- "eval_samples_per_second": 608.851,
553
- "eval_steps_per_second": 76.131,
554
- "step": 58500
555
- },
556
- {
557
- "epoch": 8.64,
558
- "learning_rate": 4.240460763138949e-06,
559
- "loss": 1.7245,
560
- "step": 60000
561
- },
562
- {
563
- "epoch": 8.64,
564
- "eval_loss": 1.6307227611541748,
565
- "eval_runtime": 28.3851,
566
- "eval_samples_per_second": 639.28,
567
- "eval_steps_per_second": 79.936,
568
- "step": 60000
569
- },
570
- {
571
- "epoch": 8.86,
572
- "learning_rate": 4.096472282217423e-06,
573
- "loss": 1.7246,
574
- "step": 61500
575
- },
576
- {
577
- "epoch": 8.86,
578
- "eval_loss": 1.6230413913726807,
579
- "eval_runtime": 28.1883,
580
- "eval_samples_per_second": 643.741,
581
- "eval_steps_per_second": 80.494,
582
- "step": 61500
583
- },
584
- {
585
- "epoch": 9.07,
586
- "learning_rate": 3.952483801295896e-06,
587
- "loss": 1.7244,
588
- "step": 63000
589
- },
590
- {
591
- "epoch": 9.07,
592
- "eval_loss": 1.6280677318572998,
593
- "eval_runtime": 28.3706,
594
- "eval_samples_per_second": 639.607,
595
- "eval_steps_per_second": 79.977,
596
- "step": 63000
597
- },
598
- {
599
- "epoch": 9.29,
600
- "learning_rate": 3.8084953203743704e-06,
601
- "loss": 1.7257,
602
- "step": 64500
603
- },
604
- {
605
- "epoch": 9.29,
606
- "eval_loss": 1.6250897645950317,
607
- "eval_runtime": 28.4547,
608
- "eval_samples_per_second": 637.715,
609
- "eval_steps_per_second": 79.741,
610
- "step": 64500
611
- },
612
- {
613
- "epoch": 9.5,
614
- "learning_rate": 3.664506839452844e-06,
615
- "loss": 1.7304,
616
- "step": 66000
617
- },
618
- {
619
- "epoch": 9.5,
620
- "eval_loss": 1.626404881477356,
621
- "eval_runtime": 28.4535,
622
- "eval_samples_per_second": 637.743,
623
- "eval_steps_per_second": 79.744,
624
- "step": 66000
625
- },
626
- {
627
- "epoch": 9.72,
628
- "learning_rate": 3.520518358531318e-06,
629
- "loss": 1.729,
630
- "step": 67500
631
- },
632
- {
633
- "epoch": 9.72,
634
- "eval_loss": 1.6339225769042969,
635
- "eval_runtime": 28.8358,
636
- "eval_samples_per_second": 629.287,
637
- "eval_steps_per_second": 78.687,
638
- "step": 67500
639
- },
640
- {
641
- "epoch": 9.94,
642
- "learning_rate": 3.3765298776097914e-06,
643
- "loss": 1.7331,
644
- "step": 69000
645
- },
646
- {
647
- "epoch": 9.94,
648
- "eval_loss": 1.6299844980239868,
649
- "eval_runtime": 28.7305,
650
- "eval_samples_per_second": 631.594,
651
- "eval_steps_per_second": 78.975,
652
- "step": 69000
653
- },
654
- {
655
- "epoch": 10.15,
656
- "learning_rate": 3.2325413966882653e-06,
657
- "loss": 1.7302,
658
- "step": 70500
659
- },
660
- {
661
- "epoch": 10.15,
662
- "eval_loss": 1.6214041709899902,
663
- "eval_runtime": 28.4725,
664
- "eval_samples_per_second": 637.317,
665
- "eval_steps_per_second": 79.691,
666
- "step": 70500
667
- },
668
- {
669
- "epoch": 10.37,
670
- "learning_rate": 3.088552915766739e-06,
671
- "loss": 1.7272,
672
- "step": 72000
673
- },
674
- {
675
- "epoch": 10.37,
676
- "eval_loss": 1.6317757368087769,
677
- "eval_runtime": 28.623,
678
- "eval_samples_per_second": 633.966,
679
- "eval_steps_per_second": 79.272,
680
- "step": 72000
681
- },
682
- {
683
- "epoch": 10.58,
684
- "learning_rate": 2.9445644348452123e-06,
685
- "loss": 1.7306,
686
- "step": 73500
687
- },
688
- {
689
- "epoch": 10.58,
690
- "eval_loss": 1.6280230283737183,
691
- "eval_runtime": 28.4735,
692
- "eval_samples_per_second": 637.294,
693
- "eval_steps_per_second": 79.688,
694
- "step": 73500
695
- },
696
- {
697
- "epoch": 10.8,
698
- "learning_rate": 2.8005759539236867e-06,
699
- "loss": 1.735,
700
- "step": 75000
701
- },
702
- {
703
- "epoch": 10.8,
704
- "eval_loss": 1.6216455698013306,
705
- "eval_runtime": 28.5783,
706
- "eval_samples_per_second": 634.957,
707
- "eval_steps_per_second": 79.396,
708
- "step": 75000
709
- },
710
- {
711
- "epoch": 11.02,
712
- "learning_rate": 2.6565874730021602e-06,
713
- "loss": 1.7368,
714
- "step": 76500
715
- },
716
- {
717
- "epoch": 11.02,
718
- "eval_loss": 1.6300657987594604,
719
- "eval_runtime": 29.3173,
720
- "eval_samples_per_second": 618.952,
721
- "eval_steps_per_second": 77.395,
722
- "step": 76500
723
- },
724
- {
725
- "epoch": 11.23,
726
- "learning_rate": 2.5125989920806338e-06,
727
- "loss": 1.736,
728
- "step": 78000
729
- },
730
- {
731
- "epoch": 11.23,
732
- "eval_loss": 1.6382640600204468,
733
- "eval_runtime": 29.0698,
734
- "eval_samples_per_second": 624.221,
735
- "eval_steps_per_second": 78.053,
736
- "step": 78000
737
- },
738
- {
739
- "epoch": 11.45,
740
- "learning_rate": 2.3686105111591073e-06,
741
- "loss": 1.7364,
742
- "step": 79500
743
- },
744
- {
745
- "epoch": 11.45,
746
- "eval_loss": 1.6312644481658936,
747
- "eval_runtime": 28.6275,
748
- "eval_samples_per_second": 633.866,
749
- "eval_steps_per_second": 79.259,
750
- "step": 79500
751
- },
752
- {
753
- "epoch": 11.66,
754
- "learning_rate": 2.2246220302375812e-06,
755
- "loss": 1.7343,
756
- "step": 81000
757
- },
758
- {
759
- "epoch": 11.66,
760
- "eval_loss": 1.6355253458023071,
761
- "eval_runtime": 28.6773,
762
- "eval_samples_per_second": 632.765,
763
- "eval_steps_per_second": 79.122,
764
- "step": 81000
765
- },
766
- {
767
- "epoch": 11.88,
768
- "learning_rate": 2.0806335493160548e-06,
769
- "loss": 1.7391,
770
- "step": 82500
771
- },
772
- {
773
- "epoch": 11.88,
774
- "eval_loss": 1.6428338289260864,
775
- "eval_runtime": 28.8154,
776
- "eval_samples_per_second": 629.733,
777
- "eval_steps_per_second": 78.743,
778
- "step": 82500
779
- },
780
- {
781
- "epoch": 12.1,
782
- "learning_rate": 1.9366450683945287e-06,
783
- "loss": 1.7425,
784
- "step": 84000
785
- },
786
- {
787
- "epoch": 12.1,
788
- "eval_loss": 1.6533492803573608,
789
- "eval_runtime": 28.8739,
790
- "eval_samples_per_second": 628.457,
791
- "eval_steps_per_second": 78.583,
792
- "step": 84000
793
- },
794
- {
795
- "epoch": 12.31,
796
- "learning_rate": 1.7926565874730022e-06,
797
- "loss": 1.7369,
798
- "step": 85500
799
- },
800
- {
801
- "epoch": 12.31,
802
- "eval_loss": 1.6431362628936768,
803
- "eval_runtime": 28.7145,
804
- "eval_samples_per_second": 631.946,
805
- "eval_steps_per_second": 79.019,
806
- "step": 85500
807
- },
808
- {
809
- "epoch": 12.53,
810
- "learning_rate": 1.648668106551476e-06,
811
- "loss": 1.7377,
812
- "step": 87000
813
- },
814
- {
815
- "epoch": 12.53,
816
- "eval_loss": 1.6380741596221924,
817
- "eval_runtime": 28.5841,
818
- "eval_samples_per_second": 634.828,
819
- "eval_steps_per_second": 79.38,
820
- "step": 87000
821
- },
822
- {
823
- "epoch": 12.74,
824
- "learning_rate": 1.5046796256299497e-06,
825
- "loss": 1.7403,
826
- "step": 88500
827
- },
828
- {
829
- "epoch": 12.74,
830
- "eval_loss": 1.6389954090118408,
831
- "eval_runtime": 29.0449,
832
- "eval_samples_per_second": 624.756,
833
- "eval_steps_per_second": 78.12,
834
- "step": 88500
835
- },
836
- {
837
- "epoch": 12.96,
838
- "learning_rate": 1.3606911447084234e-06,
839
- "loss": 1.7405,
840
- "step": 90000
841
- },
842
- {
843
- "epoch": 12.96,
844
- "eval_loss": 1.6429226398468018,
845
- "eval_runtime": 28.6205,
846
- "eval_samples_per_second": 634.022,
847
- "eval_steps_per_second": 79.279,
848
- "step": 90000
849
- },
850
- {
851
- "epoch": 13.17,
852
- "learning_rate": 1.2167026637868972e-06,
853
- "loss": 1.7395,
854
- "step": 91500
855
- },
856
- {
857
- "epoch": 13.17,
858
- "eval_loss": 1.641994833946228,
859
- "eval_runtime": 28.6175,
860
- "eval_samples_per_second": 634.088,
861
- "eval_steps_per_second": 79.287,
862
- "step": 91500
863
- },
864
- {
865
- "epoch": 13.39,
866
- "learning_rate": 1.072714182865371e-06,
867
- "loss": 1.7387,
868
- "step": 93000
869
- },
870
- {
871
- "epoch": 13.39,
872
- "eval_loss": 1.6299211978912354,
873
- "eval_runtime": 28.8353,
874
- "eval_samples_per_second": 629.298,
875
- "eval_steps_per_second": 78.688,
876
- "step": 93000
877
- },
878
- {
879
- "epoch": 13.61,
880
- "learning_rate": 9.287257019438446e-07,
881
- "loss": 1.7394,
882
- "step": 94500
883
- },
884
- {
885
- "epoch": 13.61,
886
- "eval_loss": 1.652402639389038,
887
- "eval_runtime": 31.7277,
888
- "eval_samples_per_second": 571.929,
889
- "eval_steps_per_second": 71.515,
890
- "step": 94500
891
- },
892
- {
893
- "epoch": 13.82,
894
- "learning_rate": 7.847372210223183e-07,
895
- "loss": 1.7425,
896
- "step": 96000
897
- },
898
- {
899
- "epoch": 13.82,
900
- "eval_loss": 1.6413946151733398,
901
- "eval_runtime": 31.26,
902
- "eval_samples_per_second": 580.487,
903
- "eval_steps_per_second": 72.585,
904
- "step": 96000
905
- },
906
- {
907
- "epoch": 14.04,
908
- "learning_rate": 6.40748740100792e-07,
909
- "loss": 1.74,
910
- "step": 97500
911
- },
912
- {
913
- "epoch": 14.04,
914
- "eval_loss": 1.6491047143936157,
915
- "eval_runtime": 29.6448,
916
- "eval_samples_per_second": 612.115,
917
- "eval_steps_per_second": 76.54,
918
- "step": 97500
919
- },
920
- {
921
- "epoch": 14.25,
922
- "learning_rate": 4.967602591792657e-07,
923
- "loss": 1.7402,
924
- "step": 99000
925
- },
926
- {
927
- "epoch": 14.25,
928
- "eval_loss": 1.6329963207244873,
929
- "eval_runtime": 28.9348,
930
- "eval_samples_per_second": 627.134,
931
- "eval_steps_per_second": 78.418,
932
- "step": 99000
933
- },
934
- {
935
- "epoch": 14.47,
936
- "learning_rate": 3.5277177825773936e-07,
937
- "loss": 1.7407,
938
- "step": 100500
939
- },
940
- {
941
- "epoch": 14.47,
942
- "eval_loss": 1.640425682067871,
943
- "eval_runtime": 28.3457,
944
- "eval_samples_per_second": 640.168,
945
- "eval_steps_per_second": 80.048,
946
- "step": 100500
947
- },
948
- {
949
- "epoch": 14.69,
950
- "learning_rate": 2.0878329733621312e-07,
951
- "loss": 1.7408,
952
- "step": 102000
953
- },
954
- {
955
- "epoch": 14.69,
956
- "eval_loss": 1.6292715072631836,
957
- "eval_runtime": 28.5084,
958
- "eval_samples_per_second": 636.514,
959
- "eval_steps_per_second": 79.591,
960
- "step": 102000
961
- },
962
- {
963
- "epoch": 14.9,
964
- "learning_rate": 6.479481641468683e-08,
965
- "loss": 1.7374,
966
- "step": 103500
967
- },
968
- {
969
- "epoch": 14.9,
970
- "eval_loss": 1.6261721849441528,
971
- "eval_runtime": 28.3539,
972
- "eval_samples_per_second": 639.982,
973
- "eval_steps_per_second": 80.024,
974
- "step": 103500
975
- }
976
- ],
977
- "max_steps": 104175,
978
- "num_train_epochs": 15,
979
- "total_flos": 6.475717648203267e+17,
980
- "trial_name": null,
981
- "trial_params": null
982
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3bdab85c49770ae0a845ff26fd6e9d1edb800500dfa4828e9f9980bf589232e
3
- size 3451
 
 
 
 
last-checkpoint/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0f6d7dfce7499a39f72cf6615b36e67ad548848682144813bc3f2dd834f0220
3
  size 269468281
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08899eedd1ea87c0a4b54195fb6cfdd988c267131f790bf2b7941a057bebf231
3
  size 269468281
runs/Feb07_11-46-56_turing/events.out.tfevents.1675766828.turing.1045087.1 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d88c3e433aa5a7ecc321f462fd9a2f66a3c65e468a9b4a83a1836da7e40002e
3
- size 33952
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b39ef39aa285d0107f9c5ca86e64b3ee99bf7f1ad216cb653d28ce90de75ae0b
3
+ size 34312