dq158 commited on
Commit
cba3821
1 Parent(s): 0e6c514

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "dq158/pingusPongus",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
 
1
  {
2
+ "_name_or_path": "pingusPongus",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd19122e1b6607cdb79a7016978319f5233648dfa21d17f64f4c5ed9ebfab2b1
3
  size 2371770
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29ddd9f1252dc57286ac7118b5ddd965377fcf663d8bce9811afcd2b7eac4784
3
  size 2371770
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20f9eff7f8c80652672f33d591daec33e74fa91afea6f63662fcce0b413fc45e
3
  size 990409330
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cd5ab5ffed749cadfd623aa9fd91034f5f25cd2d5186c64528e111b7e53e547
3
  size 990409330
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b21196dd7454f3b651b63e42d3595da46748179b2ded600b1b8e0ecb74a09883
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6df0e59dd9c797f952cda2a036b91c3ecc642525cb3dda578b892770a07f726
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:429bbb7d8273481822c70b354c32cf6b2c09f778063e5299935515e96827f77b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:575617c2e41c68103ab814e88f286f7f0b21dfb482a487c336b5b023e1ada9ae
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,526 +1,110 @@
1
  {
2
- "best_metric": 1.8215827941894531,
3
- "best_model_checkpoint": "dq158/pingusPongus/checkpoint-3162",
4
- "epoch": 9.0,
5
  "eval_steps": 500,
6
- "global_step": 28458,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.16,
13
- "learning_rate": 5e-05,
14
- "loss": 1.9604,
15
  "step": 500
16
  },
17
  {
18
- "epoch": 0.32,
19
- "learning_rate": 4.999216501710915e-05,
20
- "loss": 2.0007,
21
  "step": 1000
22
  },
23
  {
24
- "epoch": 0.47,
25
- "learning_rate": 4.996866497939315e-05,
26
- "loss": 2.0079,
27
  "step": 1500
28
  },
29
  {
30
- "epoch": 0.63,
31
- "learning_rate": 4.992951461664347e-05,
32
- "loss": 2.0288,
33
  "step": 2000
34
  },
35
  {
36
- "epoch": 0.79,
37
- "learning_rate": 4.9874738468253904e-05,
38
- "loss": 2.0206,
39
  "step": 2500
40
  },
41
  {
42
- "epoch": 0.95,
43
- "learning_rate": 4.980437086783929e-05,
44
- "loss": 2.0007,
45
  "step": 3000
46
  },
47
  {
48
- "epoch": 1.0,
49
- "eval_bleu": 1.0,
50
- "eval_brevity_penalty": 1.0,
51
- "eval_length_ratio": 1.0,
52
- "eval_loss": 1.8215827941894531,
53
- "eval_precisions": [
54
- 1.0,
55
- 1.0,
56
- 1.0,
57
- 1.0
58
- ],
59
- "eval_reference_length": 52443,
60
- "eval_runtime": 584.5964,
61
- "eval_samples_per_second": 4.808,
62
- "eval_steps_per_second": 0.602,
63
- "eval_translation_length": 52443,
64
- "step": 3162
65
- },
66
- {
67
- "epoch": 1.11,
68
- "learning_rate": 4.971845592171524e-05,
69
- "loss": 1.9799,
70
  "step": 3500
71
  },
72
  {
73
- "epoch": 1.27,
74
- "learning_rate": 4.961704748125239e-05,
75
- "loss": 1.9853,
76
  "step": 4000
77
  },
78
  {
79
- "epoch": 1.42,
80
- "learning_rate": 4.9500209109122444e-05,
81
- "loss": 1.9636,
82
  "step": 4500
83
  },
84
  {
85
- "epoch": 1.58,
86
- "learning_rate": 4.936801403945711e-05,
87
- "loss": 1.9677,
88
  "step": 5000
89
  },
90
  {
91
- "epoch": 1.74,
92
- "learning_rate": 4.922054513194513e-05,
93
- "loss": 1.9714,
94
  "step": 5500
95
  },
96
  {
97
- "epoch": 1.9,
98
- "learning_rate": 4.905789481989587e-05,
99
- "loss": 1.9937,
100
  "step": 6000
101
  },
102
  {
103
- "epoch": 2.0,
104
- "eval_bleu": 1.0,
105
- "eval_brevity_penalty": 1.0,
106
- "eval_length_ratio": 1.0,
107
- "eval_loss": 1.8286010026931763,
108
- "eval_precisions": [
109
- 1.0,
110
- 1.0,
111
- 1.0,
112
- 1.0
113
- ],
114
- "eval_reference_length": 52025,
115
- "eval_runtime": 580.992,
116
- "eval_samples_per_second": 4.838,
117
- "eval_steps_per_second": 0.606,
118
- "eval_translation_length": 52025,
119
- "step": 6324
120
- },
121
- {
122
- "epoch": 2.06,
123
- "learning_rate": 4.888016505230231e-05,
124
- "loss": 1.9751,
125
- "step": 6500
126
- },
127
- {
128
- "epoch": 2.21,
129
- "learning_rate": 4.868746722993951e-05,
130
- "loss": 1.9371,
131
- "step": 7000
132
- },
133
- {
134
- "epoch": 2.37,
135
- "learning_rate": 4.847992213553878e-05,
136
- "loss": 1.9736,
137
- "step": 7500
138
- },
139
- {
140
- "epoch": 2.53,
141
- "learning_rate": 4.82576598580812e-05,
142
- "loss": 1.935,
143
- "step": 8000
144
- },
145
- {
146
- "epoch": 2.69,
147
- "learning_rate": 4.802081971125809e-05,
148
- "loss": 1.938,
149
- "step": 8500
150
- },
151
- {
152
- "epoch": 2.85,
153
- "learning_rate": 4.7769550146149295e-05,
154
- "loss": 1.9218,
155
- "step": 9000
156
- },
157
- {
158
- "epoch": 3.0,
159
- "eval_bleu": 1.0,
160
- "eval_brevity_penalty": 1.0,
161
- "eval_length_ratio": 1.0,
162
- "eval_loss": 1.8345717191696167,
163
- "eval_precisions": [
164
- 1.0,
165
- 1.0,
166
- 1.0,
167
- 1.0
168
- ],
169
- "eval_reference_length": 52929,
170
- "eval_runtime": 581.6683,
171
- "eval_samples_per_second": 4.833,
172
- "eval_steps_per_second": 0.605,
173
- "eval_translation_length": 52929,
174
- "step": 9486
175
- },
176
- {
177
- "epoch": 3.0,
178
- "learning_rate": 4.75040086581743e-05,
179
- "loss": 1.9531,
180
- "step": 9500
181
- },
182
- {
183
- "epoch": 3.16,
184
- "learning_rate": 4.7224361688374306e-05,
185
- "loss": 1.892,
186
- "step": 10000
187
- },
188
- {
189
- "epoch": 3.32,
190
- "learning_rate": 4.6930784519087246e-05,
191
- "loss": 1.9357,
192
- "step": 10500
193
- },
194
- {
195
- "epoch": 3.48,
196
- "learning_rate": 4.662346116408098e-05,
197
- "loss": 1.9117,
198
- "step": 11000
199
- },
200
- {
201
- "epoch": 3.64,
202
- "learning_rate": 4.630258425321379e-05,
203
- "loss": 1.9082,
204
- "step": 11500
205
- },
206
- {
207
- "epoch": 3.8,
208
- "learning_rate": 4.596835491169421e-05,
209
- "loss": 1.9105,
210
- "step": 12000
211
- },
212
- {
213
- "epoch": 3.95,
214
- "learning_rate": 4.562098263401604e-05,
215
- "loss": 1.9282,
216
- "step": 12500
217
- },
218
- {
219
- "epoch": 4.0,
220
- "eval_bleu": 1.0,
221
- "eval_brevity_penalty": 1.0,
222
- "eval_length_ratio": 1.0,
223
- "eval_loss": 1.8385423421859741,
224
- "eval_precisions": [
225
- 1.0,
226
- 1.0,
227
- 1.0,
228
- 1.0
229
- ],
230
- "eval_reference_length": 51997,
231
- "eval_runtime": 581.4732,
232
- "eval_samples_per_second": 4.834,
233
- "eval_steps_per_second": 0.605,
234
- "eval_translation_length": 51997,
235
- "step": 12648
236
- },
237
- {
238
- "epoch": 4.11,
239
- "learning_rate": 4.526068515264746e-05,
240
- "loss": 1.9217,
241
- "step": 13000
242
- },
243
- {
244
- "epoch": 4.27,
245
- "learning_rate": 4.488768830155665e-05,
246
- "loss": 1.9057,
247
- "step": 13500
248
- },
249
- {
250
- "epoch": 4.43,
251
- "learning_rate": 4.450222587465934e-05,
252
- "loss": 1.8887,
253
- "step": 14000
254
- },
255
- {
256
- "epoch": 4.59,
257
- "learning_rate": 4.4104539479277104e-05,
258
- "loss": 1.8943,
259
- "step": 14500
260
- },
261
- {
262
- "epoch": 4.74,
263
- "learning_rate": 4.3694878384698255e-05,
264
- "loss": 1.8776,
265
- "step": 15000
266
- },
267
- {
268
- "epoch": 4.9,
269
- "learning_rate": 4.327349936593615e-05,
270
- "loss": 1.874,
271
- "step": 15500
272
- },
273
- {
274
- "epoch": 5.0,
275
- "eval_bleu": 1.0,
276
- "eval_brevity_penalty": 1.0,
277
- "eval_length_ratio": 1.0,
278
- "eval_loss": 1.83934485912323,
279
- "eval_precisions": [
280
- 1.0,
281
- 1.0,
282
- 1.0,
283
- 1.0
284
- ],
285
- "eval_reference_length": 52639,
286
- "eval_runtime": 583.0754,
287
- "eval_samples_per_second": 4.821,
288
- "eval_steps_per_second": 0.604,
289
- "eval_translation_length": 52639,
290
- "step": 15810
291
- },
292
- {
293
- "epoch": 5.06,
294
- "learning_rate": 4.284066654278301e-05,
295
- "loss": 1.8739,
296
- "step": 16000
297
- },
298
- {
299
- "epoch": 5.22,
300
- "learning_rate": 4.239665121425993e-05,
301
- "loss": 1.8427,
302
- "step": 16500
303
- },
304
- {
305
- "epoch": 5.38,
306
- "learning_rate": 4.1941731688567106e-05,
307
- "loss": 1.8612,
308
- "step": 17000
309
- },
310
- {
311
- "epoch": 5.53,
312
- "learning_rate": 4.147619310864058e-05,
313
- "loss": 1.8802,
314
- "step": 17500
315
- },
316
- {
317
- "epoch": 5.69,
318
- "learning_rate": 4.100032727342505e-05,
319
- "loss": 1.8496,
320
- "step": 18000
321
- },
322
- {
323
- "epoch": 5.85,
324
- "learning_rate": 4.0514432454974695e-05,
325
- "loss": 1.8791,
326
- "step": 18500
327
- },
328
- {
329
- "epoch": 6.0,
330
- "eval_bleu": 1.0,
331
- "eval_brevity_penalty": 1.0,
332
- "eval_length_ratio": 1.0,
333
- "eval_loss": 1.8424808979034424,
334
- "eval_precisions": [
335
- 1.0,
336
- 1.0,
337
- 1.0,
338
- 1.0
339
- ],
340
- "eval_reference_length": 52322,
341
- "eval_runtime": 579.9659,
342
- "eval_samples_per_second": 4.847,
343
- "eval_steps_per_second": 0.607,
344
- "eval_translation_length": 52322,
345
- "step": 18972
346
- },
347
- {
348
- "epoch": 6.01,
349
- "learning_rate": 4.001881321149665e-05,
350
- "loss": 1.8549,
351
- "step": 19000
352
- },
353
- {
354
- "epoch": 6.17,
355
- "learning_rate": 3.9513780196454384e-05,
356
- "loss": 1.8317,
357
- "step": 19500
358
- },
359
- {
360
- "epoch": 6.33,
361
- "learning_rate": 3.899964996385045e-05,
362
- "loss": 1.8271,
363
- "step": 20000
364
- },
365
- {
366
- "epoch": 6.48,
367
- "learning_rate": 3.8476744769810936e-05,
368
- "loss": 1.8169,
369
- "step": 20500
370
- },
371
- {
372
- "epoch": 6.64,
373
- "learning_rate": 3.7945392370595755e-05,
374
- "loss": 1.847,
375
- "step": 21000
376
- },
377
- {
378
- "epoch": 6.8,
379
- "learning_rate": 3.740592581716146e-05,
380
- "loss": 1.857,
381
- "step": 21500
382
- },
383
- {
384
- "epoch": 6.96,
385
- "learning_rate": 3.6858683246405354e-05,
386
- "loss": 1.85,
387
- "step": 22000
388
- },
389
- {
390
- "epoch": 7.0,
391
- "eval_bleu": 1.0,
392
- "eval_brevity_penalty": 1.0,
393
- "eval_length_ratio": 1.0,
394
- "eval_loss": 1.8449796438217163,
395
- "eval_precisions": [
396
- 1.0,
397
- 1.0,
398
- 1.0,
399
- 1.0
400
- ],
401
- "eval_reference_length": 52368,
402
- "eval_runtime": 581.0567,
403
- "eval_samples_per_second": 4.838,
404
- "eval_steps_per_second": 0.606,
405
- "eval_translation_length": 52368,
406
- "step": 22134
407
- },
408
- {
409
- "epoch": 7.12,
410
- "learning_rate": 3.6304007669221754e-05,
411
- "loss": 1.8025,
412
- "step": 22500
413
- },
414
- {
415
- "epoch": 7.27,
416
- "learning_rate": 3.574224675550324e-05,
417
- "loss": 1.8139,
418
- "step": 23000
419
- },
420
- {
421
- "epoch": 7.43,
422
- "learning_rate": 3.517375261622165e-05,
423
- "loss": 1.8069,
424
- "step": 23500
425
- },
426
- {
427
- "epoch": 7.59,
428
- "learning_rate": 3.459888158272534e-05,
429
- "loss": 1.8264,
430
- "step": 24000
431
- },
432
- {
433
- "epoch": 7.75,
434
- "learning_rate": 3.401799398339127e-05,
435
- "loss": 1.7979,
436
- "step": 24500
437
- },
438
- {
439
- "epoch": 7.91,
440
- "learning_rate": 3.343145391777163e-05,
441
- "loss": 1.8242,
442
- "step": 25000
443
- },
444
- {
445
- "epoch": 8.0,
446
- "eval_bleu": 1.0,
447
- "eval_brevity_penalty": 1.0,
448
- "eval_length_ratio": 1.0,
449
- "eval_loss": 1.8492226600646973,
450
- "eval_precisions": [
451
- 1.0,
452
- 1.0,
453
- 1.0,
454
- 1.0
455
- ],
456
- "eval_reference_length": 52579,
457
- "eval_runtime": 581.3976,
458
- "eval_samples_per_second": 4.835,
459
- "eval_steps_per_second": 0.605,
460
- "eval_translation_length": 52579,
461
- "step": 25296
462
- },
463
- {
464
- "epoch": 8.06,
465
- "learning_rate": 3.283962902837673e-05,
466
- "loss": 1.8288,
467
- "step": 25500
468
- },
469
- {
470
- "epoch": 8.22,
471
- "learning_rate": 3.22428902702372e-05,
472
- "loss": 1.7989,
473
- "step": 26000
474
- },
475
- {
476
- "epoch": 8.38,
477
- "learning_rate": 3.164161167838985e-05,
478
- "loss": 1.795,
479
- "step": 26500
480
- },
481
- {
482
- "epoch": 8.54,
483
- "learning_rate": 3.103617013343307e-05,
484
- "loss": 1.7786,
485
- "step": 27000
486
- },
487
- {
488
- "epoch": 8.7,
489
- "learning_rate": 3.0426945125298563e-05,
490
- "loss": 1.7924,
491
- "step": 27500
492
- },
493
- {
494
- "epoch": 8.86,
495
- "learning_rate": 2.9814318515387547e-05,
496
- "loss": 1.8087,
497
- "step": 28000
498
- },
499
- {
500
- "epoch": 9.0,
501
  "eval_bleu": 1.0,
502
  "eval_brevity_penalty": 1.0,
503
  "eval_length_ratio": 1.0,
504
- "eval_loss": 1.8519303798675537,
505
  "eval_precisions": [
506
  1.0,
507
  1.0,
508
  1.0,
509
  1.0
510
  ],
511
- "eval_reference_length": 52432,
512
- "eval_runtime": 580.9392,
513
- "eval_samples_per_second": 4.839,
514
- "eval_steps_per_second": 0.606,
515
- "eval_translation_length": 52432,
516
- "step": 28458
517
  }
518
  ],
519
  "logging_steps": 500,
520
- "max_steps": 63240,
521
  "num_train_epochs": 20,
522
  "save_steps": 500,
523
- "total_flos": 1.558637838068613e+17,
524
  "trial_name": null,
525
  "trial_params": null
526
  }
 
1
  {
2
+ "best_metric": 1.5654487609863281,
3
+ "best_model_checkpoint": "dq158/pingusPongus/checkpoint-6323",
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 6323,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08,
13
+ "learning_rate": 5e-06,
14
+ "loss": 1.8585,
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 0.16,
19
+ "learning_rate": 4.999805607800008e-06,
20
+ "loss": 1.823,
21
  "step": 1000
22
  },
23
  {
24
+ "epoch": 0.24,
25
+ "learning_rate": 4.999222461430692e-06,
26
+ "loss": 1.8388,
27
  "step": 1500
28
  },
29
  {
30
+ "epoch": 0.32,
31
+ "learning_rate": 4.998250651579336e-06,
32
+ "loss": 1.8372,
33
  "step": 2000
34
  },
35
  {
36
+ "epoch": 0.4,
37
+ "learning_rate": 4.996890329375747e-06,
38
+ "loss": 1.8066,
39
  "step": 2500
40
  },
41
  {
42
+ "epoch": 0.47,
43
+ "learning_rate": 4.995141706368742e-06,
44
+ "loss": 1.8485,
45
  "step": 3000
46
  },
47
  {
48
+ "epoch": 0.55,
49
+ "learning_rate": 4.993005054493262e-06,
50
+ "loss": 1.8243,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  "step": 3500
52
  },
53
  {
54
+ "epoch": 0.63,
55
+ "learning_rate": 4.990480706028073e-06,
56
+ "loss": 1.8278,
57
  "step": 4000
58
  },
59
  {
60
+ "epoch": 0.71,
61
+ "learning_rate": 4.987569053544098e-06,
62
+ "loss": 1.8126,
63
  "step": 4500
64
  },
65
  {
66
+ "epoch": 0.79,
67
+ "learning_rate": 4.98427054984336e-06,
68
+ "loss": 1.8277,
69
  "step": 5000
70
  },
71
  {
72
+ "epoch": 0.87,
73
+ "learning_rate": 4.980585707888573e-06,
74
+ "loss": 1.8475,
75
  "step": 5500
76
  },
77
  {
78
+ "epoch": 0.95,
79
+ "learning_rate": 4.976515100723365e-06,
80
+ "loss": 1.8441,
81
  "step": 6000
82
  },
83
  {
84
+ "epoch": 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  "eval_bleu": 1.0,
86
  "eval_brevity_penalty": 1.0,
87
  "eval_length_ratio": 1.0,
88
+ "eval_loss": 1.5654487609863281,
89
  "eval_precisions": [
90
  1.0,
91
  1.0,
92
  1.0,
93
  1.0
94
  ],
95
+ "eval_reference_length": 52412,
96
+ "eval_runtime": 683.1649,
97
+ "eval_samples_per_second": 4.115,
98
+ "eval_steps_per_second": 1.029,
99
+ "eval_translation_length": 52412,
100
+ "step": 6323
101
  }
102
  ],
103
  "logging_steps": 500,
104
+ "max_steps": 126460,
105
  "num_train_epochs": 20,
106
  "save_steps": 500,
107
+ "total_flos": 1.7318198200762368e+16,
108
  "trial_name": null,
109
  "trial_params": null
110
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8eb6624817ce9b56d424478fc6d1a6e01d7373d5d6bcbca9eb534f2108fc942
3
  size 4664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ce6df3970ff39f84beab7b635dd3c941539643a1d32dedf54263683eef40519
3
  size 4664