JulienRPA commited on
Commit
63417c4
1 Parent(s): b53669d

End of training

Browse files
all_results.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
  "epoch": 8.0,
3
- "eval_bleu": 77.0649,
4
- "eval_em": 0.0548,
5
- "eval_gen_len": 47.473,
6
- "eval_loss": 1.024766206741333,
7
  "eval_rm": 0.22,
8
- "eval_runtime": 396.6503,
9
  "eval_samples": 1205,
10
- "eval_samples_per_second": 3.038,
11
- "eval_steps_per_second": 0.381,
12
- "train_loss": 0.15130291744847968,
13
- "train_runtime": 1637.6947,
14
  "train_samples": 22890,
15
- "train_samples_per_second": 111.816,
16
- "train_steps_per_second": 6.99
17
  }
 
1
  {
2
  "epoch": 8.0,
3
+ "eval_bleu": 76.5118,
4
+ "eval_em": 0.044,
5
+ "eval_gen_len": 46.7278,
6
+ "eval_loss": 1.04942786693573,
7
  "eval_rm": 0.22,
8
+ "eval_runtime": 359.6936,
9
  "eval_samples": 1205,
10
+ "eval_samples_per_second": 3.35,
11
+ "eval_steps_per_second": 0.42,
12
+ "train_loss": 6.2339570505647375,
13
+ "train_runtime": 11756.9791,
14
  "train_samples": 22890,
15
+ "train_samples_per_second": 15.575,
16
+ "train_steps_per_second": 0.974
17
  }
eval_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
  "epoch": 8.0,
3
- "eval_bleu": 77.0649,
4
- "eval_em": 0.0548,
5
- "eval_gen_len": 47.473,
6
- "eval_loss": 1.024766206741333,
7
- "eval_runtime": 396.6503,
8
  "eval_samples": 1205,
9
- "eval_samples_per_second": 3.038,
10
- "eval_steps_per_second": 0.381
11
  }
 
1
  {
2
  "epoch": 8.0,
3
+ "eval_bleu": 76.5118,
4
+ "eval_em": 0.044,
5
+ "eval_gen_len": 46.7278,
6
+ "eval_loss": 1.04942786693573,
7
+ "eval_runtime": 359.6936,
8
  "eval_samples": 1205,
9
+ "eval_samples_per_second": 3.35,
10
+ "eval_steps_per_second": 0.42
11
  }
runs/Jun05_10-45-59_0a95bf9de5ac/events.out.tfevents.1685974998.0a95bf9de5ac.3272.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e36e5f894c9bf1e3b8fe15f812413ec4a1ea301a59a369726903034638e7e02
3
+ size 504
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 8.0,
3
- "train_loss": 0.15130291744847968,
4
- "train_runtime": 1637.6947,
5
  "train_samples": 22890,
6
- "train_samples_per_second": 111.816,
7
- "train_steps_per_second": 6.99
8
  }
 
1
  {
2
  "epoch": 8.0,
3
+ "train_loss": 6.2339570505647375,
4
+ "train_runtime": 11756.9791,
5
  "train_samples": 22890,
6
+ "train_samples_per_second": 15.575,
7
+ "train_steps_per_second": 0.974
8
  }
trainer_state.json CHANGED
@@ -10,755 +10,755 @@
10
  {
11
  "epoch": 0.07,
12
  "learning_rate": 2.0000000000000003e-06,
13
- "loss": 86.0421,
14
  "step": 100
15
  },
16
  {
17
  "epoch": 0.14,
18
  "learning_rate": 4.000000000000001e-06,
19
- "loss": 51.6706,
20
  "step": 200
21
  },
22
  {
23
  "epoch": 0.21,
24
  "learning_rate": 6e-06,
25
- "loss": 41.1349,
26
  "step": 300
27
  },
28
  {
29
  "epoch": 0.28,
30
  "learning_rate": 8.000000000000001e-06,
31
- "loss": 36.5061,
32
  "step": 400
33
  },
34
  {
35
  "epoch": 0.35,
36
  "learning_rate": 1e-05,
37
- "loss": 33.1858,
38
  "step": 500
39
  },
40
  {
41
  "epoch": 0.42,
42
  "learning_rate": 1.2e-05,
43
- "loss": 30.5206,
44
  "step": 600
45
  },
46
  {
47
  "epoch": 0.49,
48
  "learning_rate": 1.4000000000000001e-05,
49
- "loss": 28.0073,
50
  "step": 700
51
  },
52
  {
53
  "epoch": 0.56,
54
  "learning_rate": 1.6000000000000003e-05,
55
- "loss": 26.1939,
56
  "step": 800
57
  },
58
  {
59
  "epoch": 0.63,
60
  "learning_rate": 1.8e-05,
61
- "loss": 24.3465,
62
  "step": 900
63
  },
64
  {
65
  "epoch": 0.7,
66
  "learning_rate": 2e-05,
67
- "loss": 22.5759,
68
  "step": 1000
69
  },
70
  {
71
  "epoch": 0.77,
72
  "learning_rate": 2.2000000000000003e-05,
73
- "loss": 20.9294,
74
  "step": 1100
75
  },
76
  {
77
  "epoch": 0.84,
78
  "learning_rate": 2.4e-05,
79
- "loss": 19.3762,
80
  "step": 1200
81
  },
82
  {
83
  "epoch": 0.91,
84
  "learning_rate": 2.6000000000000002e-05,
85
- "loss": 17.72,
86
  "step": 1300
87
  },
88
  {
89
  "epoch": 0.98,
90
  "learning_rate": 2.8000000000000003e-05,
91
- "loss": 15.7901,
92
  "step": 1400
93
  },
94
  {
95
  "epoch": 1.05,
96
  "learning_rate": 3e-05,
97
- "loss": 14.0008,
98
  "step": 1500
99
  },
100
  {
101
  "epoch": 1.12,
102
  "learning_rate": 3.2000000000000005e-05,
103
- "loss": 12.3777,
104
  "step": 1600
105
  },
106
  {
107
  "epoch": 1.19,
108
  "learning_rate": 3.4000000000000007e-05,
109
- "loss": 10.7261,
110
  "step": 1700
111
  },
112
  {
113
  "epoch": 1.26,
114
  "learning_rate": 3.6e-05,
115
- "loss": 9.1024,
116
  "step": 1800
117
  },
118
  {
119
  "epoch": 1.33,
120
  "learning_rate": 3.8e-05,
121
- "loss": 7.4676,
122
  "step": 1900
123
  },
124
  {
125
  "epoch": 1.4,
126
  "learning_rate": 4e-05,
127
- "loss": 6.6044,
128
  "step": 2000
129
  },
130
  {
131
  "epoch": 1.4,
132
- "eval_bleu": 3.8045,
133
  "eval_em": 0.0,
134
- "eval_gen_len": 158.8473,
135
- "eval_loss": 6.171305179595947,
136
- "eval_runtime": 1556.9786,
137
- "eval_samples_per_second": 0.774,
138
- "eval_steps_per_second": 0.097,
139
  "step": 2000
140
  },
141
  {
142
  "epoch": 1.47,
143
  "learning_rate": 4.2e-05,
144
- "loss": 6.0941,
145
  "step": 2100
146
  },
147
  {
148
  "epoch": 1.54,
149
  "learning_rate": 4.4000000000000006e-05,
150
- "loss": 5.6741,
151
  "step": 2200
152
  },
153
  {
154
  "epoch": 1.61,
155
  "learning_rate": 4.600000000000001e-05,
156
- "loss": 5.4757,
157
  "step": 2300
158
  },
159
  {
160
  "epoch": 1.68,
161
  "learning_rate": 4.8e-05,
162
- "loss": 5.242,
163
  "step": 2400
164
  },
165
  {
166
  "epoch": 1.75,
167
  "learning_rate": 5e-05,
168
- "loss": 5.0108,
169
  "step": 2500
170
  },
171
  {
172
  "epoch": 1.82,
173
  "learning_rate": 4.944121591417077e-05,
174
- "loss": 4.8595,
175
  "step": 2600
176
  },
177
  {
178
  "epoch": 1.89,
179
  "learning_rate": 4.888243182834153e-05,
180
- "loss": 4.695,
181
  "step": 2700
182
  },
183
  {
184
  "epoch": 1.96,
185
  "learning_rate": 4.8323647742512295e-05,
186
- "loss": 4.5706,
187
  "step": 2800
188
  },
189
  {
190
  "epoch": 2.03,
191
  "learning_rate": 4.776486365668306e-05,
192
- "loss": 4.2498,
193
  "step": 2900
194
  },
195
  {
196
  "epoch": 2.1,
197
  "learning_rate": 4.720607957085382e-05,
198
- "loss": 4.1223,
199
  "step": 3000
200
  },
201
  {
202
  "epoch": 2.17,
203
  "learning_rate": 4.664729548502459e-05,
204
- "loss": 4.0181,
205
  "step": 3100
206
  },
207
  {
208
  "epoch": 2.24,
209
  "learning_rate": 4.6088511399195353e-05,
210
- "loss": 3.8722,
211
  "step": 3200
212
  },
213
  {
214
  "epoch": 2.31,
215
  "learning_rate": 4.552972731336611e-05,
216
- "loss": 3.7786,
217
  "step": 3300
218
  },
219
  {
220
  "epoch": 2.38,
221
  "learning_rate": 4.497094322753688e-05,
222
- "loss": 3.6403,
223
  "step": 3400
224
  },
225
  {
226
  "epoch": 2.45,
227
  "learning_rate": 4.4412159141707646e-05,
228
- "loss": 3.5437,
229
  "step": 3500
230
  },
231
  {
232
  "epoch": 2.52,
233
  "learning_rate": 4.385337505587841e-05,
234
- "loss": 3.389,
235
  "step": 3600
236
  },
237
  {
238
  "epoch": 2.59,
239
  "learning_rate": 4.329459097004918e-05,
240
- "loss": 3.2395,
241
  "step": 3700
242
  },
243
  {
244
  "epoch": 2.66,
245
  "learning_rate": 4.2735806884219945e-05,
246
- "loss": 3.1786,
247
  "step": 3800
248
  },
249
  {
250
  "epoch": 2.73,
251
  "learning_rate": 4.2177022798390704e-05,
252
- "loss": 3.0657,
253
  "step": 3900
254
  },
255
  {
256
  "epoch": 2.8,
257
  "learning_rate": 4.161823871256147e-05,
258
- "loss": 3.032,
259
  "step": 4000
260
  },
261
  {
262
  "epoch": 2.8,
263
- "eval_bleu": 27.701,
264
  "eval_em": 0.0,
265
- "eval_gen_len": 33.9568,
266
- "eval_loss": 2.904534339904785,
267
- "eval_runtime": 297.5953,
268
- "eval_samples_per_second": 4.049,
269
- "eval_steps_per_second": 0.507,
270
  "step": 4000
271
  },
272
  {
273
  "epoch": 2.87,
274
  "learning_rate": 4.105945462673223e-05,
275
- "loss": 2.8755,
276
  "step": 4100
277
  },
278
  {
279
  "epoch": 2.94,
280
  "learning_rate": 4.0500670540903e-05,
281
- "loss": 2.8396,
282
  "step": 4200
283
  },
284
  {
285
  "epoch": 3.0,
286
  "learning_rate": 3.994188645507376e-05,
287
- "loss": 2.7454,
288
  "step": 4300
289
  },
290
  {
291
  "epoch": 3.07,
292
  "learning_rate": 3.938310236924452e-05,
293
- "loss": 2.5218,
294
  "step": 4400
295
  },
296
  {
297
  "epoch": 3.14,
298
  "learning_rate": 3.882431828341529e-05,
299
- "loss": 2.4895,
300
  "step": 4500
301
  },
302
  {
303
  "epoch": 3.21,
304
  "learning_rate": 3.8265534197586055e-05,
305
- "loss": 2.4554,
306
  "step": 4600
307
  },
308
  {
309
  "epoch": 3.28,
310
  "learning_rate": 3.7706750111756815e-05,
311
- "loss": 2.3573,
312
  "step": 4700
313
  },
314
  {
315
  "epoch": 3.35,
316
  "learning_rate": 3.714796602592758e-05,
317
- "loss": 2.2979,
318
  "step": 4800
319
  },
320
  {
321
  "epoch": 3.42,
322
  "learning_rate": 3.658918194009835e-05,
323
- "loss": 2.1874,
324
  "step": 4900
325
  },
326
  {
327
  "epoch": 3.49,
328
  "learning_rate": 3.603039785426911e-05,
329
- "loss": 2.1803,
330
  "step": 5000
331
  },
332
  {
333
  "epoch": 3.56,
334
  "learning_rate": 3.5471613768439874e-05,
335
- "loss": 2.1553,
336
  "step": 5100
337
  },
338
  {
339
  "epoch": 3.63,
340
  "learning_rate": 3.491282968261064e-05,
341
- "loss": 2.0567,
342
  "step": 5200
343
  },
344
  {
345
  "epoch": 3.7,
346
  "learning_rate": 3.4354045596781406e-05,
347
- "loss": 2.0147,
348
  "step": 5300
349
  },
350
  {
351
  "epoch": 3.77,
352
  "learning_rate": 3.379526151095217e-05,
353
- "loss": 1.9817,
354
  "step": 5400
355
  },
356
  {
357
  "epoch": 3.84,
358
  "learning_rate": 3.323647742512294e-05,
359
- "loss": 1.8843,
360
  "step": 5500
361
  },
362
  {
363
  "epoch": 3.91,
364
  "learning_rate": 3.26776933392937e-05,
365
- "loss": 1.8849,
366
  "step": 5600
367
  },
368
  {
369
  "epoch": 3.98,
370
  "learning_rate": 3.2118909253464465e-05,
371
- "loss": 1.8232,
372
  "step": 5700
373
  },
374
  {
375
  "epoch": 4.05,
376
  "learning_rate": 3.156012516763523e-05,
377
- "loss": 1.6714,
378
  "step": 5800
379
  },
380
  {
381
  "epoch": 4.12,
382
  "learning_rate": 3.100134108180599e-05,
383
- "loss": 1.6047,
384
  "step": 5900
385
  },
386
  {
387
  "epoch": 4.19,
388
  "learning_rate": 3.0442556995976757e-05,
389
- "loss": 1.5893,
390
  "step": 6000
391
  },
392
  {
393
  "epoch": 4.19,
394
- "eval_bleu": 57.7509,
395
- "eval_em": 0.0033,
396
- "eval_gen_len": 44.132,
397
- "eval_loss": 1.7326730489730835,
398
- "eval_runtime": 426.2734,
399
- "eval_samples_per_second": 2.827,
400
- "eval_steps_per_second": 0.354,
401
  "step": 6000
402
  },
403
  {
404
  "epoch": 4.26,
405
  "learning_rate": 2.9883772910147524e-05,
406
- "loss": 1.5864,
407
  "step": 6100
408
  },
409
  {
410
  "epoch": 4.33,
411
  "learning_rate": 2.9324988824318283e-05,
412
- "loss": 1.5608,
413
  "step": 6200
414
  },
415
  {
416
  "epoch": 4.4,
417
  "learning_rate": 2.876620473848905e-05,
418
- "loss": 1.5144,
419
  "step": 6300
420
  },
421
  {
422
  "epoch": 4.47,
423
  "learning_rate": 2.8207420652659816e-05,
424
- "loss": 1.4582,
425
  "step": 6400
426
  },
427
  {
428
  "epoch": 4.54,
429
  "learning_rate": 2.7648636566830576e-05,
430
- "loss": 1.4793,
431
  "step": 6500
432
  },
433
  {
434
  "epoch": 4.61,
435
  "learning_rate": 2.7089852481001342e-05,
436
- "loss": 1.472,
437
  "step": 6600
438
  },
439
  {
440
  "epoch": 4.68,
441
  "learning_rate": 2.653106839517211e-05,
442
- "loss": 1.4424,
443
  "step": 6700
444
  },
445
  {
446
  "epoch": 4.75,
447
  "learning_rate": 2.597228430934287e-05,
448
- "loss": 1.3779,
449
  "step": 6800
450
  },
451
  {
452
  "epoch": 4.82,
453
  "learning_rate": 2.5413500223513638e-05,
454
- "loss": 1.3611,
455
  "step": 6900
456
  },
457
  {
458
  "epoch": 4.89,
459
  "learning_rate": 2.48547161376844e-05,
460
- "loss": 1.3311,
461
  "step": 7000
462
  },
463
  {
464
  "epoch": 4.96,
465
  "learning_rate": 2.4295932051855164e-05,
466
- "loss": 1.3164,
467
  "step": 7100
468
  },
469
  {
470
  "epoch": 5.03,
471
  "learning_rate": 2.373714796602593e-05,
472
- "loss": 1.2119,
473
  "step": 7200
474
  },
475
  {
476
  "epoch": 5.1,
477
  "learning_rate": 2.3178363880196693e-05,
478
- "loss": 1.1122,
479
  "step": 7300
480
  },
481
  {
482
  "epoch": 5.17,
483
  "learning_rate": 2.2619579794367456e-05,
484
- "loss": 1.1198,
485
  "step": 7400
486
  },
487
  {
488
  "epoch": 5.24,
489
  "learning_rate": 2.206079570853822e-05,
490
- "loss": 1.0416,
491
  "step": 7500
492
  },
493
  {
494
  "epoch": 5.31,
495
  "learning_rate": 2.1502011622708985e-05,
496
- "loss": 1.1042,
497
  "step": 7600
498
  },
499
  {
500
  "epoch": 5.38,
501
  "learning_rate": 2.0943227536879752e-05,
502
- "loss": 1.0715,
503
  "step": 7700
504
  },
505
  {
506
  "epoch": 5.45,
507
  "learning_rate": 2.0384443451050515e-05,
508
- "loss": 1.0815,
509
  "step": 7800
510
  },
511
  {
512
  "epoch": 5.52,
513
  "learning_rate": 1.982565936522128e-05,
514
- "loss": 1.0445,
515
  "step": 7900
516
  },
517
  {
518
  "epoch": 5.59,
519
  "learning_rate": 1.9266875279392044e-05,
520
- "loss": 1.0512,
521
  "step": 8000
522
  },
523
  {
524
  "epoch": 5.59,
525
- "eval_bleu": 71.1272,
526
- "eval_em": 0.0241,
527
- "eval_gen_len": 46.0672,
528
- "eval_loss": 1.2382431030273438,
529
- "eval_runtime": 358.1089,
530
- "eval_samples_per_second": 3.365,
531
- "eval_steps_per_second": 0.422,
532
  "step": 8000
533
  },
534
  {
535
  "epoch": 5.66,
536
  "learning_rate": 1.8708091193562807e-05,
537
- "loss": 1.0201,
538
  "step": 8100
539
  },
540
  {
541
  "epoch": 5.73,
542
  "learning_rate": 1.8149307107733573e-05,
543
- "loss": 1.0377,
544
  "step": 8200
545
  },
546
  {
547
  "epoch": 5.8,
548
  "learning_rate": 1.7590523021904336e-05,
549
- "loss": 0.986,
550
  "step": 8300
551
  },
552
  {
553
  "epoch": 5.87,
554
  "learning_rate": 1.70317389360751e-05,
555
- "loss": 1.0244,
556
  "step": 8400
557
  },
558
  {
559
  "epoch": 5.94,
560
  "learning_rate": 1.6472954850245866e-05,
561
- "loss": 0.9654,
562
  "step": 8500
563
  },
564
  {
565
  "epoch": 6.01,
566
  "learning_rate": 1.5914170764416632e-05,
567
- "loss": 0.9454,
568
  "step": 8600
569
  },
570
  {
571
  "epoch": 6.08,
572
  "learning_rate": 1.5355386678587395e-05,
573
- "loss": 0.8179,
574
  "step": 8700
575
  },
576
  {
577
  "epoch": 6.15,
578
  "learning_rate": 1.479660259275816e-05,
579
- "loss": 0.8433,
580
  "step": 8800
581
  },
582
  {
583
  "epoch": 6.22,
584
  "learning_rate": 1.4237818506928924e-05,
585
- "loss": 0.8235,
586
  "step": 8900
587
  },
588
  {
589
  "epoch": 6.29,
590
  "learning_rate": 1.3679034421099687e-05,
591
- "loss": 0.832,
592
  "step": 9000
593
  },
594
  {
595
  "epoch": 6.36,
596
  "learning_rate": 1.312025033527045e-05,
597
- "loss": 0.8019,
598
  "step": 9100
599
  },
600
  {
601
  "epoch": 6.43,
602
  "learning_rate": 1.2561466249441217e-05,
603
- "loss": 0.806,
604
  "step": 9200
605
  },
606
  {
607
  "epoch": 6.5,
608
  "learning_rate": 1.2002682163611981e-05,
609
- "loss": 0.7985,
610
  "step": 9300
611
  },
612
  {
613
  "epoch": 6.57,
614
  "learning_rate": 1.1443898077782746e-05,
615
- "loss": 0.801,
616
  "step": 9400
617
  },
618
  {
619
  "epoch": 6.64,
620
  "learning_rate": 1.0885113991953509e-05,
621
- "loss": 0.7758,
622
  "step": 9500
623
  },
624
  {
625
  "epoch": 6.71,
626
  "learning_rate": 1.0326329906124274e-05,
627
- "loss": 0.804,
628
  "step": 9600
629
  },
630
  {
631
  "epoch": 6.78,
632
  "learning_rate": 9.767545820295038e-06,
633
- "loss": 0.7846,
634
  "step": 9700
635
  },
636
  {
637
  "epoch": 6.85,
638
  "learning_rate": 9.208761734465803e-06,
639
- "loss": 0.7736,
640
  "step": 9800
641
  },
642
  {
643
  "epoch": 6.92,
644
  "learning_rate": 8.649977648636568e-06,
645
- "loss": 0.7625,
646
  "step": 9900
647
  },
648
  {
649
  "epoch": 6.99,
650
  "learning_rate": 8.09119356280733e-06,
651
- "loss": 0.7515,
652
  "step": 10000
653
  },
654
  {
655
  "epoch": 6.99,
656
- "eval_bleu": 75.7356,
657
- "eval_em": 0.0432,
658
- "eval_gen_len": 47.2896,
659
- "eval_loss": 1.057088017463684,
660
- "eval_runtime": 378.6301,
661
- "eval_samples_per_second": 3.183,
662
- "eval_steps_per_second": 0.399,
663
  "step": 10000
664
  },
665
  {
666
  "epoch": 7.06,
667
  "learning_rate": 7.532409476978096e-06,
668
- "loss": 0.6703,
669
  "step": 10100
670
  },
671
  {
672
  "epoch": 7.13,
673
  "learning_rate": 6.973625391148861e-06,
674
- "loss": 0.6731,
675
  "step": 10200
676
  },
677
  {
678
  "epoch": 7.2,
679
  "learning_rate": 6.414841305319625e-06,
680
- "loss": 0.6584,
681
  "step": 10300
682
  },
683
  {
684
  "epoch": 7.27,
685
  "learning_rate": 5.856057219490389e-06,
686
- "loss": 0.6758,
687
  "step": 10400
688
  },
689
  {
690
  "epoch": 7.34,
691
  "learning_rate": 5.297273133661153e-06,
692
- "loss": 0.6801,
693
  "step": 10500
694
  },
695
  {
696
  "epoch": 7.41,
697
  "learning_rate": 4.738489047831918e-06,
698
- "loss": 0.6556,
699
  "step": 10600
700
  },
701
  {
702
  "epoch": 7.48,
703
  "learning_rate": 4.1797049620026825e-06,
704
- "loss": 0.6413,
705
  "step": 10700
706
  },
707
  {
708
  "epoch": 7.55,
709
  "learning_rate": 3.6209208761734468e-06,
710
- "loss": 0.6548,
711
  "step": 10800
712
  },
713
  {
714
  "epoch": 7.62,
715
  "learning_rate": 3.062136790344211e-06,
716
- "loss": 0.6631,
717
  "step": 10900
718
  },
719
  {
720
  "epoch": 7.69,
721
  "learning_rate": 2.5033527045149757e-06,
722
- "loss": 0.6458,
723
  "step": 11000
724
  },
725
  {
726
  "epoch": 7.76,
727
  "learning_rate": 1.94456861868574e-06,
728
- "loss": 0.629,
729
  "step": 11100
730
  },
731
  {
732
  "epoch": 7.83,
733
  "learning_rate": 1.3857845328565042e-06,
734
- "loss": 0.643,
735
  "step": 11200
736
  },
737
  {
738
  "epoch": 7.9,
739
  "learning_rate": 8.270004470272687e-07,
740
- "loss": 0.6475,
741
  "step": 11300
742
  },
743
  {
744
  "epoch": 7.97,
745
  "learning_rate": 2.682163611980331e-07,
746
- "loss": 0.633,
747
  "step": 11400
748
  },
749
  {
750
  "epoch": 8.0,
751
  "step": 11448,
752
- "total_flos": 7528840539566280.0,
753
- "train_loss": 0.15130291744847968,
754
- "train_runtime": 1637.6947,
755
- "train_samples_per_second": 111.816,
756
- "train_steps_per_second": 6.99
757
  }
758
  ],
759
  "max_steps": 11448,
760
  "num_train_epochs": 8,
761
- "total_flos": 7528840539566280.0,
762
  "trial_name": null,
763
  "trial_params": null
764
  }
 
10
  {
11
  "epoch": 0.07,
12
  "learning_rate": 2.0000000000000003e-06,
13
+ "loss": 87.2627,
14
  "step": 100
15
  },
16
  {
17
  "epoch": 0.14,
18
  "learning_rate": 4.000000000000001e-06,
19
+ "loss": 52.4123,
20
  "step": 200
21
  },
22
  {
23
  "epoch": 0.21,
24
  "learning_rate": 6e-06,
25
+ "loss": 41.3932,
26
  "step": 300
27
  },
28
  {
29
  "epoch": 0.28,
30
  "learning_rate": 8.000000000000001e-06,
31
+ "loss": 36.7185,
32
  "step": 400
33
  },
34
  {
35
  "epoch": 0.35,
36
  "learning_rate": 1e-05,
37
+ "loss": 33.4597,
38
  "step": 500
39
  },
40
  {
41
  "epoch": 0.42,
42
  "learning_rate": 1.2e-05,
43
+ "loss": 31.1323,
44
  "step": 600
45
  },
46
  {
47
  "epoch": 0.49,
48
  "learning_rate": 1.4000000000000001e-05,
49
+ "loss": 28.9204,
50
  "step": 700
51
  },
52
  {
53
  "epoch": 0.56,
54
  "learning_rate": 1.6000000000000003e-05,
55
+ "loss": 27.0128,
56
  "step": 800
57
  },
58
  {
59
  "epoch": 0.63,
60
  "learning_rate": 1.8e-05,
61
+ "loss": 25.1703,
62
  "step": 900
63
  },
64
  {
65
  "epoch": 0.7,
66
  "learning_rate": 2e-05,
67
+ "loss": 23.5486,
68
  "step": 1000
69
  },
70
  {
71
  "epoch": 0.77,
72
  "learning_rate": 2.2000000000000003e-05,
73
+ "loss": 21.8419,
74
  "step": 1100
75
  },
76
  {
77
  "epoch": 0.84,
78
  "learning_rate": 2.4e-05,
79
+ "loss": 20.3387,
80
  "step": 1200
81
  },
82
  {
83
  "epoch": 0.91,
84
  "learning_rate": 2.6000000000000002e-05,
85
+ "loss": 18.7216,
86
  "step": 1300
87
  },
88
  {
89
  "epoch": 0.98,
90
  "learning_rate": 2.8000000000000003e-05,
91
+ "loss": 16.7862,
92
  "step": 1400
93
  },
94
  {
95
  "epoch": 1.05,
96
  "learning_rate": 3e-05,
97
+ "loss": 15.034,
98
  "step": 1500
99
  },
100
  {
101
  "epoch": 1.12,
102
  "learning_rate": 3.2000000000000005e-05,
103
+ "loss": 13.3747,
104
  "step": 1600
105
  },
106
  {
107
  "epoch": 1.19,
108
  "learning_rate": 3.4000000000000007e-05,
109
+ "loss": 11.7581,
110
  "step": 1700
111
  },
112
  {
113
  "epoch": 1.26,
114
  "learning_rate": 3.6e-05,
115
+ "loss": 9.9702,
116
  "step": 1800
117
  },
118
  {
119
  "epoch": 1.33,
120
  "learning_rate": 3.8e-05,
121
+ "loss": 8.3363,
122
  "step": 1900
123
  },
124
  {
125
  "epoch": 1.4,
126
  "learning_rate": 4e-05,
127
+ "loss": 6.9199,
128
  "step": 2000
129
  },
130
  {
131
  "epoch": 1.4,
132
+ "eval_bleu": 3.4801,
133
  "eval_em": 0.0,
134
+ "eval_gen_len": 220.966,
135
+ "eval_loss": 6.3751444816589355,
136
+ "eval_runtime": 1604.7901,
137
+ "eval_samples_per_second": 0.751,
138
+ "eval_steps_per_second": 0.094,
139
  "step": 2000
140
  },
141
  {
142
  "epoch": 1.47,
143
  "learning_rate": 4.2e-05,
144
+ "loss": 6.2524,
145
  "step": 2100
146
  },
147
  {
148
  "epoch": 1.54,
149
  "learning_rate": 4.4000000000000006e-05,
150
+ "loss": 5.8029,
151
  "step": 2200
152
  },
153
  {
154
  "epoch": 1.61,
155
  "learning_rate": 4.600000000000001e-05,
156
+ "loss": 5.606,
157
  "step": 2300
158
  },
159
  {
160
  "epoch": 1.68,
161
  "learning_rate": 4.8e-05,
162
+ "loss": 5.3757,
163
  "step": 2400
164
  },
165
  {
166
  "epoch": 1.75,
167
  "learning_rate": 5e-05,
168
+ "loss": 5.1043,
169
  "step": 2500
170
  },
171
  {
172
  "epoch": 1.82,
173
  "learning_rate": 4.944121591417077e-05,
174
+ "loss": 4.9414,
175
  "step": 2600
176
  },
177
  {
178
  "epoch": 1.89,
179
  "learning_rate": 4.888243182834153e-05,
180
+ "loss": 4.7381,
181
  "step": 2700
182
  },
183
  {
184
  "epoch": 1.96,
185
  "learning_rate": 4.8323647742512295e-05,
186
+ "loss": 4.6214,
187
  "step": 2800
188
  },
189
  {
190
  "epoch": 2.03,
191
  "learning_rate": 4.776486365668306e-05,
192
+ "loss": 4.2971,
193
  "step": 2900
194
  },
195
  {
196
  "epoch": 2.1,
197
  "learning_rate": 4.720607957085382e-05,
198
+ "loss": 4.1602,
199
  "step": 3000
200
  },
201
  {
202
  "epoch": 2.17,
203
  "learning_rate": 4.664729548502459e-05,
204
+ "loss": 4.0391,
205
  "step": 3100
206
  },
207
  {
208
  "epoch": 2.24,
209
  "learning_rate": 4.6088511399195353e-05,
210
+ "loss": 3.9211,
211
  "step": 3200
212
  },
213
  {
214
  "epoch": 2.31,
215
  "learning_rate": 4.552972731336611e-05,
216
+ "loss": 3.7642,
217
  "step": 3300
218
  },
219
  {
220
  "epoch": 2.38,
221
  "learning_rate": 4.497094322753688e-05,
222
+ "loss": 3.6698,
223
  "step": 3400
224
  },
225
  {
226
  "epoch": 2.45,
227
  "learning_rate": 4.4412159141707646e-05,
228
+ "loss": 3.5409,
229
  "step": 3500
230
  },
231
  {
232
  "epoch": 2.52,
233
  "learning_rate": 4.385337505587841e-05,
234
+ "loss": 3.4016,
235
  "step": 3600
236
  },
237
  {
238
  "epoch": 2.59,
239
  "learning_rate": 4.329459097004918e-05,
240
+ "loss": 3.2761,
241
  "step": 3700
242
  },
243
  {
244
  "epoch": 2.66,
245
  "learning_rate": 4.2735806884219945e-05,
246
+ "loss": 3.1708,
247
  "step": 3800
248
  },
249
  {
250
  "epoch": 2.73,
251
  "learning_rate": 4.2177022798390704e-05,
252
+ "loss": 3.0849,
253
  "step": 3900
254
  },
255
  {
256
  "epoch": 2.8,
257
  "learning_rate": 4.161823871256147e-05,
258
+ "loss": 3.0222,
259
  "step": 4000
260
  },
261
  {
262
  "epoch": 2.8,
263
+ "eval_bleu": 27.8543,
264
  "eval_em": 0.0,
265
+ "eval_gen_len": 36.8,
266
+ "eval_loss": 2.8796441555023193,
267
+ "eval_runtime": 501.7382,
268
+ "eval_samples_per_second": 2.402,
269
+ "eval_steps_per_second": 0.301,
270
  "step": 4000
271
  },
272
  {
273
  "epoch": 2.87,
274
  "learning_rate": 4.105945462673223e-05,
275
+ "loss": 2.9079,
276
  "step": 4100
277
  },
278
  {
279
  "epoch": 2.94,
280
  "learning_rate": 4.0500670540903e-05,
281
+ "loss": 2.8688,
282
  "step": 4200
283
  },
284
  {
285
  "epoch": 3.0,
286
  "learning_rate": 3.994188645507376e-05,
287
+ "loss": 2.7398,
288
  "step": 4300
289
  },
290
  {
291
  "epoch": 3.07,
292
  "learning_rate": 3.938310236924452e-05,
293
+ "loss": 2.5503,
294
  "step": 4400
295
  },
296
  {
297
  "epoch": 3.14,
298
  "learning_rate": 3.882431828341529e-05,
299
+ "loss": 2.5121,
300
  "step": 4500
301
  },
302
  {
303
  "epoch": 3.21,
304
  "learning_rate": 3.8265534197586055e-05,
305
+ "loss": 2.4625,
306
  "step": 4600
307
  },
308
  {
309
  "epoch": 3.28,
310
  "learning_rate": 3.7706750111756815e-05,
311
+ "loss": 2.3833,
312
  "step": 4700
313
  },
314
  {
315
  "epoch": 3.35,
316
  "learning_rate": 3.714796602592758e-05,
317
+ "loss": 2.3133,
318
  "step": 4800
319
  },
320
  {
321
  "epoch": 3.42,
322
  "learning_rate": 3.658918194009835e-05,
323
+ "loss": 2.2152,
324
  "step": 4900
325
  },
326
  {
327
  "epoch": 3.49,
328
  "learning_rate": 3.603039785426911e-05,
329
+ "loss": 2.2304,
330
  "step": 5000
331
  },
332
  {
333
  "epoch": 3.56,
334
  "learning_rate": 3.5471613768439874e-05,
335
+ "loss": 2.1563,
336
  "step": 5100
337
  },
338
  {
339
  "epoch": 3.63,
340
  "learning_rate": 3.491282968261064e-05,
341
+ "loss": 2.0991,
342
  "step": 5200
343
  },
344
  {
345
  "epoch": 3.7,
346
  "learning_rate": 3.4354045596781406e-05,
347
+ "loss": 2.0702,
348
  "step": 5300
349
  },
350
  {
351
  "epoch": 3.77,
352
  "learning_rate": 3.379526151095217e-05,
353
+ "loss": 1.9914,
354
  "step": 5400
355
  },
356
  {
357
  "epoch": 3.84,
358
  "learning_rate": 3.323647742512294e-05,
359
+ "loss": 1.9235,
360
  "step": 5500
361
  },
362
  {
363
  "epoch": 3.91,
364
  "learning_rate": 3.26776933392937e-05,
365
+ "loss": 1.8922,
366
  "step": 5600
367
  },
368
  {
369
  "epoch": 3.98,
370
  "learning_rate": 3.2118909253464465e-05,
371
+ "loss": 1.8512,
372
  "step": 5700
373
  },
374
  {
375
  "epoch": 4.05,
376
  "learning_rate": 3.156012516763523e-05,
377
+ "loss": 1.6966,
378
  "step": 5800
379
  },
380
  {
381
  "epoch": 4.12,
382
  "learning_rate": 3.100134108180599e-05,
383
+ "loss": 1.6399,
384
  "step": 5900
385
  },
386
  {
387
  "epoch": 4.19,
388
  "learning_rate": 3.0442556995976757e-05,
389
+ "loss": 1.5982,
390
  "step": 6000
391
  },
392
  {
393
  "epoch": 4.19,
394
+ "eval_bleu": 56.0747,
395
+ "eval_em": 0.0017,
396
+ "eval_gen_len": 43.9021,
397
+ "eval_loss": 1.7495189905166626,
398
+ "eval_runtime": 423.2589,
399
+ "eval_samples_per_second": 2.847,
400
+ "eval_steps_per_second": 0.357,
401
  "step": 6000
402
  },
403
  {
404
  "epoch": 4.26,
405
  "learning_rate": 2.9883772910147524e-05,
406
+ "loss": 1.5968,
407
  "step": 6100
408
  },
409
  {
410
  "epoch": 4.33,
411
  "learning_rate": 2.9324988824318283e-05,
412
+ "loss": 1.5813,
413
  "step": 6200
414
  },
415
  {
416
  "epoch": 4.4,
417
  "learning_rate": 2.876620473848905e-05,
418
+ "loss": 1.5421,
419
  "step": 6300
420
  },
421
  {
422
  "epoch": 4.47,
423
  "learning_rate": 2.8207420652659816e-05,
424
+ "loss": 1.4852,
425
  "step": 6400
426
  },
427
  {
428
  "epoch": 4.54,
429
  "learning_rate": 2.7648636566830576e-05,
430
+ "loss": 1.5027,
431
  "step": 6500
432
  },
433
  {
434
  "epoch": 4.61,
435
  "learning_rate": 2.7089852481001342e-05,
436
+ "loss": 1.4951,
437
  "step": 6600
438
  },
439
  {
440
  "epoch": 4.68,
441
  "learning_rate": 2.653106839517211e-05,
442
+ "loss": 1.4803,
443
  "step": 6700
444
  },
445
  {
446
  "epoch": 4.75,
447
  "learning_rate": 2.597228430934287e-05,
448
+ "loss": 1.4127,
449
  "step": 6800
450
  },
451
  {
452
  "epoch": 4.82,
453
  "learning_rate": 2.5413500223513638e-05,
454
+ "loss": 1.3896,
455
  "step": 6900
456
  },
457
  {
458
  "epoch": 4.89,
459
  "learning_rate": 2.48547161376844e-05,
460
+ "loss": 1.3656,
461
  "step": 7000
462
  },
463
  {
464
  "epoch": 4.96,
465
  "learning_rate": 2.4295932051855164e-05,
466
+ "loss": 1.3432,
467
  "step": 7100
468
  },
469
  {
470
  "epoch": 5.03,
471
  "learning_rate": 2.373714796602593e-05,
472
+ "loss": 1.2224,
473
  "step": 7200
474
  },
475
  {
476
  "epoch": 5.1,
477
  "learning_rate": 2.3178363880196693e-05,
478
+ "loss": 1.1396,
479
  "step": 7300
480
  },
481
  {
482
  "epoch": 5.17,
483
  "learning_rate": 2.2619579794367456e-05,
484
+ "loss": 1.1475,
485
  "step": 7400
486
  },
487
  {
488
  "epoch": 5.24,
489
  "learning_rate": 2.206079570853822e-05,
490
+ "loss": 1.0669,
491
  "step": 7500
492
  },
493
  {
494
  "epoch": 5.31,
495
  "learning_rate": 2.1502011622708985e-05,
496
+ "loss": 1.1356,
497
  "step": 7600
498
  },
499
  {
500
  "epoch": 5.38,
501
  "learning_rate": 2.0943227536879752e-05,
502
+ "loss": 1.0965,
503
  "step": 7700
504
  },
505
  {
506
  "epoch": 5.45,
507
  "learning_rate": 2.0384443451050515e-05,
508
+ "loss": 1.1086,
509
  "step": 7800
510
  },
511
  {
512
  "epoch": 5.52,
513
  "learning_rate": 1.982565936522128e-05,
514
+ "loss": 1.0642,
515
  "step": 7900
516
  },
517
  {
518
  "epoch": 5.59,
519
  "learning_rate": 1.9266875279392044e-05,
520
+ "loss": 1.0717,
521
  "step": 8000
522
  },
523
  {
524
  "epoch": 5.59,
525
+ "eval_bleu": 69.9606,
526
+ "eval_em": 0.0199,
527
+ "eval_gen_len": 46.0722,
528
+ "eval_loss": 1.26251220703125,
529
+ "eval_runtime": 355.6397,
530
+ "eval_samples_per_second": 3.388,
531
+ "eval_steps_per_second": 0.425,
532
  "step": 8000
533
  },
534
  {
535
  "epoch": 5.66,
536
  "learning_rate": 1.8708091193562807e-05,
537
+ "loss": 1.0546,
538
  "step": 8100
539
  },
540
  {
541
  "epoch": 5.73,
542
  "learning_rate": 1.8149307107733573e-05,
543
+ "loss": 1.0579,
544
  "step": 8200
545
  },
546
  {
547
  "epoch": 5.8,
548
  "learning_rate": 1.7590523021904336e-05,
549
+ "loss": 1.0204,
550
  "step": 8300
551
  },
552
  {
553
  "epoch": 5.87,
554
  "learning_rate": 1.70317389360751e-05,
555
+ "loss": 1.0398,
556
  "step": 8400
557
  },
558
  {
559
  "epoch": 5.94,
560
  "learning_rate": 1.6472954850245866e-05,
561
+ "loss": 0.9992,
562
  "step": 8500
563
  },
564
  {
565
  "epoch": 6.01,
566
  "learning_rate": 1.5914170764416632e-05,
567
+ "loss": 0.9756,
568
  "step": 8600
569
  },
570
  {
571
  "epoch": 6.08,
572
  "learning_rate": 1.5355386678587395e-05,
573
+ "loss": 0.8385,
574
  "step": 8700
575
  },
576
  {
577
  "epoch": 6.15,
578
  "learning_rate": 1.479660259275816e-05,
579
+ "loss": 0.8815,
580
  "step": 8800
581
  },
582
  {
583
  "epoch": 6.22,
584
  "learning_rate": 1.4237818506928924e-05,
585
+ "loss": 0.8447,
586
  "step": 8900
587
  },
588
  {
589
  "epoch": 6.29,
590
  "learning_rate": 1.3679034421099687e-05,
591
+ "loss": 0.8553,
592
  "step": 9000
593
  },
594
  {
595
  "epoch": 6.36,
596
  "learning_rate": 1.312025033527045e-05,
597
+ "loss": 0.8188,
598
  "step": 9100
599
  },
600
  {
601
  "epoch": 6.43,
602
  "learning_rate": 1.2561466249441217e-05,
603
+ "loss": 0.8241,
604
  "step": 9200
605
  },
606
  {
607
  "epoch": 6.5,
608
  "learning_rate": 1.2002682163611981e-05,
609
+ "loss": 0.8118,
610
  "step": 9300
611
  },
612
  {
613
  "epoch": 6.57,
614
  "learning_rate": 1.1443898077782746e-05,
615
+ "loss": 0.8357,
616
  "step": 9400
617
  },
618
  {
619
  "epoch": 6.64,
620
  "learning_rate": 1.0885113991953509e-05,
621
+ "loss": 0.8063,
622
  "step": 9500
623
  },
624
  {
625
  "epoch": 6.71,
626
  "learning_rate": 1.0326329906124274e-05,
627
+ "loss": 0.8263,
628
  "step": 9600
629
  },
630
  {
631
  "epoch": 6.78,
632
  "learning_rate": 9.767545820295038e-06,
633
+ "loss": 0.8064,
634
  "step": 9700
635
  },
636
  {
637
  "epoch": 6.85,
638
  "learning_rate": 9.208761734465803e-06,
639
+ "loss": 0.7858,
640
  "step": 9800
641
  },
642
  {
643
  "epoch": 6.92,
644
  "learning_rate": 8.649977648636568e-06,
645
+ "loss": 0.7854,
646
  "step": 9900
647
  },
648
  {
649
  "epoch": 6.99,
650
  "learning_rate": 8.09119356280733e-06,
651
+ "loss": 0.7765,
652
  "step": 10000
653
  },
654
  {
655
  "epoch": 6.99,
656
+ "eval_bleu": 74.7723,
657
+ "eval_em": 0.0349,
658
+ "eval_gen_len": 46.1685,
659
+ "eval_loss": 1.0809996128082275,
660
+ "eval_runtime": 352.8566,
661
+ "eval_samples_per_second": 3.415,
662
+ "eval_steps_per_second": 0.428,
663
  "step": 10000
664
  },
665
  {
666
  "epoch": 7.06,
667
  "learning_rate": 7.532409476978096e-06,
668
+ "loss": 0.696,
669
  "step": 10100
670
  },
671
  {
672
  "epoch": 7.13,
673
  "learning_rate": 6.973625391148861e-06,
674
+ "loss": 0.6991,
675
  "step": 10200
676
  },
677
  {
678
  "epoch": 7.2,
679
  "learning_rate": 6.414841305319625e-06,
680
+ "loss": 0.6795,
681
  "step": 10300
682
  },
683
  {
684
  "epoch": 7.27,
685
  "learning_rate": 5.856057219490389e-06,
686
+ "loss": 0.6953,
687
  "step": 10400
688
  },
689
  {
690
  "epoch": 7.34,
691
  "learning_rate": 5.297273133661153e-06,
692
+ "loss": 0.6854,
693
  "step": 10500
694
  },
695
  {
696
  "epoch": 7.41,
697
  "learning_rate": 4.738489047831918e-06,
698
+ "loss": 0.6798,
699
  "step": 10600
700
  },
701
  {
702
  "epoch": 7.48,
703
  "learning_rate": 4.1797049620026825e-06,
704
+ "loss": 0.6701,
705
  "step": 10700
706
  },
707
  {
708
  "epoch": 7.55,
709
  "learning_rate": 3.6209208761734468e-06,
710
+ "loss": 0.684,
711
  "step": 10800
712
  },
713
  {
714
  "epoch": 7.62,
715
  "learning_rate": 3.062136790344211e-06,
716
+ "loss": 0.686,
717
  "step": 10900
718
  },
719
  {
720
  "epoch": 7.69,
721
  "learning_rate": 2.5033527045149757e-06,
722
+ "loss": 0.6764,
723
  "step": 11000
724
  },
725
  {
726
  "epoch": 7.76,
727
  "learning_rate": 1.94456861868574e-06,
728
+ "loss": 0.6461,
729
  "step": 11100
730
  },
731
  {
732
  "epoch": 7.83,
733
  "learning_rate": 1.3857845328565042e-06,
734
+ "loss": 0.6723,
735
  "step": 11200
736
  },
737
  {
738
  "epoch": 7.9,
739
  "learning_rate": 8.270004470272687e-07,
740
+ "loss": 0.6453,
741
  "step": 11300
742
  },
743
  {
744
  "epoch": 7.97,
745
  "learning_rate": 2.682163611980331e-07,
746
+ "loss": 0.6544,
747
  "step": 11400
748
  },
749
  {
750
  "epoch": 8.0,
751
  "step": 11448,
752
+ "total_flos": 7528078235838336.0,
753
+ "train_loss": 6.2339570505647375,
754
+ "train_runtime": 11756.9791,
755
+ "train_samples_per_second": 15.575,
756
+ "train_steps_per_second": 0.974
757
  }
758
  ],
759
  "max_steps": 11448,
760
  "num_train_epochs": 8,
761
+ "total_flos": 7528078235838336.0,
762
  "trial_name": null,
763
  "trial_params": null
764
  }