jimypbr commited on
Commit
217da73
1 Parent(s): 052112e

End of training

Browse files
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.9829009433962265,
4
- "eval_loss": 0.08111572265625,
5
- "eval_runtime": 86.2363,
6
- "eval_samples_per_second": 78.83,
7
- "eval_steps_per_second": 0.626,
8
- "train_loss": 0.3196671349661691,
9
- "train_runtime": 293.2603,
10
- "train_samples_per_second": 871.137,
11
- "train_steps_per_second": 6.803
12
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.9821639150943396,
4
+ "eval_loss": 0.0848388671875,
5
+ "eval_runtime": 229.7479,
6
+ "eval_samples_per_second": 29.589,
7
+ "eval_steps_per_second": 0.235,
8
+ "train_loss": 0.32014525516290115,
9
+ "train_runtime": 392.8902,
10
+ "train_samples_per_second": 650.233,
11
+ "train_steps_per_second": 5.078
12
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.9829009433962265,
4
- "eval_loss": 0.08111572265625,
5
- "eval_runtime": 86.2363,
6
- "eval_samples_per_second": 78.83,
7
- "eval_steps_per_second": 0.626
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.9821639150943396,
4
+ "eval_loss": 0.0848388671875,
5
+ "eval_runtime": 229.7479,
6
+ "eval_samples_per_second": 29.589,
7
+ "eval_steps_per_second": 0.235
8
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:193c40d3b1a1016a9d31b695e5571158a9f8cd811351c7dc1113e059a126e26f
3
  size 189210261
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2821bc9de1117450dbce5c0cf271cbd824e1fe6097d3bb2161b0f4189ec04635
3
  size 189210261
runs/May23_23-28-14_gbnwp-pod015-2.ipu.graphcore.ai/events.out.tfevents.1653345392.gbnwp-pod015-2.ipu.graphcore.ai CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da258078781c7cbcaf73f21869d38ba05be597f13e5c997e9fb9607fd11d361c
3
- size 27702
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b6df7c6b9168aa97507327127d587a5420b73d64ab5e6485d4638a498216df1
3
+ size 35906
runs/May23_23-28-14_gbnwp-pod015-2.ipu.graphcore.ai/events.out.tfevents.1653346022.gbnwp-pod015-2.ipu.graphcore.ai ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ffcaf02470b31d560fa1dfff56228916ee00c9f71e92db41d5e6279039b982e
3
+ size 40
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 5.0,
3
- "train_loss": 0.3196671349661691,
4
- "train_runtime": 293.2603,
5
- "train_samples_per_second": 871.137,
6
- "train_steps_per_second": 6.803
7
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "train_loss": 0.32014525516290115,
4
+ "train_runtime": 392.8902,
5
+ "train_samples_per_second": 650.233,
6
+ "train_steps_per_second": 5.078
7
  }
trainer_state.json CHANGED
@@ -10,55 +10,55 @@
10
  {
11
  "epoch": 0.03,
12
  "learning_rate": 1.5e-06,
13
- "loss": 2.4701,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.05,
18
  "learning_rate": 3e-06,
19
- "loss": 2.4543,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.08,
24
  "learning_rate": 4.5e-06,
25
- "loss": 2.3809,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.1,
30
  "learning_rate": 6e-06,
31
- "loss": 2.2195,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.13,
36
  "learning_rate": 7.5e-06,
37
- "loss": 2.0812,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.15,
42
  "learning_rate": 9e-06,
43
- "loss": 1.9122,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.18,
48
  "learning_rate": 1.05e-05,
49
- "loss": 1.8156,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.2,
54
  "learning_rate": 1.2e-05,
55
- "loss": 1.8081,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.23,
60
  "learning_rate": 1.3500000000000001e-05,
61
- "loss": 1.5834,
62
  "step": 90
63
  },
64
  {
@@ -70,19 +70,19 @@
70
  {
71
  "epoch": 0.28,
72
  "learning_rate": 1.65e-05,
73
- "loss": 1.5589,
74
  "step": 110
75
  },
76
  {
77
  "epoch": 0.3,
78
  "learning_rate": 1.8e-05,
79
- "loss": 1.6076,
80
  "step": 120
81
  },
82
  {
83
  "epoch": 0.33,
84
  "learning_rate": 1.95e-05,
85
- "loss": 1.3708,
86
  "step": 130
87
  },
88
  {
@@ -94,709 +94,709 @@
94
  {
95
  "epoch": 0.38,
96
  "learning_rate": 2.25e-05,
97
- "loss": 1.4704,
98
  "step": 150
99
  },
100
  {
101
  "epoch": 0.4,
102
  "learning_rate": 2.4e-05,
103
- "loss": 1.5823,
104
  "step": 160
105
  },
106
  {
107
  "epoch": 0.43,
108
  "learning_rate": 2.55e-05,
109
- "loss": 1.4415,
110
  "step": 170
111
  },
112
  {
113
  "epoch": 0.45,
114
  "learning_rate": 2.7000000000000002e-05,
115
- "loss": 1.3505,
116
  "step": 180
117
  },
118
  {
119
  "epoch": 0.48,
120
  "learning_rate": 2.8499999999999998e-05,
121
- "loss": 1.492,
122
  "step": 190
123
  },
124
  {
125
  "epoch": 0.5,
126
  "learning_rate": 3e-05,
127
- "loss": 1.1947,
128
  "step": 200
129
  },
130
  {
131
  "epoch": 0.53,
132
  "learning_rate": 2.9832869080779945e-05,
133
- "loss": 1.344,
134
  "step": 210
135
  },
136
  {
137
  "epoch": 0.55,
138
  "learning_rate": 2.9665738161559886e-05,
139
- "loss": 1.2266,
140
  "step": 220
141
  },
142
  {
143
  "epoch": 0.58,
144
  "learning_rate": 2.9498607242339834e-05,
145
- "loss": 1.2471,
146
  "step": 230
147
  },
148
  {
149
  "epoch": 0.6,
150
  "learning_rate": 2.933147632311978e-05,
151
- "loss": 1.0699,
152
  "step": 240
153
  },
154
  {
155
  "epoch": 0.63,
156
  "learning_rate": 2.916434540389972e-05,
157
- "loss": 1.0953,
158
  "step": 250
159
  },
160
  {
161
  "epoch": 0.65,
162
  "learning_rate": 2.8997214484679665e-05,
163
- "loss": 1.1088,
164
  "step": 260
165
  },
166
  {
167
  "epoch": 0.68,
168
  "learning_rate": 2.8830083565459613e-05,
169
- "loss": 0.9984,
170
  "step": 270
171
  },
172
  {
173
  "epoch": 0.7,
174
  "learning_rate": 2.8662952646239554e-05,
175
- "loss": 0.7705,
176
  "step": 280
177
  },
178
  {
179
  "epoch": 0.73,
180
  "learning_rate": 2.84958217270195e-05,
181
- "loss": 0.8448,
182
  "step": 290
183
  },
184
  {
185
  "epoch": 0.75,
186
  "learning_rate": 2.8328690807799443e-05,
187
- "loss": 0.9973,
188
  "step": 300
189
  },
190
  {
191
  "epoch": 0.78,
192
  "learning_rate": 2.8161559888579388e-05,
193
- "loss": 0.7903,
194
  "step": 310
195
  },
196
  {
197
  "epoch": 0.8,
198
  "learning_rate": 2.7994428969359332e-05,
199
- "loss": 0.7426,
200
  "step": 320
201
  },
202
  {
203
  "epoch": 0.83,
204
  "learning_rate": 2.7827298050139277e-05,
205
- "loss": 0.6204,
206
  "step": 330
207
  },
208
  {
209
  "epoch": 0.85,
210
  "learning_rate": 2.7660167130919218e-05,
211
- "loss": 0.5555,
212
  "step": 340
213
  },
214
  {
215
  "epoch": 0.88,
216
  "learning_rate": 2.7493036211699166e-05,
217
- "loss": 0.5791,
218
  "step": 350
219
  },
220
  {
221
  "epoch": 0.9,
222
  "learning_rate": 2.732590529247911e-05,
223
- "loss": 0.5195,
224
  "step": 360
225
  },
226
  {
227
  "epoch": 0.93,
228
  "learning_rate": 2.7158774373259055e-05,
229
- "loss": 0.4228,
230
  "step": 370
231
  },
232
  {
233
  "epoch": 0.95,
234
  "learning_rate": 2.6991643454038996e-05,
235
- "loss": 0.3886,
236
  "step": 380
237
  },
238
  {
239
  "epoch": 0.98,
240
  "learning_rate": 2.6824512534818944e-05,
241
- "loss": 0.3633,
242
  "step": 390
243
  },
244
  {
245
  "epoch": 1.0,
246
  "learning_rate": 2.665738161559889e-05,
247
- "loss": 0.3384,
248
  "step": 400
249
  },
250
  {
251
  "epoch": 1.03,
252
  "learning_rate": 2.649025069637883e-05,
253
- "loss": 0.2494,
254
  "step": 410
255
  },
256
  {
257
  "epoch": 1.05,
258
  "learning_rate": 2.6323119777158774e-05,
259
- "loss": 0.2741,
260
  "step": 420
261
  },
262
  {
263
  "epoch": 1.08,
264
  "learning_rate": 2.6155988857938722e-05,
265
- "loss": 0.2752,
266
  "step": 430
267
  },
268
  {
269
  "epoch": 1.1,
270
  "learning_rate": 2.5988857938718663e-05,
271
- "loss": 0.2718,
272
  "step": 440
273
  },
274
  {
275
  "epoch": 1.13,
276
  "learning_rate": 2.5821727019498608e-05,
277
- "loss": 0.2862,
278
  "step": 450
279
  },
280
  {
281
  "epoch": 1.15,
282
  "learning_rate": 2.5654596100278553e-05,
283
- "loss": 0.2188,
284
  "step": 460
285
  },
286
  {
287
  "epoch": 1.18,
288
  "learning_rate": 2.5487465181058497e-05,
289
- "loss": 0.1573,
290
  "step": 470
291
  },
292
  {
293
  "epoch": 1.2,
294
  "learning_rate": 2.532033426183844e-05,
295
- "loss": 0.1884,
296
  "step": 480
297
  },
298
  {
299
  "epoch": 1.23,
300
  "learning_rate": 2.5153203342618386e-05,
301
- "loss": 0.2707,
302
  "step": 490
303
  },
304
  {
305
  "epoch": 1.25,
306
  "learning_rate": 2.4986072423398327e-05,
307
- "loss": 0.1957,
308
  "step": 500
309
  },
310
  {
311
  "epoch": 1.28,
312
  "learning_rate": 2.4818941504178275e-05,
313
- "loss": 0.157,
314
  "step": 510
315
  },
316
  {
317
  "epoch": 1.3,
318
  "learning_rate": 2.465181058495822e-05,
319
- "loss": 0.2192,
320
  "step": 520
321
  },
322
  {
323
  "epoch": 1.33,
324
  "learning_rate": 2.448467966573816e-05,
325
- "loss": 0.1871,
326
  "step": 530
327
  },
328
  {
329
  "epoch": 1.35,
330
  "learning_rate": 2.4317548746518106e-05,
331
- "loss": 0.0955,
332
  "step": 540
333
  },
334
  {
335
  "epoch": 1.38,
336
  "learning_rate": 2.415041782729805e-05,
337
- "loss": 0.1621,
338
  "step": 550
339
  },
340
  {
341
  "epoch": 1.4,
342
  "learning_rate": 2.3983286908077995e-05,
343
- "loss": 0.1034,
344
  "step": 560
345
  },
346
  {
347
  "epoch": 1.43,
348
  "learning_rate": 2.381615598885794e-05,
349
- "loss": 0.2276,
350
  "step": 570
351
  },
352
  {
353
  "epoch": 1.45,
354
  "learning_rate": 2.3649025069637884e-05,
355
- "loss": 0.0837,
356
  "step": 580
357
  },
358
  {
359
  "epoch": 1.48,
360
  "learning_rate": 2.3481894150417825e-05,
361
- "loss": 0.0966,
362
  "step": 590
363
  },
364
  {
365
  "epoch": 1.5,
366
  "learning_rate": 2.3314763231197773e-05,
367
- "loss": 0.1502,
368
  "step": 600
369
  },
370
  {
371
  "epoch": 1.53,
372
  "learning_rate": 2.3147632311977718e-05,
373
- "loss": 0.3077,
374
  "step": 610
375
  },
376
  {
377
  "epoch": 1.55,
378
  "learning_rate": 2.298050139275766e-05,
379
- "loss": 0.1133,
380
  "step": 620
381
  },
382
  {
383
  "epoch": 1.58,
384
  "learning_rate": 2.2813370473537603e-05,
385
- "loss": 0.1835,
386
  "step": 630
387
  },
388
  {
389
  "epoch": 1.6,
390
  "learning_rate": 2.264623955431755e-05,
391
- "loss": 0.0869,
392
  "step": 640
393
  },
394
  {
395
  "epoch": 1.63,
396
  "learning_rate": 2.2479108635097492e-05,
397
- "loss": 0.1023,
398
  "step": 650
399
  },
400
  {
401
  "epoch": 1.65,
402
  "learning_rate": 2.2311977715877437e-05,
403
- "loss": 0.25,
404
  "step": 660
405
  },
406
  {
407
  "epoch": 1.68,
408
  "learning_rate": 2.214484679665738e-05,
409
- "loss": 0.0932,
410
  "step": 670
411
  },
412
  {
413
  "epoch": 1.7,
414
  "learning_rate": 2.1977715877437326e-05,
415
- "loss": 0.0431,
416
  "step": 680
417
  },
418
  {
419
  "epoch": 1.73,
420
  "learning_rate": 2.181058495821727e-05,
421
- "loss": 0.1125,
422
  "step": 690
423
  },
424
  {
425
  "epoch": 1.75,
426
  "learning_rate": 2.1643454038997215e-05,
427
- "loss": 0.0932,
428
  "step": 700
429
  },
430
  {
431
  "epoch": 1.78,
432
  "learning_rate": 2.147632311977716e-05,
433
- "loss": 0.1224,
434
  "step": 710
435
  },
436
  {
437
  "epoch": 1.8,
438
  "learning_rate": 2.1309192200557104e-05,
439
- "loss": 0.1396,
440
  "step": 720
441
  },
442
  {
443
  "epoch": 1.83,
444
  "learning_rate": 2.114206128133705e-05,
445
- "loss": 0.0411,
446
  "step": 730
447
  },
448
  {
449
  "epoch": 1.85,
450
  "learning_rate": 2.0974930362116993e-05,
451
- "loss": 0.0565,
452
  "step": 740
453
  },
454
  {
455
  "epoch": 1.88,
456
  "learning_rate": 2.0807799442896935e-05,
457
- "loss": 0.0429,
458
  "step": 750
459
  },
460
  {
461
  "epoch": 1.9,
462
  "learning_rate": 2.0640668523676883e-05,
463
- "loss": 0.1253,
464
  "step": 760
465
  },
466
  {
467
  "epoch": 1.93,
468
  "learning_rate": 2.0473537604456827e-05,
469
- "loss": 0.0591,
470
  "step": 770
471
  },
472
  {
473
  "epoch": 1.95,
474
  "learning_rate": 2.0306406685236768e-05,
475
- "loss": 0.0623,
476
  "step": 780
477
  },
478
  {
479
  "epoch": 1.98,
480
  "learning_rate": 2.0139275766016713e-05,
481
- "loss": 0.109,
482
  "step": 790
483
  },
484
  {
485
  "epoch": 2.01,
486
  "learning_rate": 1.997214484679666e-05,
487
- "loss": 0.0324,
488
  "step": 800
489
  },
490
  {
491
  "epoch": 2.03,
492
  "learning_rate": 1.9805013927576602e-05,
493
- "loss": 0.0791,
494
  "step": 810
495
  },
496
  {
497
  "epoch": 2.06,
498
  "learning_rate": 1.9637883008356546e-05,
499
- "loss": 0.0351,
500
  "step": 820
501
  },
502
  {
503
  "epoch": 2.08,
504
  "learning_rate": 1.947075208913649e-05,
505
- "loss": 0.1256,
506
  "step": 830
507
  },
508
  {
509
  "epoch": 2.11,
510
  "learning_rate": 1.9303621169916436e-05,
511
- "loss": 0.0627,
512
  "step": 840
513
  },
514
  {
515
  "epoch": 2.13,
516
  "learning_rate": 1.913649025069638e-05,
517
- "loss": 0.1514,
518
  "step": 850
519
  },
520
  {
521
  "epoch": 2.16,
522
  "learning_rate": 1.8969359331476325e-05,
523
- "loss": 0.056,
524
  "step": 860
525
  },
526
  {
527
  "epoch": 2.18,
528
  "learning_rate": 1.8802228412256266e-05,
529
- "loss": 0.112,
530
  "step": 870
531
  },
532
  {
533
  "epoch": 2.21,
534
  "learning_rate": 1.863509749303621e-05,
535
- "loss": 0.0336,
536
  "step": 880
537
  },
538
  {
539
  "epoch": 2.23,
540
  "learning_rate": 1.846796657381616e-05,
541
- "loss": 0.0529,
542
  "step": 890
543
  },
544
  {
545
  "epoch": 2.26,
546
  "learning_rate": 1.83008356545961e-05,
547
- "loss": 0.1006,
548
  "step": 900
549
  },
550
  {
551
  "epoch": 2.28,
552
  "learning_rate": 1.8133704735376044e-05,
553
- "loss": 0.0707,
554
  "step": 910
555
  },
556
  {
557
  "epoch": 2.31,
558
  "learning_rate": 1.796657381615599e-05,
559
- "loss": 0.0624,
560
  "step": 920
561
  },
562
  {
563
  "epoch": 2.33,
564
  "learning_rate": 1.7799442896935933e-05,
565
- "loss": 0.0785,
566
  "step": 930
567
  },
568
  {
569
  "epoch": 2.36,
570
  "learning_rate": 1.7632311977715878e-05,
571
- "loss": 0.0327,
572
  "step": 940
573
  },
574
  {
575
  "epoch": 2.38,
576
  "learning_rate": 1.7465181058495822e-05,
577
- "loss": 0.0635,
578
  "step": 950
579
  },
580
  {
581
  "epoch": 2.41,
582
  "learning_rate": 1.7298050139275764e-05,
583
- "loss": 0.1434,
584
  "step": 960
585
  },
586
  {
587
  "epoch": 2.43,
588
  "learning_rate": 1.713091922005571e-05,
589
- "loss": 0.0561,
590
  "step": 970
591
  },
592
  {
593
  "epoch": 2.46,
594
  "learning_rate": 1.6963788300835656e-05,
595
- "loss": 0.0499,
596
  "step": 980
597
  },
598
  {
599
  "epoch": 2.48,
600
  "learning_rate": 1.6796657381615597e-05,
601
- "loss": 0.022,
602
  "step": 990
603
  },
604
  {
605
  "epoch": 2.51,
606
  "learning_rate": 1.6629526462395542e-05,
607
- "loss": 0.0243,
608
  "step": 1000
609
  },
610
  {
611
  "epoch": 2.53,
612
  "learning_rate": 1.646239554317549e-05,
613
- "loss": 0.0651,
614
  "step": 1010
615
  },
616
  {
617
  "epoch": 2.56,
618
  "learning_rate": 1.6295264623955434e-05,
619
- "loss": 0.1027,
620
  "step": 1020
621
  },
622
  {
623
  "epoch": 2.58,
624
  "learning_rate": 1.6128133704735375e-05,
625
- "loss": 0.0382,
626
  "step": 1030
627
  },
628
  {
629
  "epoch": 2.61,
630
  "learning_rate": 1.596100278551532e-05,
631
- "loss": 0.0663,
632
  "step": 1040
633
  },
634
  {
635
  "epoch": 2.63,
636
  "learning_rate": 1.5793871866295268e-05,
637
- "loss": 0.0272,
638
  "step": 1050
639
  },
640
  {
641
  "epoch": 2.66,
642
  "learning_rate": 1.562674094707521e-05,
643
- "loss": 0.0229,
644
  "step": 1060
645
  },
646
  {
647
  "epoch": 2.68,
648
  "learning_rate": 1.5459610027855154e-05,
649
- "loss": 0.0721,
650
  "step": 1070
651
  },
652
  {
653
  "epoch": 2.71,
654
  "learning_rate": 1.5292479108635098e-05,
655
- "loss": 0.0606,
656
  "step": 1080
657
  },
658
  {
659
  "epoch": 2.73,
660
  "learning_rate": 1.5125348189415043e-05,
661
- "loss": 0.0648,
662
  "step": 1090
663
  },
664
  {
665
  "epoch": 2.76,
666
  "learning_rate": 1.4958217270194987e-05,
667
- "loss": 0.1202,
668
  "step": 1100
669
  },
670
  {
671
  "epoch": 2.78,
672
  "learning_rate": 1.479108635097493e-05,
673
- "loss": 0.0333,
674
  "step": 1110
675
  },
676
  {
677
  "epoch": 2.81,
678
  "learning_rate": 1.4623955431754876e-05,
679
- "loss": 0.0617,
680
  "step": 1120
681
  },
682
  {
683
  "epoch": 2.83,
684
  "learning_rate": 1.445682451253482e-05,
685
- "loss": 0.0138,
686
  "step": 1130
687
  },
688
  {
689
  "epoch": 2.86,
690
  "learning_rate": 1.4289693593314764e-05,
691
- "loss": 0.1383,
692
  "step": 1140
693
  },
694
  {
695
  "epoch": 2.88,
696
  "learning_rate": 1.4122562674094708e-05,
697
- "loss": 0.0859,
698
  "step": 1150
699
  },
700
  {
701
  "epoch": 2.91,
702
  "learning_rate": 1.3955431754874653e-05,
703
- "loss": 0.053,
704
  "step": 1160
705
  },
706
  {
707
  "epoch": 2.93,
708
  "learning_rate": 1.3788300835654596e-05,
709
- "loss": 0.1055,
710
  "step": 1170
711
  },
712
  {
713
  "epoch": 2.96,
714
  "learning_rate": 1.362116991643454e-05,
715
- "loss": 0.0642,
716
  "step": 1180
717
  },
718
  {
719
  "epoch": 2.98,
720
  "learning_rate": 1.3454038997214485e-05,
721
- "loss": 0.1085,
722
  "step": 1190
723
  },
724
  {
725
  "epoch": 3.01,
726
  "learning_rate": 1.3286908077994428e-05,
727
- "loss": 0.0576,
728
  "step": 1200
729
  },
730
  {
731
  "epoch": 3.03,
732
  "learning_rate": 1.3119777158774374e-05,
733
- "loss": 0.0386,
734
  "step": 1210
735
  },
736
  {
737
  "epoch": 3.06,
738
  "learning_rate": 1.2952646239554317e-05,
739
- "loss": 0.0949,
740
  "step": 1220
741
  },
742
  {
743
  "epoch": 3.08,
744
  "learning_rate": 1.2785515320334262e-05,
745
- "loss": 0.1058,
746
  "step": 1230
747
  },
748
  {
749
  "epoch": 3.11,
750
  "learning_rate": 1.2618384401114206e-05,
751
- "loss": 0.0586,
752
  "step": 1240
753
  },
754
  {
755
  "epoch": 3.13,
756
  "learning_rate": 1.245125348189415e-05,
757
- "loss": 0.1403,
758
  "step": 1250
759
  },
760
  {
761
  "epoch": 3.16,
762
  "learning_rate": 1.2284122562674095e-05,
763
- "loss": 0.0833,
764
  "step": 1260
765
  },
766
  {
767
  "epoch": 3.18,
768
  "learning_rate": 1.211699164345404e-05,
769
- "loss": 0.0247,
770
  "step": 1270
771
  },
772
  {
773
  "epoch": 3.21,
774
  "learning_rate": 1.1949860724233983e-05,
775
- "loss": 0.01,
776
  "step": 1280
777
  },
778
  {
779
  "epoch": 3.23,
780
  "learning_rate": 1.1782729805013929e-05,
781
- "loss": 0.1515,
782
  "step": 1290
783
  },
784
  {
785
  "epoch": 3.26,
786
  "learning_rate": 1.1615598885793872e-05,
787
- "loss": 0.0516,
788
  "step": 1300
789
  },
790
  {
791
  "epoch": 3.28,
792
  "learning_rate": 1.1448467966573816e-05,
793
- "loss": 0.0531,
794
  "step": 1310
795
  },
796
  {
797
  "epoch": 3.31,
798
  "learning_rate": 1.1281337047353761e-05,
799
- "loss": 0.013,
800
  "step": 1320
801
  },
802
  {
@@ -808,367 +808,367 @@
808
  {
809
  "epoch": 3.36,
810
  "learning_rate": 1.0947075208913648e-05,
811
- "loss": 0.0122,
812
  "step": 1340
813
  },
814
  {
815
  "epoch": 3.38,
816
  "learning_rate": 1.0779944289693595e-05,
817
- "loss": 0.0666,
818
  "step": 1350
819
  },
820
  {
821
  "epoch": 3.41,
822
  "learning_rate": 1.0612813370473537e-05,
823
- "loss": 0.0597,
824
  "step": 1360
825
  },
826
  {
827
  "epoch": 3.43,
828
  "learning_rate": 1.0445682451253482e-05,
829
- "loss": 0.0525,
830
  "step": 1370
831
  },
832
  {
833
  "epoch": 3.46,
834
  "learning_rate": 1.0278551532033427e-05,
835
- "loss": 0.0161,
836
  "step": 1380
837
  },
838
  {
839
  "epoch": 3.48,
840
  "learning_rate": 1.0111420612813371e-05,
841
- "loss": 0.0161,
842
  "step": 1390
843
  },
844
  {
845
  "epoch": 3.51,
846
  "learning_rate": 9.944289693593314e-06,
847
- "loss": 0.0116,
848
  "step": 1400
849
  },
850
  {
851
  "epoch": 3.53,
852
  "learning_rate": 9.77715877437326e-06,
853
- "loss": 0.01,
854
  "step": 1410
855
  },
856
  {
857
  "epoch": 3.56,
858
  "learning_rate": 9.610027855153203e-06,
859
- "loss": 0.01,
860
  "step": 1420
861
  },
862
  {
863
  "epoch": 3.58,
864
  "learning_rate": 9.44289693593315e-06,
865
- "loss": 0.022,
866
  "step": 1430
867
  },
868
  {
869
  "epoch": 3.61,
870
  "learning_rate": 9.275766016713092e-06,
871
- "loss": 0.0628,
872
  "step": 1440
873
  },
874
  {
875
  "epoch": 3.63,
876
  "learning_rate": 9.108635097493037e-06,
877
- "loss": 0.0526,
878
  "step": 1450
879
  },
880
  {
881
  "epoch": 3.66,
882
  "learning_rate": 8.941504178272981e-06,
883
- "loss": 0.0526,
884
  "step": 1460
885
  },
886
  {
887
  "epoch": 3.68,
888
  "learning_rate": 8.774373259052926e-06,
889
- "loss": 0.0323,
890
  "step": 1470
891
  },
892
  {
893
  "epoch": 3.71,
894
  "learning_rate": 8.607242339832869e-06,
895
- "loss": 0.1494,
896
  "step": 1480
897
  },
898
  {
899
  "epoch": 3.73,
900
  "learning_rate": 8.440111420612815e-06,
901
- "loss": 0.0288,
902
  "step": 1490
903
  },
904
  {
905
  "epoch": 3.76,
906
  "learning_rate": 8.272980501392758e-06,
907
- "loss": 0.0159,
908
  "step": 1500
909
  },
910
  {
911
  "epoch": 3.78,
912
  "learning_rate": 8.1058495821727e-06,
913
- "loss": 0.119,
914
  "step": 1510
915
  },
916
  {
917
  "epoch": 3.81,
918
  "learning_rate": 7.938718662952647e-06,
919
- "loss": 0.0519,
920
  "step": 1520
921
  },
922
  {
923
  "epoch": 3.83,
924
  "learning_rate": 7.77158774373259e-06,
925
- "loss": 0.0186,
926
  "step": 1530
927
  },
928
  {
929
  "epoch": 3.86,
930
  "learning_rate": 7.604456824512535e-06,
931
- "loss": 0.0975,
932
  "step": 1540
933
  },
934
  {
935
  "epoch": 3.88,
936
  "learning_rate": 7.43732590529248e-06,
937
- "loss": 0.0193,
938
  "step": 1550
939
  },
940
  {
941
  "epoch": 3.91,
942
  "learning_rate": 7.2701949860724235e-06,
943
- "loss": 0.012,
944
  "step": 1560
945
  },
946
  {
947
  "epoch": 3.93,
948
  "learning_rate": 7.103064066852368e-06,
949
- "loss": 0.1069,
950
  "step": 1570
951
  },
952
  {
953
  "epoch": 3.96,
954
  "learning_rate": 6.935933147632313e-06,
955
- "loss": 0.0862,
956
  "step": 1580
957
  },
958
  {
959
  "epoch": 3.98,
960
  "learning_rate": 6.768802228412256e-06,
961
- "loss": 0.0577,
962
  "step": 1590
963
  },
964
  {
965
  "epoch": 4.01,
966
  "learning_rate": 6.601671309192201e-06,
967
- "loss": 0.0097,
968
  "step": 1600
969
  },
970
  {
971
  "epoch": 4.04,
972
  "learning_rate": 6.4345403899721455e-06,
973
- "loss": 0.0739,
974
  "step": 1610
975
  },
976
  {
977
  "epoch": 4.06,
978
  "learning_rate": 6.267409470752089e-06,
979
- "loss": 0.0929,
980
  "step": 1620
981
  },
982
  {
983
  "epoch": 4.09,
984
  "learning_rate": 6.100278551532034e-06,
985
- "loss": 0.0097,
986
  "step": 1630
987
  },
988
  {
989
  "epoch": 4.11,
990
  "learning_rate": 5.933147632311978e-06,
991
- "loss": 0.1369,
992
  "step": 1640
993
  },
994
  {
995
  "epoch": 4.14,
996
  "learning_rate": 5.766016713091923e-06,
997
- "loss": 0.0422,
998
  "step": 1650
999
  },
1000
  {
1001
  "epoch": 4.16,
1002
  "learning_rate": 5.598885793871867e-06,
1003
- "loss": 0.0513,
1004
  "step": 1660
1005
  },
1006
  {
1007
  "epoch": 4.19,
1008
  "learning_rate": 5.43175487465181e-06,
1009
- "loss": 0.0122,
1010
  "step": 1670
1011
  },
1012
  {
1013
  "epoch": 4.21,
1014
  "learning_rate": 5.264623955431755e-06,
1015
- "loss": 0.0325,
1016
  "step": 1680
1017
  },
1018
  {
1019
  "epoch": 4.24,
1020
  "learning_rate": 5.0974930362116986e-06,
1021
- "loss": 0.0119,
1022
  "step": 1690
1023
  },
1024
  {
1025
  "epoch": 4.26,
1026
  "learning_rate": 4.930362116991643e-06,
1027
- "loss": 0.048,
1028
  "step": 1700
1029
  },
1030
  {
1031
  "epoch": 4.29,
1032
  "learning_rate": 4.763231197771588e-06,
1033
- "loss": 0.0461,
1034
  "step": 1710
1035
  },
1036
  {
1037
  "epoch": 4.31,
1038
  "learning_rate": 4.596100278551532e-06,
1039
- "loss": 0.0925,
1040
  "step": 1720
1041
  },
1042
  {
1043
  "epoch": 4.34,
1044
  "learning_rate": 4.428969359331476e-06,
1045
- "loss": 0.0166,
1046
  "step": 1730
1047
  },
1048
  {
1049
  "epoch": 4.36,
1050
  "learning_rate": 4.2618384401114205e-06,
1051
- "loss": 0.0954,
1052
  "step": 1740
1053
  },
1054
  {
1055
  "epoch": 4.39,
1056
  "learning_rate": 4.094707520891365e-06,
1057
- "loss": 0.147,
1058
  "step": 1750
1059
  },
1060
  {
1061
  "epoch": 4.41,
1062
  "learning_rate": 3.927576601671309e-06,
1063
- "loss": 0.0107,
1064
  "step": 1760
1065
  },
1066
  {
1067
  "epoch": 4.44,
1068
  "learning_rate": 3.7604456824512533e-06,
1069
- "loss": 0.0108,
1070
  "step": 1770
1071
  },
1072
  {
1073
  "epoch": 4.46,
1074
  "learning_rate": 3.593314763231198e-06,
1075
- "loss": 0.0116,
1076
  "step": 1780
1077
  },
1078
  {
1079
  "epoch": 4.49,
1080
  "learning_rate": 3.426183844011142e-06,
1081
- "loss": 0.0617,
1082
  "step": 1790
1083
  },
1084
  {
1085
  "epoch": 4.51,
1086
  "learning_rate": 3.259052924791086e-06,
1087
- "loss": 0.0468,
1088
  "step": 1800
1089
  },
1090
  {
1091
  "epoch": 4.54,
1092
  "learning_rate": 3.0919220055710307e-06,
1093
- "loss": 0.0709,
1094
  "step": 1810
1095
  },
1096
  {
1097
  "epoch": 4.56,
1098
  "learning_rate": 2.924791086350975e-06,
1099
- "loss": 0.019,
1100
  "step": 1820
1101
  },
1102
  {
1103
  "epoch": 4.59,
1104
  "learning_rate": 2.7576601671309194e-06,
1105
- "loss": 0.0206,
1106
  "step": 1830
1107
  },
1108
  {
1109
  "epoch": 4.61,
1110
  "learning_rate": 2.5905292479108636e-06,
1111
- "loss": 0.0108,
1112
  "step": 1840
1113
  },
1114
  {
1115
  "epoch": 4.64,
1116
  "learning_rate": 2.4233983286908077e-06,
1117
- "loss": 0.0608,
1118
  "step": 1850
1119
  },
1120
  {
1121
  "epoch": 4.66,
1122
  "learning_rate": 2.2562674094707523e-06,
1123
- "loss": 0.0877,
1124
  "step": 1860
1125
  },
1126
  {
1127
  "epoch": 4.69,
1128
  "learning_rate": 2.0891364902506964e-06,
1129
- "loss": 0.0191,
1130
  "step": 1870
1131
  },
1132
  {
1133
  "epoch": 4.71,
1134
  "learning_rate": 1.922005571030641e-06,
1135
- "loss": 0.0099,
1136
  "step": 1880
1137
  },
1138
  {
1139
  "epoch": 4.74,
1140
  "learning_rate": 1.7548746518105849e-06,
1141
- "loss": 0.0121,
1142
  "step": 1890
1143
  },
1144
  {
1145
  "epoch": 4.76,
1146
  "learning_rate": 1.5877437325905292e-06,
1147
- "loss": 0.0605,
1148
  "step": 1900
1149
  },
1150
  {
1151
  "epoch": 4.79,
1152
  "learning_rate": 1.4206128133704736e-06,
1153
- "loss": 0.0096,
1154
  "step": 1910
1155
  },
1156
  {
1157
  "epoch": 4.81,
1158
  "learning_rate": 1.253481894150418e-06,
1159
- "loss": 0.1002,
1160
  "step": 1920
1161
  },
1162
  {
1163
  "epoch": 4.84,
1164
  "learning_rate": 1.0863509749303623e-06,
1165
- "loss": 0.0533,
1166
  "step": 1930
1167
  },
1168
  {
1169
  "epoch": 4.86,
1170
  "learning_rate": 9.192200557103064e-07,
1171
- "loss": 0.0479,
1172
  "step": 1940
1173
  },
1174
  {
@@ -1186,29 +1186,29 @@
1186
  {
1187
  "epoch": 4.94,
1188
  "learning_rate": 4.178272980501393e-07,
1189
- "loss": 0.0069,
1190
  "step": 1970
1191
  },
1192
  {
1193
  "epoch": 4.96,
1194
  "learning_rate": 2.506963788300836e-07,
1195
- "loss": 0.0171,
1196
  "step": 1980
1197
  },
1198
  {
1199
  "epoch": 4.99,
1200
  "learning_rate": 8.356545961002785e-08,
1201
- "loss": 0.0211,
1202
  "step": 1990
1203
  },
1204
  {
1205
  "epoch": 5.0,
1206
  "step": 1995,
1207
  "total_flos": 1.4837528359600128e+20,
1208
- "train_loss": 0.3196671349661691,
1209
- "train_runtime": 293.2603,
1210
- "train_samples_per_second": 871.137,
1211
- "train_steps_per_second": 6.803
1212
  }
1213
  ],
1214
  "max_steps": 1995,
 
10
  {
11
  "epoch": 0.03,
12
  "learning_rate": 1.5e-06,
13
+ "loss": 2.4689,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.05,
18
  "learning_rate": 3e-06,
19
+ "loss": 2.4537,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.08,
24
  "learning_rate": 4.5e-06,
25
+ "loss": 2.3789,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.1,
30
  "learning_rate": 6e-06,
31
+ "loss": 2.2205,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.13,
36
  "learning_rate": 7.5e-06,
37
+ "loss": 2.0825,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.15,
42
  "learning_rate": 9e-06,
43
+ "loss": 1.9133,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.18,
48
  "learning_rate": 1.05e-05,
49
+ "loss": 1.8169,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.2,
54
  "learning_rate": 1.2e-05,
55
+ "loss": 1.8079,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.23,
60
  "learning_rate": 1.3500000000000001e-05,
61
+ "loss": 1.5825,
62
  "step": 90
63
  },
64
  {
 
70
  {
71
  "epoch": 0.28,
72
  "learning_rate": 1.65e-05,
73
+ "loss": 1.5597,
74
  "step": 110
75
  },
76
  {
77
  "epoch": 0.3,
78
  "learning_rate": 1.8e-05,
79
+ "loss": 1.6068,
80
  "step": 120
81
  },
82
  {
83
  "epoch": 0.33,
84
  "learning_rate": 1.95e-05,
85
+ "loss": 1.3703,
86
  "step": 130
87
  },
88
  {
 
94
  {
95
  "epoch": 0.38,
96
  "learning_rate": 2.25e-05,
97
+ "loss": 1.4707,
98
  "step": 150
99
  },
100
  {
101
  "epoch": 0.4,
102
  "learning_rate": 2.4e-05,
103
+ "loss": 1.5798,
104
  "step": 160
105
  },
106
  {
107
  "epoch": 0.43,
108
  "learning_rate": 2.55e-05,
109
+ "loss": 1.4417,
110
  "step": 170
111
  },
112
  {
113
  "epoch": 0.45,
114
  "learning_rate": 2.7000000000000002e-05,
115
+ "loss": 1.3562,
116
  "step": 180
117
  },
118
  {
119
  "epoch": 0.48,
120
  "learning_rate": 2.8499999999999998e-05,
121
+ "loss": 1.4646,
122
  "step": 190
123
  },
124
  {
125
  "epoch": 0.5,
126
  "learning_rate": 3e-05,
127
+ "loss": 1.1936,
128
  "step": 200
129
  },
130
  {
131
  "epoch": 0.53,
132
  "learning_rate": 2.9832869080779945e-05,
133
+ "loss": 1.3362,
134
  "step": 210
135
  },
136
  {
137
  "epoch": 0.55,
138
  "learning_rate": 2.9665738161559886e-05,
139
+ "loss": 1.1757,
140
  "step": 220
141
  },
142
  {
143
  "epoch": 0.58,
144
  "learning_rate": 2.9498607242339834e-05,
145
+ "loss": 1.1686,
146
  "step": 230
147
  },
148
  {
149
  "epoch": 0.6,
150
  "learning_rate": 2.933147632311978e-05,
151
+ "loss": 1.147,
152
  "step": 240
153
  },
154
  {
155
  "epoch": 0.63,
156
  "learning_rate": 2.916434540389972e-05,
157
+ "loss": 1.0784,
158
  "step": 250
159
  },
160
  {
161
  "epoch": 0.65,
162
  "learning_rate": 2.8997214484679665e-05,
163
+ "loss": 1.0562,
164
  "step": 260
165
  },
166
  {
167
  "epoch": 0.68,
168
  "learning_rate": 2.8830083565459613e-05,
169
+ "loss": 0.9743,
170
  "step": 270
171
  },
172
  {
173
  "epoch": 0.7,
174
  "learning_rate": 2.8662952646239554e-05,
175
+ "loss": 0.7418,
176
  "step": 280
177
  },
178
  {
179
  "epoch": 0.73,
180
  "learning_rate": 2.84958217270195e-05,
181
+ "loss": 0.814,
182
  "step": 290
183
  },
184
  {
185
  "epoch": 0.75,
186
  "learning_rate": 2.8328690807799443e-05,
187
+ "loss": 0.9595,
188
  "step": 300
189
  },
190
  {
191
  "epoch": 0.78,
192
  "learning_rate": 2.8161559888579388e-05,
193
+ "loss": 0.7455,
194
  "step": 310
195
  },
196
  {
197
  "epoch": 0.8,
198
  "learning_rate": 2.7994428969359332e-05,
199
+ "loss": 0.616,
200
  "step": 320
201
  },
202
  {
203
  "epoch": 0.83,
204
  "learning_rate": 2.7827298050139277e-05,
205
+ "loss": 0.6448,
206
  "step": 330
207
  },
208
  {
209
  "epoch": 0.85,
210
  "learning_rate": 2.7660167130919218e-05,
211
+ "loss": 0.5373,
212
  "step": 340
213
  },
214
  {
215
  "epoch": 0.88,
216
  "learning_rate": 2.7493036211699166e-05,
217
+ "loss": 0.5143,
218
  "step": 350
219
  },
220
  {
221
  "epoch": 0.9,
222
  "learning_rate": 2.732590529247911e-05,
223
+ "loss": 0.5485,
224
  "step": 360
225
  },
226
  {
227
  "epoch": 0.93,
228
  "learning_rate": 2.7158774373259055e-05,
229
+ "loss": 0.3994,
230
  "step": 370
231
  },
232
  {
233
  "epoch": 0.95,
234
  "learning_rate": 2.6991643454038996e-05,
235
+ "loss": 0.4221,
236
  "step": 380
237
  },
238
  {
239
  "epoch": 0.98,
240
  "learning_rate": 2.6824512534818944e-05,
241
+ "loss": 0.3865,
242
  "step": 390
243
  },
244
  {
245
  "epoch": 1.0,
246
  "learning_rate": 2.665738161559889e-05,
247
+ "loss": 0.3756,
248
  "step": 400
249
  },
250
  {
251
  "epoch": 1.03,
252
  "learning_rate": 2.649025069637883e-05,
253
+ "loss": 0.2686,
254
  "step": 410
255
  },
256
  {
257
  "epoch": 1.05,
258
  "learning_rate": 2.6323119777158774e-05,
259
+ "loss": 0.2645,
260
  "step": 420
261
  },
262
  {
263
  "epoch": 1.08,
264
  "learning_rate": 2.6155988857938722e-05,
265
+ "loss": 0.2747,
266
  "step": 430
267
  },
268
  {
269
  "epoch": 1.1,
270
  "learning_rate": 2.5988857938718663e-05,
271
+ "loss": 0.3327,
272
  "step": 440
273
  },
274
  {
275
  "epoch": 1.13,
276
  "learning_rate": 2.5821727019498608e-05,
277
+ "loss": 0.2937,
278
  "step": 450
279
  },
280
  {
281
  "epoch": 1.15,
282
  "learning_rate": 2.5654596100278553e-05,
283
+ "loss": 0.2593,
284
  "step": 460
285
  },
286
  {
287
  "epoch": 1.18,
288
  "learning_rate": 2.5487465181058497e-05,
289
+ "loss": 0.1705,
290
  "step": 470
291
  },
292
  {
293
  "epoch": 1.2,
294
  "learning_rate": 2.532033426183844e-05,
295
+ "loss": 0.1641,
296
  "step": 480
297
  },
298
  {
299
  "epoch": 1.23,
300
  "learning_rate": 2.5153203342618386e-05,
301
+ "loss": 0.3088,
302
  "step": 490
303
  },
304
  {
305
  "epoch": 1.25,
306
  "learning_rate": 2.4986072423398327e-05,
307
+ "loss": 0.2023,
308
  "step": 500
309
  },
310
  {
311
  "epoch": 1.28,
312
  "learning_rate": 2.4818941504178275e-05,
313
+ "loss": 0.1896,
314
  "step": 510
315
  },
316
  {
317
  "epoch": 1.3,
318
  "learning_rate": 2.465181058495822e-05,
319
+ "loss": 0.1476,
320
  "step": 520
321
  },
322
  {
323
  "epoch": 1.33,
324
  "learning_rate": 2.448467966573816e-05,
325
+ "loss": 0.2023,
326
  "step": 530
327
  },
328
  {
329
  "epoch": 1.35,
330
  "learning_rate": 2.4317548746518106e-05,
331
+ "loss": 0.1005,
332
  "step": 540
333
  },
334
  {
335
  "epoch": 1.38,
336
  "learning_rate": 2.415041782729805e-05,
337
+ "loss": 0.2535,
338
  "step": 550
339
  },
340
  {
341
  "epoch": 1.4,
342
  "learning_rate": 2.3983286908077995e-05,
343
+ "loss": 0.1387,
344
  "step": 560
345
  },
346
  {
347
  "epoch": 1.43,
348
  "learning_rate": 2.381615598885794e-05,
349
+ "loss": 0.2567,
350
  "step": 570
351
  },
352
  {
353
  "epoch": 1.45,
354
  "learning_rate": 2.3649025069637884e-05,
355
+ "loss": 0.0976,
356
  "step": 580
357
  },
358
  {
359
  "epoch": 1.48,
360
  "learning_rate": 2.3481894150417825e-05,
361
+ "loss": 0.1436,
362
  "step": 590
363
  },
364
  {
365
  "epoch": 1.5,
366
  "learning_rate": 2.3314763231197773e-05,
367
+ "loss": 0.0673,
368
  "step": 600
369
  },
370
  {
371
  "epoch": 1.53,
372
  "learning_rate": 2.3147632311977718e-05,
373
+ "loss": 0.1552,
374
  "step": 610
375
  },
376
  {
377
  "epoch": 1.55,
378
  "learning_rate": 2.298050139275766e-05,
379
+ "loss": 0.1199,
380
  "step": 620
381
  },
382
  {
383
  "epoch": 1.58,
384
  "learning_rate": 2.2813370473537603e-05,
385
+ "loss": 0.1526,
386
  "step": 630
387
  },
388
  {
389
  "epoch": 1.6,
390
  "learning_rate": 2.264623955431755e-05,
391
+ "loss": 0.065,
392
  "step": 640
393
  },
394
  {
395
  "epoch": 1.63,
396
  "learning_rate": 2.2479108635097492e-05,
397
+ "loss": 0.1841,
398
  "step": 650
399
  },
400
  {
401
  "epoch": 1.65,
402
  "learning_rate": 2.2311977715877437e-05,
403
+ "loss": 0.3076,
404
  "step": 660
405
  },
406
  {
407
  "epoch": 1.68,
408
  "learning_rate": 2.214484679665738e-05,
409
+ "loss": 0.0943,
410
  "step": 670
411
  },
412
  {
413
  "epoch": 1.7,
414
  "learning_rate": 2.1977715877437326e-05,
415
+ "loss": 0.0507,
416
  "step": 680
417
  },
418
  {
419
  "epoch": 1.73,
420
  "learning_rate": 2.181058495821727e-05,
421
+ "loss": 0.0748,
422
  "step": 690
423
  },
424
  {
425
  "epoch": 1.75,
426
  "learning_rate": 2.1643454038997215e-05,
427
+ "loss": 0.0612,
428
  "step": 700
429
  },
430
  {
431
  "epoch": 1.78,
432
  "learning_rate": 2.147632311977716e-05,
433
+ "loss": 0.1332,
434
  "step": 710
435
  },
436
  {
437
  "epoch": 1.8,
438
  "learning_rate": 2.1309192200557104e-05,
439
+ "loss": 0.1505,
440
  "step": 720
441
  },
442
  {
443
  "epoch": 1.83,
444
  "learning_rate": 2.114206128133705e-05,
445
+ "loss": 0.0881,
446
  "step": 730
447
  },
448
  {
449
  "epoch": 1.85,
450
  "learning_rate": 2.0974930362116993e-05,
451
+ "loss": 0.1235,
452
  "step": 740
453
  },
454
  {
455
  "epoch": 1.88,
456
  "learning_rate": 2.0807799442896935e-05,
457
+ "loss": 0.0482,
458
  "step": 750
459
  },
460
  {
461
  "epoch": 1.9,
462
  "learning_rate": 2.0640668523676883e-05,
463
+ "loss": 0.1344,
464
  "step": 760
465
  },
466
  {
467
  "epoch": 1.93,
468
  "learning_rate": 2.0473537604456827e-05,
469
+ "loss": 0.05,
470
  "step": 770
471
  },
472
  {
473
  "epoch": 1.95,
474
  "learning_rate": 2.0306406685236768e-05,
475
+ "loss": 0.0645,
476
  "step": 780
477
  },
478
  {
479
  "epoch": 1.98,
480
  "learning_rate": 2.0139275766016713e-05,
481
+ "loss": 0.0952,
482
  "step": 790
483
  },
484
  {
485
  "epoch": 2.01,
486
  "learning_rate": 1.997214484679666e-05,
487
+ "loss": 0.0444,
488
  "step": 800
489
  },
490
  {
491
  "epoch": 2.03,
492
  "learning_rate": 1.9805013927576602e-05,
493
+ "loss": 0.0892,
494
  "step": 810
495
  },
496
  {
497
  "epoch": 2.06,
498
  "learning_rate": 1.9637883008356546e-05,
499
+ "loss": 0.0361,
500
  "step": 820
501
  },
502
  {
503
  "epoch": 2.08,
504
  "learning_rate": 1.947075208913649e-05,
505
+ "loss": 0.1341,
506
  "step": 830
507
  },
508
  {
509
  "epoch": 2.11,
510
  "learning_rate": 1.9303621169916436e-05,
511
+ "loss": 0.101,
512
  "step": 840
513
  },
514
  {
515
  "epoch": 2.13,
516
  "learning_rate": 1.913649025069638e-05,
517
+ "loss": 0.0945,
518
  "step": 850
519
  },
520
  {
521
  "epoch": 2.16,
522
  "learning_rate": 1.8969359331476325e-05,
523
+ "loss": 0.0405,
524
  "step": 860
525
  },
526
  {
527
  "epoch": 2.18,
528
  "learning_rate": 1.8802228412256266e-05,
529
+ "loss": 0.1336,
530
  "step": 870
531
  },
532
  {
533
  "epoch": 2.21,
534
  "learning_rate": 1.863509749303621e-05,
535
+ "loss": 0.0359,
536
  "step": 880
537
  },
538
  {
539
  "epoch": 2.23,
540
  "learning_rate": 1.846796657381616e-05,
541
+ "loss": 0.0371,
542
  "step": 890
543
  },
544
  {
545
  "epoch": 2.26,
546
  "learning_rate": 1.83008356545961e-05,
547
+ "loss": 0.09,
548
  "step": 900
549
  },
550
  {
551
  "epoch": 2.28,
552
  "learning_rate": 1.8133704735376044e-05,
553
+ "loss": 0.079,
554
  "step": 910
555
  },
556
  {
557
  "epoch": 2.31,
558
  "learning_rate": 1.796657381615599e-05,
559
+ "loss": 0.04,
560
  "step": 920
561
  },
562
  {
563
  "epoch": 2.33,
564
  "learning_rate": 1.7799442896935933e-05,
565
+ "loss": 0.0835,
566
  "step": 930
567
  },
568
  {
569
  "epoch": 2.36,
570
  "learning_rate": 1.7632311977715878e-05,
571
+ "loss": 0.0233,
572
  "step": 940
573
  },
574
  {
575
  "epoch": 2.38,
576
  "learning_rate": 1.7465181058495822e-05,
577
+ "loss": 0.0377,
578
  "step": 950
579
  },
580
  {
581
  "epoch": 2.41,
582
  "learning_rate": 1.7298050139275764e-05,
583
+ "loss": 0.0422,
584
  "step": 960
585
  },
586
  {
587
  "epoch": 2.43,
588
  "learning_rate": 1.713091922005571e-05,
589
+ "loss": 0.052,
590
  "step": 970
591
  },
592
  {
593
  "epoch": 2.46,
594
  "learning_rate": 1.6963788300835656e-05,
595
+ "loss": 0.0272,
596
  "step": 980
597
  },
598
  {
599
  "epoch": 2.48,
600
  "learning_rate": 1.6796657381615597e-05,
601
+ "loss": 0.0206,
602
  "step": 990
603
  },
604
  {
605
  "epoch": 2.51,
606
  "learning_rate": 1.6629526462395542e-05,
607
+ "loss": 0.0296,
608
  "step": 1000
609
  },
610
  {
611
  "epoch": 2.53,
612
  "learning_rate": 1.646239554317549e-05,
613
+ "loss": 0.0632,
614
  "step": 1010
615
  },
616
  {
617
  "epoch": 2.56,
618
  "learning_rate": 1.6295264623955434e-05,
619
+ "loss": 0.0719,
620
  "step": 1020
621
  },
622
  {
623
  "epoch": 2.58,
624
  "learning_rate": 1.6128133704735375e-05,
625
+ "loss": 0.0835,
626
  "step": 1030
627
  },
628
  {
629
  "epoch": 2.61,
630
  "learning_rate": 1.596100278551532e-05,
631
+ "loss": 0.0492,
632
  "step": 1040
633
  },
634
  {
635
  "epoch": 2.63,
636
  "learning_rate": 1.5793871866295268e-05,
637
+ "loss": 0.0275,
638
  "step": 1050
639
  },
640
  {
641
  "epoch": 2.66,
642
  "learning_rate": 1.562674094707521e-05,
643
+ "loss": 0.0242,
644
  "step": 1060
645
  },
646
  {
647
  "epoch": 2.68,
648
  "learning_rate": 1.5459610027855154e-05,
649
+ "loss": 0.0278,
650
  "step": 1070
651
  },
652
  {
653
  "epoch": 2.71,
654
  "learning_rate": 1.5292479108635098e-05,
655
+ "loss": 0.0592,
656
  "step": 1080
657
  },
658
  {
659
  "epoch": 2.73,
660
  "learning_rate": 1.5125348189415043e-05,
661
+ "loss": 0.0734,
662
  "step": 1090
663
  },
664
  {
665
  "epoch": 2.76,
666
  "learning_rate": 1.4958217270194987e-05,
667
+ "loss": 0.2112,
668
  "step": 1100
669
  },
670
  {
671
  "epoch": 2.78,
672
  "learning_rate": 1.479108635097493e-05,
673
+ "loss": 0.0753,
674
  "step": 1110
675
  },
676
  {
677
  "epoch": 2.81,
678
  "learning_rate": 1.4623955431754876e-05,
679
+ "loss": 0.072,
680
  "step": 1120
681
  },
682
  {
683
  "epoch": 2.83,
684
  "learning_rate": 1.445682451253482e-05,
685
+ "loss": 0.0146,
686
  "step": 1130
687
  },
688
  {
689
  "epoch": 2.86,
690
  "learning_rate": 1.4289693593314764e-05,
691
+ "loss": 0.1671,
692
  "step": 1140
693
  },
694
  {
695
  "epoch": 2.88,
696
  "learning_rate": 1.4122562674094708e-05,
697
+ "loss": 0.0701,
698
  "step": 1150
699
  },
700
  {
701
  "epoch": 2.91,
702
  "learning_rate": 1.3955431754874653e-05,
703
+ "loss": 0.1404,
704
  "step": 1160
705
  },
706
  {
707
  "epoch": 2.93,
708
  "learning_rate": 1.3788300835654596e-05,
709
+ "loss": 0.08,
710
  "step": 1170
711
  },
712
  {
713
  "epoch": 2.96,
714
  "learning_rate": 1.362116991643454e-05,
715
+ "loss": 0.083,
716
  "step": 1180
717
  },
718
  {
719
  "epoch": 2.98,
720
  "learning_rate": 1.3454038997214485e-05,
721
+ "loss": 0.0619,
722
  "step": 1190
723
  },
724
  {
725
  "epoch": 3.01,
726
  "learning_rate": 1.3286908077994428e-05,
727
+ "loss": 0.0569,
728
  "step": 1200
729
  },
730
  {
731
  "epoch": 3.03,
732
  "learning_rate": 1.3119777158774374e-05,
733
+ "loss": 0.0377,
734
  "step": 1210
735
  },
736
  {
737
  "epoch": 3.06,
738
  "learning_rate": 1.2952646239554317e-05,
739
+ "loss": 0.1295,
740
  "step": 1220
741
  },
742
  {
743
  "epoch": 3.08,
744
  "learning_rate": 1.2785515320334262e-05,
745
+ "loss": 0.1053,
746
  "step": 1230
747
  },
748
  {
749
  "epoch": 3.11,
750
  "learning_rate": 1.2618384401114206e-05,
751
+ "loss": 0.0191,
752
  "step": 1240
753
  },
754
  {
755
  "epoch": 3.13,
756
  "learning_rate": 1.245125348189415e-05,
757
+ "loss": 0.1647,
758
  "step": 1250
759
  },
760
  {
761
  "epoch": 3.16,
762
  "learning_rate": 1.2284122562674095e-05,
763
+ "loss": 0.0731,
764
  "step": 1260
765
  },
766
  {
767
  "epoch": 3.18,
768
  "learning_rate": 1.211699164345404e-05,
769
+ "loss": 0.0238,
770
  "step": 1270
771
  },
772
  {
773
  "epoch": 3.21,
774
  "learning_rate": 1.1949860724233983e-05,
775
+ "loss": 0.0133,
776
  "step": 1280
777
  },
778
  {
779
  "epoch": 3.23,
780
  "learning_rate": 1.1782729805013929e-05,
781
+ "loss": 0.167,
782
  "step": 1290
783
  },
784
  {
785
  "epoch": 3.26,
786
  "learning_rate": 1.1615598885793872e-05,
787
+ "loss": 0.0532,
788
  "step": 1300
789
  },
790
  {
791
  "epoch": 3.28,
792
  "learning_rate": 1.1448467966573816e-05,
793
+ "loss": 0.0524,
794
  "step": 1310
795
  },
796
  {
797
  "epoch": 3.31,
798
  "learning_rate": 1.1281337047353761e-05,
799
+ "loss": 0.0206,
800
  "step": 1320
801
  },
802
  {
 
808
  {
809
  "epoch": 3.36,
810
  "learning_rate": 1.0947075208913648e-05,
811
+ "loss": 0.0127,
812
  "step": 1340
813
  },
814
  {
815
  "epoch": 3.38,
816
  "learning_rate": 1.0779944289693595e-05,
817
+ "loss": 0.0625,
818
  "step": 1350
819
  },
820
  {
821
  "epoch": 3.41,
822
  "learning_rate": 1.0612813370473537e-05,
823
+ "loss": 0.016,
824
  "step": 1360
825
  },
826
  {
827
  "epoch": 3.43,
828
  "learning_rate": 1.0445682451253482e-05,
829
+ "loss": 0.0521,
830
  "step": 1370
831
  },
832
  {
833
  "epoch": 3.46,
834
  "learning_rate": 1.0278551532033427e-05,
835
+ "loss": 0.0317,
836
  "step": 1380
837
  },
838
  {
839
  "epoch": 3.48,
840
  "learning_rate": 1.0111420612813371e-05,
841
+ "loss": 0.0154,
842
  "step": 1390
843
  },
844
  {
845
  "epoch": 3.51,
846
  "learning_rate": 9.944289693593314e-06,
847
+ "loss": 0.0907,
848
  "step": 1400
849
  },
850
  {
851
  "epoch": 3.53,
852
  "learning_rate": 9.77715877437326e-06,
853
+ "loss": 0.0105,
854
  "step": 1410
855
  },
856
  {
857
  "epoch": 3.56,
858
  "learning_rate": 9.610027855153203e-06,
859
+ "loss": 0.0122,
860
  "step": 1420
861
  },
862
  {
863
  "epoch": 3.58,
864
  "learning_rate": 9.44289693593315e-06,
865
+ "loss": 0.0247,
866
  "step": 1430
867
  },
868
  {
869
  "epoch": 3.61,
870
  "learning_rate": 9.275766016713092e-06,
871
+ "loss": 0.0598,
872
  "step": 1440
873
  },
874
  {
875
  "epoch": 3.63,
876
  "learning_rate": 9.108635097493037e-06,
877
+ "loss": 0.0195,
878
  "step": 1450
879
  },
880
  {
881
  "epoch": 3.66,
882
  "learning_rate": 8.941504178272981e-06,
883
+ "loss": 0.0195,
884
  "step": 1460
885
  },
886
  {
887
  "epoch": 3.68,
888
  "learning_rate": 8.774373259052926e-06,
889
+ "loss": 0.0211,
890
  "step": 1470
891
  },
892
  {
893
  "epoch": 3.71,
894
  "learning_rate": 8.607242339832869e-06,
895
+ "loss": 0.1396,
896
  "step": 1480
897
  },
898
  {
899
  "epoch": 3.73,
900
  "learning_rate": 8.440111420612815e-06,
901
+ "loss": 0.0316,
902
  "step": 1490
903
  },
904
  {
905
  "epoch": 3.76,
906
  "learning_rate": 8.272980501392758e-06,
907
+ "loss": 0.0208,
908
  "step": 1500
909
  },
910
  {
911
  "epoch": 3.78,
912
  "learning_rate": 8.1058495821727e-06,
913
+ "loss": 0.1344,
914
  "step": 1510
915
  },
916
  {
917
  "epoch": 3.81,
918
  "learning_rate": 7.938718662952647e-06,
919
+ "loss": 0.0711,
920
  "step": 1520
921
  },
922
  {
923
  "epoch": 3.83,
924
  "learning_rate": 7.77158774373259e-06,
925
+ "loss": 0.0596,
926
  "step": 1530
927
  },
928
  {
929
  "epoch": 3.86,
930
  "learning_rate": 7.604456824512535e-06,
931
+ "loss": 0.0837,
932
  "step": 1540
933
  },
934
  {
935
  "epoch": 3.88,
936
  "learning_rate": 7.43732590529248e-06,
937
+ "loss": 0.0242,
938
  "step": 1550
939
  },
940
  {
941
  "epoch": 3.91,
942
  "learning_rate": 7.2701949860724235e-06,
943
+ "loss": 0.06,
944
  "step": 1560
945
  },
946
  {
947
  "epoch": 3.93,
948
  "learning_rate": 7.103064066852368e-06,
949
+ "loss": 0.1006,
950
  "step": 1570
951
  },
952
  {
953
  "epoch": 3.96,
954
  "learning_rate": 6.935933147632313e-06,
955
+ "loss": 0.0972,
956
  "step": 1580
957
  },
958
  {
959
  "epoch": 3.98,
960
  "learning_rate": 6.768802228412256e-06,
961
+ "loss": 0.0627,
962
  "step": 1590
963
  },
964
  {
965
  "epoch": 4.01,
966
  "learning_rate": 6.601671309192201e-06,
967
+ "loss": 0.01,
968
  "step": 1600
969
  },
970
  {
971
  "epoch": 4.04,
972
  "learning_rate": 6.4345403899721455e-06,
973
+ "loss": 0.0734,
974
  "step": 1610
975
  },
976
  {
977
  "epoch": 4.06,
978
  "learning_rate": 6.267409470752089e-06,
979
+ "loss": 0.0889,
980
  "step": 1620
981
  },
982
  {
983
  "epoch": 4.09,
984
  "learning_rate": 6.100278551532034e-06,
985
+ "loss": 0.0109,
986
  "step": 1630
987
  },
988
  {
989
  "epoch": 4.11,
990
  "learning_rate": 5.933147632311978e-06,
991
+ "loss": 0.0777,
992
  "step": 1640
993
  },
994
  {
995
  "epoch": 4.14,
996
  "learning_rate": 5.766016713091923e-06,
997
+ "loss": 0.0475,
998
  "step": 1650
999
  },
1000
  {
1001
  "epoch": 4.16,
1002
  "learning_rate": 5.598885793871867e-06,
1003
+ "loss": 0.0508,
1004
  "step": 1660
1005
  },
1006
  {
1007
  "epoch": 4.19,
1008
  "learning_rate": 5.43175487465181e-06,
1009
+ "loss": 0.01,
1010
  "step": 1670
1011
  },
1012
  {
1013
  "epoch": 4.21,
1014
  "learning_rate": 5.264623955431755e-06,
1015
+ "loss": 0.0317,
1016
  "step": 1680
1017
  },
1018
  {
1019
  "epoch": 4.24,
1020
  "learning_rate": 5.0974930362116986e-06,
1021
+ "loss": 0.0109,
1022
  "step": 1690
1023
  },
1024
  {
1025
  "epoch": 4.26,
1026
  "learning_rate": 4.930362116991643e-06,
1027
+ "loss": 0.0466,
1028
  "step": 1700
1029
  },
1030
  {
1031
  "epoch": 4.29,
1032
  "learning_rate": 4.763231197771588e-06,
1033
+ "loss": 0.0477,
1034
  "step": 1710
1035
  },
1036
  {
1037
  "epoch": 4.31,
1038
  "learning_rate": 4.596100278551532e-06,
1039
+ "loss": 0.1175,
1040
  "step": 1720
1041
  },
1042
  {
1043
  "epoch": 4.34,
1044
  "learning_rate": 4.428969359331476e-06,
1045
+ "loss": 0.0198,
1046
  "step": 1730
1047
  },
1048
  {
1049
  "epoch": 4.36,
1050
  "learning_rate": 4.2618384401114205e-06,
1051
+ "loss": 0.0833,
1052
  "step": 1740
1053
  },
1054
  {
1055
  "epoch": 4.39,
1056
  "learning_rate": 4.094707520891365e-06,
1057
+ "loss": 0.1487,
1058
  "step": 1750
1059
  },
1060
  {
1061
  "epoch": 4.41,
1062
  "learning_rate": 3.927576601671309e-06,
1063
+ "loss": 0.0096,
1064
  "step": 1760
1065
  },
1066
  {
1067
  "epoch": 4.44,
1068
  "learning_rate": 3.7604456824512533e-06,
1069
+ "loss": 0.0101,
1070
  "step": 1770
1071
  },
1072
  {
1073
  "epoch": 4.46,
1074
  "learning_rate": 3.593314763231198e-06,
1075
+ "loss": 0.01,
1076
  "step": 1780
1077
  },
1078
  {
1079
  "epoch": 4.49,
1080
  "learning_rate": 3.426183844011142e-06,
1081
+ "loss": 0.0578,
1082
  "step": 1790
1083
  },
1084
  {
1085
  "epoch": 4.51,
1086
  "learning_rate": 3.259052924791086e-06,
1087
+ "loss": 0.047,
1088
  "step": 1800
1089
  },
1090
  {
1091
  "epoch": 4.54,
1092
  "learning_rate": 3.0919220055710307e-06,
1093
+ "loss": 0.08,
1094
  "step": 1810
1095
  },
1096
  {
1097
  "epoch": 4.56,
1098
  "learning_rate": 2.924791086350975e-06,
1099
+ "loss": 0.013,
1100
  "step": 1820
1101
  },
1102
  {
1103
  "epoch": 4.59,
1104
  "learning_rate": 2.7576601671309194e-06,
1105
+ "loss": 0.0812,
1106
  "step": 1830
1107
  },
1108
  {
1109
  "epoch": 4.61,
1110
  "learning_rate": 2.5905292479108636e-06,
1111
+ "loss": 0.037,
1112
  "step": 1840
1113
  },
1114
  {
1115
  "epoch": 4.64,
1116
  "learning_rate": 2.4233983286908077e-06,
1117
+ "loss": 0.0472,
1118
  "step": 1850
1119
  },
1120
  {
1121
  "epoch": 4.66,
1122
  "learning_rate": 2.2562674094707523e-06,
1123
+ "loss": 0.0905,
1124
  "step": 1860
1125
  },
1126
  {
1127
  "epoch": 4.69,
1128
  "learning_rate": 2.0891364902506964e-06,
1129
+ "loss": 0.0469,
1130
  "step": 1870
1131
  },
1132
  {
1133
  "epoch": 4.71,
1134
  "learning_rate": 1.922005571030641e-06,
1135
+ "loss": 0.0297,
1136
  "step": 1880
1137
  },
1138
  {
1139
  "epoch": 4.74,
1140
  "learning_rate": 1.7548746518105849e-06,
1141
+ "loss": 0.0126,
1142
  "step": 1890
1143
  },
1144
  {
1145
  "epoch": 4.76,
1146
  "learning_rate": 1.5877437325905292e-06,
1147
+ "loss": 0.094,
1148
  "step": 1900
1149
  },
1150
  {
1151
  "epoch": 4.79,
1152
  "learning_rate": 1.4206128133704736e-06,
1153
+ "loss": 0.0132,
1154
  "step": 1910
1155
  },
1156
  {
1157
  "epoch": 4.81,
1158
  "learning_rate": 1.253481894150418e-06,
1159
+ "loss": 0.1048,
1160
  "step": 1920
1161
  },
1162
  {
1163
  "epoch": 4.84,
1164
  "learning_rate": 1.0863509749303623e-06,
1165
+ "loss": 0.0526,
1166
  "step": 1930
1167
  },
1168
  {
1169
  "epoch": 4.86,
1170
  "learning_rate": 9.192200557103064e-07,
1171
+ "loss": 0.0498,
1172
  "step": 1940
1173
  },
1174
  {
 
1186
  {
1187
  "epoch": 4.94,
1188
  "learning_rate": 4.178272980501393e-07,
1189
+ "loss": 0.0078,
1190
  "step": 1970
1191
  },
1192
  {
1193
  "epoch": 4.96,
1194
  "learning_rate": 2.506963788300836e-07,
1195
+ "loss": 0.0161,
1196
  "step": 1980
1197
  },
1198
  {
1199
  "epoch": 4.99,
1200
  "learning_rate": 8.356545961002785e-08,
1201
+ "loss": 0.0226,
1202
  "step": 1990
1203
  },
1204
  {
1205
  "epoch": 5.0,
1206
  "step": 1995,
1207
  "total_flos": 1.4837528359600128e+20,
1208
+ "train_loss": 0.32014525516290115,
1209
+ "train_runtime": 392.8902,
1210
+ "train_samples_per_second": 650.233,
1211
+ "train_steps_per_second": 5.078
1212
  }
1213
  ],
1214
  "max_steps": 1995,