Joemgu commited on
Commit
557308b
1 Parent(s): 93e6d9d

Training in progress, step 200

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:537b98e9e4b084248e4303645beaeef65f5235a05ae56f3de4fb9728ce91f3d4
3
  size 4736616809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da89bbf56255467fc300c679809ab5415cdbf18df9c2c3fb2070c3e4cd902f34
3
  size 4736616809
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46d927f86c4381f851e153c7cd12d23e9c3b351d0fd5dd23d8dfbedb21e8dbf4
3
  size 2368281769
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1649cca59a6a8d74726a76c7340a221823661f3a836db46a53dbc67221b14982
3
  size 2368281769
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:972139d83957a9cf2600cb6eeca17287d7a5377c33a53500ae7e13fe830ad36b
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:829907c2ddd812cf0db97973145bb576e9b9568aa2102106c5f2e9d84f6b1059
3
  size 14575
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2e0f128c7ae735ba8b14877fdb8cb2ead55b72037741596f80aa07ed1f6f130
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97e255bbc5f7f71168348462c22fbdbbadbc23b19d6869fc621700a4f4ba07b1
3
  size 627
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 2.0144975185394287,
3
- "best_model_checkpoint": "output/checkpoint-400",
4
- "epoch": 0.27347359537658705,
5
- "global_step": 400,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -10,2423 +10,1215 @@
10
  {
11
  "epoch": 0.0,
12
  "learning_rate": 2.9999999999999997e-06,
13
- "loss": 2.0077,
14
  "step": 1
15
  },
16
  {
17
  "epoch": 0.0,
18
  "learning_rate": 5.999999999999999e-06,
19
- "loss": 2.1786,
20
  "step": 2
21
  },
22
  {
23
  "epoch": 0.0,
24
  "learning_rate": 8.999999999999999e-06,
25
- "loss": 2.1404,
26
  "step": 3
27
  },
28
  {
29
  "epoch": 0.0,
30
  "learning_rate": 1.1999999999999999e-05,
31
- "loss": 2.1009,
32
  "step": 4
33
  },
34
  {
35
  "epoch": 0.0,
36
  "learning_rate": 1.4999999999999999e-05,
37
- "loss": 2.1208,
38
  "step": 5
39
  },
40
  {
41
  "epoch": 0.0,
42
  "learning_rate": 1.7999999999999997e-05,
43
- "loss": 2.1575,
44
  "step": 6
45
  },
46
  {
47
  "epoch": 0.0,
48
  "learning_rate": 2.1e-05,
49
- "loss": 2.1128,
50
  "step": 7
51
  },
52
  {
53
  "epoch": 0.01,
54
  "learning_rate": 2.3999999999999997e-05,
55
- "loss": 2.2115,
56
  "step": 8
57
  },
58
  {
59
  "epoch": 0.01,
60
  "learning_rate": 2.6999999999999996e-05,
61
- "loss": 2.151,
62
  "step": 9
63
  },
64
  {
65
  "epoch": 0.01,
66
  "learning_rate": 2.9999999999999997e-05,
67
- "loss": 2.1026,
68
  "step": 10
69
  },
70
  {
71
  "epoch": 0.01,
72
  "learning_rate": 3.2999999999999996e-05,
73
- "loss": 2.0163,
74
  "step": 11
75
  },
76
  {
77
  "epoch": 0.01,
78
  "learning_rate": 3.5999999999999994e-05,
79
- "loss": 2.0976,
80
  "step": 12
81
  },
82
  {
83
  "epoch": 0.01,
84
  "learning_rate": 3.9e-05,
85
- "loss": 2.0799,
86
  "step": 13
87
  },
88
  {
89
  "epoch": 0.01,
90
  "learning_rate": 4.2e-05,
91
- "loss": 2.0675,
92
  "step": 14
93
  },
94
  {
95
  "epoch": 0.01,
96
  "learning_rate": 4.4999999999999996e-05,
97
- "loss": 1.9994,
98
  "step": 15
99
  },
100
  {
101
  "epoch": 0.01,
102
  "learning_rate": 4.7999999999999994e-05,
103
- "loss": 2.0982,
104
  "step": 16
105
  },
106
  {
107
  "epoch": 0.01,
108
  "learning_rate": 5.1e-05,
109
- "loss": 2.1543,
110
  "step": 17
111
  },
112
  {
113
  "epoch": 0.01,
114
  "learning_rate": 5.399999999999999e-05,
115
- "loss": 2.0545,
116
  "step": 18
117
  },
118
  {
119
  "epoch": 0.01,
120
  "learning_rate": 5.6999999999999996e-05,
121
- "loss": 1.9721,
122
  "step": 19
123
  },
124
  {
125
  "epoch": 0.01,
126
  "learning_rate": 5.9999999999999995e-05,
127
- "loss": 2.0459,
128
  "step": 20
129
  },
130
  {
131
  "epoch": 0.01,
132
  "learning_rate": 6.299999999999999e-05,
133
- "loss": 1.9305,
134
  "step": 21
135
  },
136
  {
137
  "epoch": 0.02,
138
  "learning_rate": 6.599999999999999e-05,
139
- "loss": 2.0157,
140
  "step": 22
141
  },
142
  {
143
  "epoch": 0.02,
144
  "learning_rate": 6.9e-05,
145
- "loss": 2.0086,
146
  "step": 23
147
  },
148
  {
149
  "epoch": 0.02,
150
  "learning_rate": 7.199999999999999e-05,
151
- "loss": 2.0465,
152
  "step": 24
153
  },
154
  {
155
  "epoch": 0.02,
156
  "learning_rate": 7.5e-05,
157
- "loss": 1.9646,
158
  "step": 25
159
  },
160
  {
161
  "epoch": 0.02,
162
  "learning_rate": 7.8e-05,
163
- "loss": 1.9861,
164
  "step": 26
165
  },
166
  {
167
  "epoch": 0.02,
168
  "learning_rate": 8.1e-05,
169
- "loss": 2.0234,
170
  "step": 27
171
  },
172
  {
173
  "epoch": 0.02,
174
  "learning_rate": 8.4e-05,
175
- "loss": 2.0791,
176
  "step": 28
177
  },
178
  {
179
  "epoch": 0.02,
180
  "learning_rate": 8.699999999999999e-05,
181
- "loss": 2.0444,
182
  "step": 29
183
  },
184
  {
185
  "epoch": 0.02,
186
  "learning_rate": 8.999999999999999e-05,
187
- "loss": 2.0175,
188
  "step": 30
189
  },
190
  {
191
  "epoch": 0.02,
192
  "learning_rate": 9.3e-05,
193
- "loss": 2.0487,
194
  "step": 31
195
  },
196
  {
197
  "epoch": 0.02,
198
  "learning_rate": 9.599999999999999e-05,
199
- "loss": 1.9894,
200
  "step": 32
201
  },
202
  {
203
  "epoch": 0.02,
204
  "learning_rate": 9.9e-05,
205
- "loss": 2.0274,
206
  "step": 33
207
  },
208
  {
209
  "epoch": 0.02,
210
  "learning_rate": 0.000102,
211
- "loss": 2.0745,
212
  "step": 34
213
  },
214
  {
215
  "epoch": 0.02,
216
  "learning_rate": 0.00010499999999999999,
217
- "loss": 2.0062,
218
  "step": 35
219
  },
220
  {
221
  "epoch": 0.02,
222
  "learning_rate": 0.00010799999999999998,
223
- "loss": 2.0295,
224
  "step": 36
225
  },
226
  {
227
  "epoch": 0.03,
228
  "learning_rate": 0.00011099999999999999,
229
- "loss": 2.0375,
230
  "step": 37
231
  },
232
  {
233
  "epoch": 0.03,
234
  "learning_rate": 0.00011399999999999999,
235
- "loss": 2.0231,
236
  "step": 38
237
  },
238
  {
239
  "epoch": 0.03,
240
  "learning_rate": 0.000117,
241
- "loss": 2.0579,
242
  "step": 39
243
  },
244
  {
245
  "epoch": 0.03,
246
  "learning_rate": 0.00011999999999999999,
247
- "loss": 2.0871,
248
  "step": 40
249
  },
250
  {
251
  "epoch": 0.03,
252
  "learning_rate": 0.00012299999999999998,
253
- "loss": 1.9509,
254
  "step": 41
255
  },
256
  {
257
  "epoch": 0.03,
258
  "learning_rate": 0.00012599999999999997,
259
- "loss": 1.8855,
260
  "step": 42
261
  },
262
  {
263
  "epoch": 0.03,
264
  "learning_rate": 0.000129,
265
- "loss": 2.0348,
266
  "step": 43
267
  },
268
  {
269
  "epoch": 0.03,
270
  "learning_rate": 0.00013199999999999998,
271
- "loss": 2.0375,
272
  "step": 44
273
  },
274
  {
275
  "epoch": 0.03,
276
  "learning_rate": 0.000135,
277
- "loss": 2.0577,
278
  "step": 45
279
  },
280
  {
281
  "epoch": 0.03,
282
  "learning_rate": 0.000138,
283
- "loss": 2.0738,
284
  "step": 46
285
  },
286
  {
287
  "epoch": 0.03,
288
  "learning_rate": 0.00014099999999999998,
289
- "loss": 2.0123,
290
  "step": 47
291
  },
292
  {
293
  "epoch": 0.03,
294
  "learning_rate": 0.00014399999999999998,
295
- "loss": 2.0051,
296
  "step": 48
297
  },
298
  {
299
  "epoch": 0.03,
300
  "learning_rate": 0.000147,
301
- "loss": 2.0202,
302
  "step": 49
303
  },
304
  {
305
  "epoch": 0.03,
306
  "learning_rate": 0.00015,
307
- "loss": 2.0341,
308
  "step": 50
309
  },
310
  {
311
  "epoch": 0.03,
312
  "learning_rate": 0.00015299999999999998,
313
- "loss": 2.0641,
314
  "step": 51
315
  },
316
  {
317
  "epoch": 0.04,
318
  "learning_rate": 0.000156,
319
- "loss": 2.0062,
320
  "step": 52
321
  },
322
  {
323
  "epoch": 0.04,
324
  "learning_rate": 0.000159,
325
- "loss": 2.0098,
326
  "step": 53
327
  },
328
  {
329
  "epoch": 0.04,
330
  "learning_rate": 0.000162,
331
- "loss": 2.0048,
332
  "step": 54
333
  },
334
  {
335
  "epoch": 0.04,
336
  "learning_rate": 0.000165,
337
- "loss": 2.024,
338
  "step": 55
339
  },
340
  {
341
  "epoch": 0.04,
342
  "learning_rate": 0.000168,
343
- "loss": 2.1095,
344
  "step": 56
345
  },
346
  {
347
  "epoch": 0.04,
348
  "learning_rate": 0.00017099999999999998,
349
- "loss": 2.0006,
350
  "step": 57
351
  },
352
  {
353
  "epoch": 0.04,
354
  "learning_rate": 0.00017399999999999997,
355
- "loss": 1.9974,
356
  "step": 58
357
  },
358
  {
359
  "epoch": 0.04,
360
  "learning_rate": 0.00017699999999999997,
361
- "loss": 1.9859,
362
  "step": 59
363
  },
364
  {
365
  "epoch": 0.04,
366
  "learning_rate": 0.00017999999999999998,
367
- "loss": 1.9709,
368
  "step": 60
369
  },
370
  {
371
  "epoch": 0.04,
372
  "learning_rate": 0.00018299999999999998,
373
- "loss": 2.0321,
374
  "step": 61
375
  },
376
  {
377
  "epoch": 0.04,
378
  "learning_rate": 0.000186,
379
- "loss": 2.0237,
380
  "step": 62
381
  },
382
  {
383
  "epoch": 0.04,
384
  "learning_rate": 0.00018899999999999999,
385
- "loss": 2.0283,
386
  "step": 63
387
  },
388
  {
389
  "epoch": 0.04,
390
  "learning_rate": 0.00019199999999999998,
391
- "loss": 2.0366,
392
  "step": 64
393
  },
394
  {
395
  "epoch": 0.04,
396
  "learning_rate": 0.000195,
397
- "loss": 1.896,
398
  "step": 65
399
  },
400
  {
401
  "epoch": 0.05,
402
  "learning_rate": 0.000198,
403
- "loss": 1.9694,
404
  "step": 66
405
  },
406
  {
407
  "epoch": 0.05,
408
  "learning_rate": 0.000201,
409
- "loss": 2.0214,
410
  "step": 67
411
  },
412
  {
413
  "epoch": 0.05,
414
  "learning_rate": 0.000204,
415
- "loss": 1.9792,
416
  "step": 68
417
  },
418
  {
419
  "epoch": 0.05,
420
  "learning_rate": 0.00020699999999999996,
421
- "loss": 1.9212,
422
  "step": 69
423
  },
424
  {
425
  "epoch": 0.05,
426
  "learning_rate": 0.00020999999999999998,
427
- "loss": 1.8687,
428
  "step": 70
429
  },
430
  {
431
  "epoch": 0.05,
432
  "learning_rate": 0.00021299999999999997,
433
- "loss": 1.9514,
434
  "step": 71
435
  },
436
  {
437
  "epoch": 0.05,
438
  "learning_rate": 0.00021599999999999996,
439
- "loss": 1.9373,
440
  "step": 72
441
  },
442
  {
443
  "epoch": 0.05,
444
  "learning_rate": 0.00021899999999999998,
445
- "loss": 2.0325,
446
  "step": 73
447
  },
448
  {
449
  "epoch": 0.05,
450
  "learning_rate": 0.00022199999999999998,
451
- "loss": 1.9904,
452
  "step": 74
453
  },
454
  {
455
  "epoch": 0.05,
456
  "learning_rate": 0.000225,
457
- "loss": 1.9753,
458
  "step": 75
459
  },
460
  {
461
  "epoch": 0.05,
462
  "learning_rate": 0.00022799999999999999,
463
- "loss": 1.9835,
464
  "step": 76
465
  },
466
  {
467
  "epoch": 0.05,
468
  "learning_rate": 0.00023099999999999998,
469
- "loss": 1.866,
470
  "step": 77
471
  },
472
  {
473
  "epoch": 0.05,
474
  "learning_rate": 0.000234,
475
- "loss": 1.9926,
476
  "step": 78
477
  },
478
  {
479
  "epoch": 0.05,
480
  "learning_rate": 0.000237,
481
- "loss": 1.9915,
482
  "step": 79
483
  },
484
  {
485
  "epoch": 0.05,
486
  "learning_rate": 0.00023999999999999998,
487
- "loss": 1.9351,
488
  "step": 80
489
  },
490
  {
491
  "epoch": 0.06,
492
  "learning_rate": 0.000243,
493
- "loss": 2.008,
494
  "step": 81
495
  },
496
  {
497
  "epoch": 0.06,
498
  "learning_rate": 0.00024599999999999996,
499
- "loss": 1.9728,
500
  "step": 82
501
  },
502
  {
503
  "epoch": 0.06,
504
  "learning_rate": 0.000249,
505
- "loss": 2.0915,
506
  "step": 83
507
  },
508
  {
509
  "epoch": 0.06,
510
  "learning_rate": 0.00025199999999999995,
511
- "loss": 2.008,
512
  "step": 84
513
  },
514
  {
515
  "epoch": 0.06,
516
  "learning_rate": 0.00025499999999999996,
517
- "loss": 1.9635,
518
  "step": 85
519
  },
520
  {
521
  "epoch": 0.06,
522
  "learning_rate": 0.000258,
523
- "loss": 1.9995,
524
  "step": 86
525
  },
526
  {
527
  "epoch": 0.06,
528
  "learning_rate": 0.000261,
529
- "loss": 1.8813,
530
  "step": 87
531
  },
532
  {
533
  "epoch": 0.06,
534
  "learning_rate": 0.00026399999999999997,
535
- "loss": 1.9433,
536
  "step": 88
537
  },
538
  {
539
  "epoch": 0.06,
540
  "learning_rate": 0.000267,
541
- "loss": 1.9815,
542
  "step": 89
543
  },
544
  {
545
  "epoch": 0.06,
546
  "learning_rate": 0.00027,
547
- "loss": 1.9644,
548
  "step": 90
549
  },
550
  {
551
  "epoch": 0.06,
552
  "learning_rate": 0.00027299999999999997,
553
- "loss": 1.9775,
554
  "step": 91
555
  },
556
  {
557
  "epoch": 0.06,
558
  "learning_rate": 0.000276,
559
- "loss": 1.9424,
560
  "step": 92
561
  },
562
  {
563
  "epoch": 0.06,
564
  "learning_rate": 0.000279,
565
- "loss": 1.8931,
566
  "step": 93
567
  },
568
  {
569
  "epoch": 0.06,
570
  "learning_rate": 0.00028199999999999997,
571
- "loss": 1.982,
572
  "step": 94
573
  },
574
  {
575
  "epoch": 0.06,
576
  "learning_rate": 0.000285,
577
- "loss": 1.9369,
578
  "step": 95
579
  },
580
  {
581
  "epoch": 0.07,
582
  "learning_rate": 0.00028799999999999995,
583
- "loss": 1.9895,
584
  "step": 96
585
  },
586
  {
587
  "epoch": 0.07,
588
  "learning_rate": 0.00029099999999999997,
589
- "loss": 1.9581,
590
  "step": 97
591
  },
592
  {
593
  "epoch": 0.07,
594
  "learning_rate": 0.000294,
595
- "loss": 1.9321,
596
  "step": 98
597
  },
598
  {
599
  "epoch": 0.07,
600
  "learning_rate": 0.00029699999999999996,
601
- "loss": 1.972,
602
  "step": 99
603
  },
604
  {
605
  "epoch": 0.07,
606
  "learning_rate": 0.0003,
607
- "loss": 2.0379,
608
  "step": 100
609
  },
610
  {
611
  "epoch": 0.07,
612
  "learning_rate": 0.000303,
613
- "loss": 1.9495,
614
  "step": 101
615
  },
616
  {
617
  "epoch": 0.07,
618
  "learning_rate": 0.00030599999999999996,
619
- "loss": 2.0403,
620
  "step": 102
621
  },
622
  {
623
  "epoch": 0.07,
624
  "learning_rate": 0.000309,
625
- "loss": 1.9609,
626
  "step": 103
627
  },
628
  {
629
  "epoch": 0.07,
630
  "learning_rate": 0.000312,
631
- "loss": 1.8649,
632
  "step": 104
633
  },
634
  {
635
  "epoch": 0.07,
636
  "learning_rate": 0.00031499999999999996,
637
- "loss": 1.9075,
638
  "step": 105
639
  },
640
  {
641
  "epoch": 0.07,
642
  "learning_rate": 0.000318,
643
- "loss": 1.9003,
644
  "step": 106
645
  },
646
  {
647
  "epoch": 0.07,
648
  "learning_rate": 0.000321,
649
- "loss": 1.9617,
650
  "step": 107
651
  },
652
  {
653
  "epoch": 0.07,
654
  "learning_rate": 0.000324,
655
- "loss": 2.0252,
656
  "step": 108
657
  },
658
  {
659
  "epoch": 0.07,
660
  "learning_rate": 0.000327,
661
- "loss": 1.9904,
662
  "step": 109
663
  },
664
  {
665
  "epoch": 0.08,
666
  "learning_rate": 0.00033,
667
- "loss": 1.9021,
668
  "step": 110
669
  },
670
  {
671
  "epoch": 0.08,
672
  "learning_rate": 0.000333,
673
- "loss": 1.9953,
674
  "step": 111
675
  },
676
  {
677
  "epoch": 0.08,
678
  "learning_rate": 0.000336,
679
- "loss": 1.9547,
680
  "step": 112
681
  },
682
  {
683
  "epoch": 0.08,
684
  "learning_rate": 0.00033899999999999995,
685
- "loss": 1.9819,
686
  "step": 113
687
  },
688
  {
689
  "epoch": 0.08,
690
  "learning_rate": 0.00034199999999999996,
691
- "loss": 1.9297,
692
  "step": 114
693
  },
694
  {
695
  "epoch": 0.08,
696
  "learning_rate": 0.00034499999999999993,
697
- "loss": 2.0428,
698
  "step": 115
699
  },
700
  {
701
  "epoch": 0.08,
702
  "learning_rate": 0.00034799999999999995,
703
- "loss": 1.9725,
704
  "step": 116
705
  },
706
  {
707
  "epoch": 0.08,
708
  "learning_rate": 0.00035099999999999997,
709
- "loss": 2.011,
710
  "step": 117
711
  },
712
  {
713
  "epoch": 0.08,
714
  "learning_rate": 0.00035399999999999993,
715
- "loss": 1.9258,
716
  "step": 118
717
  },
718
  {
719
  "epoch": 0.08,
720
  "learning_rate": 0.00035699999999999995,
721
- "loss": 1.9744,
722
  "step": 119
723
  },
724
  {
725
  "epoch": 0.08,
726
  "learning_rate": 0.00035999999999999997,
727
- "loss": 1.9285,
728
  "step": 120
729
  },
730
  {
731
  "epoch": 0.08,
732
  "learning_rate": 0.00036299999999999993,
733
- "loss": 1.9671,
734
  "step": 121
735
  },
736
  {
737
  "epoch": 0.08,
738
  "learning_rate": 0.00036599999999999995,
739
- "loss": 1.961,
740
  "step": 122
741
  },
742
  {
743
  "epoch": 0.08,
744
  "learning_rate": 0.00036899999999999997,
745
- "loss": 1.9486,
746
  "step": 123
747
  },
748
  {
749
  "epoch": 0.08,
750
  "learning_rate": 0.000372,
751
- "loss": 1.9547,
752
  "step": 124
753
  },
754
  {
755
  "epoch": 0.09,
756
  "learning_rate": 0.00037499999999999995,
757
- "loss": 1.9198,
758
  "step": 125
759
  },
760
  {
761
  "epoch": 0.09,
762
  "learning_rate": 0.00037799999999999997,
763
- "loss": 2.0241,
764
  "step": 126
765
  },
766
  {
767
  "epoch": 0.09,
768
  "learning_rate": 0.000381,
769
- "loss": 1.9285,
770
  "step": 127
771
  },
772
  {
773
  "epoch": 0.09,
774
  "learning_rate": 0.00038399999999999996,
775
- "loss": 2.0473,
776
  "step": 128
777
  },
778
  {
779
  "epoch": 0.09,
780
  "learning_rate": 0.000387,
781
- "loss": 1.972,
782
  "step": 129
783
  },
784
  {
785
  "epoch": 0.09,
786
  "learning_rate": 0.00039,
787
- "loss": 2.0174,
788
  "step": 130
789
  },
790
  {
791
  "epoch": 0.09,
792
  "learning_rate": 0.00039299999999999996,
793
- "loss": 2.042,
794
  "step": 131
795
  },
796
  {
797
  "epoch": 0.09,
798
  "learning_rate": 0.000396,
799
- "loss": 1.9143,
800
  "step": 132
801
  },
802
  {
803
  "epoch": 0.09,
804
  "learning_rate": 0.000399,
805
- "loss": 1.9869,
806
  "step": 133
807
  },
808
  {
809
  "epoch": 0.09,
810
  "learning_rate": 0.000402,
811
- "loss": 1.902,
812
  "step": 134
813
  },
814
  {
815
  "epoch": 0.09,
816
  "learning_rate": 0.000405,
817
- "loss": 2.0594,
818
  "step": 135
819
  },
820
  {
821
  "epoch": 0.09,
822
  "learning_rate": 0.000408,
823
- "loss": 1.996,
824
  "step": 136
825
  },
826
  {
827
  "epoch": 0.09,
828
  "learning_rate": 0.000411,
829
- "loss": 1.9112,
830
  "step": 137
831
  },
832
  {
833
  "epoch": 0.09,
834
  "learning_rate": 0.0004139999999999999,
835
- "loss": 1.8769,
836
  "step": 138
837
  },
838
  {
839
  "epoch": 0.1,
840
  "learning_rate": 0.00041699999999999994,
841
- "loss": 1.9617,
842
  "step": 139
843
  },
844
  {
845
  "epoch": 0.1,
846
  "learning_rate": 0.00041999999999999996,
847
- "loss": 2.0377,
848
  "step": 140
849
  },
850
  {
851
  "epoch": 0.1,
852
  "learning_rate": 0.00042299999999999993,
853
- "loss": 1.9877,
854
  "step": 141
855
  },
856
  {
857
  "epoch": 0.1,
858
  "learning_rate": 0.00042599999999999995,
859
- "loss": 1.9169,
860
  "step": 142
861
  },
862
  {
863
  "epoch": 0.1,
864
  "learning_rate": 0.00042899999999999997,
865
- "loss": 2.0591,
866
  "step": 143
867
  },
868
  {
869
  "epoch": 0.1,
870
  "learning_rate": 0.00043199999999999993,
871
- "loss": 1.8444,
872
  "step": 144
873
  },
874
  {
875
  "epoch": 0.1,
876
  "learning_rate": 0.00043499999999999995,
877
- "loss": 1.9824,
878
  "step": 145
879
  },
880
  {
881
  "epoch": 0.1,
882
  "learning_rate": 0.00043799999999999997,
883
- "loss": 1.9722,
884
  "step": 146
885
  },
886
  {
887
  "epoch": 0.1,
888
  "learning_rate": 0.00044099999999999993,
889
- "loss": 1.9294,
890
  "step": 147
891
  },
892
  {
893
  "epoch": 0.1,
894
  "learning_rate": 0.00044399999999999995,
895
- "loss": 1.9214,
896
  "step": 148
897
  },
898
  {
899
  "epoch": 0.1,
900
  "learning_rate": 0.00044699999999999997,
901
- "loss": 1.9745,
902
  "step": 149
903
  },
904
  {
905
  "epoch": 0.1,
906
  "learning_rate": 0.00045,
907
- "loss": 2.0246,
908
  "step": 150
909
  },
910
  {
911
  "epoch": 0.1,
912
  "learning_rate": 0.00045299999999999995,
913
- "loss": 2.0081,
914
  "step": 151
915
  },
916
  {
917
  "epoch": 0.1,
918
  "learning_rate": 0.00045599999999999997,
919
- "loss": 1.9461,
920
  "step": 152
921
  },
922
  {
923
  "epoch": 0.1,
924
  "learning_rate": 0.000459,
925
- "loss": 2.0223,
926
  "step": 153
927
  },
928
  {
929
  "epoch": 0.11,
930
  "learning_rate": 0.00046199999999999995,
931
- "loss": 1.9378,
932
  "step": 154
933
  },
934
  {
935
  "epoch": 0.11,
936
  "learning_rate": 0.00046499999999999997,
937
- "loss": 2.078,
938
  "step": 155
939
  },
940
  {
941
  "epoch": 0.11,
942
  "learning_rate": 0.000468,
943
- "loss": 1.8492,
944
  "step": 156
945
  },
946
  {
947
  "epoch": 0.11,
948
  "learning_rate": 0.00047099999999999996,
949
- "loss": 1.9702,
950
  "step": 157
951
  },
952
  {
953
  "epoch": 0.11,
954
  "learning_rate": 0.000474,
955
- "loss": 2.0145,
956
  "step": 158
957
  },
958
  {
959
  "epoch": 0.11,
960
  "learning_rate": 0.000477,
961
- "loss": 1.9919,
962
  "step": 159
963
  },
964
  {
965
  "epoch": 0.11,
966
  "learning_rate": 0.00047999999999999996,
967
- "loss": 1.859,
968
  "step": 160
969
  },
970
  {
971
  "epoch": 0.11,
972
  "learning_rate": 0.000483,
973
- "loss": 2.0404,
974
  "step": 161
975
  },
976
  {
977
  "epoch": 0.11,
978
  "learning_rate": 0.000486,
979
- "loss": 1.9834,
980
  "step": 162
981
  },
982
  {
983
  "epoch": 0.11,
984
  "learning_rate": 0.000489,
985
- "loss": 1.9675,
986
  "step": 163
987
  },
988
  {
989
  "epoch": 0.11,
990
  "learning_rate": 0.0004919999999999999,
991
- "loss": 1.9736,
992
  "step": 164
993
  },
994
  {
995
  "epoch": 0.11,
996
  "learning_rate": 0.0004949999999999999,
997
- "loss": 1.9782,
998
  "step": 165
999
  },
1000
  {
1001
  "epoch": 0.11,
1002
  "learning_rate": 0.000498,
1003
- "loss": 1.9871,
1004
  "step": 166
1005
  },
1006
  {
1007
  "epoch": 0.11,
1008
  "learning_rate": 0.0005009999999999999,
1009
- "loss": 1.8851,
1010
  "step": 167
1011
  },
1012
  {
1013
  "epoch": 0.11,
1014
  "learning_rate": 0.0005039999999999999,
1015
- "loss": 1.843,
1016
  "step": 168
1017
  },
1018
  {
1019
  "epoch": 0.12,
1020
  "learning_rate": 0.000507,
1021
- "loss": 2.0048,
1022
  "step": 169
1023
  },
1024
  {
1025
  "epoch": 0.12,
1026
  "learning_rate": 0.0005099999999999999,
1027
- "loss": 1.9889,
1028
  "step": 170
1029
  },
1030
  {
1031
  "epoch": 0.12,
1032
  "learning_rate": 0.0005129999999999999,
1033
- "loss": 1.8957,
1034
  "step": 171
1035
  },
1036
  {
1037
  "epoch": 0.12,
1038
  "learning_rate": 0.000516,
1039
- "loss": 1.973,
1040
  "step": 172
1041
  },
1042
  {
1043
  "epoch": 0.12,
1044
  "learning_rate": 0.0005189999999999999,
1045
- "loss": 1.9994,
1046
  "step": 173
1047
  },
1048
  {
1049
  "epoch": 0.12,
1050
  "learning_rate": 0.000522,
1051
- "loss": 1.8714,
1052
  "step": 174
1053
  },
1054
  {
1055
  "epoch": 0.12,
1056
  "learning_rate": 0.000525,
1057
- "loss": 1.9058,
1058
  "step": 175
1059
  },
1060
  {
1061
  "epoch": 0.12,
1062
  "learning_rate": 0.0005279999999999999,
1063
- "loss": 1.883,
1064
  "step": 176
1065
  },
1066
  {
1067
  "epoch": 0.12,
1068
  "learning_rate": 0.000531,
1069
- "loss": 1.9557,
1070
  "step": 177
1071
  },
1072
  {
1073
  "epoch": 0.12,
1074
  "learning_rate": 0.000534,
1075
- "loss": 1.9734,
1076
  "step": 178
1077
  },
1078
  {
1079
  "epoch": 0.12,
1080
  "learning_rate": 0.0005369999999999999,
1081
- "loss": 1.9472,
1082
  "step": 179
1083
  },
1084
  {
1085
  "epoch": 0.12,
1086
  "learning_rate": 0.00054,
1087
- "loss": 1.8726,
1088
  "step": 180
1089
  },
1090
  {
1091
  "epoch": 0.12,
1092
  "learning_rate": 0.000543,
1093
- "loss": 1.9167,
1094
  "step": 181
1095
  },
1096
  {
1097
  "epoch": 0.12,
1098
  "learning_rate": 0.0005459999999999999,
1099
- "loss": 1.9731,
1100
  "step": 182
1101
  },
1102
  {
1103
  "epoch": 0.13,
1104
  "learning_rate": 0.000549,
1105
- "loss": 1.9897,
1106
  "step": 183
1107
  },
1108
  {
1109
  "epoch": 0.13,
1110
  "learning_rate": 0.000552,
1111
- "loss": 2.0158,
1112
  "step": 184
1113
  },
1114
  {
1115
  "epoch": 0.13,
1116
  "learning_rate": 0.0005549999999999999,
1117
- "loss": 1.8802,
1118
  "step": 185
1119
  },
1120
  {
1121
  "epoch": 0.13,
1122
  "learning_rate": 0.000558,
1123
- "loss": 1.9343,
1124
  "step": 186
1125
  },
1126
  {
1127
  "epoch": 0.13,
1128
  "learning_rate": 0.000561,
1129
- "loss": 1.928,
1130
  "step": 187
1131
  },
1132
  {
1133
  "epoch": 0.13,
1134
  "learning_rate": 0.0005639999999999999,
1135
- "loss": 2.0039,
1136
  "step": 188
1137
  },
1138
  {
1139
  "epoch": 0.13,
1140
  "learning_rate": 0.0005669999999999999,
1141
- "loss": 1.8459,
1142
  "step": 189
1143
  },
1144
  {
1145
  "epoch": 0.13,
1146
  "learning_rate": 0.00057,
1147
- "loss": 1.9679,
1148
  "step": 190
1149
  },
1150
  {
1151
  "epoch": 0.13,
1152
  "learning_rate": 0.0005729999999999999,
1153
- "loss": 1.9395,
1154
  "step": 191
1155
  },
1156
  {
1157
  "epoch": 0.13,
1158
  "learning_rate": 0.0005759999999999999,
1159
- "loss": 1.9677,
1160
  "step": 192
1161
  },
1162
  {
1163
  "epoch": 0.13,
1164
  "learning_rate": 0.000579,
1165
- "loss": 1.9577,
1166
  "step": 193
1167
  },
1168
  {
1169
  "epoch": 0.13,
1170
  "learning_rate": 0.0005819999999999999,
1171
- "loss": 2.0165,
1172
  "step": 194
1173
  },
1174
  {
1175
  "epoch": 0.13,
1176
  "learning_rate": 0.0005849999999999999,
1177
- "loss": 1.9697,
1178
  "step": 195
1179
  },
1180
  {
1181
  "epoch": 0.13,
1182
  "learning_rate": 0.000588,
1183
- "loss": 2.0005,
1184
  "step": 196
1185
  },
1186
  {
1187
  "epoch": 0.13,
1188
  "learning_rate": 0.0005909999999999999,
1189
- "loss": 1.9252,
1190
  "step": 197
1191
  },
1192
  {
1193
  "epoch": 0.14,
1194
  "learning_rate": 0.0005939999999999999,
1195
- "loss": 1.9321,
1196
  "step": 198
1197
  },
1198
  {
1199
  "epoch": 0.14,
1200
  "learning_rate": 0.000597,
1201
- "loss": 1.8474,
1202
  "step": 199
1203
  },
1204
  {
1205
  "epoch": 0.14,
1206
  "learning_rate": 0.0006,
1207
- "loss": 1.9134,
1208
  "step": 200
1209
  },
1210
  {
1211
  "epoch": 0.14,
1212
- "eval_loss": 2.0390162467956543,
1213
- "eval_runtime": 114.9464,
1214
- "eval_samples_per_second": 8.7,
1215
- "eval_steps_per_second": 8.7,
1216
  "step": 200
1217
- },
1218
- {
1219
- "epoch": 0.14,
1220
- "learning_rate": 0.0005998937677053823,
1221
- "loss": 1.9258,
1222
- "step": 201
1223
- },
1224
- {
1225
- "epoch": 0.14,
1226
- "learning_rate": 0.0005997875354107648,
1227
- "loss": 1.9414,
1228
- "step": 202
1229
- },
1230
- {
1231
- "epoch": 0.14,
1232
- "learning_rate": 0.0005996813031161472,
1233
- "loss": 1.9516,
1234
- "step": 203
1235
- },
1236
- {
1237
- "epoch": 0.14,
1238
- "learning_rate": 0.0005995750708215297,
1239
- "loss": 2.0185,
1240
- "step": 204
1241
- },
1242
- {
1243
- "epoch": 0.14,
1244
- "learning_rate": 0.0005994688385269121,
1245
- "loss": 2.0142,
1246
- "step": 205
1247
- },
1248
- {
1249
- "epoch": 0.14,
1250
- "learning_rate": 0.0005993626062322946,
1251
- "loss": 1.9021,
1252
- "step": 206
1253
- },
1254
- {
1255
- "epoch": 0.14,
1256
- "learning_rate": 0.000599256373937677,
1257
- "loss": 1.9439,
1258
- "step": 207
1259
- },
1260
- {
1261
- "epoch": 0.14,
1262
- "learning_rate": 0.0005991501416430595,
1263
- "loss": 1.9489,
1264
- "step": 208
1265
- },
1266
- {
1267
- "epoch": 0.14,
1268
- "learning_rate": 0.0005990439093484419,
1269
- "loss": 1.8585,
1270
- "step": 209
1271
- },
1272
- {
1273
- "epoch": 0.14,
1274
- "learning_rate": 0.0005989376770538244,
1275
- "loss": 1.982,
1276
- "step": 210
1277
- },
1278
- {
1279
- "epoch": 0.14,
1280
- "learning_rate": 0.0005988314447592068,
1281
- "loss": 1.9516,
1282
- "step": 211
1283
- },
1284
- {
1285
- "epoch": 0.14,
1286
- "learning_rate": 0.0005987252124645891,
1287
- "loss": 1.9344,
1288
- "step": 212
1289
- },
1290
- {
1291
- "epoch": 0.15,
1292
- "learning_rate": 0.0005986189801699716,
1293
- "loss": 1.8629,
1294
- "step": 213
1295
- },
1296
- {
1297
- "epoch": 0.15,
1298
- "learning_rate": 0.000598512747875354,
1299
- "loss": 1.9363,
1300
- "step": 214
1301
- },
1302
- {
1303
- "epoch": 0.15,
1304
- "learning_rate": 0.0005984065155807364,
1305
- "loss": 2.0402,
1306
- "step": 215
1307
- },
1308
- {
1309
- "epoch": 0.15,
1310
- "learning_rate": 0.0005983002832861189,
1311
- "loss": 1.9,
1312
- "step": 216
1313
- },
1314
- {
1315
- "epoch": 0.15,
1316
- "learning_rate": 0.0005981940509915014,
1317
- "loss": 1.8588,
1318
- "step": 217
1319
- },
1320
- {
1321
- "epoch": 0.15,
1322
- "learning_rate": 0.0005980878186968838,
1323
- "loss": 1.8654,
1324
- "step": 218
1325
- },
1326
- {
1327
- "epoch": 0.15,
1328
- "learning_rate": 0.0005979815864022663,
1329
- "loss": 1.9509,
1330
- "step": 219
1331
- },
1332
- {
1333
- "epoch": 0.15,
1334
- "learning_rate": 0.0005978753541076487,
1335
- "loss": 1.9325,
1336
- "step": 220
1337
- },
1338
- {
1339
- "epoch": 0.15,
1340
- "learning_rate": 0.0005977691218130311,
1341
- "loss": 1.8617,
1342
- "step": 221
1343
- },
1344
- {
1345
- "epoch": 0.15,
1346
- "learning_rate": 0.0005976628895184136,
1347
- "loss": 1.9164,
1348
- "step": 222
1349
- },
1350
- {
1351
- "epoch": 0.15,
1352
- "learning_rate": 0.000597556657223796,
1353
- "loss": 1.8987,
1354
- "step": 223
1355
- },
1356
- {
1357
- "epoch": 0.15,
1358
- "learning_rate": 0.0005974504249291785,
1359
- "loss": 2.0317,
1360
- "step": 224
1361
- },
1362
- {
1363
- "epoch": 0.15,
1364
- "learning_rate": 0.0005973441926345608,
1365
- "loss": 1.9936,
1366
- "step": 225
1367
- },
1368
- {
1369
- "epoch": 0.15,
1370
- "learning_rate": 0.0005972379603399432,
1371
- "loss": 1.9103,
1372
- "step": 226
1373
- },
1374
- {
1375
- "epoch": 0.16,
1376
- "learning_rate": 0.0005971317280453257,
1377
- "loss": 1.9545,
1378
- "step": 227
1379
- },
1380
- {
1381
- "epoch": 0.16,
1382
- "learning_rate": 0.0005970254957507082,
1383
- "loss": 1.9247,
1384
- "step": 228
1385
- },
1386
- {
1387
- "epoch": 0.16,
1388
- "learning_rate": 0.0005969192634560906,
1389
- "loss": 1.8792,
1390
- "step": 229
1391
- },
1392
- {
1393
- "epoch": 0.16,
1394
- "learning_rate": 0.0005968130311614731,
1395
- "loss": 1.9461,
1396
- "step": 230
1397
- },
1398
- {
1399
- "epoch": 0.16,
1400
- "learning_rate": 0.0005967067988668555,
1401
- "loss": 1.8948,
1402
- "step": 231
1403
- },
1404
- {
1405
- "epoch": 0.16,
1406
- "learning_rate": 0.0005966005665722379,
1407
- "loss": 2.0436,
1408
- "step": 232
1409
- },
1410
- {
1411
- "epoch": 0.16,
1412
- "learning_rate": 0.0005964943342776204,
1413
- "loss": 2.0009,
1414
- "step": 233
1415
- },
1416
- {
1417
- "epoch": 0.16,
1418
- "learning_rate": 0.0005963881019830028,
1419
- "loss": 1.9595,
1420
- "step": 234
1421
- },
1422
- {
1423
- "epoch": 0.16,
1424
- "learning_rate": 0.0005962818696883852,
1425
- "loss": 1.9346,
1426
- "step": 235
1427
- },
1428
- {
1429
- "epoch": 0.16,
1430
- "learning_rate": 0.0005961756373937677,
1431
- "loss": 2.0312,
1432
- "step": 236
1433
- },
1434
- {
1435
- "epoch": 0.16,
1436
- "learning_rate": 0.00059606940509915,
1437
- "loss": 2.0205,
1438
- "step": 237
1439
- },
1440
- {
1441
- "epoch": 0.16,
1442
- "learning_rate": 0.0005959631728045325,
1443
- "loss": 1.8533,
1444
- "step": 238
1445
- },
1446
- {
1447
- "epoch": 0.16,
1448
- "learning_rate": 0.000595856940509915,
1449
- "loss": 1.9943,
1450
- "step": 239
1451
- },
1452
- {
1453
- "epoch": 0.16,
1454
- "learning_rate": 0.0005957507082152974,
1455
- "loss": 1.9002,
1456
- "step": 240
1457
- },
1458
- {
1459
- "epoch": 0.16,
1460
- "learning_rate": 0.0005956444759206798,
1461
- "loss": 2.0044,
1462
- "step": 241
1463
- },
1464
- {
1465
- "epoch": 0.17,
1466
- "learning_rate": 0.0005955382436260623,
1467
- "loss": 2.032,
1468
- "step": 242
1469
- },
1470
- {
1471
- "epoch": 0.17,
1472
- "learning_rate": 0.0005954320113314447,
1473
- "loss": 1.8933,
1474
- "step": 243
1475
- },
1476
- {
1477
- "epoch": 0.17,
1478
- "learning_rate": 0.0005953257790368272,
1479
- "loss": 1.8971,
1480
- "step": 244
1481
- },
1482
- {
1483
- "epoch": 0.17,
1484
- "learning_rate": 0.0005952195467422096,
1485
- "loss": 1.9241,
1486
- "step": 245
1487
- },
1488
- {
1489
- "epoch": 0.17,
1490
- "learning_rate": 0.000595113314447592,
1491
- "loss": 1.9246,
1492
- "step": 246
1493
- },
1494
- {
1495
- "epoch": 0.17,
1496
- "learning_rate": 0.0005950070821529745,
1497
- "loss": 1.8752,
1498
- "step": 247
1499
- },
1500
- {
1501
- "epoch": 0.17,
1502
- "learning_rate": 0.0005949008498583569,
1503
- "loss": 1.8325,
1504
- "step": 248
1505
- },
1506
- {
1507
- "epoch": 0.17,
1508
- "learning_rate": 0.0005947946175637393,
1509
- "loss": 2.0427,
1510
- "step": 249
1511
- },
1512
- {
1513
- "epoch": 0.17,
1514
- "learning_rate": 0.0005946883852691218,
1515
- "loss": 2.0028,
1516
- "step": 250
1517
- },
1518
- {
1519
- "epoch": 0.17,
1520
- "learning_rate": 0.0005945821529745042,
1521
- "loss": 1.8925,
1522
- "step": 251
1523
- },
1524
- {
1525
- "epoch": 0.17,
1526
- "learning_rate": 0.0005944759206798866,
1527
- "loss": 1.9451,
1528
- "step": 252
1529
- },
1530
- {
1531
- "epoch": 0.17,
1532
- "learning_rate": 0.0005943696883852691,
1533
- "loss": 2.0214,
1534
- "step": 253
1535
- },
1536
- {
1537
- "epoch": 0.17,
1538
- "learning_rate": 0.0005942634560906515,
1539
- "loss": 1.9055,
1540
- "step": 254
1541
- },
1542
- {
1543
- "epoch": 0.17,
1544
- "learning_rate": 0.0005941572237960339,
1545
- "loss": 2.0471,
1546
- "step": 255
1547
- },
1548
- {
1549
- "epoch": 0.18,
1550
- "learning_rate": 0.0005940509915014164,
1551
- "loss": 1.9051,
1552
- "step": 256
1553
- },
1554
- {
1555
- "epoch": 0.18,
1556
- "learning_rate": 0.0005939447592067988,
1557
- "loss": 1.9161,
1558
- "step": 257
1559
- },
1560
- {
1561
- "epoch": 0.18,
1562
- "learning_rate": 0.0005938385269121813,
1563
- "loss": 1.9885,
1564
- "step": 258
1565
- },
1566
- {
1567
- "epoch": 0.18,
1568
- "learning_rate": 0.0005937322946175637,
1569
- "loss": 1.8885,
1570
- "step": 259
1571
- },
1572
- {
1573
- "epoch": 0.18,
1574
- "learning_rate": 0.000593626062322946,
1575
- "loss": 1.9312,
1576
- "step": 260
1577
- },
1578
- {
1579
- "epoch": 0.18,
1580
- "learning_rate": 0.0005935198300283285,
1581
- "loss": 2.007,
1582
- "step": 261
1583
- },
1584
- {
1585
- "epoch": 0.18,
1586
- "learning_rate": 0.000593413597733711,
1587
- "loss": 2.0364,
1588
- "step": 262
1589
- },
1590
- {
1591
- "epoch": 0.18,
1592
- "learning_rate": 0.0005933073654390934,
1593
- "loss": 1.9795,
1594
- "step": 263
1595
- },
1596
- {
1597
- "epoch": 0.18,
1598
- "learning_rate": 0.0005932011331444759,
1599
- "loss": 2.0227,
1600
- "step": 264
1601
- },
1602
- {
1603
- "epoch": 0.18,
1604
- "learning_rate": 0.0005930949008498583,
1605
- "loss": 2.0389,
1606
- "step": 265
1607
- },
1608
- {
1609
- "epoch": 0.18,
1610
- "learning_rate": 0.0005929886685552407,
1611
- "loss": 2.0966,
1612
- "step": 266
1613
- },
1614
- {
1615
- "epoch": 0.18,
1616
- "learning_rate": 0.0005928824362606232,
1617
- "loss": 2.0527,
1618
- "step": 267
1619
- },
1620
- {
1621
- "epoch": 0.18,
1622
- "learning_rate": 0.0005927762039660056,
1623
- "loss": 1.9479,
1624
- "step": 268
1625
- },
1626
- {
1627
- "epoch": 0.18,
1628
- "learning_rate": 0.0005926699716713881,
1629
- "loss": 1.9406,
1630
- "step": 269
1631
- },
1632
- {
1633
- "epoch": 0.18,
1634
- "learning_rate": 0.0005925637393767705,
1635
- "loss": 1.9553,
1636
- "step": 270
1637
- },
1638
- {
1639
- "epoch": 0.19,
1640
- "learning_rate": 0.0005924575070821529,
1641
- "loss": 1.9249,
1642
- "step": 271
1643
- },
1644
- {
1645
- "epoch": 0.19,
1646
- "learning_rate": 0.0005923512747875354,
1647
- "loss": 1.8754,
1648
- "step": 272
1649
- },
1650
- {
1651
- "epoch": 0.19,
1652
- "learning_rate": 0.0005922450424929179,
1653
- "loss": 1.9168,
1654
- "step": 273
1655
- },
1656
- {
1657
- "epoch": 0.19,
1658
- "learning_rate": 0.0005921388101983002,
1659
- "loss": 1.8502,
1660
- "step": 274
1661
- },
1662
- {
1663
- "epoch": 0.19,
1664
- "learning_rate": 0.0005920325779036826,
1665
- "loss": 1.909,
1666
- "step": 275
1667
- },
1668
- {
1669
- "epoch": 0.19,
1670
- "learning_rate": 0.0005919263456090651,
1671
- "loss": 1.9713,
1672
- "step": 276
1673
- },
1674
- {
1675
- "epoch": 0.19,
1676
- "learning_rate": 0.0005918201133144475,
1677
- "loss": 1.9817,
1678
- "step": 277
1679
- },
1680
- {
1681
- "epoch": 0.19,
1682
- "learning_rate": 0.00059171388101983,
1683
- "loss": 1.9468,
1684
- "step": 278
1685
- },
1686
- {
1687
- "epoch": 0.19,
1688
- "learning_rate": 0.0005916076487252124,
1689
- "loss": 1.9499,
1690
- "step": 279
1691
- },
1692
- {
1693
- "epoch": 0.19,
1694
- "learning_rate": 0.0005915014164305948,
1695
- "loss": 2.0154,
1696
- "step": 280
1697
- },
1698
- {
1699
- "epoch": 0.19,
1700
- "learning_rate": 0.0005913951841359773,
1701
- "loss": 2.0013,
1702
- "step": 281
1703
- },
1704
- {
1705
- "epoch": 0.19,
1706
- "learning_rate": 0.0005912889518413597,
1707
- "loss": 1.9174,
1708
- "step": 282
1709
- },
1710
- {
1711
- "epoch": 0.19,
1712
- "learning_rate": 0.0005911827195467422,
1713
- "loss": 2.0181,
1714
- "step": 283
1715
- },
1716
- {
1717
- "epoch": 0.19,
1718
- "learning_rate": 0.0005910764872521247,
1719
- "loss": 1.8739,
1720
- "step": 284
1721
- },
1722
- {
1723
- "epoch": 0.19,
1724
- "learning_rate": 0.000590970254957507,
1725
- "loss": 1.9622,
1726
- "step": 285
1727
- },
1728
- {
1729
- "epoch": 0.2,
1730
- "learning_rate": 0.0005908640226628894,
1731
- "loss": 1.992,
1732
- "step": 286
1733
- },
1734
- {
1735
- "epoch": 0.2,
1736
- "learning_rate": 0.0005907577903682719,
1737
- "loss": 1.9311,
1738
- "step": 287
1739
- },
1740
- {
1741
- "epoch": 0.2,
1742
- "learning_rate": 0.0005906515580736543,
1743
- "loss": 1.8938,
1744
- "step": 288
1745
- },
1746
- {
1747
- "epoch": 0.2,
1748
- "learning_rate": 0.0005905453257790368,
1749
- "loss": 1.939,
1750
- "step": 289
1751
- },
1752
- {
1753
- "epoch": 0.2,
1754
- "learning_rate": 0.0005904390934844192,
1755
- "loss": 1.946,
1756
- "step": 290
1757
- },
1758
- {
1759
- "epoch": 0.2,
1760
- "learning_rate": 0.0005903328611898016,
1761
- "loss": 1.969,
1762
- "step": 291
1763
- },
1764
- {
1765
- "epoch": 0.2,
1766
- "learning_rate": 0.0005902266288951841,
1767
- "loss": 1.9189,
1768
- "step": 292
1769
- },
1770
- {
1771
- "epoch": 0.2,
1772
- "learning_rate": 0.0005901203966005665,
1773
- "loss": 1.9114,
1774
- "step": 293
1775
- },
1776
- {
1777
- "epoch": 0.2,
1778
- "learning_rate": 0.000590014164305949,
1779
- "loss": 1.919,
1780
- "step": 294
1781
- },
1782
- {
1783
- "epoch": 0.2,
1784
- "learning_rate": 0.0005899079320113314,
1785
- "loss": 1.9174,
1786
- "step": 295
1787
- },
1788
- {
1789
- "epoch": 0.2,
1790
- "learning_rate": 0.0005898016997167139,
1791
- "loss": 1.9578,
1792
- "step": 296
1793
- },
1794
- {
1795
- "epoch": 0.2,
1796
- "learning_rate": 0.0005896954674220963,
1797
- "loss": 1.9573,
1798
- "step": 297
1799
- },
1800
- {
1801
- "epoch": 0.2,
1802
- "learning_rate": 0.0005895892351274787,
1803
- "loss": 1.9774,
1804
- "step": 298
1805
- },
1806
- {
1807
- "epoch": 0.2,
1808
- "learning_rate": 0.0005894830028328611,
1809
- "loss": 2.0076,
1810
- "step": 299
1811
- },
1812
- {
1813
- "epoch": 0.21,
1814
- "learning_rate": 0.0005893767705382435,
1815
- "loss": 1.9378,
1816
- "step": 300
1817
- },
1818
- {
1819
- "epoch": 0.21,
1820
- "learning_rate": 0.000589270538243626,
1821
- "loss": 1.9127,
1822
- "step": 301
1823
- },
1824
- {
1825
- "epoch": 0.21,
1826
- "learning_rate": 0.0005891643059490084,
1827
- "loss": 1.9028,
1828
- "step": 302
1829
- },
1830
- {
1831
- "epoch": 0.21,
1832
- "learning_rate": 0.0005890580736543909,
1833
- "loss": 2.0307,
1834
- "step": 303
1835
- },
1836
- {
1837
- "epoch": 0.21,
1838
- "learning_rate": 0.0005889518413597733,
1839
- "loss": 1.886,
1840
- "step": 304
1841
- },
1842
- {
1843
- "epoch": 0.21,
1844
- "learning_rate": 0.0005888456090651558,
1845
- "loss": 1.9288,
1846
- "step": 305
1847
- },
1848
- {
1849
- "epoch": 0.21,
1850
- "learning_rate": 0.0005887393767705382,
1851
- "loss": 1.9184,
1852
- "step": 306
1853
- },
1854
- {
1855
- "epoch": 0.21,
1856
- "learning_rate": 0.0005886331444759207,
1857
- "loss": 1.8903,
1858
- "step": 307
1859
- },
1860
- {
1861
- "epoch": 0.21,
1862
- "learning_rate": 0.0005885269121813031,
1863
- "loss": 1.9464,
1864
- "step": 308
1865
- },
1866
- {
1867
- "epoch": 0.21,
1868
- "learning_rate": 0.0005884206798866856,
1869
- "loss": 1.9017,
1870
- "step": 309
1871
- },
1872
- {
1873
- "epoch": 0.21,
1874
- "learning_rate": 0.000588314447592068,
1875
- "loss": 1.9359,
1876
- "step": 310
1877
- },
1878
- {
1879
- "epoch": 0.21,
1880
- "learning_rate": 0.0005882082152974503,
1881
- "loss": 2.07,
1882
- "step": 311
1883
- },
1884
- {
1885
- "epoch": 0.21,
1886
- "learning_rate": 0.0005881019830028328,
1887
- "loss": 1.899,
1888
- "step": 312
1889
- },
1890
- {
1891
- "epoch": 0.21,
1892
- "learning_rate": 0.0005879957507082152,
1893
- "loss": 1.9346,
1894
- "step": 313
1895
- },
1896
- {
1897
- "epoch": 0.21,
1898
- "learning_rate": 0.0005878895184135976,
1899
- "loss": 1.9751,
1900
- "step": 314
1901
- },
1902
- {
1903
- "epoch": 0.22,
1904
- "learning_rate": 0.0005877832861189801,
1905
- "loss": 1.8593,
1906
- "step": 315
1907
- },
1908
- {
1909
- "epoch": 0.22,
1910
- "learning_rate": 0.0005876770538243626,
1911
- "loss": 1.9072,
1912
- "step": 316
1913
- },
1914
- {
1915
- "epoch": 0.22,
1916
- "learning_rate": 0.000587570821529745,
1917
- "loss": 2.0165,
1918
- "step": 317
1919
- },
1920
- {
1921
- "epoch": 0.22,
1922
- "learning_rate": 0.0005874645892351275,
1923
- "loss": 2.0048,
1924
- "step": 318
1925
- },
1926
- {
1927
- "epoch": 0.22,
1928
- "learning_rate": 0.0005873583569405099,
1929
- "loss": 2.0082,
1930
- "step": 319
1931
- },
1932
- {
1933
- "epoch": 0.22,
1934
- "learning_rate": 0.0005872521246458923,
1935
- "loss": 1.9908,
1936
- "step": 320
1937
- },
1938
- {
1939
- "epoch": 0.22,
1940
- "learning_rate": 0.0005871458923512748,
1941
- "loss": 1.8866,
1942
- "step": 321
1943
- },
1944
- {
1945
- "epoch": 0.22,
1946
- "learning_rate": 0.0005870396600566571,
1947
- "loss": 1.8997,
1948
- "step": 322
1949
- },
1950
- {
1951
- "epoch": 0.22,
1952
- "learning_rate": 0.0005869334277620396,
1953
- "loss": 2.0193,
1954
- "step": 323
1955
- },
1956
- {
1957
- "epoch": 0.22,
1958
- "learning_rate": 0.000586827195467422,
1959
- "loss": 2.0157,
1960
- "step": 324
1961
- },
1962
- {
1963
- "epoch": 0.22,
1964
- "learning_rate": 0.0005867209631728044,
1965
- "loss": 1.9045,
1966
- "step": 325
1967
- },
1968
- {
1969
- "epoch": 0.22,
1970
- "learning_rate": 0.0005866147308781869,
1971
- "loss": 1.9727,
1972
- "step": 326
1973
- },
1974
- {
1975
- "epoch": 0.22,
1976
- "learning_rate": 0.0005865084985835694,
1977
- "loss": 1.9071,
1978
- "step": 327
1979
- },
1980
- {
1981
- "epoch": 0.22,
1982
- "learning_rate": 0.0005864022662889518,
1983
- "loss": 1.9013,
1984
- "step": 328
1985
- },
1986
- {
1987
- "epoch": 0.22,
1988
- "learning_rate": 0.0005862960339943343,
1989
- "loss": 1.9227,
1990
- "step": 329
1991
- },
1992
- {
1993
- "epoch": 0.23,
1994
- "learning_rate": 0.0005861898016997167,
1995
- "loss": 1.9121,
1996
- "step": 330
1997
- },
1998
- {
1999
- "epoch": 0.23,
2000
- "learning_rate": 0.0005860835694050991,
2001
- "loss": 1.9515,
2002
- "step": 331
2003
- },
2004
- {
2005
- "epoch": 0.23,
2006
- "learning_rate": 0.0005859773371104816,
2007
- "loss": 1.8773,
2008
- "step": 332
2009
- },
2010
- {
2011
- "epoch": 0.23,
2012
- "learning_rate": 0.000585871104815864,
2013
- "loss": 1.979,
2014
- "step": 333
2015
- },
2016
- {
2017
- "epoch": 0.23,
2018
- "learning_rate": 0.0005857648725212463,
2019
- "loss": 1.9036,
2020
- "step": 334
2021
- },
2022
- {
2023
- "epoch": 0.23,
2024
- "learning_rate": 0.0005856586402266288,
2025
- "loss": 1.9582,
2026
- "step": 335
2027
- },
2028
- {
2029
- "epoch": 0.23,
2030
- "learning_rate": 0.0005855524079320112,
2031
- "loss": 1.8322,
2032
- "step": 336
2033
- },
2034
- {
2035
- "epoch": 0.23,
2036
- "learning_rate": 0.0005854461756373937,
2037
- "loss": 2.0046,
2038
- "step": 337
2039
- },
2040
- {
2041
- "epoch": 0.23,
2042
- "learning_rate": 0.0005853399433427762,
2043
- "loss": 1.9435,
2044
- "step": 338
2045
- },
2046
- {
2047
- "epoch": 0.23,
2048
- "learning_rate": 0.0005852337110481586,
2049
- "loss": 1.942,
2050
- "step": 339
2051
- },
2052
- {
2053
- "epoch": 0.23,
2054
- "learning_rate": 0.000585127478753541,
2055
- "loss": 1.9315,
2056
- "step": 340
2057
- },
2058
- {
2059
- "epoch": 0.23,
2060
- "learning_rate": 0.0005850212464589235,
2061
- "loss": 1.95,
2062
- "step": 341
2063
- },
2064
- {
2065
- "epoch": 0.23,
2066
- "learning_rate": 0.0005849150141643059,
2067
- "loss": 1.9401,
2068
- "step": 342
2069
- },
2070
- {
2071
- "epoch": 0.23,
2072
- "learning_rate": 0.0005848087818696884,
2073
- "loss": 1.98,
2074
- "step": 343
2075
- },
2076
- {
2077
- "epoch": 0.24,
2078
- "learning_rate": 0.0005847025495750708,
2079
- "loss": 1.9448,
2080
- "step": 344
2081
- },
2082
- {
2083
- "epoch": 0.24,
2084
- "learning_rate": 0.0005845963172804532,
2085
- "loss": 1.9539,
2086
- "step": 345
2087
- },
2088
- {
2089
- "epoch": 0.24,
2090
- "learning_rate": 0.0005844900849858357,
2091
- "loss": 2.0188,
2092
- "step": 346
2093
- },
2094
- {
2095
- "epoch": 0.24,
2096
- "learning_rate": 0.000584383852691218,
2097
- "loss": 1.9267,
2098
- "step": 347
2099
- },
2100
- {
2101
- "epoch": 0.24,
2102
- "learning_rate": 0.0005842776203966005,
2103
- "loss": 2.0382,
2104
- "step": 348
2105
- },
2106
- {
2107
- "epoch": 0.24,
2108
- "learning_rate": 0.000584171388101983,
2109
- "loss": 1.9714,
2110
- "step": 349
2111
- },
2112
- {
2113
- "epoch": 0.24,
2114
- "learning_rate": 0.0005840651558073654,
2115
- "loss": 1.9789,
2116
- "step": 350
2117
- },
2118
- {
2119
- "epoch": 0.24,
2120
- "learning_rate": 0.0005839589235127478,
2121
- "loss": 1.8664,
2122
- "step": 351
2123
- },
2124
- {
2125
- "epoch": 0.24,
2126
- "learning_rate": 0.0005838526912181303,
2127
- "loss": 2.0072,
2128
- "step": 352
2129
- },
2130
- {
2131
- "epoch": 0.24,
2132
- "learning_rate": 0.0005837464589235127,
2133
- "loss": 1.936,
2134
- "step": 353
2135
- },
2136
- {
2137
- "epoch": 0.24,
2138
- "learning_rate": 0.0005836402266288951,
2139
- "loss": 1.8285,
2140
- "step": 354
2141
- },
2142
- {
2143
- "epoch": 0.24,
2144
- "learning_rate": 0.0005835339943342776,
2145
- "loss": 1.9294,
2146
- "step": 355
2147
- },
2148
- {
2149
- "epoch": 0.24,
2150
- "learning_rate": 0.00058342776203966,
2151
- "loss": 1.8979,
2152
- "step": 356
2153
- },
2154
- {
2155
- "epoch": 0.24,
2156
- "learning_rate": 0.0005833215297450425,
2157
- "loss": 1.9088,
2158
- "step": 357
2159
- },
2160
- {
2161
- "epoch": 0.24,
2162
- "learning_rate": 0.0005832152974504249,
2163
- "loss": 1.9107,
2164
- "step": 358
2165
- },
2166
- {
2167
- "epoch": 0.25,
2168
- "learning_rate": 0.0005831090651558072,
2169
- "loss": 1.9215,
2170
- "step": 359
2171
- },
2172
- {
2173
- "epoch": 0.25,
2174
- "learning_rate": 0.0005830028328611897,
2175
- "loss": 1.9034,
2176
- "step": 360
2177
- },
2178
- {
2179
- "epoch": 0.25,
2180
- "learning_rate": 0.0005828966005665722,
2181
- "loss": 1.8581,
2182
- "step": 361
2183
- },
2184
- {
2185
- "epoch": 0.25,
2186
- "learning_rate": 0.0005827903682719546,
2187
- "loss": 1.8922,
2188
- "step": 362
2189
- },
2190
- {
2191
- "epoch": 0.25,
2192
- "learning_rate": 0.0005826841359773371,
2193
- "loss": 1.8397,
2194
- "step": 363
2195
- },
2196
- {
2197
- "epoch": 0.25,
2198
- "learning_rate": 0.0005825779036827195,
2199
- "loss": 1.9799,
2200
- "step": 364
2201
- },
2202
- {
2203
- "epoch": 0.25,
2204
- "learning_rate": 0.0005824716713881019,
2205
- "loss": 1.9085,
2206
- "step": 365
2207
- },
2208
- {
2209
- "epoch": 0.25,
2210
- "learning_rate": 0.0005823654390934844,
2211
- "loss": 1.865,
2212
- "step": 366
2213
- },
2214
- {
2215
- "epoch": 0.25,
2216
- "learning_rate": 0.0005822592067988668,
2217
- "loss": 2.0325,
2218
- "step": 367
2219
- },
2220
- {
2221
- "epoch": 0.25,
2222
- "learning_rate": 0.0005821529745042493,
2223
- "loss": 1.9283,
2224
- "step": 368
2225
- },
2226
- {
2227
- "epoch": 0.25,
2228
- "learning_rate": 0.0005820467422096317,
2229
- "loss": 1.8934,
2230
- "step": 369
2231
- },
2232
- {
2233
- "epoch": 0.25,
2234
- "learning_rate": 0.000581940509915014,
2235
- "loss": 1.9225,
2236
- "step": 370
2237
- },
2238
- {
2239
- "epoch": 0.25,
2240
- "learning_rate": 0.0005818342776203965,
2241
- "loss": 1.8816,
2242
- "step": 371
2243
- },
2244
- {
2245
- "epoch": 0.25,
2246
- "learning_rate": 0.000581728045325779,
2247
- "loss": 1.897,
2248
- "step": 372
2249
- },
2250
- {
2251
- "epoch": 0.26,
2252
- "learning_rate": 0.0005816218130311614,
2253
- "loss": 1.9609,
2254
- "step": 373
2255
- },
2256
- {
2257
- "epoch": 0.26,
2258
- "learning_rate": 0.0005815155807365438,
2259
- "loss": 1.9549,
2260
- "step": 374
2261
- },
2262
- {
2263
- "epoch": 0.26,
2264
- "learning_rate": 0.0005814093484419263,
2265
- "loss": 2.0133,
2266
- "step": 375
2267
- },
2268
- {
2269
- "epoch": 0.26,
2270
- "learning_rate": 0.0005813031161473087,
2271
- "loss": 1.8156,
2272
- "step": 376
2273
- },
2274
- {
2275
- "epoch": 0.26,
2276
- "learning_rate": 0.0005811968838526912,
2277
- "loss": 1.9295,
2278
- "step": 377
2279
- },
2280
- {
2281
- "epoch": 0.26,
2282
- "learning_rate": 0.0005810906515580736,
2283
- "loss": 1.9548,
2284
- "step": 378
2285
- },
2286
- {
2287
- "epoch": 0.26,
2288
- "learning_rate": 0.000580984419263456,
2289
- "loss": 1.9483,
2290
- "step": 379
2291
- },
2292
- {
2293
- "epoch": 0.26,
2294
- "learning_rate": 0.0005808781869688385,
2295
- "loss": 1.8704,
2296
- "step": 380
2297
- },
2298
- {
2299
- "epoch": 0.26,
2300
- "learning_rate": 0.0005807719546742209,
2301
- "loss": 1.9807,
2302
- "step": 381
2303
- },
2304
- {
2305
- "epoch": 0.26,
2306
- "learning_rate": 0.0005806657223796034,
2307
- "loss": 1.9642,
2308
- "step": 382
2309
- },
2310
- {
2311
- "epoch": 0.26,
2312
- "learning_rate": 0.0005805594900849859,
2313
- "loss": 1.9625,
2314
- "step": 383
2315
- },
2316
- {
2317
- "epoch": 0.26,
2318
- "learning_rate": 0.0005804532577903682,
2319
- "loss": 1.8797,
2320
- "step": 384
2321
- },
2322
- {
2323
- "epoch": 0.26,
2324
- "learning_rate": 0.0005803470254957506,
2325
- "loss": 1.933,
2326
- "step": 385
2327
- },
2328
- {
2329
- "epoch": 0.26,
2330
- "learning_rate": 0.0005802407932011331,
2331
- "loss": 1.8383,
2332
- "step": 386
2333
- },
2334
- {
2335
- "epoch": 0.26,
2336
- "learning_rate": 0.0005801345609065155,
2337
- "loss": 1.9156,
2338
- "step": 387
2339
- },
2340
- {
2341
- "epoch": 0.27,
2342
- "learning_rate": 0.000580028328611898,
2343
- "loss": 1.9132,
2344
- "step": 388
2345
- },
2346
- {
2347
- "epoch": 0.27,
2348
- "learning_rate": 0.0005799220963172804,
2349
- "loss": 2.038,
2350
- "step": 389
2351
- },
2352
- {
2353
- "epoch": 0.27,
2354
- "learning_rate": 0.0005798158640226628,
2355
- "loss": 1.9887,
2356
- "step": 390
2357
- },
2358
- {
2359
- "epoch": 0.27,
2360
- "learning_rate": 0.0005797096317280453,
2361
- "loss": 1.8482,
2362
- "step": 391
2363
- },
2364
- {
2365
- "epoch": 0.27,
2366
- "learning_rate": 0.0005796033994334277,
2367
- "loss": 1.9472,
2368
- "step": 392
2369
- },
2370
- {
2371
- "epoch": 0.27,
2372
- "learning_rate": 0.0005794971671388101,
2373
- "loss": 1.9624,
2374
- "step": 393
2375
- },
2376
- {
2377
- "epoch": 0.27,
2378
- "learning_rate": 0.0005793909348441927,
2379
- "loss": 1.8782,
2380
- "step": 394
2381
- },
2382
- {
2383
- "epoch": 0.27,
2384
- "learning_rate": 0.000579284702549575,
2385
- "loss": 1.8894,
2386
- "step": 395
2387
- },
2388
- {
2389
- "epoch": 0.27,
2390
- "learning_rate": 0.0005791784702549574,
2391
- "loss": 1.8229,
2392
- "step": 396
2393
- },
2394
- {
2395
- "epoch": 0.27,
2396
- "learning_rate": 0.0005790722379603399,
2397
- "loss": 1.8939,
2398
- "step": 397
2399
- },
2400
- {
2401
- "epoch": 0.27,
2402
- "learning_rate": 0.0005789660056657223,
2403
- "loss": 1.886,
2404
- "step": 398
2405
- },
2406
- {
2407
- "epoch": 0.27,
2408
- "learning_rate": 0.0005788597733711047,
2409
- "loss": 1.8785,
2410
- "step": 399
2411
- },
2412
- {
2413
- "epoch": 0.27,
2414
- "learning_rate": 0.0005787535410764872,
2415
- "loss": 1.8943,
2416
- "step": 400
2417
- },
2418
- {
2419
- "epoch": 0.27,
2420
- "eval_loss": 2.0144975185394287,
2421
- "eval_runtime": 115.2996,
2422
- "eval_samples_per_second": 8.673,
2423
- "eval_steps_per_second": 8.673,
2424
- "step": 400
2425
  }
2426
  ],
2427
- "max_steps": 5848,
2428
- "num_train_epochs": 4,
2429
- "total_flos": 4.8975138397771776e+17,
2430
  "trial_name": null,
2431
  "trial_params": null
2432
  }
 
1
  {
2
+ "best_metric": 2.0633840560913086,
3
+ "best_model_checkpoint": "output/checkpoint-200",
4
+ "epoch": 0.13673679768829353,
5
+ "global_step": 200,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
10
  {
11
  "epoch": 0.0,
12
  "learning_rate": 2.9999999999999997e-06,
13
+ "loss": 2.1853,
14
  "step": 1
15
  },
16
  {
17
  "epoch": 0.0,
18
  "learning_rate": 5.999999999999999e-06,
19
+ "loss": 2.3789,
20
  "step": 2
21
  },
22
  {
23
  "epoch": 0.0,
24
  "learning_rate": 8.999999999999999e-06,
25
+ "loss": 2.3312,
26
  "step": 3
27
  },
28
  {
29
  "epoch": 0.0,
30
  "learning_rate": 1.1999999999999999e-05,
31
+ "loss": 2.2895,
32
  "step": 4
33
  },
34
  {
35
  "epoch": 0.0,
36
  "learning_rate": 1.4999999999999999e-05,
37
+ "loss": 2.3325,
38
  "step": 5
39
  },
40
  {
41
  "epoch": 0.0,
42
  "learning_rate": 1.7999999999999997e-05,
43
+ "loss": 2.3354,
44
  "step": 6
45
  },
46
  {
47
  "epoch": 0.0,
48
  "learning_rate": 2.1e-05,
49
+ "loss": 2.3068,
50
  "step": 7
51
  },
52
  {
53
  "epoch": 0.01,
54
  "learning_rate": 2.3999999999999997e-05,
55
+ "loss": 2.3766,
56
  "step": 8
57
  },
58
  {
59
  "epoch": 0.01,
60
  "learning_rate": 2.6999999999999996e-05,
61
+ "loss": 2.3448,
62
  "step": 9
63
  },
64
  {
65
  "epoch": 0.01,
66
  "learning_rate": 2.9999999999999997e-05,
67
+ "loss": 2.2754,
68
  "step": 10
69
  },
70
  {
71
  "epoch": 0.01,
72
  "learning_rate": 3.2999999999999996e-05,
73
+ "loss": 2.2119,
74
  "step": 11
75
  },
76
  {
77
  "epoch": 0.01,
78
  "learning_rate": 3.5999999999999994e-05,
79
+ "loss": 2.3228,
80
  "step": 12
81
  },
82
  {
83
  "epoch": 0.01,
84
  "learning_rate": 3.9e-05,
85
+ "loss": 2.2545,
86
  "step": 13
87
  },
88
  {
89
  "epoch": 0.01,
90
  "learning_rate": 4.2e-05,
91
+ "loss": 2.2331,
92
  "step": 14
93
  },
94
  {
95
  "epoch": 0.01,
96
  "learning_rate": 4.4999999999999996e-05,
97
+ "loss": 2.1402,
98
  "step": 15
99
  },
100
  {
101
  "epoch": 0.01,
102
  "learning_rate": 4.7999999999999994e-05,
103
+ "loss": 2.2808,
104
  "step": 16
105
  },
106
  {
107
  "epoch": 0.01,
108
  "learning_rate": 5.1e-05,
109
+ "loss": 2.3187,
110
  "step": 17
111
  },
112
  {
113
  "epoch": 0.01,
114
  "learning_rate": 5.399999999999999e-05,
115
+ "loss": 2.2397,
116
  "step": 18
117
  },
118
  {
119
  "epoch": 0.01,
120
  "learning_rate": 5.6999999999999996e-05,
121
+ "loss": 2.1723,
122
  "step": 19
123
  },
124
  {
125
  "epoch": 0.01,
126
  "learning_rate": 5.9999999999999995e-05,
127
+ "loss": 2.2569,
128
  "step": 20
129
  },
130
  {
131
  "epoch": 0.01,
132
  "learning_rate": 6.299999999999999e-05,
133
+ "loss": 2.1196,
134
  "step": 21
135
  },
136
  {
137
  "epoch": 0.02,
138
  "learning_rate": 6.599999999999999e-05,
139
+ "loss": 2.2139,
140
  "step": 22
141
  },
142
  {
143
  "epoch": 0.02,
144
  "learning_rate": 6.9e-05,
145
+ "loss": 2.1917,
146
  "step": 23
147
  },
148
  {
149
  "epoch": 0.02,
150
  "learning_rate": 7.199999999999999e-05,
151
+ "loss": 2.2482,
152
  "step": 24
153
  },
154
  {
155
  "epoch": 0.02,
156
  "learning_rate": 7.5e-05,
157
+ "loss": 2.099,
158
  "step": 25
159
  },
160
  {
161
  "epoch": 0.02,
162
  "learning_rate": 7.8e-05,
163
+ "loss": 2.1668,
164
  "step": 26
165
  },
166
  {
167
  "epoch": 0.02,
168
  "learning_rate": 8.1e-05,
169
+ "loss": 2.2079,
170
  "step": 27
171
  },
172
  {
173
  "epoch": 0.02,
174
  "learning_rate": 8.4e-05,
175
+ "loss": 2.2618,
176
  "step": 28
177
  },
178
  {
179
  "epoch": 0.02,
180
  "learning_rate": 8.699999999999999e-05,
181
+ "loss": 2.2295,
182
  "step": 29
183
  },
184
  {
185
  "epoch": 0.02,
186
  "learning_rate": 8.999999999999999e-05,
187
+ "loss": 2.2095,
188
  "step": 30
189
  },
190
  {
191
  "epoch": 0.02,
192
  "learning_rate": 9.3e-05,
193
+ "loss": 2.219,
194
  "step": 31
195
  },
196
  {
197
  "epoch": 0.02,
198
  "learning_rate": 9.599999999999999e-05,
199
+ "loss": 2.172,
200
  "step": 32
201
  },
202
  {
203
  "epoch": 0.02,
204
  "learning_rate": 9.9e-05,
205
+ "loss": 2.2062,
206
  "step": 33
207
  },
208
  {
209
  "epoch": 0.02,
210
  "learning_rate": 0.000102,
211
+ "loss": 2.2535,
212
  "step": 34
213
  },
214
  {
215
  "epoch": 0.02,
216
  "learning_rate": 0.00010499999999999999,
217
+ "loss": 2.164,
218
  "step": 35
219
  },
220
  {
221
  "epoch": 0.02,
222
  "learning_rate": 0.00010799999999999998,
223
+ "loss": 2.2069,
224
  "step": 36
225
  },
226
  {
227
  "epoch": 0.03,
228
  "learning_rate": 0.00011099999999999999,
229
+ "loss": 2.1966,
230
  "step": 37
231
  },
232
  {
233
  "epoch": 0.03,
234
  "learning_rate": 0.00011399999999999999,
235
+ "loss": 2.1929,
236
  "step": 38
237
  },
238
  {
239
  "epoch": 0.03,
240
  "learning_rate": 0.000117,
241
+ "loss": 2.2215,
242
  "step": 39
243
  },
244
  {
245
  "epoch": 0.03,
246
  "learning_rate": 0.00011999999999999999,
247
+ "loss": 2.2541,
248
  "step": 40
249
  },
250
  {
251
  "epoch": 0.03,
252
  "learning_rate": 0.00012299999999999998,
253
+ "loss": 2.1316,
254
  "step": 41
255
  },
256
  {
257
  "epoch": 0.03,
258
  "learning_rate": 0.00012599999999999997,
259
+ "loss": 2.0486,
260
  "step": 42
261
  },
262
  {
263
  "epoch": 0.03,
264
  "learning_rate": 0.000129,
265
+ "loss": 2.2175,
266
  "step": 43
267
  },
268
  {
269
  "epoch": 0.03,
270
  "learning_rate": 0.00013199999999999998,
271
+ "loss": 2.2277,
272
  "step": 44
273
  },
274
  {
275
  "epoch": 0.03,
276
  "learning_rate": 0.000135,
277
+ "loss": 2.2629,
278
  "step": 45
279
  },
280
  {
281
  "epoch": 0.03,
282
  "learning_rate": 0.000138,
283
+ "loss": 2.2549,
284
  "step": 46
285
  },
286
  {
287
  "epoch": 0.03,
288
  "learning_rate": 0.00014099999999999998,
289
+ "loss": 2.1836,
290
  "step": 47
291
  },
292
  {
293
  "epoch": 0.03,
294
  "learning_rate": 0.00014399999999999998,
295
+ "loss": 2.1772,
296
  "step": 48
297
  },
298
  {
299
  "epoch": 0.03,
300
  "learning_rate": 0.000147,
301
+ "loss": 2.2013,
302
  "step": 49
303
  },
304
  {
305
  "epoch": 0.03,
306
  "learning_rate": 0.00015,
307
+ "loss": 2.2103,
308
  "step": 50
309
  },
310
  {
311
  "epoch": 0.03,
312
  "learning_rate": 0.00015299999999999998,
313
+ "loss": 2.244,
314
  "step": 51
315
  },
316
  {
317
  "epoch": 0.04,
318
  "learning_rate": 0.000156,
319
+ "loss": 2.178,
320
  "step": 52
321
  },
322
  {
323
  "epoch": 0.04,
324
  "learning_rate": 0.000159,
325
+ "loss": 2.1942,
326
  "step": 53
327
  },
328
  {
329
  "epoch": 0.04,
330
  "learning_rate": 0.000162,
331
+ "loss": 2.171,
332
  "step": 54
333
  },
334
  {
335
  "epoch": 0.04,
336
  "learning_rate": 0.000165,
337
+ "loss": 2.2166,
338
  "step": 55
339
  },
340
  {
341
  "epoch": 0.04,
342
  "learning_rate": 0.000168,
343
+ "loss": 2.2737,
344
  "step": 56
345
  },
346
  {
347
  "epoch": 0.04,
348
  "learning_rate": 0.00017099999999999998,
349
+ "loss": 2.179,
350
  "step": 57
351
  },
352
  {
353
  "epoch": 0.04,
354
  "learning_rate": 0.00017399999999999997,
355
+ "loss": 2.173,
356
  "step": 58
357
  },
358
  {
359
  "epoch": 0.04,
360
  "learning_rate": 0.00017699999999999997,
361
+ "loss": 2.1753,
362
  "step": 59
363
  },
364
  {
365
  "epoch": 0.04,
366
  "learning_rate": 0.00017999999999999998,
367
+ "loss": 2.1591,
368
  "step": 60
369
  },
370
  {
371
  "epoch": 0.04,
372
  "learning_rate": 0.00018299999999999998,
373
+ "loss": 2.1936,
374
  "step": 61
375
  },
376
  {
377
  "epoch": 0.04,
378
  "learning_rate": 0.000186,
379
+ "loss": 2.1971,
380
  "step": 62
381
  },
382
  {
383
  "epoch": 0.04,
384
  "learning_rate": 0.00018899999999999999,
385
+ "loss": 2.209,
386
  "step": 63
387
  },
388
  {
389
  "epoch": 0.04,
390
  "learning_rate": 0.00019199999999999998,
391
+ "loss": 2.2032,
392
  "step": 64
393
  },
394
  {
395
  "epoch": 0.04,
396
  "learning_rate": 0.000195,
397
+ "loss": 2.0609,
398
  "step": 65
399
  },
400
  {
401
  "epoch": 0.05,
402
  "learning_rate": 0.000198,
403
+ "loss": 2.1394,
404
  "step": 66
405
  },
406
  {
407
  "epoch": 0.05,
408
  "learning_rate": 0.000201,
409
+ "loss": 2.1906,
410
  "step": 67
411
  },
412
  {
413
  "epoch": 0.05,
414
  "learning_rate": 0.000204,
415
+ "loss": 2.1658,
416
  "step": 68
417
  },
418
  {
419
  "epoch": 0.05,
420
  "learning_rate": 0.00020699999999999996,
421
+ "loss": 2.0702,
422
  "step": 69
423
  },
424
  {
425
  "epoch": 0.05,
426
  "learning_rate": 0.00020999999999999998,
427
+ "loss": 2.0302,
428
  "step": 70
429
  },
430
  {
431
  "epoch": 0.05,
432
  "learning_rate": 0.00021299999999999997,
433
+ "loss": 2.1205,
434
  "step": 71
435
  },
436
  {
437
  "epoch": 0.05,
438
  "learning_rate": 0.00021599999999999996,
439
+ "loss": 2.1048,
440
  "step": 72
441
  },
442
  {
443
  "epoch": 0.05,
444
  "learning_rate": 0.00021899999999999998,
445
+ "loss": 2.2125,
446
  "step": 73
447
  },
448
  {
449
  "epoch": 0.05,
450
  "learning_rate": 0.00022199999999999998,
451
+ "loss": 2.1554,
452
  "step": 74
453
  },
454
  {
455
  "epoch": 0.05,
456
  "learning_rate": 0.000225,
457
+ "loss": 2.1242,
458
  "step": 75
459
  },
460
  {
461
  "epoch": 0.05,
462
  "learning_rate": 0.00022799999999999999,
463
+ "loss": 2.1383,
464
  "step": 76
465
  },
466
  {
467
  "epoch": 0.05,
468
  "learning_rate": 0.00023099999999999998,
469
+ "loss": 2.0299,
470
  "step": 77
471
  },
472
  {
473
  "epoch": 0.05,
474
  "learning_rate": 0.000234,
475
+ "loss": 2.1578,
476
  "step": 78
477
  },
478
  {
479
  "epoch": 0.05,
480
  "learning_rate": 0.000237,
481
+ "loss": 2.1432,
482
  "step": 79
483
  },
484
  {
485
  "epoch": 0.05,
486
  "learning_rate": 0.00023999999999999998,
487
+ "loss": 2.1129,
488
  "step": 80
489
  },
490
  {
491
  "epoch": 0.06,
492
  "learning_rate": 0.000243,
493
+ "loss": 2.1672,
494
  "step": 81
495
  },
496
  {
497
  "epoch": 0.06,
498
  "learning_rate": 0.00024599999999999996,
499
+ "loss": 2.1425,
500
  "step": 82
501
  },
502
  {
503
  "epoch": 0.06,
504
  "learning_rate": 0.000249,
505
+ "loss": 2.2758,
506
  "step": 83
507
  },
508
  {
509
  "epoch": 0.06,
510
  "learning_rate": 0.00025199999999999995,
511
+ "loss": 2.173,
512
  "step": 84
513
  },
514
  {
515
  "epoch": 0.06,
516
  "learning_rate": 0.00025499999999999996,
517
+ "loss": 2.1262,
518
  "step": 85
519
  },
520
  {
521
  "epoch": 0.06,
522
  "learning_rate": 0.000258,
523
+ "loss": 2.1657,
524
  "step": 86
525
  },
526
  {
527
  "epoch": 0.06,
528
  "learning_rate": 0.000261,
529
+ "loss": 2.0322,
530
  "step": 87
531
  },
532
  {
533
  "epoch": 0.06,
534
  "learning_rate": 0.00026399999999999997,
535
+ "loss": 2.0887,
536
  "step": 88
537
  },
538
  {
539
  "epoch": 0.06,
540
  "learning_rate": 0.000267,
541
+ "loss": 2.135,
542
  "step": 89
543
  },
544
  {
545
  "epoch": 0.06,
546
  "learning_rate": 0.00027,
547
+ "loss": 2.1048,
548
  "step": 90
549
  },
550
  {
551
  "epoch": 0.06,
552
  "learning_rate": 0.00027299999999999997,
553
+ "loss": 2.1313,
554
  "step": 91
555
  },
556
  {
557
  "epoch": 0.06,
558
  "learning_rate": 0.000276,
559
+ "loss": 2.0808,
560
  "step": 92
561
  },
562
  {
563
  "epoch": 0.06,
564
  "learning_rate": 0.000279,
565
+ "loss": 2.0489,
566
  "step": 93
567
  },
568
  {
569
  "epoch": 0.06,
570
  "learning_rate": 0.00028199999999999997,
571
+ "loss": 2.1537,
572
  "step": 94
573
  },
574
  {
575
  "epoch": 0.06,
576
  "learning_rate": 0.000285,
577
+ "loss": 2.1017,
578
  "step": 95
579
  },
580
  {
581
  "epoch": 0.07,
582
  "learning_rate": 0.00028799999999999995,
583
+ "loss": 2.1312,
584
  "step": 96
585
  },
586
  {
587
  "epoch": 0.07,
588
  "learning_rate": 0.00029099999999999997,
589
+ "loss": 2.1248,
590
  "step": 97
591
  },
592
  {
593
  "epoch": 0.07,
594
  "learning_rate": 0.000294,
595
+ "loss": 2.0856,
596
  "step": 98
597
  },
598
  {
599
  "epoch": 0.07,
600
  "learning_rate": 0.00029699999999999996,
601
+ "loss": 2.1286,
602
  "step": 99
603
  },
604
  {
605
  "epoch": 0.07,
606
  "learning_rate": 0.0003,
607
+ "loss": 2.202,
608
  "step": 100
609
  },
610
  {
611
  "epoch": 0.07,
612
  "learning_rate": 0.000303,
613
+ "loss": 2.0967,
614
  "step": 101
615
  },
616
  {
617
  "epoch": 0.07,
618
  "learning_rate": 0.00030599999999999996,
619
+ "loss": 2.1731,
620
  "step": 102
621
  },
622
  {
623
  "epoch": 0.07,
624
  "learning_rate": 0.000309,
625
+ "loss": 2.1321,
626
  "step": 103
627
  },
628
  {
629
  "epoch": 0.07,
630
  "learning_rate": 0.000312,
631
+ "loss": 2.0159,
632
  "step": 104
633
  },
634
  {
635
  "epoch": 0.07,
636
  "learning_rate": 0.00031499999999999996,
637
+ "loss": 2.071,
638
  "step": 105
639
  },
640
  {
641
  "epoch": 0.07,
642
  "learning_rate": 0.000318,
643
+ "loss": 2.0542,
644
  "step": 106
645
  },
646
  {
647
  "epoch": 0.07,
648
  "learning_rate": 0.000321,
649
+ "loss": 2.1085,
650
  "step": 107
651
  },
652
  {
653
  "epoch": 0.07,
654
  "learning_rate": 0.000324,
655
+ "loss": 2.1758,
656
  "step": 108
657
  },
658
  {
659
  "epoch": 0.07,
660
  "learning_rate": 0.000327,
661
+ "loss": 2.1243,
662
  "step": 109
663
  },
664
  {
665
  "epoch": 0.08,
666
  "learning_rate": 0.00033,
667
+ "loss": 2.0375,
668
  "step": 110
669
  },
670
  {
671
  "epoch": 0.08,
672
  "learning_rate": 0.000333,
673
+ "loss": 2.1409,
674
  "step": 111
675
  },
676
  {
677
  "epoch": 0.08,
678
  "learning_rate": 0.000336,
679
+ "loss": 2.1071,
680
  "step": 112
681
  },
682
  {
683
  "epoch": 0.08,
684
  "learning_rate": 0.00033899999999999995,
685
+ "loss": 2.1259,
686
  "step": 113
687
  },
688
  {
689
  "epoch": 0.08,
690
  "learning_rate": 0.00034199999999999996,
691
+ "loss": 2.0782,
692
  "step": 114
693
  },
694
  {
695
  "epoch": 0.08,
696
  "learning_rate": 0.00034499999999999993,
697
+ "loss": 2.2005,
698
  "step": 115
699
  },
700
  {
701
  "epoch": 0.08,
702
  "learning_rate": 0.00034799999999999995,
703
+ "loss": 2.1199,
704
  "step": 116
705
  },
706
  {
707
  "epoch": 0.08,
708
  "learning_rate": 0.00035099999999999997,
709
+ "loss": 2.1757,
710
  "step": 117
711
  },
712
  {
713
  "epoch": 0.08,
714
  "learning_rate": 0.00035399999999999993,
715
+ "loss": 2.0705,
716
  "step": 118
717
  },
718
  {
719
  "epoch": 0.08,
720
  "learning_rate": 0.00035699999999999995,
721
+ "loss": 2.1261,
722
  "step": 119
723
  },
724
  {
725
  "epoch": 0.08,
726
  "learning_rate": 0.00035999999999999997,
727
+ "loss": 2.0633,
728
  "step": 120
729
  },
730
  {
731
  "epoch": 0.08,
732
  "learning_rate": 0.00036299999999999993,
733
+ "loss": 2.1012,
734
  "step": 121
735
  },
736
  {
737
  "epoch": 0.08,
738
  "learning_rate": 0.00036599999999999995,
739
+ "loss": 2.1077,
740
  "step": 122
741
  },
742
  {
743
  "epoch": 0.08,
744
  "learning_rate": 0.00036899999999999997,
745
+ "loss": 2.0978,
746
  "step": 123
747
  },
748
  {
749
  "epoch": 0.08,
750
  "learning_rate": 0.000372,
751
+ "loss": 2.0924,
752
  "step": 124
753
  },
754
  {
755
  "epoch": 0.09,
756
  "learning_rate": 0.00037499999999999995,
757
+ "loss": 2.0565,
758
  "step": 125
759
  },
760
  {
761
  "epoch": 0.09,
762
  "learning_rate": 0.00037799999999999997,
763
+ "loss": 2.1612,
764
  "step": 126
765
  },
766
  {
767
  "epoch": 0.09,
768
  "learning_rate": 0.000381,
769
+ "loss": 2.0699,
770
  "step": 127
771
  },
772
  {
773
  "epoch": 0.09,
774
  "learning_rate": 0.00038399999999999996,
775
+ "loss": 2.1886,
776
  "step": 128
777
  },
778
  {
779
  "epoch": 0.09,
780
  "learning_rate": 0.000387,
781
+ "loss": 2.1232,
782
  "step": 129
783
  },
784
  {
785
  "epoch": 0.09,
786
  "learning_rate": 0.00039,
787
+ "loss": 2.1499,
788
  "step": 130
789
  },
790
  {
791
  "epoch": 0.09,
792
  "learning_rate": 0.00039299999999999996,
793
+ "loss": 2.1805,
794
  "step": 131
795
  },
796
  {
797
  "epoch": 0.09,
798
  "learning_rate": 0.000396,
799
+ "loss": 2.0487,
800
  "step": 132
801
  },
802
  {
803
  "epoch": 0.09,
804
  "learning_rate": 0.000399,
805
+ "loss": 2.1278,
806
  "step": 133
807
  },
808
  {
809
  "epoch": 0.09,
810
  "learning_rate": 0.000402,
811
+ "loss": 2.0348,
812
  "step": 134
813
  },
814
  {
815
  "epoch": 0.09,
816
  "learning_rate": 0.000405,
817
+ "loss": 2.1963,
818
  "step": 135
819
  },
820
  {
821
  "epoch": 0.09,
822
  "learning_rate": 0.000408,
823
+ "loss": 2.1453,
824
  "step": 136
825
  },
826
  {
827
  "epoch": 0.09,
828
  "learning_rate": 0.000411,
829
+ "loss": 2.035,
830
  "step": 137
831
  },
832
  {
833
  "epoch": 0.09,
834
  "learning_rate": 0.0004139999999999999,
835
+ "loss": 2.0129,
836
  "step": 138
837
  },
838
  {
839
  "epoch": 0.1,
840
  "learning_rate": 0.00041699999999999994,
841
+ "loss": 2.0832,
842
  "step": 139
843
  },
844
  {
845
  "epoch": 0.1,
846
  "learning_rate": 0.00041999999999999996,
847
+ "loss": 2.1616,
848
  "step": 140
849
  },
850
  {
851
  "epoch": 0.1,
852
  "learning_rate": 0.00042299999999999993,
853
+ "loss": 2.1153,
854
  "step": 141
855
  },
856
  {
857
  "epoch": 0.1,
858
  "learning_rate": 0.00042599999999999995,
859
+ "loss": 2.0437,
860
  "step": 142
861
  },
862
  {
863
  "epoch": 0.1,
864
  "learning_rate": 0.00042899999999999997,
865
+ "loss": 2.1931,
866
  "step": 143
867
  },
868
  {
869
  "epoch": 0.1,
870
  "learning_rate": 0.00043199999999999993,
871
+ "loss": 1.9648,
872
  "step": 144
873
  },
874
  {
875
  "epoch": 0.1,
876
  "learning_rate": 0.00043499999999999995,
877
+ "loss": 2.0986,
878
  "step": 145
879
  },
880
  {
881
  "epoch": 0.1,
882
  "learning_rate": 0.00043799999999999997,
883
+ "loss": 2.0978,
884
  "step": 146
885
  },
886
  {
887
  "epoch": 0.1,
888
  "learning_rate": 0.00044099999999999993,
889
+ "loss": 2.0683,
890
  "step": 147
891
  },
892
  {
893
  "epoch": 0.1,
894
  "learning_rate": 0.00044399999999999995,
895
+ "loss": 2.0536,
896
  "step": 148
897
  },
898
  {
899
  "epoch": 0.1,
900
  "learning_rate": 0.00044699999999999997,
901
+ "loss": 2.1095,
902
  "step": 149
903
  },
904
  {
905
  "epoch": 0.1,
906
  "learning_rate": 0.00045,
907
+ "loss": 2.154,
908
  "step": 150
909
  },
910
  {
911
  "epoch": 0.1,
912
  "learning_rate": 0.00045299999999999995,
913
+ "loss": 2.1351,
914
  "step": 151
915
  },
916
  {
917
  "epoch": 0.1,
918
  "learning_rate": 0.00045599999999999997,
919
+ "loss": 2.0517,
920
  "step": 152
921
  },
922
  {
923
  "epoch": 0.1,
924
  "learning_rate": 0.000459,
925
+ "loss": 2.1572,
926
  "step": 153
927
  },
928
  {
929
  "epoch": 0.11,
930
  "learning_rate": 0.00046199999999999995,
931
+ "loss": 2.0666,
932
  "step": 154
933
  },
934
  {
935
  "epoch": 0.11,
936
  "learning_rate": 0.00046499999999999997,
937
+ "loss": 2.195,
938
  "step": 155
939
  },
940
  {
941
  "epoch": 0.11,
942
  "learning_rate": 0.000468,
943
+ "loss": 1.9685,
944
  "step": 156
945
  },
946
  {
947
  "epoch": 0.11,
948
  "learning_rate": 0.00047099999999999996,
949
+ "loss": 2.11,
950
  "step": 157
951
  },
952
  {
953
  "epoch": 0.11,
954
  "learning_rate": 0.000474,
955
+ "loss": 2.1476,
956
  "step": 158
957
  },
958
  {
959
  "epoch": 0.11,
960
  "learning_rate": 0.000477,
961
+ "loss": 2.0957,
962
  "step": 159
963
  },
964
  {
965
  "epoch": 0.11,
966
  "learning_rate": 0.00047999999999999996,
967
+ "loss": 1.9753,
968
  "step": 160
969
  },
970
  {
971
  "epoch": 0.11,
972
  "learning_rate": 0.000483,
973
+ "loss": 2.1659,
974
  "step": 161
975
  },
976
  {
977
  "epoch": 0.11,
978
  "learning_rate": 0.000486,
979
+ "loss": 2.0978,
980
  "step": 162
981
  },
982
  {
983
  "epoch": 0.11,
984
  "learning_rate": 0.000489,
985
+ "loss": 2.0742,
986
  "step": 163
987
  },
988
  {
989
  "epoch": 0.11,
990
  "learning_rate": 0.0004919999999999999,
991
+ "loss": 2.0865,
992
  "step": 164
993
  },
994
  {
995
  "epoch": 0.11,
996
  "learning_rate": 0.0004949999999999999,
997
+ "loss": 2.0911,
998
  "step": 165
999
  },
1000
  {
1001
  "epoch": 0.11,
1002
  "learning_rate": 0.000498,
1003
+ "loss": 2.1147,
1004
  "step": 166
1005
  },
1006
  {
1007
  "epoch": 0.11,
1008
  "learning_rate": 0.0005009999999999999,
1009
+ "loss": 1.9884,
1010
  "step": 167
1011
  },
1012
  {
1013
  "epoch": 0.11,
1014
  "learning_rate": 0.0005039999999999999,
1015
+ "loss": 1.9497,
1016
  "step": 168
1017
  },
1018
  {
1019
  "epoch": 0.12,
1020
  "learning_rate": 0.000507,
1021
+ "loss": 2.1134,
1022
  "step": 169
1023
  },
1024
  {
1025
  "epoch": 0.12,
1026
  "learning_rate": 0.0005099999999999999,
1027
+ "loss": 2.1097,
1028
  "step": 170
1029
  },
1030
  {
1031
  "epoch": 0.12,
1032
  "learning_rate": 0.0005129999999999999,
1033
+ "loss": 2.0186,
1034
  "step": 171
1035
  },
1036
  {
1037
  "epoch": 0.12,
1038
  "learning_rate": 0.000516,
1039
+ "loss": 2.0772,
1040
  "step": 172
1041
  },
1042
  {
1043
  "epoch": 0.12,
1044
  "learning_rate": 0.0005189999999999999,
1045
+ "loss": 2.1284,
1046
  "step": 173
1047
  },
1048
  {
1049
  "epoch": 0.12,
1050
  "learning_rate": 0.000522,
1051
+ "loss": 1.9895,
1052
  "step": 174
1053
  },
1054
  {
1055
  "epoch": 0.12,
1056
  "learning_rate": 0.000525,
1057
+ "loss": 2.0206,
1058
  "step": 175
1059
  },
1060
  {
1061
  "epoch": 0.12,
1062
  "learning_rate": 0.0005279999999999999,
1063
+ "loss": 1.9765,
1064
  "step": 176
1065
  },
1066
  {
1067
  "epoch": 0.12,
1068
  "learning_rate": 0.000531,
1069
+ "loss": 2.0687,
1070
  "step": 177
1071
  },
1072
  {
1073
  "epoch": 0.12,
1074
  "learning_rate": 0.000534,
1075
+ "loss": 2.079,
1076
  "step": 178
1077
  },
1078
  {
1079
  "epoch": 0.12,
1080
  "learning_rate": 0.0005369999999999999,
1081
+ "loss": 2.0544,
1082
  "step": 179
1083
  },
1084
  {
1085
  "epoch": 0.12,
1086
  "learning_rate": 0.00054,
1087
+ "loss": 1.9787,
1088
  "step": 180
1089
  },
1090
  {
1091
  "epoch": 0.12,
1092
  "learning_rate": 0.000543,
1093
+ "loss": 2.0328,
1094
  "step": 181
1095
  },
1096
  {
1097
  "epoch": 0.12,
1098
  "learning_rate": 0.0005459999999999999,
1099
+ "loss": 2.0692,
1100
  "step": 182
1101
  },
1102
  {
1103
  "epoch": 0.13,
1104
  "learning_rate": 0.000549,
1105
+ "loss": 2.1062,
1106
  "step": 183
1107
  },
1108
  {
1109
  "epoch": 0.13,
1110
  "learning_rate": 0.000552,
1111
+ "loss": 2.0996,
1112
  "step": 184
1113
  },
1114
  {
1115
  "epoch": 0.13,
1116
  "learning_rate": 0.0005549999999999999,
1117
+ "loss": 1.9696,
1118
  "step": 185
1119
  },
1120
  {
1121
  "epoch": 0.13,
1122
  "learning_rate": 0.000558,
1123
+ "loss": 2.0402,
1124
  "step": 186
1125
  },
1126
  {
1127
  "epoch": 0.13,
1128
  "learning_rate": 0.000561,
1129
+ "loss": 2.0141,
1130
  "step": 187
1131
  },
1132
  {
1133
  "epoch": 0.13,
1134
  "learning_rate": 0.0005639999999999999,
1135
+ "loss": 2.1041,
1136
  "step": 188
1137
  },
1138
  {
1139
  "epoch": 0.13,
1140
  "learning_rate": 0.0005669999999999999,
1141
+ "loss": 1.9539,
1142
  "step": 189
1143
  },
1144
  {
1145
  "epoch": 0.13,
1146
  "learning_rate": 0.00057,
1147
+ "loss": 2.0689,
1148
  "step": 190
1149
  },
1150
  {
1151
  "epoch": 0.13,
1152
  "learning_rate": 0.0005729999999999999,
1153
+ "loss": 2.047,
1154
  "step": 191
1155
  },
1156
  {
1157
  "epoch": 0.13,
1158
  "learning_rate": 0.0005759999999999999,
1159
+ "loss": 2.0751,
1160
  "step": 192
1161
  },
1162
  {
1163
  "epoch": 0.13,
1164
  "learning_rate": 0.000579,
1165
+ "loss": 2.0649,
1166
  "step": 193
1167
  },
1168
  {
1169
  "epoch": 0.13,
1170
  "learning_rate": 0.0005819999999999999,
1171
+ "loss": 2.1111,
1172
  "step": 194
1173
  },
1174
  {
1175
  "epoch": 0.13,
1176
  "learning_rate": 0.0005849999999999999,
1177
+ "loss": 2.0669,
1178
  "step": 195
1179
  },
1180
  {
1181
  "epoch": 0.13,
1182
  "learning_rate": 0.000588,
1183
+ "loss": 2.1075,
1184
  "step": 196
1185
  },
1186
  {
1187
  "epoch": 0.13,
1188
  "learning_rate": 0.0005909999999999999,
1189
+ "loss": 2.0398,
1190
  "step": 197
1191
  },
1192
  {
1193
  "epoch": 0.14,
1194
  "learning_rate": 0.0005939999999999999,
1195
+ "loss": 2.0357,
1196
  "step": 198
1197
  },
1198
  {
1199
  "epoch": 0.14,
1200
  "learning_rate": 0.000597,
1201
+ "loss": 1.953,
1202
  "step": 199
1203
  },
1204
  {
1205
  "epoch": 0.14,
1206
  "learning_rate": 0.0006,
1207
+ "loss": 2.026,
1208
  "step": 200
1209
  },
1210
  {
1211
  "epoch": 0.14,
1212
+ "eval_loss": 2.0633840560913086,
1213
+ "eval_runtime": 1727.3111,
1214
+ "eval_samples_per_second": 5.789,
1215
+ "eval_steps_per_second": 5.789,
1216
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1217
  }
1218
  ],
1219
+ "max_steps": 4386,
1220
+ "num_train_epochs": 3,
1221
+ "total_flos": 2.4532993377759744e+17,
1222
  "trial_name": null,
1223
  "trial_params": null
1224
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81616117252fb3172fd7b27ba4471c6643d6cc085d4f229ee6a2e5de41127204
3
  size 4091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2576b0258f7068012c42a5ab049476237477f0fc73e59a3a795dfbe3d1a37f47
3
  size 4091
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46d927f86c4381f851e153c7cd12d23e9c3b351d0fd5dd23d8dfbedb21e8dbf4
3
  size 2368281769
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1649cca59a6a8d74726a76c7340a221823661f3a836db46a53dbc67221b14982
3
  size 2368281769
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81616117252fb3172fd7b27ba4471c6643d6cc085d4f229ee6a2e5de41127204
3
  size 4091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2576b0258f7068012c42a5ab049476237477f0fc73e59a3a795dfbe3d1a37f47
3
  size 4091