yjwsb233 commited on
Commit
bf1e080
·
1 Parent(s): 1c70adb

Training in progress, step 100

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 2.0,
3
- "eval_accuracy": 0.43737166324435317,
4
- "eval_loss": 2.7726495265960693,
5
- "eval_runtime": 6.4303,
6
- "eval_samples_per_second": 75.735,
7
- "eval_steps_per_second": 9.486,
8
  "total_flos": 1.1959742538326016e+17,
9
- "train_loss": 3.1177605087162594,
10
- "train_runtime": 412.2299,
11
- "train_samples_per_second": 7.549,
12
- "train_steps_per_second": 3.775
13
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "eval_accuracy": 0.31025641025641026,
4
+ "eval_loss": 2.86479115486145,
5
+ "eval_runtime": 5.2198,
6
+ "eval_samples_per_second": 74.716,
7
+ "eval_steps_per_second": 9.387,
8
  "total_flos": 1.1959742538326016e+17,
9
+ "train_loss": 3.140765584832905,
10
+ "train_runtime": 414.3391,
11
+ "train_samples_per_second": 7.511,
12
+ "train_steps_per_second": 3.755
13
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c117e8bb1f9074cb1c08fd0fde2ce1f0ad44ce12fc5ef1e8826260f84328396
3
  size 170850837
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e087dc21cee195e52ecc0a8fad4598f302dda3d3f261baa5f8372944a1914521
3
  size 170850837
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 1.1959742538326016e+17,
4
- "train_loss": 3.1177605087162594,
5
- "train_runtime": 412.2299,
6
- "train_samples_per_second": 7.549,
7
- "train_steps_per_second": 3.775
8
  }
 
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 1.1959742538326016e+17,
4
+ "train_loss": 3.140765584832905,
5
+ "train_runtime": 414.3391,
6
+ "train_samples_per_second": 7.511,
7
+ "train_steps_per_second": 3.755
8
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 2.783235788345337,
3
  "best_model_checkpoint": "/content/Train/checkpoint-1400",
4
  "epoch": 2.0,
5
  "global_step": 1556,
@@ -10,1084 +10,1084 @@
10
  {
11
  "epoch": 0.01,
12
  "learning_rate": 0.0001994858611825193,
13
- "loss": 3.2724,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.03,
18
  "learning_rate": 0.00019820051413881748,
19
- "loss": 3.2838,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.04,
24
  "learning_rate": 0.0001969151670951157,
25
- "loss": 3.3858,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.05,
30
  "learning_rate": 0.00019562982005141388,
31
- "loss": 3.1207,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.06,
36
- "learning_rate": 0.0001943444730077121,
37
- "loss": 3.4414,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.08,
42
- "learning_rate": 0.0001930591259640103,
43
- "loss": 3.2139,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.09,
48
- "learning_rate": 0.0001917737789203085,
49
- "loss": 3.1755,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.1,
54
- "learning_rate": 0.0001904884318766067,
55
- "loss": 3.7297,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.12,
60
- "learning_rate": 0.0001892030848329049,
61
- "loss": 3.4405,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 0.13,
66
- "learning_rate": 0.0001879177377892031,
67
- "loss": 3.1529,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 0.13,
72
- "eval_accuracy": 0.05897435897435897,
73
- "eval_loss": 3.211608648300171,
74
- "eval_runtime": 5.3959,
75
- "eval_samples_per_second": 72.278,
76
- "eval_steps_per_second": 9.081,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 0.14,
81
- "learning_rate": 0.0001866323907455013,
82
- "loss": 3.1837,
83
  "step": 110
84
  },
85
  {
86
  "epoch": 0.15,
87
- "learning_rate": 0.0001853470437017995,
88
- "loss": 3.2418,
89
  "step": 120
90
  },
91
  {
92
  "epoch": 0.17,
93
- "learning_rate": 0.0001840616966580977,
94
- "loss": 3.2269,
95
  "step": 130
96
  },
97
  {
98
  "epoch": 0.18,
99
- "learning_rate": 0.0001827763496143959,
100
- "loss": 3.1065,
101
  "step": 140
102
  },
103
  {
104
  "epoch": 0.19,
105
- "learning_rate": 0.00018149100257069408,
106
- "loss": 3.3563,
107
  "step": 150
108
  },
109
  {
110
  "epoch": 0.21,
111
  "learning_rate": 0.0001803341902313625,
112
- "loss": 3.2934,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 0.22,
117
  "learning_rate": 0.00017904884318766068,
118
- "loss": 3.4139,
119
  "step": 170
120
  },
121
  {
122
  "epoch": 0.23,
123
  "learning_rate": 0.0001777634961439589,
124
- "loss": 3.2532,
125
  "step": 180
126
  },
127
  {
128
  "epoch": 0.24,
129
  "learning_rate": 0.00017647814910025708,
130
- "loss": 3.2196,
131
  "step": 190
132
  },
133
  {
134
  "epoch": 0.26,
135
  "learning_rate": 0.00017519280205655527,
136
- "loss": 3.1644,
137
  "step": 200
138
  },
139
  {
140
  "epoch": 0.26,
141
- "eval_accuracy": 0.06923076923076923,
142
- "eval_loss": 3.209580421447754,
143
- "eval_runtime": 4.8409,
144
- "eval_samples_per_second": 80.564,
145
- "eval_steps_per_second": 10.122,
146
  "step": 200
147
  },
148
  {
149
  "epoch": 0.27,
150
  "learning_rate": 0.00017390745501285349,
151
- "loss": 3.3494,
152
  "step": 210
153
  },
154
  {
155
  "epoch": 0.28,
156
  "learning_rate": 0.00017262210796915167,
157
- "loss": 3.336,
158
  "step": 220
159
  },
160
  {
161
  "epoch": 0.3,
162
  "learning_rate": 0.0001713367609254499,
163
- "loss": 3.2151,
164
  "step": 230
165
  },
166
  {
167
  "epoch": 0.31,
168
  "learning_rate": 0.00017005141388174808,
169
- "loss": 3.2835,
170
  "step": 240
171
  },
172
  {
173
  "epoch": 0.32,
174
  "learning_rate": 0.0001687660668380463,
175
- "loss": 3.4358,
176
  "step": 250
177
  },
178
  {
179
  "epoch": 0.33,
180
  "learning_rate": 0.00016748071979434448,
181
- "loss": 3.303,
182
  "step": 260
183
  },
184
  {
185
  "epoch": 0.35,
186
  "learning_rate": 0.00016619537275064267,
187
- "loss": 3.2475,
188
  "step": 270
189
  },
190
  {
191
  "epoch": 0.36,
192
  "learning_rate": 0.00016491002570694088,
193
- "loss": 3.1522,
194
  "step": 280
195
  },
196
  {
197
  "epoch": 0.37,
198
  "learning_rate": 0.00016362467866323907,
199
- "loss": 3.2906,
200
  "step": 290
201
  },
202
  {
203
  "epoch": 0.39,
204
  "learning_rate": 0.00016233933161953728,
205
- "loss": 3.1549,
206
  "step": 300
207
  },
208
  {
209
  "epoch": 0.39,
210
- "eval_accuracy": 0.06923076923076923,
211
- "eval_loss": 3.1985976696014404,
212
- "eval_runtime": 4.6899,
213
- "eval_samples_per_second": 83.157,
214
- "eval_steps_per_second": 10.448,
215
  "step": 300
216
  },
217
  {
218
  "epoch": 0.4,
219
  "learning_rate": 0.0001610539845758355,
220
- "loss": 3.4558,
221
  "step": 310
222
  },
223
  {
224
  "epoch": 0.41,
225
  "learning_rate": 0.00015976863753213369,
226
- "loss": 3.2304,
227
  "step": 320
228
  },
229
  {
230
  "epoch": 0.42,
231
  "learning_rate": 0.0001584832904884319,
232
- "loss": 3.2686,
233
  "step": 330
234
  },
235
  {
236
  "epoch": 0.44,
237
  "learning_rate": 0.0001571979434447301,
238
- "loss": 3.3832,
239
  "step": 340
240
  },
241
  {
242
  "epoch": 0.45,
243
  "learning_rate": 0.00015591259640102828,
244
- "loss": 3.3443,
245
  "step": 350
246
  },
247
  {
248
  "epoch": 0.46,
249
  "learning_rate": 0.0001546272493573265,
250
- "loss": 3.2797,
251
  "step": 360
252
  },
253
  {
254
  "epoch": 0.48,
255
  "learning_rate": 0.00015334190231362468,
256
- "loss": 3.2673,
257
  "step": 370
258
  },
259
  {
260
  "epoch": 0.49,
261
  "learning_rate": 0.0001520565552699229,
262
- "loss": 3.1919,
263
  "step": 380
264
  },
265
  {
266
  "epoch": 0.5,
267
  "learning_rate": 0.00015077120822622108,
268
- "loss": 3.1238,
269
  "step": 390
270
  },
271
  {
272
  "epoch": 0.51,
273
  "learning_rate": 0.0001494858611825193,
274
- "loss": 3.2998,
275
  "step": 400
276
  },
277
  {
278
  "epoch": 0.51,
279
  "eval_accuracy": 0.1076923076923077,
280
- "eval_loss": 3.1967546939849854,
281
- "eval_runtime": 4.6763,
282
- "eval_samples_per_second": 83.399,
283
- "eval_steps_per_second": 10.478,
284
  "step": 400
285
  },
286
  {
287
  "epoch": 0.53,
288
  "learning_rate": 0.00014820051413881748,
289
- "loss": 3.237,
290
  "step": 410
291
  },
292
  {
293
  "epoch": 0.54,
294
  "learning_rate": 0.00014691516709511567,
295
- "loss": 3.1511,
296
  "step": 420
297
  },
298
  {
299
  "epoch": 0.55,
300
  "learning_rate": 0.00014562982005141388,
301
- "loss": 3.1972,
302
  "step": 430
303
  },
304
  {
305
  "epoch": 0.57,
306
  "learning_rate": 0.0001443444730077121,
307
- "loss": 3.3029,
308
  "step": 440
309
  },
310
  {
311
  "epoch": 0.58,
312
  "learning_rate": 0.0001430591259640103,
313
- "loss": 3.2639,
314
  "step": 450
315
  },
316
  {
317
  "epoch": 0.59,
318
  "learning_rate": 0.0001417737789203085,
319
- "loss": 3.2602,
320
  "step": 460
321
  },
322
  {
323
  "epoch": 0.6,
324
  "learning_rate": 0.0001404884318766067,
325
- "loss": 3.1096,
326
  "step": 470
327
  },
328
  {
329
  "epoch": 0.62,
330
  "learning_rate": 0.0001392030848329049,
331
- "loss": 3.2692,
332
  "step": 480
333
  },
334
  {
335
  "epoch": 0.63,
336
  "learning_rate": 0.0001379177377892031,
337
- "loss": 3.3299,
338
  "step": 490
339
  },
340
  {
341
  "epoch": 0.64,
342
  "learning_rate": 0.0001366323907455013,
343
- "loss": 3.1344,
344
  "step": 500
345
  },
346
  {
347
  "epoch": 0.64,
348
- "eval_accuracy": 0.1717948717948718,
349
- "eval_loss": 3.1666767597198486,
350
- "eval_runtime": 4.8058,
351
- "eval_samples_per_second": 81.152,
352
- "eval_steps_per_second": 10.196,
353
  "step": 500
354
  },
355
  {
356
  "epoch": 0.66,
357
  "learning_rate": 0.0001353470437017995,
358
- "loss": 3.0552,
359
  "step": 510
360
  },
361
  {
362
  "epoch": 0.67,
363
  "learning_rate": 0.00013406169665809768,
364
- "loss": 3.3452,
365
  "step": 520
366
  },
367
  {
368
  "epoch": 0.68,
369
  "learning_rate": 0.0001327763496143959,
370
- "loss": 3.3108,
371
  "step": 530
372
  },
373
  {
374
  "epoch": 0.69,
375
  "learning_rate": 0.00013149100257069408,
376
- "loss": 3.0806,
377
  "step": 540
378
  },
379
  {
380
  "epoch": 0.71,
381
  "learning_rate": 0.0001302056555269923,
382
- "loss": 3.4034,
383
  "step": 550
384
  },
385
  {
386
  "epoch": 0.72,
387
  "learning_rate": 0.00012892030848329049,
388
- "loss": 3.0853,
389
  "step": 560
390
  },
391
  {
392
  "epoch": 0.73,
393
  "learning_rate": 0.00012763496143958867,
394
- "loss": 3.1718,
395
  "step": 570
396
  },
397
  {
398
  "epoch": 0.75,
399
  "learning_rate": 0.00012634961439588692,
400
- "loss": 3.1265,
401
  "step": 580
402
  },
403
  {
404
  "epoch": 0.76,
405
  "learning_rate": 0.0001250642673521851,
406
- "loss": 3.183,
407
  "step": 590
408
  },
409
  {
410
  "epoch": 0.77,
411
  "learning_rate": 0.0001237789203084833,
412
- "loss": 3.3638,
413
  "step": 600
414
  },
415
  {
416
  "epoch": 0.77,
417
- "eval_accuracy": 0.17435897435897435,
418
- "eval_loss": 3.1383864879608154,
419
- "eval_runtime": 4.7872,
420
- "eval_samples_per_second": 81.466,
421
- "eval_steps_per_second": 10.236,
422
  "step": 600
423
  },
424
  {
425
  "epoch": 0.78,
426
  "learning_rate": 0.0001224935732647815,
427
- "loss": 3.1725,
428
  "step": 610
429
  },
430
  {
431
  "epoch": 0.8,
432
  "learning_rate": 0.0001212082262210797,
433
- "loss": 3.5329,
434
  "step": 620
435
  },
436
  {
437
  "epoch": 0.81,
438
  "learning_rate": 0.0001199228791773779,
439
- "loss": 3.1399,
440
  "step": 630
441
  },
442
  {
443
  "epoch": 0.82,
444
  "learning_rate": 0.0001186375321336761,
445
- "loss": 3.1998,
446
  "step": 640
447
  },
448
  {
449
  "epoch": 0.84,
450
  "learning_rate": 0.0001173521850899743,
451
- "loss": 3.1562,
452
  "step": 650
453
  },
454
  {
455
  "epoch": 0.85,
456
  "learning_rate": 0.0001160668380462725,
457
- "loss": 3.3428,
458
  "step": 660
459
  },
460
  {
461
  "epoch": 0.86,
462
  "learning_rate": 0.0001147814910025707,
463
- "loss": 3.0586,
464
  "step": 670
465
  },
466
  {
467
  "epoch": 0.87,
468
  "learning_rate": 0.0001134961439588689,
469
- "loss": 3.0464,
470
  "step": 680
471
  },
472
  {
473
  "epoch": 0.89,
474
  "learning_rate": 0.00011221079691516709,
475
- "loss": 3.1097,
476
  "step": 690
477
  },
478
  {
479
  "epoch": 0.9,
480
  "learning_rate": 0.00011092544987146529,
481
- "loss": 3.1482,
482
  "step": 700
483
  },
484
  {
485
  "epoch": 0.9,
486
- "eval_accuracy": 0.20256410256410257,
487
- "eval_loss": 3.096599578857422,
488
- "eval_runtime": 4.7875,
489
- "eval_samples_per_second": 81.462,
490
- "eval_steps_per_second": 10.235,
491
  "step": 700
492
  },
493
  {
494
  "epoch": 0.91,
495
  "learning_rate": 0.00010964010282776349,
496
- "loss": 3.1459,
497
  "step": 710
498
  },
499
  {
500
  "epoch": 0.93,
501
  "learning_rate": 0.00010835475578406172,
502
- "loss": 3.1789,
503
  "step": 720
504
  },
505
  {
506
  "epoch": 0.94,
507
  "learning_rate": 0.0001070694087403599,
508
- "loss": 3.2216,
509
  "step": 730
510
  },
511
  {
512
  "epoch": 0.95,
513
  "learning_rate": 0.0001057840616966581,
514
- "loss": 2.8712,
515
  "step": 740
516
  },
517
  {
518
  "epoch": 0.96,
519
- "learning_rate": 0.00010449871465295631,
520
- "loss": 3.1489,
521
  "step": 750
522
  },
523
  {
524
  "epoch": 0.98,
525
- "learning_rate": 0.00010321336760925451,
526
- "loss": 3.438,
527
  "step": 760
528
  },
529
  {
530
  "epoch": 0.99,
531
- "learning_rate": 0.00010192802056555271,
532
- "loss": 3.23,
533
  "step": 770
534
  },
535
  {
536
  "epoch": 1.0,
537
- "learning_rate": 0.00010064267352185091,
538
- "loss": 3.1966,
539
  "step": 780
540
  },
541
  {
542
  "epoch": 1.02,
543
- "learning_rate": 9.93573264781491e-05,
544
- "loss": 3.0066,
545
  "step": 790
546
  },
547
  {
548
  "epoch": 1.03,
549
- "learning_rate": 9.80719794344473e-05,
550
- "loss": 3.1366,
551
  "step": 800
552
  },
553
  {
554
  "epoch": 1.03,
555
- "eval_accuracy": 0.18974358974358974,
556
- "eval_loss": 3.0483875274658203,
557
- "eval_runtime": 4.6499,
558
- "eval_samples_per_second": 83.872,
559
- "eval_steps_per_second": 10.538,
560
  "step": 800
561
  },
562
  {
563
  "epoch": 1.04,
564
- "learning_rate": 9.67866323907455e-05,
565
- "loss": 3.1106,
566
  "step": 810
567
  },
568
  {
569
  "epoch": 1.05,
570
- "learning_rate": 9.550128534704372e-05,
571
- "loss": 2.9235,
572
  "step": 820
573
  },
574
  {
575
  "epoch": 1.07,
576
- "learning_rate": 9.421593830334192e-05,
577
- "loss": 3.0565,
578
  "step": 830
579
  },
580
  {
581
  "epoch": 1.08,
582
- "learning_rate": 9.29305912596401e-05,
583
- "loss": 3.0503,
584
  "step": 840
585
  },
586
  {
587
  "epoch": 1.09,
588
- "learning_rate": 9.16452442159383e-05,
589
- "loss": 3.2726,
590
  "step": 850
591
  },
592
  {
593
  "epoch": 1.11,
594
- "learning_rate": 9.03598971722365e-05,
595
- "loss": 3.0256,
596
  "step": 860
597
  },
598
  {
599
  "epoch": 1.12,
600
- "learning_rate": 8.907455012853471e-05,
601
- "loss": 3.0892,
602
  "step": 870
603
  },
604
  {
605
  "epoch": 1.13,
606
- "learning_rate": 8.778920308483291e-05,
607
- "loss": 3.1357,
608
  "step": 880
609
  },
610
  {
611
  "epoch": 1.14,
612
- "learning_rate": 8.650385604113111e-05,
613
- "loss": 3.1402,
614
  "step": 890
615
  },
616
  {
617
  "epoch": 1.16,
618
- "learning_rate": 8.521850899742931e-05,
619
- "loss": 3.0206,
620
  "step": 900
621
  },
622
  {
623
  "epoch": 1.16,
624
- "eval_accuracy": 0.30256410256410254,
625
- "eval_loss": 3.016371250152588,
626
- "eval_runtime": 4.6485,
627
- "eval_samples_per_second": 83.899,
628
- "eval_steps_per_second": 10.541,
629
  "step": 900
630
  },
631
  {
632
  "epoch": 1.17,
633
- "learning_rate": 8.393316195372751e-05,
634
- "loss": 3.0057,
635
  "step": 910
636
  },
637
  {
638
  "epoch": 1.18,
639
- "learning_rate": 8.264781491002571e-05,
640
- "loss": 3.1709,
641
  "step": 920
642
  },
643
  {
644
  "epoch": 1.2,
645
- "learning_rate": 8.136246786632391e-05,
646
- "loss": 3.2243,
647
  "step": 930
648
  },
649
  {
650
  "epoch": 1.21,
651
- "learning_rate": 8.007712082262212e-05,
652
- "loss": 3.19,
653
  "step": 940
654
  },
655
  {
656
  "epoch": 1.22,
657
- "learning_rate": 7.87917737789203e-05,
658
- "loss": 3.2785,
659
  "step": 950
660
  },
661
  {
662
  "epoch": 1.23,
663
- "learning_rate": 7.750642673521852e-05,
664
- "loss": 2.8424,
665
  "step": 960
666
  },
667
  {
668
  "epoch": 1.25,
669
- "learning_rate": 7.622107969151672e-05,
670
- "loss": 3.2575,
671
  "step": 970
672
  },
673
  {
674
  "epoch": 1.26,
675
- "learning_rate": 7.493573264781492e-05,
676
- "loss": 2.9012,
677
  "step": 980
678
  },
679
  {
680
  "epoch": 1.27,
681
- "learning_rate": 7.365038560411311e-05,
682
- "loss": 3.1516,
683
  "step": 990
684
  },
685
  {
686
  "epoch": 1.29,
687
- "learning_rate": 7.236503856041131e-05,
688
- "loss": 2.921,
689
  "step": 1000
690
  },
691
  {
692
  "epoch": 1.29,
693
- "eval_accuracy": 0.3230769230769231,
694
- "eval_loss": 2.984600305557251,
695
- "eval_runtime": 4.9539,
696
- "eval_samples_per_second": 78.726,
697
- "eval_steps_per_second": 9.891,
698
  "step": 1000
699
  },
700
  {
701
  "epoch": 1.3,
702
- "learning_rate": 7.107969151670951e-05,
703
- "loss": 2.9828,
704
  "step": 1010
705
  },
706
  {
707
  "epoch": 1.31,
708
- "learning_rate": 6.979434447300771e-05,
709
- "loss": 3.0413,
710
  "step": 1020
711
  },
712
  {
713
  "epoch": 1.32,
714
- "learning_rate": 6.850899742930593e-05,
715
- "loss": 2.9759,
716
  "step": 1030
717
  },
718
  {
719
  "epoch": 1.34,
720
- "learning_rate": 6.722365038560411e-05,
721
- "loss": 2.9434,
722
  "step": 1040
723
  },
724
  {
725
  "epoch": 1.35,
726
- "learning_rate": 6.593830334190231e-05,
727
- "loss": 3.0893,
728
  "step": 1050
729
  },
730
  {
731
  "epoch": 1.36,
732
- "learning_rate": 6.465295629820052e-05,
733
- "loss": 3.0797,
734
  "step": 1060
735
  },
736
  {
737
  "epoch": 1.38,
738
- "learning_rate": 6.336760925449872e-05,
739
- "loss": 2.9291,
740
  "step": 1070
741
  },
742
  {
743
  "epoch": 1.39,
744
- "learning_rate": 6.208226221079692e-05,
745
- "loss": 3.0357,
746
  "step": 1080
747
  },
748
  {
749
  "epoch": 1.4,
750
- "learning_rate": 6.079691516709511e-05,
751
- "loss": 3.0109,
752
  "step": 1090
753
  },
754
  {
755
  "epoch": 1.41,
756
- "learning_rate": 5.951156812339333e-05,
757
- "loss": 3.0027,
758
  "step": 1100
759
  },
760
  {
761
  "epoch": 1.41,
762
- "eval_accuracy": 0.33589743589743587,
763
- "eval_loss": 2.933824062347412,
764
- "eval_runtime": 4.8337,
765
- "eval_samples_per_second": 80.684,
766
- "eval_steps_per_second": 10.137,
767
  "step": 1100
768
  },
769
  {
770
  "epoch": 1.43,
771
- "learning_rate": 5.822622107969152e-05,
772
- "loss": 3.2462,
773
  "step": 1110
774
  },
775
  {
776
  "epoch": 1.44,
777
- "learning_rate": 5.694087403598972e-05,
778
- "loss": 2.9376,
779
  "step": 1120
780
  },
781
  {
782
  "epoch": 1.45,
783
- "learning_rate": 5.5655526992287924e-05,
784
- "loss": 2.9242,
785
  "step": 1130
786
  },
787
  {
788
  "epoch": 1.47,
789
- "learning_rate": 5.437017994858612e-05,
790
- "loss": 2.862,
791
  "step": 1140
792
  },
793
  {
794
  "epoch": 1.48,
795
- "learning_rate": 5.308483290488432e-05,
796
- "loss": 3.1005,
797
  "step": 1150
798
  },
799
  {
800
  "epoch": 1.49,
801
- "learning_rate": 5.1799485861182514e-05,
802
- "loss": 3.1811,
803
  "step": 1160
804
  },
805
  {
806
  "epoch": 1.5,
807
- "learning_rate": 5.051413881748073e-05,
808
- "loss": 3.1254,
809
  "step": 1170
810
  },
811
  {
812
  "epoch": 1.52,
813
- "learning_rate": 4.922879177377892e-05,
814
- "loss": 2.797,
815
  "step": 1180
816
  },
817
  {
818
  "epoch": 1.53,
819
- "learning_rate": 4.7943444730077124e-05,
820
- "loss": 3.1899,
821
  "step": 1190
822
  },
823
  {
824
  "epoch": 1.54,
825
- "learning_rate": 4.6658097686375325e-05,
826
- "loss": 2.9047,
827
  "step": 1200
828
  },
829
  {
830
  "epoch": 1.54,
831
- "eval_accuracy": 0.34615384615384615,
832
- "eval_loss": 2.8916842937469482,
833
- "eval_runtime": 4.6342,
834
- "eval_samples_per_second": 84.157,
835
- "eval_steps_per_second": 10.574,
836
  "step": 1200
837
  },
838
  {
839
  "epoch": 1.56,
840
- "learning_rate": 4.537275064267352e-05,
841
- "loss": 3.1712,
842
  "step": 1210
843
  },
844
  {
845
  "epoch": 1.57,
846
- "learning_rate": 4.408740359897173e-05,
847
- "loss": 2.9201,
848
  "step": 1220
849
  },
850
  {
851
  "epoch": 1.58,
852
- "learning_rate": 4.280205655526993e-05,
853
- "loss": 2.9952,
854
  "step": 1230
855
  },
856
  {
857
  "epoch": 1.59,
858
- "learning_rate": 4.151670951156812e-05,
859
- "loss": 2.8225,
860
  "step": 1240
861
  },
862
  {
863
  "epoch": 1.61,
864
- "learning_rate": 4.0231362467866324e-05,
865
- "loss": 2.9507,
866
  "step": 1250
867
  },
868
  {
869
  "epoch": 1.62,
870
- "learning_rate": 3.8946015424164526e-05,
871
- "loss": 2.96,
872
  "step": 1260
873
  },
874
  {
875
  "epoch": 1.63,
876
- "learning_rate": 3.766066838046273e-05,
877
- "loss": 3.0775,
878
  "step": 1270
879
  },
880
  {
881
  "epoch": 1.65,
882
- "learning_rate": 3.637532133676093e-05,
883
- "loss": 2.9242,
884
  "step": 1280
885
  },
886
  {
887
  "epoch": 1.66,
888
- "learning_rate": 3.508997429305913e-05,
889
- "loss": 2.8814,
890
  "step": 1290
891
  },
892
  {
893
  "epoch": 1.67,
894
- "learning_rate": 3.380462724935733e-05,
895
- "loss": 2.8579,
896
  "step": 1300
897
  },
898
  {
899
  "epoch": 1.67,
900
- "eval_accuracy": 0.4025641025641026,
901
- "eval_loss": 2.8616135120391846,
902
- "eval_runtime": 4.7031,
903
- "eval_samples_per_second": 82.924,
904
- "eval_steps_per_second": 10.419,
905
  "step": 1300
906
  },
907
  {
908
  "epoch": 1.68,
909
- "learning_rate": 3.251928020565553e-05,
910
- "loss": 2.6972,
911
  "step": 1310
912
  },
913
  {
914
  "epoch": 1.7,
915
- "learning_rate": 3.1233933161953726e-05,
916
- "loss": 2.8446,
917
  "step": 1320
918
  },
919
  {
920
  "epoch": 1.71,
921
- "learning_rate": 2.994858611825193e-05,
922
- "loss": 2.915,
923
  "step": 1330
924
  },
925
  {
926
  "epoch": 1.72,
927
- "learning_rate": 2.866323907455013e-05,
928
- "loss": 3.1146,
929
  "step": 1340
930
  },
931
  {
932
  "epoch": 1.74,
933
- "learning_rate": 2.737789203084833e-05,
934
- "loss": 2.9168,
935
  "step": 1350
936
  },
937
  {
938
  "epoch": 1.75,
939
- "learning_rate": 2.6092544987146534e-05,
940
- "loss": 2.8762,
941
  "step": 1360
942
  },
943
  {
944
  "epoch": 1.76,
945
- "learning_rate": 2.480719794344473e-05,
946
- "loss": 2.9499,
947
  "step": 1370
948
  },
949
  {
950
  "epoch": 1.77,
951
- "learning_rate": 2.3521850899742933e-05,
952
- "loss": 2.907,
953
  "step": 1380
954
  },
955
  {
956
  "epoch": 1.79,
957
- "learning_rate": 2.2236503856041134e-05,
958
- "loss": 3.0447,
959
  "step": 1390
960
  },
961
  {
962
  "epoch": 1.8,
963
- "learning_rate": 2.095115681233933e-05,
964
- "loss": 2.988,
965
  "step": 1400
966
  },
967
  {
968
  "epoch": 1.8,
969
- "eval_accuracy": 0.4076923076923077,
970
- "eval_loss": 2.783235788345337,
971
- "eval_runtime": 4.64,
972
- "eval_samples_per_second": 84.052,
973
- "eval_steps_per_second": 10.56,
974
  "step": 1400
975
  },
976
  {
977
  "epoch": 1.81,
978
- "learning_rate": 1.9665809768637533e-05,
979
- "loss": 2.8267,
980
  "step": 1410
981
  },
982
  {
983
  "epoch": 1.83,
984
- "learning_rate": 1.8380462724935734e-05,
985
- "loss": 2.7718,
986
  "step": 1420
987
  },
988
  {
989
  "epoch": 1.84,
990
- "learning_rate": 1.7095115681233935e-05,
991
- "loss": 2.8327,
992
  "step": 1430
993
  },
994
  {
995
  "epoch": 1.85,
996
- "learning_rate": 1.5809768637532136e-05,
997
- "loss": 2.9942,
998
  "step": 1440
999
  },
1000
  {
1001
  "epoch": 1.86,
1002
- "learning_rate": 1.4524421593830334e-05,
1003
- "loss": 3.0647,
1004
  "step": 1450
1005
  },
1006
  {
1007
  "epoch": 1.88,
1008
- "learning_rate": 1.3239074550128535e-05,
1009
- "loss": 2.8267,
1010
  "step": 1460
1011
  },
1012
  {
1013
  "epoch": 1.89,
1014
- "learning_rate": 1.1953727506426736e-05,
1015
- "loss": 2.8312,
1016
  "step": 1470
1017
  },
1018
  {
1019
  "epoch": 1.9,
1020
- "learning_rate": 1.0668380462724936e-05,
1021
- "loss": 3.1317,
1022
  "step": 1480
1023
  },
1024
  {
1025
  "epoch": 1.92,
1026
- "learning_rate": 9.383033419023137e-06,
1027
- "loss": 3.1244,
1028
  "step": 1490
1029
  },
1030
  {
1031
  "epoch": 1.93,
1032
- "learning_rate": 8.097686375321336e-06,
1033
- "loss": 2.8553,
1034
  "step": 1500
1035
  },
1036
  {
1037
  "epoch": 1.93,
1038
- "eval_accuracy": 0.3871794871794872,
1039
- "eval_loss": 2.821709632873535,
1040
- "eval_runtime": 4.6901,
1041
- "eval_samples_per_second": 83.154,
1042
- "eval_steps_per_second": 10.448,
1043
  "step": 1500
1044
  },
1045
  {
1046
  "epoch": 1.94,
1047
- "learning_rate": 6.812339331619537e-06,
1048
- "loss": 2.6944,
1049
  "step": 1510
1050
  },
1051
  {
1052
  "epoch": 1.95,
1053
- "learning_rate": 5.526992287917738e-06,
1054
- "loss": 3.1573,
1055
  "step": 1520
1056
  },
1057
  {
1058
  "epoch": 1.97,
1059
- "learning_rate": 4.241645244215939e-06,
1060
- "loss": 2.8412,
1061
  "step": 1530
1062
  },
1063
  {
1064
  "epoch": 1.98,
1065
- "learning_rate": 2.956298200514139e-06,
1066
- "loss": 2.6937,
1067
  "step": 1540
1068
  },
1069
  {
1070
  "epoch": 1.99,
1071
- "learning_rate": 1.6709511568123394e-06,
1072
- "loss": 2.65,
1073
  "step": 1550
1074
  },
1075
  {
1076
  "epoch": 2.0,
1077
  "step": 1556,
1078
  "total_flos": 1.1959742538326016e+17,
1079
- "train_loss": 3.1177605087162594,
1080
- "train_runtime": 412.2299,
1081
- "train_samples_per_second": 7.549,
1082
- "train_steps_per_second": 3.775
1083
  },
1084
  {
1085
  "epoch": 2.0,
1086
- "eval_accuracy": 0.4076923076923077,
1087
- "eval_loss": 2.783235788345337,
1088
- "eval_runtime": 5.3396,
1089
- "eval_samples_per_second": 73.039,
1090
- "eval_steps_per_second": 9.177,
1091
  "step": 1556
1092
  }
1093
  ],
 
1
  {
2
+ "best_metric": 2.86479115486145,
3
  "best_model_checkpoint": "/content/Train/checkpoint-1400",
4
  "epoch": 2.0,
5
  "global_step": 1556,
 
10
  {
11
  "epoch": 0.01,
12
  "learning_rate": 0.0001994858611825193,
13
+ "loss": 3.2909,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.03,
18
  "learning_rate": 0.00019820051413881748,
19
+ "loss": 3.2775,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.04,
24
  "learning_rate": 0.0001969151670951157,
25
+ "loss": 3.4685,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.05,
30
  "learning_rate": 0.00019562982005141388,
31
+ "loss": 3.1491,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.06,
36
+ "learning_rate": 0.00019447300771208227,
37
+ "loss": 3.5003,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.08,
42
+ "learning_rate": 0.00019318766066838048,
43
+ "loss": 3.2438,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.09,
48
+ "learning_rate": 0.00019190231362467867,
49
+ "loss": 3.2587,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.1,
54
+ "learning_rate": 0.00019061696658097688,
55
+ "loss": 3.8744,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.12,
60
+ "learning_rate": 0.00018933161953727507,
61
+ "loss": 3.5397,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 0.13,
66
+ "learning_rate": 0.00018804627249357326,
67
+ "loss": 3.2433,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 0.13,
72
+ "eval_accuracy": 0.046153846153846156,
73
+ "eval_loss": 3.216766834259033,
74
+ "eval_runtime": 4.8707,
75
+ "eval_samples_per_second": 80.071,
76
+ "eval_steps_per_second": 10.06,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 0.14,
81
+ "learning_rate": 0.00018676092544987147,
82
+ "loss": 3.1843,
83
  "step": 110
84
  },
85
  {
86
  "epoch": 0.15,
87
+ "learning_rate": 0.00018547557840616966,
88
+ "loss": 3.1816,
89
  "step": 120
90
  },
91
  {
92
  "epoch": 0.17,
93
+ "learning_rate": 0.00018419023136246788,
94
+ "loss": 3.247,
95
  "step": 130
96
  },
97
  {
98
  "epoch": 0.18,
99
+ "learning_rate": 0.00018290488431876606,
100
+ "loss": 3.2118,
101
  "step": 140
102
  },
103
  {
104
  "epoch": 0.19,
105
+ "learning_rate": 0.00018161953727506428,
106
+ "loss": 3.3541,
107
  "step": 150
108
  },
109
  {
110
  "epoch": 0.21,
111
  "learning_rate": 0.0001803341902313625,
112
+ "loss": 3.3353,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 0.22,
117
  "learning_rate": 0.00017904884318766068,
118
+ "loss": 3.4277,
119
  "step": 170
120
  },
121
  {
122
  "epoch": 0.23,
123
  "learning_rate": 0.0001777634961439589,
124
+ "loss": 3.2286,
125
  "step": 180
126
  },
127
  {
128
  "epoch": 0.24,
129
  "learning_rate": 0.00017647814910025708,
130
+ "loss": 3.2597,
131
  "step": 190
132
  },
133
  {
134
  "epoch": 0.26,
135
  "learning_rate": 0.00017519280205655527,
136
+ "loss": 3.2076,
137
  "step": 200
138
  },
139
  {
140
  "epoch": 0.26,
141
+ "eval_accuracy": 0.0641025641025641,
142
+ "eval_loss": 3.2063100337982178,
143
+ "eval_runtime": 4.8973,
144
+ "eval_samples_per_second": 79.635,
145
+ "eval_steps_per_second": 10.005,
146
  "step": 200
147
  },
148
  {
149
  "epoch": 0.27,
150
  "learning_rate": 0.00017390745501285349,
151
+ "loss": 3.3286,
152
  "step": 210
153
  },
154
  {
155
  "epoch": 0.28,
156
  "learning_rate": 0.00017262210796915167,
157
+ "loss": 3.3623,
158
  "step": 220
159
  },
160
  {
161
  "epoch": 0.3,
162
  "learning_rate": 0.0001713367609254499,
163
+ "loss": 3.1635,
164
  "step": 230
165
  },
166
  {
167
  "epoch": 0.31,
168
  "learning_rate": 0.00017005141388174808,
169
+ "loss": 3.3247,
170
  "step": 240
171
  },
172
  {
173
  "epoch": 0.32,
174
  "learning_rate": 0.0001687660668380463,
175
+ "loss": 3.4129,
176
  "step": 250
177
  },
178
  {
179
  "epoch": 0.33,
180
  "learning_rate": 0.00016748071979434448,
181
+ "loss": 3.2366,
182
  "step": 260
183
  },
184
  {
185
  "epoch": 0.35,
186
  "learning_rate": 0.00016619537275064267,
187
+ "loss": 3.164,
188
  "step": 270
189
  },
190
  {
191
  "epoch": 0.36,
192
  "learning_rate": 0.00016491002570694088,
193
+ "loss": 3.1943,
194
  "step": 280
195
  },
196
  {
197
  "epoch": 0.37,
198
  "learning_rate": 0.00016362467866323907,
199
+ "loss": 3.3545,
200
  "step": 290
201
  },
202
  {
203
  "epoch": 0.39,
204
  "learning_rate": 0.00016233933161953728,
205
+ "loss": 3.136,
206
  "step": 300
207
  },
208
  {
209
  "epoch": 0.39,
210
+ "eval_accuracy": 0.10512820512820513,
211
+ "eval_loss": 3.193809986114502,
212
+ "eval_runtime": 5.0459,
213
+ "eval_samples_per_second": 77.29,
214
+ "eval_steps_per_second": 9.711,
215
  "step": 300
216
  },
217
  {
218
  "epoch": 0.4,
219
  "learning_rate": 0.0001610539845758355,
220
+ "loss": 3.3411,
221
  "step": 310
222
  },
223
  {
224
  "epoch": 0.41,
225
  "learning_rate": 0.00015976863753213369,
226
+ "loss": 3.1711,
227
  "step": 320
228
  },
229
  {
230
  "epoch": 0.42,
231
  "learning_rate": 0.0001584832904884319,
232
+ "loss": 3.2203,
233
  "step": 330
234
  },
235
  {
236
  "epoch": 0.44,
237
  "learning_rate": 0.0001571979434447301,
238
+ "loss": 3.3112,
239
  "step": 340
240
  },
241
  {
242
  "epoch": 0.45,
243
  "learning_rate": 0.00015591259640102828,
244
+ "loss": 3.3413,
245
  "step": 350
246
  },
247
  {
248
  "epoch": 0.46,
249
  "learning_rate": 0.0001546272493573265,
250
+ "loss": 3.3321,
251
  "step": 360
252
  },
253
  {
254
  "epoch": 0.48,
255
  "learning_rate": 0.00015334190231362468,
256
+ "loss": 3.2329,
257
  "step": 370
258
  },
259
  {
260
  "epoch": 0.49,
261
  "learning_rate": 0.0001520565552699229,
262
+ "loss": 3.1901,
263
  "step": 380
264
  },
265
  {
266
  "epoch": 0.5,
267
  "learning_rate": 0.00015077120822622108,
268
+ "loss": 3.1793,
269
  "step": 390
270
  },
271
  {
272
  "epoch": 0.51,
273
  "learning_rate": 0.0001494858611825193,
274
+ "loss": 3.2283,
275
  "step": 400
276
  },
277
  {
278
  "epoch": 0.51,
279
  "eval_accuracy": 0.1076923076923077,
280
+ "eval_loss": 3.1784756183624268,
281
+ "eval_runtime": 4.9684,
282
+ "eval_samples_per_second": 78.496,
283
+ "eval_steps_per_second": 9.862,
284
  "step": 400
285
  },
286
  {
287
  "epoch": 0.53,
288
  "learning_rate": 0.00014820051413881748,
289
+ "loss": 3.3054,
290
  "step": 410
291
  },
292
  {
293
  "epoch": 0.54,
294
  "learning_rate": 0.00014691516709511567,
295
+ "loss": 3.1213,
296
  "step": 420
297
  },
298
  {
299
  "epoch": 0.55,
300
  "learning_rate": 0.00014562982005141388,
301
+ "loss": 3.1617,
302
  "step": 430
303
  },
304
  {
305
  "epoch": 0.57,
306
  "learning_rate": 0.0001443444730077121,
307
+ "loss": 3.3134,
308
  "step": 440
309
  },
310
  {
311
  "epoch": 0.58,
312
  "learning_rate": 0.0001430591259640103,
313
+ "loss": 3.1977,
314
  "step": 450
315
  },
316
  {
317
  "epoch": 0.59,
318
  "learning_rate": 0.0001417737789203085,
319
+ "loss": 3.4898,
320
  "step": 460
321
  },
322
  {
323
  "epoch": 0.6,
324
  "learning_rate": 0.0001404884318766067,
325
+ "loss": 3.215,
326
  "step": 470
327
  },
328
  {
329
  "epoch": 0.62,
330
  "learning_rate": 0.0001392030848329049,
331
+ "loss": 3.1978,
332
  "step": 480
333
  },
334
  {
335
  "epoch": 0.63,
336
  "learning_rate": 0.0001379177377892031,
337
+ "loss": 3.1939,
338
  "step": 490
339
  },
340
  {
341
  "epoch": 0.64,
342
  "learning_rate": 0.0001366323907455013,
343
+ "loss": 3.1354,
344
  "step": 500
345
  },
346
  {
347
  "epoch": 0.64,
348
+ "eval_accuracy": 0.1282051282051282,
349
+ "eval_loss": 3.1671173572540283,
350
+ "eval_runtime": 4.8377,
351
+ "eval_samples_per_second": 80.617,
352
+ "eval_steps_per_second": 10.129,
353
  "step": 500
354
  },
355
  {
356
  "epoch": 0.66,
357
  "learning_rate": 0.0001353470437017995,
358
+ "loss": 3.0448,
359
  "step": 510
360
  },
361
  {
362
  "epoch": 0.67,
363
  "learning_rate": 0.00013406169665809768,
364
+ "loss": 3.4826,
365
  "step": 520
366
  },
367
  {
368
  "epoch": 0.68,
369
  "learning_rate": 0.0001327763496143959,
370
+ "loss": 3.2411,
371
  "step": 530
372
  },
373
  {
374
  "epoch": 0.69,
375
  "learning_rate": 0.00013149100257069408,
376
+ "loss": 3.0833,
377
  "step": 540
378
  },
379
  {
380
  "epoch": 0.71,
381
  "learning_rate": 0.0001302056555269923,
382
+ "loss": 3.2685,
383
  "step": 550
384
  },
385
  {
386
  "epoch": 0.72,
387
  "learning_rate": 0.00012892030848329049,
388
+ "loss": 3.0119,
389
  "step": 560
390
  },
391
  {
392
  "epoch": 0.73,
393
  "learning_rate": 0.00012763496143958867,
394
+ "loss": 3.1179,
395
  "step": 570
396
  },
397
  {
398
  "epoch": 0.75,
399
  "learning_rate": 0.00012634961439588692,
400
+ "loss": 3.1146,
401
  "step": 580
402
  },
403
  {
404
  "epoch": 0.76,
405
  "learning_rate": 0.0001250642673521851,
406
+ "loss": 3.2957,
407
  "step": 590
408
  },
409
  {
410
  "epoch": 0.77,
411
  "learning_rate": 0.0001237789203084833,
412
+ "loss": 3.3212,
413
  "step": 600
414
  },
415
  {
416
  "epoch": 0.77,
417
+ "eval_accuracy": 0.15897435897435896,
418
+ "eval_loss": 3.1384565830230713,
419
+ "eval_runtime": 4.7255,
420
+ "eval_samples_per_second": 82.53,
421
+ "eval_steps_per_second": 10.369,
422
  "step": 600
423
  },
424
  {
425
  "epoch": 0.78,
426
  "learning_rate": 0.0001224935732647815,
427
+ "loss": 3.1221,
428
  "step": 610
429
  },
430
  {
431
  "epoch": 0.8,
432
  "learning_rate": 0.0001212082262210797,
433
+ "loss": 3.4999,
434
  "step": 620
435
  },
436
  {
437
  "epoch": 0.81,
438
  "learning_rate": 0.0001199228791773779,
439
+ "loss": 3.2125,
440
  "step": 630
441
  },
442
  {
443
  "epoch": 0.82,
444
  "learning_rate": 0.0001186375321336761,
445
+ "loss": 3.1819,
446
  "step": 640
447
  },
448
  {
449
  "epoch": 0.84,
450
  "learning_rate": 0.0001173521850899743,
451
+ "loss": 3.2923,
452
  "step": 650
453
  },
454
  {
455
  "epoch": 0.85,
456
  "learning_rate": 0.0001160668380462725,
457
+ "loss": 3.3054,
458
  "step": 660
459
  },
460
  {
461
  "epoch": 0.86,
462
  "learning_rate": 0.0001147814910025707,
463
+ "loss": 3.0682,
464
  "step": 670
465
  },
466
  {
467
  "epoch": 0.87,
468
  "learning_rate": 0.0001134961439588689,
469
+ "loss": 3.111,
470
  "step": 680
471
  },
472
  {
473
  "epoch": 0.89,
474
  "learning_rate": 0.00011221079691516709,
475
+ "loss": 3.2225,
476
  "step": 690
477
  },
478
  {
479
  "epoch": 0.9,
480
  "learning_rate": 0.00011092544987146529,
481
+ "loss": 3.3626,
482
  "step": 700
483
  },
484
  {
485
  "epoch": 0.9,
486
+ "eval_accuracy": 0.2128205128205128,
487
+ "eval_loss": 3.1268680095672607,
488
+ "eval_runtime": 4.7092,
489
+ "eval_samples_per_second": 82.816,
490
+ "eval_steps_per_second": 10.405,
491
  "step": 700
492
  },
493
  {
494
  "epoch": 0.91,
495
  "learning_rate": 0.00010964010282776349,
496
+ "loss": 3.1624,
497
  "step": 710
498
  },
499
  {
500
  "epoch": 0.93,
501
  "learning_rate": 0.00010835475578406172,
502
+ "loss": 3.127,
503
  "step": 720
504
  },
505
  {
506
  "epoch": 0.94,
507
  "learning_rate": 0.0001070694087403599,
508
+ "loss": 3.3503,
509
  "step": 730
510
  },
511
  {
512
  "epoch": 0.95,
513
  "learning_rate": 0.0001057840616966581,
514
+ "loss": 2.9137,
515
  "step": 740
516
  },
517
  {
518
  "epoch": 0.96,
519
+ "learning_rate": 0.00010462724935732648,
520
+ "loss": 3.2102,
521
  "step": 750
522
  },
523
  {
524
  "epoch": 0.98,
525
+ "learning_rate": 0.00010334190231362468,
526
+ "loss": 3.3043,
527
  "step": 760
528
  },
529
  {
530
  "epoch": 0.99,
531
+ "learning_rate": 0.00010205655526992288,
532
+ "loss": 3.3108,
533
  "step": 770
534
  },
535
  {
536
  "epoch": 1.0,
537
+ "learning_rate": 0.00010077120822622108,
538
+ "loss": 3.3166,
539
  "step": 780
540
  },
541
  {
542
  "epoch": 1.02,
543
+ "learning_rate": 9.948586118251929e-05,
544
+ "loss": 2.9514,
545
  "step": 790
546
  },
547
  {
548
  "epoch": 1.03,
549
+ "learning_rate": 9.820051413881749e-05,
550
+ "loss": 3.1565,
551
  "step": 800
552
  },
553
  {
554
  "epoch": 1.03,
555
+ "eval_accuracy": 0.1564102564102564,
556
+ "eval_loss": 3.0775740146636963,
557
+ "eval_runtime": 5.0971,
558
+ "eval_samples_per_second": 76.514,
559
+ "eval_steps_per_second": 9.613,
560
  "step": 800
561
  },
562
  {
563
  "epoch": 1.04,
564
+ "learning_rate": 9.691516709511569e-05,
565
+ "loss": 3.0278,
566
  "step": 810
567
  },
568
  {
569
  "epoch": 1.05,
570
+ "learning_rate": 9.562982005141389e-05,
571
+ "loss": 2.931,
572
  "step": 820
573
  },
574
  {
575
  "epoch": 1.07,
576
+ "learning_rate": 9.434447300771209e-05,
577
+ "loss": 3.0111,
578
  "step": 830
579
  },
580
  {
581
  "epoch": 1.08,
582
+ "learning_rate": 9.305912596401029e-05,
583
+ "loss": 3.0158,
584
  "step": 840
585
  },
586
  {
587
  "epoch": 1.09,
588
+ "learning_rate": 9.177377892030848e-05,
589
+ "loss": 3.2079,
590
  "step": 850
591
  },
592
  {
593
  "epoch": 1.11,
594
+ "learning_rate": 9.048843187660668e-05,
595
+ "loss": 3.0987,
596
  "step": 860
597
  },
598
  {
599
  "epoch": 1.12,
600
+ "learning_rate": 8.92030848329049e-05,
601
+ "loss": 3.0854,
602
  "step": 870
603
  },
604
  {
605
  "epoch": 1.13,
606
+ "learning_rate": 8.79177377892031e-05,
607
+ "loss": 3.2495,
608
  "step": 880
609
  },
610
  {
611
  "epoch": 1.14,
612
+ "learning_rate": 8.66323907455013e-05,
613
+ "loss": 3.018,
614
  "step": 890
615
  },
616
  {
617
  "epoch": 1.16,
618
+ "learning_rate": 8.534704370179948e-05,
619
+ "loss": 3.0873,
620
  "step": 900
621
  },
622
  {
623
  "epoch": 1.16,
624
+ "eval_accuracy": 0.21794871794871795,
625
+ "eval_loss": 3.0445914268493652,
626
+ "eval_runtime": 4.9901,
627
+ "eval_samples_per_second": 78.155,
628
+ "eval_steps_per_second": 9.819,
629
  "step": 900
630
  },
631
  {
632
  "epoch": 1.17,
633
+ "learning_rate": 8.406169665809769e-05,
634
+ "loss": 3.0805,
635
  "step": 910
636
  },
637
  {
638
  "epoch": 1.18,
639
+ "learning_rate": 8.277634961439589e-05,
640
+ "loss": 3.2969,
641
  "step": 920
642
  },
643
  {
644
  "epoch": 1.2,
645
+ "learning_rate": 8.149100257069409e-05,
646
+ "loss": 3.1201,
647
  "step": 930
648
  },
649
  {
650
  "epoch": 1.21,
651
+ "learning_rate": 8.02056555269923e-05,
652
+ "loss": 3.2055,
653
  "step": 940
654
  },
655
  {
656
  "epoch": 1.22,
657
+ "learning_rate": 7.892030848329049e-05,
658
+ "loss": 3.1691,
659
  "step": 950
660
  },
661
  {
662
  "epoch": 1.23,
663
+ "learning_rate": 7.763496143958869e-05,
664
+ "loss": 2.9756,
665
  "step": 960
666
  },
667
  {
668
  "epoch": 1.25,
669
+ "learning_rate": 7.634961439588689e-05,
670
+ "loss": 3.1635,
671
  "step": 970
672
  },
673
  {
674
  "epoch": 1.26,
675
+ "learning_rate": 7.50642673521851e-05,
676
+ "loss": 2.9979,
677
  "step": 980
678
  },
679
  {
680
  "epoch": 1.27,
681
+ "learning_rate": 7.37789203084833e-05,
682
+ "loss": 3.1009,
683
  "step": 990
684
  },
685
  {
686
  "epoch": 1.29,
687
+ "learning_rate": 7.24935732647815e-05,
688
+ "loss": 2.9991,
689
  "step": 1000
690
  },
691
  {
692
  "epoch": 1.29,
693
+ "eval_accuracy": 0.2743589743589744,
694
+ "eval_loss": 2.994030475616455,
695
+ "eval_runtime": 5.0834,
696
+ "eval_samples_per_second": 76.721,
697
+ "eval_steps_per_second": 9.639,
698
  "step": 1000
699
  },
700
  {
701
  "epoch": 1.3,
702
+ "learning_rate": 7.12082262210797e-05,
703
+ "loss": 3.0882,
704
  "step": 1010
705
  },
706
  {
707
  "epoch": 1.31,
708
+ "learning_rate": 6.99228791773779e-05,
709
+ "loss": 3.0527,
710
  "step": 1020
711
  },
712
  {
713
  "epoch": 1.32,
714
+ "learning_rate": 6.86375321336761e-05,
715
+ "loss": 2.9187,
716
  "step": 1030
717
  },
718
  {
719
  "epoch": 1.34,
720
+ "learning_rate": 6.73521850899743e-05,
721
+ "loss": 3.011,
722
  "step": 1040
723
  },
724
  {
725
  "epoch": 1.35,
726
+ "learning_rate": 6.606683804627249e-05,
727
+ "loss": 3.138,
728
  "step": 1050
729
  },
730
  {
731
  "epoch": 1.36,
732
+ "learning_rate": 6.478149100257069e-05,
733
+ "loss": 3.0516,
734
  "step": 1060
735
  },
736
  {
737
  "epoch": 1.38,
738
+ "learning_rate": 6.349614395886889e-05,
739
+ "loss": 2.9856,
740
  "step": 1070
741
  },
742
  {
743
  "epoch": 1.39,
744
+ "learning_rate": 6.22107969151671e-05,
745
+ "loss": 2.9641,
746
  "step": 1080
747
  },
748
  {
749
  "epoch": 1.4,
750
+ "learning_rate": 6.09254498714653e-05,
751
+ "loss": 2.9252,
752
  "step": 1090
753
  },
754
  {
755
  "epoch": 1.41,
756
+ "learning_rate": 5.96401028277635e-05,
757
+ "loss": 2.9279,
758
  "step": 1100
759
  },
760
  {
761
  "epoch": 1.41,
762
+ "eval_accuracy": 0.2717948717948718,
763
+ "eval_loss": 2.9565303325653076,
764
+ "eval_runtime": 4.7654,
765
+ "eval_samples_per_second": 81.839,
766
+ "eval_steps_per_second": 10.282,
767
  "step": 1100
768
  },
769
  {
770
  "epoch": 1.43,
771
+ "learning_rate": 5.83547557840617e-05,
772
+ "loss": 3.1913,
773
  "step": 1110
774
  },
775
  {
776
  "epoch": 1.44,
777
+ "learning_rate": 5.7069408740359896e-05,
778
+ "loss": 2.8937,
779
  "step": 1120
780
  },
781
  {
782
  "epoch": 1.45,
783
+ "learning_rate": 5.57840616966581e-05,
784
+ "loss": 3.0555,
785
  "step": 1130
786
  },
787
  {
788
  "epoch": 1.47,
789
+ "learning_rate": 5.44987146529563e-05,
790
+ "loss": 3.0048,
791
  "step": 1140
792
  },
793
  {
794
  "epoch": 1.48,
795
+ "learning_rate": 5.3213367609254506e-05,
796
+ "loss": 3.1386,
797
  "step": 1150
798
  },
799
  {
800
  "epoch": 1.49,
801
+ "learning_rate": 5.192802056555271e-05,
802
+ "loss": 3.1599,
803
  "step": 1160
804
  },
805
  {
806
  "epoch": 1.5,
807
+ "learning_rate": 5.06426735218509e-05,
808
+ "loss": 3.1027,
809
  "step": 1170
810
  },
811
  {
812
  "epoch": 1.52,
813
+ "learning_rate": 4.93573264781491e-05,
814
+ "loss": 2.7227,
815
  "step": 1180
816
  },
817
  {
818
  "epoch": 1.53,
819
+ "learning_rate": 4.80719794344473e-05,
820
+ "loss": 3.2903,
821
  "step": 1190
822
  },
823
  {
824
  "epoch": 1.54,
825
+ "learning_rate": 4.6786632390745505e-05,
826
+ "loss": 2.8635,
827
  "step": 1200
828
  },
829
  {
830
  "epoch": 1.54,
831
+ "eval_accuracy": 0.27692307692307694,
832
+ "eval_loss": 2.9233222007751465,
833
+ "eval_runtime": 4.7217,
834
+ "eval_samples_per_second": 82.598,
835
+ "eval_steps_per_second": 10.378,
836
  "step": 1200
837
  },
838
  {
839
  "epoch": 1.56,
840
+ "learning_rate": 4.5501285347043706e-05,
841
+ "loss": 3.4916,
842
  "step": 1210
843
  },
844
  {
845
  "epoch": 1.57,
846
+ "learning_rate": 4.42159383033419e-05,
847
+ "loss": 2.9104,
848
  "step": 1220
849
  },
850
  {
851
  "epoch": 1.58,
852
+ "learning_rate": 4.293059125964011e-05,
853
+ "loss": 3.0576,
854
  "step": 1230
855
  },
856
  {
857
  "epoch": 1.59,
858
+ "learning_rate": 4.16452442159383e-05,
859
+ "loss": 2.8137,
860
  "step": 1240
861
  },
862
  {
863
  "epoch": 1.61,
864
+ "learning_rate": 4.0359897172236504e-05,
865
+ "loss": 3.1792,
866
  "step": 1250
867
  },
868
  {
869
  "epoch": 1.62,
870
+ "learning_rate": 3.9074550128534705e-05,
871
+ "loss": 3.0377,
872
  "step": 1260
873
  },
874
  {
875
  "epoch": 1.63,
876
+ "learning_rate": 3.7789203084832907e-05,
877
+ "loss": 3.1019,
878
  "step": 1270
879
  },
880
  {
881
  "epoch": 1.65,
882
+ "learning_rate": 3.650385604113111e-05,
883
+ "loss": 2.9101,
884
  "step": 1280
885
  },
886
  {
887
  "epoch": 1.66,
888
+ "learning_rate": 3.521850899742931e-05,
889
+ "loss": 2.9055,
890
  "step": 1290
891
  },
892
  {
893
  "epoch": 1.67,
894
+ "learning_rate": 3.393316195372751e-05,
895
+ "loss": 2.9502,
896
  "step": 1300
897
  },
898
  {
899
  "epoch": 1.67,
900
+ "eval_accuracy": 0.29743589743589743,
901
+ "eval_loss": 2.92795467376709,
902
+ "eval_runtime": 4.8649,
903
+ "eval_samples_per_second": 80.166,
904
+ "eval_steps_per_second": 10.072,
905
  "step": 1300
906
  },
907
  {
908
  "epoch": 1.68,
909
+ "learning_rate": 3.264781491002571e-05,
910
+ "loss": 2.837,
911
  "step": 1310
912
  },
913
  {
914
  "epoch": 1.7,
915
+ "learning_rate": 3.1362467866323906e-05,
916
+ "loss": 3.0021,
917
  "step": 1320
918
  },
919
  {
920
  "epoch": 1.71,
921
+ "learning_rate": 3.0077120822622107e-05,
922
+ "loss": 2.974,
923
  "step": 1330
924
  },
925
  {
926
  "epoch": 1.72,
927
+ "learning_rate": 2.879177377892031e-05,
928
+ "loss": 3.1642,
929
  "step": 1340
930
  },
931
  {
932
  "epoch": 1.74,
933
+ "learning_rate": 2.750642673521851e-05,
934
+ "loss": 2.8847,
935
  "step": 1350
936
  },
937
  {
938
  "epoch": 1.75,
939
+ "learning_rate": 2.622107969151671e-05,
940
+ "loss": 2.963,
941
  "step": 1360
942
  },
943
  {
944
  "epoch": 1.76,
945
+ "learning_rate": 2.493573264781491e-05,
946
+ "loss": 2.9772,
947
  "step": 1370
948
  },
949
  {
950
  "epoch": 1.77,
951
+ "learning_rate": 2.3650385604113112e-05,
952
+ "loss": 2.9531,
953
  "step": 1380
954
  },
955
  {
956
  "epoch": 1.79,
957
+ "learning_rate": 2.236503856041131e-05,
958
+ "loss": 3.0706,
959
  "step": 1390
960
  },
961
  {
962
  "epoch": 1.8,
963
+ "learning_rate": 2.107969151670951e-05,
964
+ "loss": 3.084,
965
  "step": 1400
966
  },
967
  {
968
  "epoch": 1.8,
969
+ "eval_accuracy": 0.31025641025641026,
970
+ "eval_loss": 2.86479115486145,
971
+ "eval_runtime": 4.9511,
972
+ "eval_samples_per_second": 78.77,
973
+ "eval_steps_per_second": 9.897,
974
  "step": 1400
975
  },
976
  {
977
  "epoch": 1.81,
978
+ "learning_rate": 1.9794344473007716e-05,
979
+ "loss": 2.8422,
980
  "step": 1410
981
  },
982
  {
983
  "epoch": 1.83,
984
+ "learning_rate": 1.8508997429305914e-05,
985
+ "loss": 2.8483,
986
  "step": 1420
987
  },
988
  {
989
  "epoch": 1.84,
990
+ "learning_rate": 1.7223650385604115e-05,
991
+ "loss": 2.8629,
992
  "step": 1430
993
  },
994
  {
995
  "epoch": 1.85,
996
+ "learning_rate": 1.5938303341902313e-05,
997
+ "loss": 3.0296,
998
  "step": 1440
999
  },
1000
  {
1001
  "epoch": 1.86,
1002
+ "learning_rate": 1.4652956298200515e-05,
1003
+ "loss": 3.2828,
1004
  "step": 1450
1005
  },
1006
  {
1007
  "epoch": 1.88,
1008
+ "learning_rate": 1.3367609254498715e-05,
1009
+ "loss": 2.8906,
1010
  "step": 1460
1011
  },
1012
  {
1013
  "epoch": 1.89,
1014
+ "learning_rate": 1.2082262210796916e-05,
1015
+ "loss": 2.8501,
1016
  "step": 1470
1017
  },
1018
  {
1019
  "epoch": 1.9,
1020
+ "learning_rate": 1.0796915167095115e-05,
1021
+ "loss": 3.1941,
1022
  "step": 1480
1023
  },
1024
  {
1025
  "epoch": 1.92,
1026
+ "learning_rate": 9.511568123393317e-06,
1027
+ "loss": 3.2227,
1028
  "step": 1490
1029
  },
1030
  {
1031
  "epoch": 1.93,
1032
+ "learning_rate": 8.226221079691516e-06,
1033
+ "loss": 2.8916,
1034
  "step": 1500
1035
  },
1036
  {
1037
  "epoch": 1.93,
1038
+ "eval_accuracy": 0.3,
1039
+ "eval_loss": 2.87473464012146,
1040
+ "eval_runtime": 5.5554,
1041
+ "eval_samples_per_second": 70.202,
1042
+ "eval_steps_per_second": 8.82,
1043
  "step": 1500
1044
  },
1045
  {
1046
  "epoch": 1.94,
1047
+ "learning_rate": 6.940874035989718e-06,
1048
+ "loss": 2.696,
1049
  "step": 1510
1050
  },
1051
  {
1052
  "epoch": 1.95,
1053
+ "learning_rate": 5.6555269922879175e-06,
1054
+ "loss": 3.1394,
1055
  "step": 1520
1056
  },
1057
  {
1058
  "epoch": 1.97,
1059
+ "learning_rate": 4.370179948586119e-06,
1060
+ "loss": 2.8452,
1061
  "step": 1530
1062
  },
1063
  {
1064
  "epoch": 1.98,
1065
+ "learning_rate": 3.084832904884319e-06,
1066
+ "loss": 2.7683,
1067
  "step": 1540
1068
  },
1069
  {
1070
  "epoch": 1.99,
1071
+ "learning_rate": 1.7994858611825194e-06,
1072
+ "loss": 2.681,
1073
  "step": 1550
1074
  },
1075
  {
1076
  "epoch": 2.0,
1077
  "step": 1556,
1078
  "total_flos": 1.1959742538326016e+17,
1079
+ "train_loss": 3.140765584832905,
1080
+ "train_runtime": 414.3391,
1081
+ "train_samples_per_second": 7.511,
1082
+ "train_steps_per_second": 3.755
1083
  },
1084
  {
1085
  "epoch": 2.0,
1086
+ "eval_accuracy": 0.31025641025641026,
1087
+ "eval_loss": 2.86479115486145,
1088
+ "eval_runtime": 5.2198,
1089
+ "eval_samples_per_second": 74.716,
1090
+ "eval_steps_per_second": 9.387,
1091
  "step": 1556
1092
  }
1093
  ],
validation_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "eval_accuracy": 0.4076923076923077,
4
- "eval_loss": 2.783235788345337,
5
- "eval_runtime": 5.3396,
6
- "eval_samples_per_second": 73.039,
7
- "eval_steps_per_second": 9.177
8
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "eval_accuracy": 0.31025641025641026,
4
+ "eval_loss": 2.86479115486145,
5
+ "eval_runtime": 5.2198,
6
+ "eval_samples_per_second": 74.716,
7
+ "eval_steps_per_second": 9.387
8
  }