gokuls commited on
Commit
7c4ffb1
1 Parent(s): 02f518a

End of training

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ trainer_state.json filter=lfs diff=lfs merge=lfs -text
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 1.53,
3
- "eval_accuracy": 2.9114457332740555e-05,
4
- "eval_loss": NaN,
5
- "eval_runtime": 1244.2688,
6
  "eval_samples": 308356,
7
- "eval_samples_per_second": 247.821,
8
- "eval_steps_per_second": 3.873,
9
- "perplexity": NaN,
10
- "train_loss": 0.3459514281929236,
11
- "train_runtime": 108836.6872,
12
  "train_samples": 5858758,
13
- "train_samples_per_second": 269.154,
14
- "train_steps_per_second": 4.206
15
  }
 
1
  {
2
+ "epoch": 0.85,
3
+ "eval_accuracy": 0.15112417542322987,
4
+ "eval_loss": 6.060698509216309,
5
+ "eval_runtime": 1562.4042,
6
  "eval_samples": 308356,
7
+ "eval_samples_per_second": 197.36,
8
+ "eval_steps_per_second": 4.112,
9
+ "perplexity": 428.6747655797377,
10
+ "train_loss": 6.324463086895448,
11
+ "train_runtime": 89974.5083,
12
  "train_samples": 5858758,
13
+ "train_samples_per_second": 325.579,
14
+ "train_steps_per_second": 6.783
15
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 1.53,
3
- "eval_accuracy": 2.9114457332740555e-05,
4
- "eval_loss": NaN,
5
- "eval_runtime": 1244.2688,
6
  "eval_samples": 308356,
7
- "eval_samples_per_second": 247.821,
8
- "eval_steps_per_second": 3.873,
9
- "perplexity": NaN
10
  }
 
1
  {
2
+ "epoch": 0.85,
3
+ "eval_accuracy": 0.15112417542322987,
4
+ "eval_loss": 6.060698509216309,
5
+ "eval_runtime": 1562.4042,
6
  "eval_samples": 308356,
7
+ "eval_samples_per_second": 197.36,
8
+ "eval_steps_per_second": 4.112,
9
+ "perplexity": 428.6747655797377
10
  }
logs/events.out.tfevents.1686401208.serv-3318.135844.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5283c3252ebe3cbb3f187742a92785b0b3f6283eda8db2d6cc29049cf9c4bfc3
3
+ size 369
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.53,
3
- "train_loss": 0.3459514281929236,
4
- "train_runtime": 108836.6872,
5
  "train_samples": 5858758,
6
- "train_samples_per_second": 269.154,
7
- "train_steps_per_second": 4.206
8
  }
 
1
  {
2
+ "epoch": 0.85,
3
+ "train_loss": 6.324463086895448,
4
+ "train_runtime": 89974.5083,
5
  "train_samples": 5858758,
6
+ "train_samples_per_second": 325.579,
7
+ "train_steps_per_second": 6.783
8
  }
trainer_state.json CHANGED
@@ -1,1831 +1,3 @@
1
- {
2
- "best_metric": NaN,
3
- "best_model_checkpoint": "add_bert_12_layer_model_complete_training_new/checkpoint-10000",
4
- "epoch": 1.5293301581753038,
5
- "global_step": 140001,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "learning_rate": 2.5e-05,
13
- "loss": 8.2896,
14
- "step": 500
15
- },
16
- {
17
- "epoch": 0.01,
18
- "learning_rate": 5e-05,
19
- "loss": 6.8372,
20
- "step": 1000
21
- },
22
- {
23
- "epoch": 0.02,
24
- "learning_rate": 7.5e-05,
25
- "loss": 6.683,
26
- "step": 1500
27
- },
28
- {
29
- "epoch": 0.02,
30
- "learning_rate": 0.0001,
31
- "loss": 6.5955,
32
- "step": 2000
33
- },
34
- {
35
- "epoch": 0.03,
36
- "learning_rate": 0.000125,
37
- "loss": 6.5373,
38
- "step": 2500
39
- },
40
- {
41
- "epoch": 0.03,
42
- "learning_rate": 0.00015,
43
- "loss": 6.4804,
44
- "step": 3000
45
- },
46
- {
47
- "epoch": 0.04,
48
- "learning_rate": 0.000175,
49
- "loss": 6.4463,
50
- "step": 3500
51
- },
52
- {
53
- "epoch": 0.04,
54
- "learning_rate": 0.0002,
55
- "loss": 6.408,
56
- "step": 4000
57
- },
58
- {
59
- "epoch": 0.05,
60
- "learning_rate": 0.00022500000000000002,
61
- "loss": 6.3827,
62
- "step": 4500
63
- },
64
- {
65
- "epoch": 0.05,
66
- "learning_rate": 0.00025,
67
- "loss": 6.3731,
68
- "step": 5000
69
- },
70
- {
71
- "epoch": 0.06,
72
- "learning_rate": 0.000275,
73
- "loss": 6.3479,
74
- "step": 5500
75
- },
76
- {
77
- "epoch": 0.07,
78
- "learning_rate": 0.0003,
79
- "loss": 6.3401,
80
- "step": 6000
81
- },
82
- {
83
- "epoch": 0.07,
84
- "learning_rate": 0.0003239,
85
- "loss": 7.0795,
86
- "step": 6500
87
- },
88
- {
89
- "epoch": 0.08,
90
- "learning_rate": 0.00034155000000000003,
91
- "loss": 10.0665,
92
- "step": 7000
93
- },
94
- {
95
- "epoch": 0.08,
96
- "learning_rate": 0.00036655,
97
- "loss": 0.0,
98
- "step": 7500
99
- },
100
- {
101
- "epoch": 0.09,
102
- "learning_rate": 0.00039155,
103
- "loss": 0.0,
104
- "step": 8000
105
- },
106
- {
107
- "epoch": 0.09,
108
- "learning_rate": 0.00041654999999999996,
109
- "loss": 0.0,
110
- "step": 8500
111
- },
112
- {
113
- "epoch": 0.1,
114
- "learning_rate": 0.00044155,
115
- "loss": 0.0,
116
- "step": 9000
117
- },
118
- {
119
- "epoch": 0.1,
120
- "learning_rate": 0.00046655000000000004,
121
- "loss": 0.0,
122
- "step": 9500
123
- },
124
- {
125
- "epoch": 0.11,
126
- "learning_rate": 0.00049155,
127
- "loss": 0.0,
128
- "step": 10000
129
- },
130
- {
131
- "epoch": 0.11,
132
- "eval_accuracy": 3.0592783198016594e-05,
133
- "eval_loss": NaN,
134
- "eval_runtime": 1245.7969,
135
- "eval_samples_per_second": 247.517,
136
- "eval_steps_per_second": 3.868,
137
- "step": 10000
138
- },
139
- {
140
- "epoch": 0.11,
141
- "learning_rate": 0.0004996303493254713,
142
- "loss": 0.0,
143
- "step": 10500
144
- },
145
- {
146
- "epoch": 0.12,
147
- "learning_rate": 0.0004990719646207451,
148
- "loss": 0.0,
149
- "step": 11000
150
- },
151
- {
152
- "epoch": 0.13,
153
- "learning_rate": 0.000498513579916019,
154
- "loss": 0.0,
155
- "step": 11500
156
- },
157
- {
158
- "epoch": 0.13,
159
- "learning_rate": 0.0004979551952112928,
160
- "loss": 0.0,
161
- "step": 12000
162
- },
163
- {
164
- "epoch": 0.14,
165
- "learning_rate": 0.0004973968105065666,
166
- "loss": 0.0,
167
- "step": 12500
168
- },
169
- {
170
- "epoch": 0.14,
171
- "learning_rate": 0.0004968384258018405,
172
- "loss": 0.0,
173
- "step": 13000
174
- },
175
- {
176
- "epoch": 0.15,
177
- "learning_rate": 0.0004962800410971143,
178
- "loss": 0.0,
179
- "step": 13500
180
- },
181
- {
182
- "epoch": 0.15,
183
- "learning_rate": 0.0004957216563923881,
184
- "loss": 0.0,
185
- "step": 14000
186
- },
187
- {
188
- "epoch": 0.16,
189
- "learning_rate": 0.0004951632716876619,
190
- "loss": 0.0,
191
- "step": 14500
192
- },
193
- {
194
- "epoch": 0.16,
195
- "learning_rate": 0.0004946048869829357,
196
- "loss": 0.0,
197
- "step": 15000
198
- },
199
- {
200
- "epoch": 0.17,
201
- "learning_rate": 0.0004940465022782096,
202
- "loss": 0.0,
203
- "step": 15500
204
- },
205
- {
206
- "epoch": 0.17,
207
- "learning_rate": 0.0004934881175734834,
208
- "loss": 0.0,
209
- "step": 16000
210
- },
211
- {
212
- "epoch": 0.18,
213
- "learning_rate": 0.0004929297328687572,
214
- "loss": 0.0,
215
- "step": 16500
216
- },
217
- {
218
- "epoch": 0.19,
219
- "learning_rate": 0.0004923713481640312,
220
- "loss": 0.0,
221
- "step": 17000
222
- },
223
- {
224
- "epoch": 0.19,
225
- "learning_rate": 0.000491812963459305,
226
- "loss": 0.0,
227
- "step": 17500
228
- },
229
- {
230
- "epoch": 0.2,
231
- "learning_rate": 0.0004912545787545788,
232
- "loss": 0.0,
233
- "step": 18000
234
- },
235
- {
236
- "epoch": 0.2,
237
- "learning_rate": 0.0004906961940498526,
238
- "loss": 0.0,
239
- "step": 18500
240
- },
241
- {
242
- "epoch": 0.21,
243
- "learning_rate": 0.0004901378093451264,
244
- "loss": 0.0,
245
- "step": 19000
246
- },
247
- {
248
- "epoch": 0.21,
249
- "learning_rate": 0.0004895794246404003,
250
- "loss": 0.0,
251
- "step": 19500
252
- },
253
- {
254
- "epoch": 0.22,
255
- "learning_rate": 0.0004890210399356741,
256
- "loss": 0.0,
257
- "step": 20000
258
- },
259
- {
260
- "epoch": 0.22,
261
- "eval_accuracy": 3.5159707526351004e-05,
262
- "eval_loss": NaN,
263
- "eval_runtime": 1244.1268,
264
- "eval_samples_per_second": 247.849,
265
- "eval_steps_per_second": 3.873,
266
- "step": 20000
267
- },
268
- {
269
- "epoch": 0.22,
270
- "learning_rate": 0.0004884626552309479,
271
- "loss": 0.0,
272
- "step": 20500
273
- },
274
- {
275
- "epoch": 0.23,
276
- "learning_rate": 0.00048790427052622175,
277
- "loss": 0.0,
278
- "step": 21000
279
- },
280
- {
281
- "epoch": 0.23,
282
- "learning_rate": 0.00048734588582149556,
283
- "loss": 0.0,
284
- "step": 21500
285
- },
286
- {
287
- "epoch": 0.24,
288
- "learning_rate": 0.0004867875011167694,
289
- "loss": 0.0,
290
- "step": 22000
291
- },
292
- {
293
- "epoch": 0.25,
294
- "learning_rate": 0.0004862291164120433,
295
- "loss": 0.0,
296
- "step": 22500
297
- },
298
- {
299
- "epoch": 0.25,
300
- "learning_rate": 0.0004856707317073171,
301
- "loss": 0.0,
302
- "step": 23000
303
- },
304
- {
305
- "epoch": 0.26,
306
- "learning_rate": 0.00048511234700259094,
307
- "loss": 0.0,
308
- "step": 23500
309
- },
310
- {
311
- "epoch": 0.26,
312
- "learning_rate": 0.00048455396229786475,
313
- "loss": 0.0,
314
- "step": 24000
315
- },
316
- {
317
- "epoch": 0.27,
318
- "learning_rate": 0.00048399557759313855,
319
- "loss": 0.0,
320
- "step": 24500
321
- },
322
- {
323
- "epoch": 0.27,
324
- "learning_rate": 0.0004834371928884124,
325
- "loss": 0.0,
326
- "step": 25000
327
- },
328
- {
329
- "epoch": 0.28,
330
- "learning_rate": 0.0004828788081836862,
331
- "loss": 0.0,
332
- "step": 25500
333
- },
334
- {
335
- "epoch": 0.28,
336
- "learning_rate": 0.0004823204234789601,
337
- "loss": 0.0,
338
- "step": 26000
339
- },
340
- {
341
- "epoch": 0.29,
342
- "learning_rate": 0.00048176203877423393,
343
- "loss": 0.0,
344
- "step": 26500
345
- },
346
- {
347
- "epoch": 0.29,
348
- "learning_rate": 0.00048120365406950774,
349
- "loss": 0.0,
350
- "step": 27000
351
- },
352
- {
353
- "epoch": 0.3,
354
- "learning_rate": 0.0004806452693647816,
355
- "loss": 0.0,
356
- "step": 27500
357
- },
358
- {
359
- "epoch": 0.31,
360
- "learning_rate": 0.0004800868846600554,
361
- "loss": 0.0,
362
- "step": 28000
363
- },
364
- {
365
- "epoch": 0.31,
366
- "learning_rate": 0.0004795284999553292,
367
- "loss": 0.0,
368
- "step": 28500
369
- },
370
- {
371
- "epoch": 0.32,
372
- "learning_rate": 0.00047897011525060306,
373
- "loss": 0.0,
374
- "step": 29000
375
- },
376
- {
377
- "epoch": 0.32,
378
- "learning_rate": 0.00047841173054587687,
379
- "loss": 0.0,
380
- "step": 29500
381
- },
382
- {
383
- "epoch": 0.33,
384
- "learning_rate": 0.0004778533458411508,
385
- "loss": 0.0,
386
- "step": 30000
387
- },
388
- {
389
- "epoch": 0.33,
390
- "eval_accuracy": 3.357992413338889e-05,
391
- "eval_loss": NaN,
392
- "eval_runtime": 1245.2057,
393
- "eval_samples_per_second": 247.635,
394
- "eval_steps_per_second": 3.87,
395
- "step": 30000
396
- },
397
- {
398
- "epoch": 0.33,
399
- "learning_rate": 0.0004772949611364246,
400
- "loss": 0.0,
401
- "step": 30500
402
- },
403
- {
404
- "epoch": 0.34,
405
- "learning_rate": 0.0004767365764316984,
406
- "loss": 0.0,
407
- "step": 31000
408
- },
409
- {
410
- "epoch": 0.34,
411
- "learning_rate": 0.00047617819172697225,
412
- "loss": 0.0,
413
- "step": 31500
414
- },
415
- {
416
- "epoch": 0.35,
417
- "learning_rate": 0.00047561980702224606,
418
- "loss": 0.0,
419
- "step": 32000
420
- },
421
- {
422
- "epoch": 0.36,
423
- "learning_rate": 0.00047506142231751986,
424
- "loss": 0.0,
425
- "step": 32500
426
- },
427
- {
428
- "epoch": 0.36,
429
- "learning_rate": 0.0004745030376127937,
430
- "loss": 0.0,
431
- "step": 33000
432
- },
433
- {
434
- "epoch": 0.37,
435
- "learning_rate": 0.0004739446529080675,
436
- "loss": 0.0,
437
- "step": 33500
438
- },
439
- {
440
- "epoch": 0.37,
441
- "learning_rate": 0.0004733862682033414,
442
- "loss": 0.0,
443
- "step": 34000
444
- },
445
- {
446
- "epoch": 0.38,
447
- "learning_rate": 0.00047282788349861524,
448
- "loss": 0.0,
449
- "step": 34500
450
- },
451
- {
452
- "epoch": 0.38,
453
- "learning_rate": 0.00047226949879388905,
454
- "loss": 0.0,
455
- "step": 35000
456
- },
457
- {
458
- "epoch": 0.39,
459
- "learning_rate": 0.0004717111140891629,
460
- "loss": 0.0,
461
- "step": 35500
462
- },
463
- {
464
- "epoch": 0.39,
465
- "learning_rate": 0.0004711527293844367,
466
- "loss": 0.0,
467
- "step": 36000
468
- },
469
- {
470
- "epoch": 0.4,
471
- "learning_rate": 0.0004705943446797105,
472
- "loss": 0.0,
473
- "step": 36500
474
- },
475
- {
476
- "epoch": 0.4,
477
- "learning_rate": 0.0004700359599749844,
478
- "loss": 0.0,
479
- "step": 37000
480
- },
481
- {
482
- "epoch": 0.41,
483
- "learning_rate": 0.0004694775752702582,
484
- "loss": 0.0,
485
- "step": 37500
486
- },
487
- {
488
- "epoch": 0.42,
489
- "learning_rate": 0.00046891919056553204,
490
- "loss": 0.0,
491
- "step": 38000
492
- },
493
- {
494
- "epoch": 0.42,
495
- "learning_rate": 0.0004683608058608059,
496
- "loss": 0.0,
497
- "step": 38500
498
- },
499
- {
500
- "epoch": 0.43,
501
- "learning_rate": 0.0004678024211560797,
502
- "loss": 0.0,
503
- "step": 39000
504
- },
505
- {
506
- "epoch": 0.43,
507
- "learning_rate": 0.00046724403645135356,
508
- "loss": 0.0,
509
- "step": 39500
510
- },
511
- {
512
- "epoch": 0.44,
513
- "learning_rate": 0.00046668565174662736,
514
- "loss": 0.0,
515
- "step": 40000
516
- },
517
- {
518
- "epoch": 0.44,
519
- "eval_accuracy": 3.1386399970587474e-05,
520
- "eval_loss": NaN,
521
- "eval_runtime": 1244.2525,
522
- "eval_samples_per_second": 247.824,
523
- "eval_steps_per_second": 3.873,
524
- "step": 40000
525
- },
526
- {
527
- "epoch": 0.44,
528
- "learning_rate": 0.00046612726704190117,
529
- "loss": 0.0,
530
- "step": 40500
531
- },
532
- {
533
- "epoch": 0.45,
534
- "learning_rate": 0.00046556888233717503,
535
- "loss": 0.0,
536
- "step": 41000
537
- },
538
- {
539
- "epoch": 0.45,
540
- "learning_rate": 0.00046501049763244883,
541
- "loss": 0.0,
542
- "step": 41500
543
- },
544
- {
545
- "epoch": 0.46,
546
- "learning_rate": 0.0004644521129277227,
547
- "loss": 0.0,
548
- "step": 42000
549
- },
550
- {
551
- "epoch": 0.46,
552
- "learning_rate": 0.00046389372822299655,
553
- "loss": 0.0,
554
- "step": 42500
555
- },
556
- {
557
- "epoch": 0.47,
558
- "learning_rate": 0.00046333534351827036,
559
- "loss": 0.0,
560
- "step": 43000
561
- },
562
- {
563
- "epoch": 0.48,
564
- "learning_rate": 0.0004627769588135442,
565
- "loss": 0.0,
566
- "step": 43500
567
- },
568
- {
569
- "epoch": 0.48,
570
- "learning_rate": 0.000462218574108818,
571
- "loss": 0.0,
572
- "step": 44000
573
- },
574
- {
575
- "epoch": 0.49,
576
- "learning_rate": 0.0004616601894040919,
577
- "loss": 0.0,
578
- "step": 44500
579
- },
580
- {
581
- "epoch": 0.49,
582
- "learning_rate": 0.0004611018046993657,
583
- "loss": 0.0,
584
- "step": 45000
585
- },
586
- {
587
- "epoch": 0.5,
588
- "learning_rate": 0.0004605434199946395,
589
- "loss": 0.0,
590
- "step": 45500
591
- },
592
- {
593
- "epoch": 0.5,
594
- "learning_rate": 0.00045998503528991335,
595
- "loss": 0.0,
596
- "step": 46000
597
- },
598
- {
599
- "epoch": 0.51,
600
- "learning_rate": 0.00045942665058518715,
601
- "loss": 0.0,
602
- "step": 46500
603
- },
604
- {
605
- "epoch": 0.51,
606
- "learning_rate": 0.000458868265880461,
607
- "loss": 0.0,
608
- "step": 47000
609
- },
610
- {
611
- "epoch": 0.52,
612
- "learning_rate": 0.00045830988117573487,
613
- "loss": 0.0,
614
- "step": 47500
615
- },
616
- {
617
- "epoch": 0.52,
618
- "learning_rate": 0.0004577514964710087,
619
- "loss": 0.0,
620
- "step": 48000
621
- },
622
- {
623
- "epoch": 0.53,
624
- "learning_rate": 0.00045719311176628253,
625
- "loss": 0.0,
626
- "step": 48500
627
- },
628
- {
629
- "epoch": 0.54,
630
- "learning_rate": 0.00045663472706155634,
631
- "loss": 0.0,
632
- "step": 49000
633
- },
634
- {
635
- "epoch": 0.54,
636
- "learning_rate": 0.00045607634235683014,
637
- "loss": 0.0,
638
- "step": 49500
639
- },
640
- {
641
- "epoch": 0.55,
642
- "learning_rate": 0.000455517957652104,
643
- "loss": 0.0,
644
- "step": 50000
645
- },
646
- {
647
- "epoch": 0.55,
648
- "eval_accuracy": 2.9965971727327976e-05,
649
- "eval_loss": NaN,
650
- "eval_runtime": 1242.0864,
651
- "eval_samples_per_second": 248.256,
652
- "eval_steps_per_second": 3.88,
653
- "step": 50000
654
- },
655
- {
656
- "epoch": 0.55,
657
- "learning_rate": 0.0004549595729473778,
658
- "loss": 0.0,
659
- "step": 50500
660
- },
661
- {
662
- "epoch": 0.56,
663
- "learning_rate": 0.00045440118824265167,
664
- "loss": 0.0,
665
- "step": 51000
666
- },
667
- {
668
- "epoch": 0.56,
669
- "learning_rate": 0.0004538428035379255,
670
- "loss": 0.0,
671
- "step": 51500
672
- },
673
- {
674
- "epoch": 0.57,
675
- "learning_rate": 0.00045328441883319933,
676
- "loss": 0.0,
677
- "step": 52000
678
- },
679
- {
680
- "epoch": 0.57,
681
- "learning_rate": 0.0004527260341284732,
682
- "loss": 0.0,
683
- "step": 52500
684
- },
685
- {
686
- "epoch": 0.58,
687
- "learning_rate": 0.000452167649423747,
688
- "loss": 0.0,
689
- "step": 53000
690
- },
691
- {
692
- "epoch": 0.58,
693
- "learning_rate": 0.0004516092647190208,
694
- "loss": 0.0,
695
- "step": 53500
696
- },
697
- {
698
- "epoch": 0.59,
699
- "learning_rate": 0.00045105088001429466,
700
- "loss": 0.0,
701
- "step": 54000
702
- },
703
- {
704
- "epoch": 0.6,
705
- "learning_rate": 0.00045049249530956846,
706
- "loss": 0.0,
707
- "step": 54500
708
- },
709
- {
710
- "epoch": 0.6,
711
- "learning_rate": 0.0004499341106048423,
712
- "loss": 0.0,
713
- "step": 55000
714
- },
715
- {
716
- "epoch": 0.61,
717
- "learning_rate": 0.0004493757259001162,
718
- "loss": 0.0,
719
- "step": 55500
720
- },
721
- {
722
- "epoch": 0.61,
723
- "learning_rate": 0.00044881734119539,
724
- "loss": 0.0,
725
- "step": 56000
726
- },
727
- {
728
- "epoch": 0.62,
729
- "learning_rate": 0.00044825895649066384,
730
- "loss": 0.0,
731
- "step": 56500
732
- },
733
- {
734
- "epoch": 0.62,
735
- "learning_rate": 0.00044770057178593765,
736
- "loss": 0.0,
737
- "step": 57000
738
- },
739
- {
740
- "epoch": 0.63,
741
- "learning_rate": 0.00044714218708121145,
742
- "loss": 0.0,
743
- "step": 57500
744
- },
745
- {
746
- "epoch": 0.63,
747
- "learning_rate": 0.0004465838023764853,
748
- "loss": 0.0,
749
- "step": 58000
750
- },
751
- {
752
- "epoch": 0.64,
753
- "learning_rate": 0.0004460254176717591,
754
- "loss": 0.0,
755
- "step": 58500
756
- },
757
- {
758
- "epoch": 0.64,
759
- "learning_rate": 0.00044546703296703303,
760
- "loss": 0.0,
761
- "step": 59000
762
- },
763
- {
764
- "epoch": 0.65,
765
- "learning_rate": 0.00044490864826230683,
766
- "loss": 0.0,
767
- "step": 59500
768
- },
769
- {
770
- "epoch": 0.66,
771
- "learning_rate": 0.00044435026355758064,
772
- "loss": 0.0,
773
- "step": 60000
774
- },
775
- {
776
- "epoch": 0.66,
777
- "eval_accuracy": 3.177880317382685e-05,
778
- "eval_loss": NaN,
779
- "eval_runtime": 1238.4379,
780
- "eval_samples_per_second": 248.988,
781
- "eval_steps_per_second": 3.891,
782
- "step": 60000
783
- },
784
- {
785
- "epoch": 0.66,
786
- "learning_rate": 0.0004437918788528545,
787
- "loss": 0.0,
788
- "step": 60500
789
- },
790
- {
791
- "epoch": 0.67,
792
- "learning_rate": 0.0004432334941481283,
793
- "loss": 0.0,
794
- "step": 61000
795
- },
796
- {
797
- "epoch": 0.67,
798
- "learning_rate": 0.0004426751094434021,
799
- "loss": 0.0,
800
- "step": 61500
801
- },
802
- {
803
- "epoch": 0.68,
804
- "learning_rate": 0.00044211672473867597,
805
- "loss": 0.0,
806
- "step": 62000
807
- },
808
- {
809
- "epoch": 0.68,
810
- "learning_rate": 0.00044155834003394977,
811
- "loss": 0.0,
812
- "step": 62500
813
- },
814
- {
815
- "epoch": 0.69,
816
- "learning_rate": 0.00044099995532922363,
817
- "loss": 0.0,
818
- "step": 63000
819
- },
820
- {
821
- "epoch": 0.69,
822
- "learning_rate": 0.0004404415706244975,
823
- "loss": 0.0,
824
- "step": 63500
825
- },
826
- {
827
- "epoch": 0.7,
828
- "learning_rate": 0.0004398831859197713,
829
- "loss": 0.0,
830
- "step": 64000
831
- },
832
- {
833
- "epoch": 0.7,
834
- "learning_rate": 0.00043932480121504515,
835
- "loss": 0.0,
836
- "step": 64500
837
- },
838
- {
839
- "epoch": 0.71,
840
- "learning_rate": 0.00043876641651031896,
841
- "loss": 0.0,
842
- "step": 65000
843
- },
844
- {
845
- "epoch": 0.72,
846
- "learning_rate": 0.00043820803180559276,
847
- "loss": 0.0,
848
- "step": 65500
849
- },
850
- {
851
- "epoch": 0.72,
852
- "learning_rate": 0.0004376496471008666,
853
- "loss": 0.0,
854
- "step": 66000
855
- },
856
- {
857
- "epoch": 0.73,
858
- "learning_rate": 0.0004370912623961404,
859
- "loss": 0.0,
860
- "step": 66500
861
- },
862
- {
863
- "epoch": 0.73,
864
- "learning_rate": 0.0004365328776914143,
865
- "loss": 0.0,
866
- "step": 67000
867
- },
868
- {
869
- "epoch": 0.74,
870
- "learning_rate": 0.00043597449298668814,
871
- "loss": 0.0,
872
- "step": 67500
873
- },
874
- {
875
- "epoch": 0.74,
876
- "learning_rate": 0.00043541610828196195,
877
- "loss": 0.0,
878
- "step": 68000
879
- },
880
- {
881
- "epoch": 0.75,
882
- "learning_rate": 0.0004348577235772358,
883
- "loss": 0.0,
884
- "step": 68500
885
- },
886
- {
887
- "epoch": 0.75,
888
- "learning_rate": 0.0004342993388725096,
889
- "loss": 0.0,
890
- "step": 69000
891
- },
892
- {
893
- "epoch": 0.76,
894
- "learning_rate": 0.0004337409541677834,
895
- "loss": 0.0,
896
- "step": 69500
897
- },
898
- {
899
- "epoch": 0.76,
900
- "learning_rate": 0.0004331825694630573,
901
- "loss": 0.0,
902
- "step": 70000
903
- },
904
- {
905
- "epoch": 0.76,
906
- "eval_accuracy": 3.253472148144988e-05,
907
- "eval_loss": NaN,
908
- "eval_runtime": 1240.7983,
909
- "eval_samples_per_second": 248.514,
910
- "eval_steps_per_second": 3.884,
911
- "step": 70000
912
- },
913
- {
914
- "epoch": 0.77,
915
- "learning_rate": 0.0004326241847583311,
916
- "loss": 0.0,
917
- "step": 70500
918
- },
919
- {
920
- "epoch": 0.78,
921
- "learning_rate": 0.00043206580005360494,
922
- "loss": 0.0,
923
- "step": 71000
924
- },
925
- {
926
- "epoch": 0.78,
927
- "learning_rate": 0.0004315074153488788,
928
- "loss": 0.0,
929
- "step": 71500
930
- },
931
- {
932
- "epoch": 0.79,
933
- "learning_rate": 0.0004309490306441526,
934
- "loss": 0.0,
935
- "step": 72000
936
- },
937
- {
938
- "epoch": 0.79,
939
- "learning_rate": 0.00043039064593942646,
940
- "loss": 0.0,
941
- "step": 72500
942
- },
943
- {
944
- "epoch": 0.8,
945
- "learning_rate": 0.00042983226123470027,
946
- "loss": 0.0,
947
- "step": 73000
948
- },
949
- {
950
- "epoch": 0.8,
951
- "learning_rate": 0.00042927387652997407,
952
- "loss": 0.0,
953
- "step": 73500
954
- },
955
- {
956
- "epoch": 0.81,
957
- "learning_rate": 0.00042871549182524793,
958
- "loss": 0.0,
959
- "step": 74000
960
- },
961
- {
962
- "epoch": 0.81,
963
- "learning_rate": 0.00042815710712052174,
964
- "loss": 0.0,
965
- "step": 74500
966
- },
967
- {
968
- "epoch": 0.82,
969
- "learning_rate": 0.0004275987224157956,
970
- "loss": 0.0,
971
- "step": 75000
972
- },
973
- {
974
- "epoch": 0.82,
975
- "learning_rate": 0.0004270403377110694,
976
- "loss": 0.0,
977
- "step": 75500
978
- },
979
- {
980
- "epoch": 0.83,
981
- "learning_rate": 0.00042648195300634326,
982
- "loss": 0.0,
983
- "step": 76000
984
- },
985
- {
986
- "epoch": 0.84,
987
- "learning_rate": 0.0004259235683016171,
988
- "loss": 0.0,
989
- "step": 76500
990
- },
991
- {
992
- "epoch": 0.84,
993
- "learning_rate": 0.0004253651835968909,
994
- "loss": 0.0,
995
- "step": 77000
996
- },
997
- {
998
- "epoch": 0.85,
999
- "learning_rate": 0.0004248067988921648,
1000
- "loss": 0.0,
1001
- "step": 77500
1002
- },
1003
- {
1004
- "epoch": 0.85,
1005
- "learning_rate": 0.0004242484141874386,
1006
- "loss": 0.0,
1007
- "step": 78000
1008
- },
1009
- {
1010
- "epoch": 0.86,
1011
- "learning_rate": 0.0004236900294827124,
1012
- "loss": 0.0,
1013
- "step": 78500
1014
- },
1015
- {
1016
- "epoch": 0.86,
1017
- "learning_rate": 0.00042313164477798625,
1018
- "loss": 0.0,
1019
- "step": 79000
1020
- },
1021
- {
1022
- "epoch": 0.87,
1023
- "learning_rate": 0.00042257326007326005,
1024
- "loss": 0.0,
1025
- "step": 79500
1026
- },
1027
- {
1028
- "epoch": 0.87,
1029
- "learning_rate": 0.0004220148753685339,
1030
- "loss": 0.0,
1031
- "step": 80000
1032
- },
1033
- {
1034
- "epoch": 0.87,
1035
- "eval_accuracy": 2.7615362733125802e-05,
1036
- "eval_loss": NaN,
1037
- "eval_runtime": 1242.4281,
1038
- "eval_samples_per_second": 248.188,
1039
- "eval_steps_per_second": 3.879,
1040
- "step": 80000
1041
- },
1042
- {
1043
- "epoch": 0.88,
1044
- "learning_rate": 0.00042145649066380777,
1045
- "loss": 0.0,
1046
- "step": 80500
1047
- },
1048
- {
1049
- "epoch": 0.88,
1050
- "learning_rate": 0.0004208981059590816,
1051
- "loss": 0.0,
1052
- "step": 81000
1053
- },
1054
- {
1055
- "epoch": 0.89,
1056
- "learning_rate": 0.00042033972125435544,
1057
- "loss": 0.0,
1058
- "step": 81500
1059
- },
1060
- {
1061
- "epoch": 0.9,
1062
- "learning_rate": 0.00041978133654962924,
1063
- "loss": 0.0,
1064
- "step": 82000
1065
- },
1066
- {
1067
- "epoch": 0.9,
1068
- "learning_rate": 0.00041922295184490305,
1069
- "loss": 0.0,
1070
- "step": 82500
1071
- },
1072
- {
1073
- "epoch": 0.91,
1074
- "learning_rate": 0.0004186645671401769,
1075
- "loss": 0.0,
1076
- "step": 83000
1077
- },
1078
- {
1079
- "epoch": 0.91,
1080
- "learning_rate": 0.0004181061824354507,
1081
- "loss": 0.0,
1082
- "step": 83500
1083
- },
1084
- {
1085
- "epoch": 0.92,
1086
- "learning_rate": 0.00041754779773072457,
1087
- "loss": 0.0,
1088
- "step": 84000
1089
- },
1090
- {
1091
- "epoch": 0.92,
1092
- "learning_rate": 0.00041698941302599843,
1093
- "loss": 0.0,
1094
- "step": 84500
1095
- },
1096
- {
1097
- "epoch": 0.93,
1098
- "learning_rate": 0.00041643102832127223,
1099
- "loss": 0.0,
1100
- "step": 85000
1101
- },
1102
- {
1103
- "epoch": 0.93,
1104
- "learning_rate": 0.0004158726436165461,
1105
- "loss": 0.0,
1106
- "step": 85500
1107
- },
1108
- {
1109
- "epoch": 0.94,
1110
- "learning_rate": 0.0004153142589118199,
1111
- "loss": 0.0,
1112
- "step": 86000
1113
- },
1114
- {
1115
- "epoch": 0.94,
1116
- "learning_rate": 0.0004147558742070937,
1117
- "loss": 0.0,
1118
- "step": 86500
1119
- },
1120
- {
1121
- "epoch": 0.95,
1122
- "learning_rate": 0.00041419748950236756,
1123
- "loss": 0.0,
1124
- "step": 87000
1125
- },
1126
- {
1127
- "epoch": 0.96,
1128
- "learning_rate": 0.00041363910479764136,
1129
- "loss": 0.0,
1130
- "step": 87500
1131
- },
1132
- {
1133
- "epoch": 0.96,
1134
- "learning_rate": 0.0004130807200929152,
1135
- "loss": 0.0,
1136
- "step": 88000
1137
- },
1138
- {
1139
- "epoch": 0.97,
1140
- "learning_rate": 0.0004125223353881891,
1141
- "loss": 0.0,
1142
- "step": 88500
1143
- },
1144
- {
1145
- "epoch": 0.97,
1146
- "learning_rate": 0.0004119639506834629,
1147
- "loss": 0.0,
1148
- "step": 89000
1149
- },
1150
- {
1151
- "epoch": 0.98,
1152
- "learning_rate": 0.00041140556597873675,
1153
- "loss": 0.0,
1154
- "step": 89500
1155
- },
1156
- {
1157
- "epoch": 0.98,
1158
- "learning_rate": 0.00041084718127401055,
1159
- "loss": 0.0,
1160
- "step": 90000
1161
- },
1162
- {
1163
- "epoch": 0.98,
1164
- "eval_accuracy": 2.9214303322906894e-05,
1165
- "eval_loss": NaN,
1166
- "eval_runtime": 1238.8828,
1167
- "eval_samples_per_second": 248.898,
1168
- "eval_steps_per_second": 3.89,
1169
- "step": 90000
1170
- },
1171
- {
1172
- "epoch": 0.99,
1173
- "learning_rate": 0.00041028879656928436,
1174
- "loss": 0.0,
1175
- "step": 90500
1176
- },
1177
- {
1178
- "epoch": 0.99,
1179
- "learning_rate": 0.0004097304118645582,
1180
- "loss": 0.0,
1181
- "step": 91000
1182
- },
1183
- {
1184
- "epoch": 1.0,
1185
- "learning_rate": 0.000409172027159832,
1186
- "loss": 0.0,
1187
- "step": 91500
1188
- },
1189
- {
1190
- "epoch": 1.0,
1191
- "learning_rate": 0.0004086136424551059,
1192
- "loss": 0.0,
1193
- "step": 92000
1194
- },
1195
- {
1196
- "epoch": 1.01,
1197
- "learning_rate": 0.00040805525775037974,
1198
- "loss": 0.0,
1199
- "step": 92500
1200
- },
1201
- {
1202
- "epoch": 1.02,
1203
- "learning_rate": 0.00040749687304565354,
1204
- "loss": 0.0,
1205
- "step": 93000
1206
- },
1207
- {
1208
- "epoch": 1.02,
1209
- "learning_rate": 0.0004069384883409274,
1210
- "loss": 0.0,
1211
- "step": 93500
1212
- },
1213
- {
1214
- "epoch": 1.03,
1215
- "learning_rate": 0.0004063801036362012,
1216
- "loss": 0.0,
1217
- "step": 94000
1218
- },
1219
- {
1220
- "epoch": 1.03,
1221
- "learning_rate": 0.000405821718931475,
1222
- "loss": 0.0,
1223
- "step": 94500
1224
- },
1225
- {
1226
- "epoch": 1.04,
1227
- "learning_rate": 0.00040526333422674887,
1228
- "loss": 0.0,
1229
- "step": 95000
1230
- },
1231
- {
1232
- "epoch": 1.04,
1233
- "learning_rate": 0.0004047049495220227,
1234
- "loss": 0.0,
1235
- "step": 95500
1236
- },
1237
- {
1238
- "epoch": 1.05,
1239
- "learning_rate": 0.00040414656481729653,
1240
- "loss": 0.0,
1241
- "step": 96000
1242
- },
1243
- {
1244
- "epoch": 1.05,
1245
- "learning_rate": 0.0004035881801125704,
1246
- "loss": 0.0,
1247
- "step": 96500
1248
- },
1249
- {
1250
- "epoch": 1.06,
1251
- "learning_rate": 0.0004030297954078442,
1252
- "loss": 0.0,
1253
- "step": 97000
1254
- },
1255
- {
1256
- "epoch": 1.07,
1257
- "learning_rate": 0.00040247141070311806,
1258
- "loss": 0.0,
1259
- "step": 97500
1260
- },
1261
- {
1262
- "epoch": 1.07,
1263
- "learning_rate": 0.00040191302599839186,
1264
- "loss": 0.0,
1265
- "step": 98000
1266
- },
1267
- {
1268
- "epoch": 1.08,
1269
- "learning_rate": 0.00040135464129366567,
1270
- "loss": 0.0,
1271
- "step": 98500
1272
- },
1273
- {
1274
- "epoch": 1.08,
1275
- "learning_rate": 0.0004007962565889395,
1276
- "loss": 0.0,
1277
- "step": 99000
1278
- },
1279
- {
1280
- "epoch": 1.09,
1281
- "learning_rate": 0.00040023787188421333,
1282
- "loss": 0.0,
1283
- "step": 99500
1284
- },
1285
- {
1286
- "epoch": 1.09,
1287
- "learning_rate": 0.0003996794871794872,
1288
- "loss": 0.0,
1289
- "step": 100000
1290
- },
1291
- {
1292
- "epoch": 1.09,
1293
- "eval_accuracy": 3.071726548590269e-05,
1294
- "eval_loss": NaN,
1295
- "eval_runtime": 1240.4665,
1296
- "eval_samples_per_second": 248.581,
1297
- "eval_steps_per_second": 3.885,
1298
- "step": 100000
1299
- },
1300
- {
1301
- "epoch": 1.1,
1302
- "learning_rate": 0.00039912110247476105,
1303
- "loss": 0.0,
1304
- "step": 100500
1305
- },
1306
- {
1307
- "epoch": 1.1,
1308
- "learning_rate": 0.00039856271777003485,
1309
- "loss": 0.0,
1310
- "step": 101000
1311
- },
1312
- {
1313
- "epoch": 1.11,
1314
- "learning_rate": 0.0003980043330653087,
1315
- "loss": 0.0,
1316
- "step": 101500
1317
- },
1318
- {
1319
- "epoch": 1.11,
1320
- "learning_rate": 0.0003974459483605825,
1321
- "loss": 0.0,
1322
- "step": 102000
1323
- },
1324
- {
1325
- "epoch": 1.12,
1326
- "learning_rate": 0.0003968875636558563,
1327
- "loss": 0.0,
1328
- "step": 102500
1329
- },
1330
- {
1331
- "epoch": 1.13,
1332
- "learning_rate": 0.0003963291789511302,
1333
- "loss": 0.0,
1334
- "step": 103000
1335
- },
1336
- {
1337
- "epoch": 1.13,
1338
- "learning_rate": 0.000395770794246404,
1339
- "loss": 0.0,
1340
- "step": 103500
1341
- },
1342
- {
1343
- "epoch": 1.14,
1344
- "learning_rate": 0.00039521240954167784,
1345
- "loss": 0.0,
1346
- "step": 104000
1347
- },
1348
- {
1349
- "epoch": 1.14,
1350
- "learning_rate": 0.0003946540248369517,
1351
- "loss": 0.0,
1352
- "step": 104500
1353
- },
1354
- {
1355
- "epoch": 1.15,
1356
- "learning_rate": 0.0003940956401322255,
1357
- "loss": 0.0,
1358
- "step": 105000
1359
- },
1360
- {
1361
- "epoch": 1.15,
1362
- "learning_rate": 0.00039353725542749937,
1363
- "loss": 0.0,
1364
- "step": 105500
1365
- },
1366
- {
1367
- "epoch": 1.16,
1368
- "learning_rate": 0.00039297887072277317,
1369
- "loss": 0.0,
1370
- "step": 106000
1371
- },
1372
- {
1373
- "epoch": 1.16,
1374
- "learning_rate": 0.00039242048601804703,
1375
- "loss": 0.0,
1376
- "step": 106500
1377
- },
1378
- {
1379
- "epoch": 1.17,
1380
- "learning_rate": 0.00039186210131332083,
1381
- "loss": 0.0,
1382
- "step": 107000
1383
- },
1384
- {
1385
- "epoch": 1.17,
1386
- "learning_rate": 0.00039130371660859464,
1387
- "loss": 0.0,
1388
- "step": 107500
1389
- },
1390
- {
1391
- "epoch": 1.18,
1392
- "learning_rate": 0.0003907453319038685,
1393
- "loss": 0.0,
1394
- "step": 108000
1395
- },
1396
- {
1397
- "epoch": 1.19,
1398
- "learning_rate": 0.0003901869471991423,
1399
- "loss": 0.0,
1400
- "step": 108500
1401
- },
1402
- {
1403
- "epoch": 1.19,
1404
- "learning_rate": 0.00038962856249441616,
1405
- "loss": 0.0,
1406
- "step": 109000
1407
- },
1408
- {
1409
- "epoch": 1.2,
1410
- "learning_rate": 0.00038907017778969,
1411
- "loss": 0.0,
1412
- "step": 109500
1413
- },
1414
- {
1415
- "epoch": 1.2,
1416
- "learning_rate": 0.0003885117930849638,
1417
- "loss": 0.0,
1418
- "step": 110000
1419
- },
1420
- {
1421
- "epoch": 1.2,
1422
- "eval_accuracy": 3.188648588911819e-05,
1423
- "eval_loss": NaN,
1424
- "eval_runtime": 1241.2976,
1425
- "eval_samples_per_second": 248.414,
1426
- "eval_steps_per_second": 3.882,
1427
- "step": 110000
1428
- },
1429
- {
1430
- "epoch": 1.21,
1431
- "learning_rate": 0.0003879534083802377,
1432
- "loss": 0.0,
1433
- "step": 110500
1434
- },
1435
- {
1436
- "epoch": 1.21,
1437
- "learning_rate": 0.0003873950236755115,
1438
- "loss": 0.0,
1439
- "step": 111000
1440
- },
1441
- {
1442
- "epoch": 1.22,
1443
- "learning_rate": 0.0003868366389707853,
1444
- "loss": 0.0,
1445
- "step": 111500
1446
- },
1447
- {
1448
- "epoch": 1.22,
1449
- "learning_rate": 0.00038627825426605915,
1450
- "loss": 0.0,
1451
- "step": 112000
1452
- },
1453
- {
1454
- "epoch": 1.23,
1455
- "learning_rate": 0.00038571986956133296,
1456
- "loss": 0.0,
1457
- "step": 112500
1458
- },
1459
- {
1460
- "epoch": 1.23,
1461
- "learning_rate": 0.0003851614848566068,
1462
- "loss": 0.0,
1463
- "step": 113000
1464
- },
1465
- {
1466
- "epoch": 1.24,
1467
- "learning_rate": 0.0003846031001518807,
1468
- "loss": 0.0,
1469
- "step": 113500
1470
- },
1471
- {
1472
- "epoch": 1.25,
1473
- "learning_rate": 0.0003840447154471545,
1474
- "loss": 0.0,
1475
- "step": 114000
1476
- },
1477
- {
1478
- "epoch": 1.25,
1479
- "learning_rate": 0.00038348633074242834,
1480
- "loss": 0.0,
1481
- "step": 114500
1482
- },
1483
- {
1484
- "epoch": 1.26,
1485
- "learning_rate": 0.00038292794603770214,
1486
- "loss": 0.0,
1487
- "step": 115000
1488
- },
1489
- {
1490
- "epoch": 1.26,
1491
- "learning_rate": 0.00038236956133297595,
1492
- "loss": 0.0,
1493
- "step": 115500
1494
- },
1495
- {
1496
- "epoch": 1.27,
1497
- "learning_rate": 0.0003818111766282498,
1498
- "loss": 0.0,
1499
- "step": 116000
1500
- },
1501
- {
1502
- "epoch": 1.27,
1503
- "learning_rate": 0.0003812527919235236,
1504
- "loss": 0.0,
1505
- "step": 116500
1506
- },
1507
- {
1508
- "epoch": 1.28,
1509
- "learning_rate": 0.00038069440721879747,
1510
- "loss": 0.0,
1511
- "step": 117000
1512
- },
1513
- {
1514
- "epoch": 1.28,
1515
- "learning_rate": 0.00038013602251407133,
1516
- "loss": 0.0,
1517
- "step": 117500
1518
- },
1519
- {
1520
- "epoch": 1.29,
1521
- "learning_rate": 0.00037957763780934514,
1522
- "loss": 0.0,
1523
- "step": 118000
1524
- },
1525
- {
1526
- "epoch": 1.29,
1527
- "learning_rate": 0.000379019253104619,
1528
- "loss": 0.0,
1529
- "step": 118500
1530
- },
1531
- {
1532
- "epoch": 1.3,
1533
- "learning_rate": 0.0003784608683998928,
1534
- "loss": 0.0,
1535
- "step": 119000
1536
- },
1537
- {
1538
- "epoch": 1.31,
1539
- "learning_rate": 0.0003779024836951666,
1540
- "loss": 0.0,
1541
- "step": 119500
1542
- },
1543
- {
1544
- "epoch": 1.31,
1545
- "learning_rate": 0.00037734409899044046,
1546
- "loss": 0.0,
1547
- "step": 120000
1548
- },
1549
- {
1550
- "epoch": 1.31,
1551
- "eval_accuracy": 2.9570698381633475e-05,
1552
- "eval_loss": NaN,
1553
- "eval_runtime": 1238.6519,
1554
- "eval_samples_per_second": 248.945,
1555
- "eval_steps_per_second": 3.891,
1556
- "step": 120000
1557
- },
1558
- {
1559
- "epoch": 1.32,
1560
- "learning_rate": 0.00037678571428571427,
1561
- "loss": 0.0,
1562
- "step": 120500
1563
- },
1564
- {
1565
- "epoch": 1.32,
1566
- "learning_rate": 0.00037622732958098807,
1567
- "loss": 0.0,
1568
- "step": 121000
1569
- },
1570
- {
1571
- "epoch": 1.33,
1572
- "learning_rate": 0.000375668944876262,
1573
- "loss": 0.0,
1574
- "step": 121500
1575
- },
1576
- {
1577
- "epoch": 1.33,
1578
- "learning_rate": 0.0003751105601715358,
1579
- "loss": 0.0,
1580
- "step": 122000
1581
- },
1582
- {
1583
- "epoch": 1.34,
1584
- "learning_rate": 0.00037455217546680965,
1585
- "loss": 0.0,
1586
- "step": 122500
1587
- },
1588
- {
1589
- "epoch": 1.34,
1590
- "learning_rate": 0.00037399379076208345,
1591
- "loss": 0.0,
1592
- "step": 123000
1593
- },
1594
- {
1595
- "epoch": 1.35,
1596
- "learning_rate": 0.00037343540605735726,
1597
- "loss": 0.0,
1598
- "step": 123500
1599
- },
1600
- {
1601
- "epoch": 1.35,
1602
- "learning_rate": 0.0003728770213526311,
1603
- "loss": 0.0,
1604
- "step": 124000
1605
- },
1606
- {
1607
- "epoch": 1.36,
1608
- "learning_rate": 0.0003723186366479049,
1609
- "loss": 0.0,
1610
- "step": 124500
1611
- },
1612
- {
1613
- "epoch": 1.37,
1614
- "learning_rate": 0.0003717602519431788,
1615
- "loss": 0.0,
1616
- "step": 125000
1617
- },
1618
- {
1619
- "epoch": 1.37,
1620
- "learning_rate": 0.00037120186723845264,
1621
- "loss": 0.0,
1622
- "step": 125500
1623
- },
1624
- {
1625
- "epoch": 1.38,
1626
- "learning_rate": 0.00037064348253372644,
1627
- "loss": 0.0,
1628
- "step": 126000
1629
- },
1630
- {
1631
- "epoch": 1.38,
1632
- "learning_rate": 0.0003700850978290003,
1633
- "loss": 0.0,
1634
- "step": 126500
1635
- },
1636
- {
1637
- "epoch": 1.39,
1638
- "learning_rate": 0.0003695267131242741,
1639
- "loss": 0.0,
1640
- "step": 127000
1641
- },
1642
- {
1643
- "epoch": 1.39,
1644
- "learning_rate": 0.0003689683284195479,
1645
- "loss": 0.0,
1646
- "step": 127500
1647
- },
1648
- {
1649
- "epoch": 1.4,
1650
- "learning_rate": 0.00036840994371482177,
1651
- "loss": 0.0,
1652
- "step": 128000
1653
- },
1654
- {
1655
- "epoch": 1.4,
1656
- "learning_rate": 0.0003678515590100956,
1657
- "loss": 0.0,
1658
- "step": 128500
1659
- },
1660
- {
1661
- "epoch": 1.41,
1662
- "learning_rate": 0.00036729317430536944,
1663
- "loss": 0.0,
1664
- "step": 129000
1665
- },
1666
- {
1667
- "epoch": 1.41,
1668
- "learning_rate": 0.0003667347896006433,
1669
- "loss": 0.0,
1670
- "step": 129500
1671
- },
1672
- {
1673
- "epoch": 1.42,
1674
- "learning_rate": 0.0003661764048959171,
1675
- "loss": 0.0,
1676
- "step": 130000
1677
- },
1678
- {
1679
- "epoch": 1.42,
1680
- "eval_accuracy": 3.2269763129715425e-05,
1681
- "eval_loss": NaN,
1682
- "eval_runtime": 1246.6351,
1683
- "eval_samples_per_second": 247.351,
1684
- "eval_steps_per_second": 3.866,
1685
- "step": 130000
1686
- },
1687
- {
1688
- "epoch": 1.43,
1689
- "learning_rate": 0.00036561802019119096,
1690
- "loss": 0.0,
1691
- "step": 130500
1692
- },
1693
- {
1694
- "epoch": 1.43,
1695
- "learning_rate": 0.00036505963548646476,
1696
- "loss": 0.0,
1697
- "step": 131000
1698
- },
1699
- {
1700
- "epoch": 1.44,
1701
- "learning_rate": 0.00036450125078173857,
1702
- "loss": 0.0,
1703
- "step": 131500
1704
- },
1705
- {
1706
- "epoch": 1.44,
1707
- "learning_rate": 0.0003639428660770124,
1708
- "loss": 0.0,
1709
- "step": 132000
1710
- },
1711
- {
1712
- "epoch": 1.45,
1713
- "learning_rate": 0.00036338448137228623,
1714
- "loss": 0.0,
1715
- "step": 132500
1716
- },
1717
- {
1718
- "epoch": 1.45,
1719
- "learning_rate": 0.0003628260966675601,
1720
- "loss": 0.0,
1721
- "step": 133000
1722
- },
1723
- {
1724
- "epoch": 1.46,
1725
- "learning_rate": 0.00036226771196283395,
1726
- "loss": 0.0,
1727
- "step": 133500
1728
- },
1729
- {
1730
- "epoch": 1.46,
1731
- "learning_rate": 0.00036170932725810775,
1732
- "loss": 0.0,
1733
- "step": 134000
1734
- },
1735
- {
1736
- "epoch": 1.47,
1737
- "learning_rate": 0.0003611509425533816,
1738
- "loss": 0.0,
1739
- "step": 134500
1740
- },
1741
- {
1742
- "epoch": 1.47,
1743
- "learning_rate": 0.0003605925578486554,
1744
- "loss": 0.0,
1745
- "step": 135000
1746
- },
1747
- {
1748
- "epoch": 1.48,
1749
- "learning_rate": 0.0003600341731439292,
1750
- "loss": 0.0,
1751
- "step": 135500
1752
- },
1753
- {
1754
- "epoch": 1.49,
1755
- "learning_rate": 0.0003594757884392031,
1756
- "loss": 0.0,
1757
- "step": 136000
1758
- },
1759
- {
1760
- "epoch": 1.49,
1761
- "learning_rate": 0.0003589174037344769,
1762
- "loss": 0.0,
1763
- "step": 136500
1764
- },
1765
- {
1766
- "epoch": 1.5,
1767
- "learning_rate": 0.00035835901902975075,
1768
- "loss": 0.0,
1769
- "step": 137000
1770
- },
1771
- {
1772
- "epoch": 1.5,
1773
- "learning_rate": 0.00035780063432502455,
1774
- "loss": 0.0,
1775
- "step": 137500
1776
- },
1777
- {
1778
- "epoch": 1.51,
1779
- "learning_rate": 0.0003572422496202984,
1780
- "loss": 0.0,
1781
- "step": 138000
1782
- },
1783
- {
1784
- "epoch": 1.51,
1785
- "learning_rate": 0.00035668386491557227,
1786
- "loss": 0.0,
1787
- "step": 138500
1788
- },
1789
- {
1790
- "epoch": 1.52,
1791
- "learning_rate": 0.0003561254802108461,
1792
- "loss": 0.0,
1793
- "step": 139000
1794
- },
1795
- {
1796
- "epoch": 1.52,
1797
- "learning_rate": 0.00035556709550611993,
1798
- "loss": 0.0,
1799
- "step": 139500
1800
- },
1801
- {
1802
- "epoch": 1.53,
1803
- "learning_rate": 0.00035500871080139374,
1804
- "loss": 0.0,
1805
- "step": 140000
1806
- },
1807
- {
1808
- "epoch": 1.53,
1809
- "eval_accuracy": 3.290421786718595e-05,
1810
- "eval_loss": NaN,
1811
- "eval_runtime": 1247.689,
1812
- "eval_samples_per_second": 247.142,
1813
- "eval_steps_per_second": 3.862,
1814
- "step": 140000
1815
- },
1816
- {
1817
- "epoch": 1.53,
1818
- "step": 140001,
1819
- "total_flos": 2.560247267189588e+18,
1820
- "train_loss": 0.3459514281929236,
1821
- "train_runtime": 108836.6872,
1822
- "train_samples_per_second": 269.154,
1823
- "train_steps_per_second": 4.206
1824
- }
1825
- ],
1826
- "max_steps": 457720,
1827
- "num_train_epochs": 5,
1828
- "total_flos": 2.560247267189588e+18,
1829
- "trial_name": null,
1830
- "trial_params": null
1831
- }
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf488b89e0da577c50b385814b810962da42a93bacc6011e5e0c6b3664d5c0da
3
+ size 12524111