Aamod37 commited on
Commit
cac96d9
1 Parent(s): d9abc1d

Delete trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +0 -1572
trainer_state.json DELETED
@@ -1,1572 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 0.47914153807761095,
5
- "eval_steps": 3000,
6
- "global_step": 10800,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.002218247861470421,
13
- "grad_norm": 0.15810145437717438,
14
- "learning_rate": 2e-05,
15
- "loss": 2.8627,
16
- "step": 50
17
- },
18
- {
19
- "epoch": 0.004436495722940842,
20
- "grad_norm": 0.1590433567762375,
21
- "learning_rate": 4e-05,
22
- "loss": 2.8607,
23
- "step": 100
24
- },
25
- {
26
- "epoch": 0.006654743584411263,
27
- "grad_norm": 0.15798641741275787,
28
- "learning_rate": 6e-05,
29
- "loss": 2.8623,
30
- "step": 150
31
- },
32
- {
33
- "epoch": 0.008872991445881684,
34
- "grad_norm": 0.16127805411815643,
35
- "learning_rate": 8e-05,
36
- "loss": 2.8608,
37
- "step": 200
38
- },
39
- {
40
- "epoch": 0.011091239307352105,
41
- "grad_norm": 0.1587396264076233,
42
- "learning_rate": 0.0001,
43
- "loss": 2.8608,
44
- "step": 250
45
- },
46
- {
47
- "epoch": 0.013309487168822525,
48
- "grad_norm": 0.160736083984375,
49
- "learning_rate": 0.00012,
50
- "loss": 2.8563,
51
- "step": 300
52
- },
53
- {
54
- "epoch": 0.015527735030292948,
55
- "grad_norm": 0.16256989538669586,
56
- "learning_rate": 0.00014,
57
- "loss": 2.8549,
58
- "step": 350
59
- },
60
- {
61
- "epoch": 0.01774598289176337,
62
- "grad_norm": 0.16194568574428558,
63
- "learning_rate": 0.00016,
64
- "loss": 2.8557,
65
- "step": 400
66
- },
67
- {
68
- "epoch": 0.01996423075323379,
69
- "grad_norm": 0.15836463868618011,
70
- "learning_rate": 0.00018,
71
- "loss": 2.8545,
72
- "step": 450
73
- },
74
- {
75
- "epoch": 0.02218247861470421,
76
- "grad_norm": 0.16059577465057373,
77
- "learning_rate": 0.0002,
78
- "loss": 2.8522,
79
- "step": 500
80
- },
81
- {
82
- "epoch": 0.024400726476174632,
83
- "grad_norm": 0.16031378507614136,
84
- "learning_rate": 0.00022000000000000003,
85
- "loss": 2.8481,
86
- "step": 550
87
- },
88
- {
89
- "epoch": 0.02661897433764505,
90
- "grad_norm": 0.16000501811504364,
91
- "learning_rate": 0.00024,
92
- "loss": 2.8431,
93
- "step": 600
94
- },
95
- {
96
- "epoch": 0.028837222199115473,
97
- "grad_norm": 0.15952646732330322,
98
- "learning_rate": 0.00026000000000000003,
99
- "loss": 2.8475,
100
- "step": 650
101
- },
102
- {
103
- "epoch": 0.031055470060585896,
104
- "grad_norm": 0.16443726420402527,
105
- "learning_rate": 0.00028,
106
- "loss": 2.8452,
107
- "step": 700
108
- },
109
- {
110
- "epoch": 0.033273717922056315,
111
- "grad_norm": 0.1644088476896286,
112
- "learning_rate": 0.00030000000000000003,
113
- "loss": 2.8458,
114
- "step": 750
115
- },
116
- {
117
- "epoch": 0.03549196578352674,
118
- "grad_norm": 0.16272033751010895,
119
- "learning_rate": 0.00032,
120
- "loss": 2.8435,
121
- "step": 800
122
- },
123
- {
124
- "epoch": 0.03771021364499716,
125
- "grad_norm": 0.16485804319381714,
126
- "learning_rate": 0.00034,
127
- "loss": 2.8481,
128
- "step": 850
129
- },
130
- {
131
- "epoch": 0.03992846150646758,
132
- "grad_norm": 0.1669188290834427,
133
- "learning_rate": 0.00036,
134
- "loss": 2.8555,
135
- "step": 900
136
- },
137
- {
138
- "epoch": 0.042146709367938,
139
- "grad_norm": 0.16288943588733673,
140
- "learning_rate": 0.00038,
141
- "loss": 2.851,
142
- "step": 950
143
- },
144
- {
145
- "epoch": 0.04436495722940842,
146
- "grad_norm": 0.1651136726140976,
147
- "learning_rate": 0.0004,
148
- "loss": 2.8443,
149
- "step": 1000
150
- },
151
- {
152
- "epoch": 0.04658320509087884,
153
- "grad_norm": 0.16190673410892487,
154
- "learning_rate": 0.00039999468202328424,
155
- "loss": 2.8398,
156
- "step": 1050
157
- },
158
- {
159
- "epoch": 0.048801452952349264,
160
- "grad_norm": 0.1649934947490692,
161
- "learning_rate": 0.00039997872837594555,
162
- "loss": 2.8371,
163
- "step": 1100
164
- },
165
- {
166
- "epoch": 0.051019700813819686,
167
- "grad_norm": 0.16184477508068085,
168
- "learning_rate": 0.00039995213990639536,
169
- "loss": 2.8347,
170
- "step": 1150
171
- },
172
- {
173
- "epoch": 0.0532379486752901,
174
- "grad_norm": 0.1629864126443863,
175
- "learning_rate": 0.0003999149180286022,
176
- "loss": 2.834,
177
- "step": 1200
178
- },
179
- {
180
- "epoch": 0.055456196536760524,
181
- "grad_norm": 0.1627526730298996,
182
- "learning_rate": 0.00039986706472201685,
183
- "loss": 2.8309,
184
- "step": 1250
185
- },
186
- {
187
- "epoch": 0.05767444439823095,
188
- "grad_norm": 0.1642647087574005,
189
- "learning_rate": 0.000399808582531467,
190
- "loss": 2.8352,
191
- "step": 1300
192
- },
193
- {
194
- "epoch": 0.05989269225970137,
195
- "grad_norm": 0.16397783160209656,
196
- "learning_rate": 0.000399739474567022,
197
- "loss": 2.8317,
198
- "step": 1350
199
- },
200
- {
201
- "epoch": 0.06211094012117179,
202
- "grad_norm": 0.16319701075553894,
203
- "learning_rate": 0.00039965974450382726,
204
- "loss": 2.8322,
205
- "step": 1400
206
- },
207
- {
208
- "epoch": 0.06432918798264221,
209
- "grad_norm": 0.16067005693912506,
210
- "learning_rate": 0.000399569396581909,
211
- "loss": 2.8279,
212
- "step": 1450
213
- },
214
- {
215
- "epoch": 0.06654743584411263,
216
- "grad_norm": 0.16118553280830383,
217
- "learning_rate": 0.00039946843560594866,
218
- "loss": 2.8323,
219
- "step": 1500
220
- },
221
- {
222
- "epoch": 0.06876568370558306,
223
- "grad_norm": 0.16291728615760803,
224
- "learning_rate": 0.0003993568669450274,
225
- "loss": 2.8301,
226
- "step": 1550
227
- },
228
- {
229
- "epoch": 0.07098393156705347,
230
- "grad_norm": 0.1590035855770111,
231
- "learning_rate": 0.0003992346965323407,
232
- "loss": 2.8214,
233
- "step": 1600
234
- },
235
- {
236
- "epoch": 0.07320217942852389,
237
- "grad_norm": 0.16236472129821777,
238
- "learning_rate": 0.00039910193086488253,
239
- "loss": 2.8242,
240
- "step": 1650
241
- },
242
- {
243
- "epoch": 0.07542042728999432,
244
- "grad_norm": 0.1617489606142044,
245
- "learning_rate": 0.0003989585770031003,
246
- "loss": 2.8231,
247
- "step": 1700
248
- },
249
- {
250
- "epoch": 0.07763867515146473,
251
- "grad_norm": 0.15960238873958588,
252
- "learning_rate": 0.000398804642570519,
253
- "loss": 2.8248,
254
- "step": 1750
255
- },
256
- {
257
- "epoch": 0.07985692301293516,
258
- "grad_norm": 0.16391754150390625,
259
- "learning_rate": 0.0003986401357533358,
260
- "loss": 2.8222,
261
- "step": 1800
262
- },
263
- {
264
- "epoch": 0.08207517087440558,
265
- "grad_norm": 0.16161847114562988,
266
- "learning_rate": 0.000398465065299985,
267
- "loss": 2.8153,
268
- "step": 1850
269
- },
270
- {
271
- "epoch": 0.084293418735876,
272
- "grad_norm": 0.16447125375270844,
273
- "learning_rate": 0.00039827944052067265,
274
- "loss": 2.818,
275
- "step": 1900
276
- },
277
- {
278
- "epoch": 0.08651166659734642,
279
- "grad_norm": 0.16384591162204742,
280
- "learning_rate": 0.0003980832712868812,
281
- "loss": 2.8093,
282
- "step": 1950
283
- },
284
- {
285
- "epoch": 0.08872991445881684,
286
- "grad_norm": 0.16317427158355713,
287
- "learning_rate": 0.0003978765680308447,
288
- "loss": 2.8113,
289
- "step": 2000
290
- },
291
- {
292
- "epoch": 0.09094816232028727,
293
- "grad_norm": 0.16197824478149414,
294
- "learning_rate": 0.00039765934174499436,
295
- "loss": 2.8134,
296
- "step": 2050
297
- },
298
- {
299
- "epoch": 0.09316641018175768,
300
- "grad_norm": 0.16196754574775696,
301
- "learning_rate": 0.00039743160398137344,
302
- "loss": 2.8147,
303
- "step": 2100
304
- },
305
- {
306
- "epoch": 0.0953846580432281,
307
- "grad_norm": 0.16696424782276154,
308
- "learning_rate": 0.00039719336685102314,
309
- "loss": 2.811,
310
- "step": 2150
311
- },
312
- {
313
- "epoch": 0.09760290590469853,
314
- "grad_norm": 0.16266262531280518,
315
- "learning_rate": 0.0003969446430233386,
316
- "loss": 2.8103,
317
- "step": 2200
318
- },
319
- {
320
- "epoch": 0.09982115376616894,
321
- "grad_norm": 0.16161397099494934,
322
- "learning_rate": 0.0003966854457253951,
323
- "loss": 2.8017,
324
- "step": 2250
325
- },
326
- {
327
- "epoch": 0.10203940162763937,
328
- "grad_norm": 0.1631053388118744,
329
- "learning_rate": 0.0003964157887412445,
330
- "loss": 2.8034,
331
- "step": 2300
332
- },
333
- {
334
- "epoch": 0.10425764948910979,
335
- "grad_norm": 0.16185788810253143,
336
- "learning_rate": 0.00039613568641118255,
337
- "loss": 2.8027,
338
- "step": 2350
339
- },
340
- {
341
- "epoch": 0.1064758973505802,
342
- "grad_norm": 0.16428661346435547,
343
- "learning_rate": 0.00039584515363098584,
344
- "loss": 2.8031,
345
- "step": 2400
346
- },
347
- {
348
- "epoch": 0.10869414521205063,
349
- "grad_norm": 0.1625480055809021,
350
- "learning_rate": 0.00039554420585112,
351
- "loss": 2.7968,
352
- "step": 2450
353
- },
354
- {
355
- "epoch": 0.11091239307352105,
356
- "grad_norm": 0.1638619303703308,
357
- "learning_rate": 0.0003952328590759179,
358
- "loss": 2.8007,
359
- "step": 2500
360
- },
361
- {
362
- "epoch": 0.11313064093499148,
363
- "grad_norm": 0.16504357755184174,
364
- "learning_rate": 0.0003949111298627286,
365
- "loss": 2.7921,
366
- "step": 2550
367
- },
368
- {
369
- "epoch": 0.1153488887964619,
370
- "grad_norm": 0.16429375112056732,
371
- "learning_rate": 0.0003945790353210367,
372
- "loss": 2.7951,
373
- "step": 2600
374
- },
375
- {
376
- "epoch": 0.11756713665793232,
377
- "grad_norm": 0.166097030043602,
378
- "learning_rate": 0.0003942365931115526,
379
- "loss": 2.7948,
380
- "step": 2650
381
- },
382
- {
383
- "epoch": 0.11978538451940274,
384
- "grad_norm": 0.16275139153003693,
385
- "learning_rate": 0.0003938838214452733,
386
- "loss": 2.79,
387
- "step": 2700
388
- },
389
- {
390
- "epoch": 0.12200363238087315,
391
- "grad_norm": 0.16379590332508087,
392
- "learning_rate": 0.0003935207390825137,
393
- "loss": 2.7896,
394
- "step": 2750
395
- },
396
- {
397
- "epoch": 0.12422188024234358,
398
- "grad_norm": 0.16332408785820007,
399
- "learning_rate": 0.0003931473653319095,
400
- "loss": 2.7848,
401
- "step": 2800
402
- },
403
- {
404
- "epoch": 0.126440128103814,
405
- "grad_norm": 0.16235879063606262,
406
- "learning_rate": 0.00039276372004938987,
407
- "loss": 2.7836,
408
- "step": 2850
409
- },
410
- {
411
- "epoch": 0.12865837596528443,
412
- "grad_norm": 0.1654053032398224,
413
- "learning_rate": 0.00039236982363712145,
414
- "loss": 2.7845,
415
- "step": 2900
416
- },
417
- {
418
- "epoch": 0.13087662382675483,
419
- "grad_norm": 0.16393068432807922,
420
- "learning_rate": 0.00039196569704242376,
421
- "loss": 2.7796,
422
- "step": 2950
423
- },
424
- {
425
- "epoch": 0.13309487168822526,
426
- "grad_norm": 0.16517628729343414,
427
- "learning_rate": 0.0003915513617566551,
428
- "loss": 2.7738,
429
- "step": 3000
430
- },
431
- {
432
- "epoch": 0.13309487168822526,
433
- "eval_accuracy": 0.4247227650219834,
434
- "eval_loss": 2.8996665477752686,
435
- "eval_runtime": 243.2366,
436
- "eval_samples_per_second": 8.222,
437
- "eval_steps_per_second": 1.028,
438
- "step": 3000
439
- },
440
- {
441
- "epoch": 0.1353131195496957,
442
- "grad_norm": 0.16540038585662842,
443
- "learning_rate": 0.00039112683981406936,
444
- "loss": 2.7708,
445
- "step": 3050
446
- },
447
- {
448
- "epoch": 0.13753136741116612,
449
- "grad_norm": 0.16403205692768097,
450
- "learning_rate": 0.00039069215379064465,
451
- "loss": 2.7709,
452
- "step": 3100
453
- },
454
- {
455
- "epoch": 0.13974961527263652,
456
- "grad_norm": 0.16498889029026031,
457
- "learning_rate": 0.0003902473268028826,
458
- "loss": 2.7683,
459
- "step": 3150
460
- },
461
- {
462
- "epoch": 0.14196786313410695,
463
- "grad_norm": 0.16713927686214447,
464
- "learning_rate": 0.00038979238250657863,
465
- "loss": 2.7578,
466
- "step": 3200
467
- },
468
- {
469
- "epoch": 0.14418611099557738,
470
- "grad_norm": 0.16905058920383453,
471
- "learning_rate": 0.00038932734509556467,
472
- "loss": 2.7602,
473
- "step": 3250
474
- },
475
- {
476
- "epoch": 0.14640435885704778,
477
- "grad_norm": 0.16431044042110443,
478
- "learning_rate": 0.0003888522393004219,
479
- "loss": 2.7685,
480
- "step": 3300
481
- },
482
- {
483
- "epoch": 0.1486226067185182,
484
- "grad_norm": 0.163705512881279,
485
- "learning_rate": 0.00038836709038716583,
486
- "loss": 2.8434,
487
- "step": 3350
488
- },
489
- {
490
- "epoch": 0.15084085457998864,
491
- "grad_norm": 0.1622520387172699,
492
- "learning_rate": 0.0003878719241559027,
493
- "loss": 2.8349,
494
- "step": 3400
495
- },
496
- {
497
- "epoch": 0.15305910244145907,
498
- "grad_norm": 0.16072827577590942,
499
- "learning_rate": 0.00038736676693945746,
500
- "loss": 2.8369,
501
- "step": 3450
502
- },
503
- {
504
- "epoch": 0.15527735030292947,
505
- "grad_norm": 0.16206832230091095,
506
- "learning_rate": 0.0003868516456019733,
507
- "loss": 2.8404,
508
- "step": 3500
509
- },
510
- {
511
- "epoch": 0.1574955981643999,
512
- "grad_norm": 0.16148249804973602,
513
- "learning_rate": 0.0003863265875374829,
514
- "loss": 2.836,
515
- "step": 3550
516
- },
517
- {
518
- "epoch": 0.15971384602587033,
519
- "grad_norm": 0.16401226818561554,
520
- "learning_rate": 0.0003857916206684519,
521
- "loss": 2.8369,
522
- "step": 3600
523
- },
524
- {
525
- "epoch": 0.16193209388734073,
526
- "grad_norm": 0.15987250208854675,
527
- "learning_rate": 0.00038524677344429386,
528
- "loss": 2.8363,
529
- "step": 3650
530
- },
531
- {
532
- "epoch": 0.16415034174881116,
533
- "grad_norm": 0.16117645800113678,
534
- "learning_rate": 0.00038469207483985725,
535
- "loss": 2.8426,
536
- "step": 3700
537
- },
538
- {
539
- "epoch": 0.1663685896102816,
540
- "grad_norm": 0.16374363005161285,
541
- "learning_rate": 0.00038412755435388474,
542
- "loss": 2.8416,
543
- "step": 3750
544
- },
545
- {
546
- "epoch": 0.168586837471752,
547
- "grad_norm": 0.16495129466056824,
548
- "learning_rate": 0.0003835532420074444,
549
- "loss": 2.8396,
550
- "step": 3800
551
- },
552
- {
553
- "epoch": 0.17080508533322242,
554
- "grad_norm": 0.16315814852714539,
555
- "learning_rate": 0.0003829691683423329,
556
- "loss": 2.8358,
557
- "step": 3850
558
- },
559
- {
560
- "epoch": 0.17302333319469285,
561
- "grad_norm": 0.16098596155643463,
562
- "learning_rate": 0.00038237536441945193,
563
- "loss": 2.8354,
564
- "step": 3900
565
- },
566
- {
567
- "epoch": 0.17524158105616328,
568
- "grad_norm": 0.16226187348365784,
569
- "learning_rate": 0.00038177186181715577,
570
- "loss": 2.8352,
571
- "step": 3950
572
- },
573
- {
574
- "epoch": 0.17745982891763368,
575
- "grad_norm": 0.15939714014530182,
576
- "learning_rate": 0.00038115869262957233,
577
- "loss": 2.835,
578
- "step": 4000
579
- },
580
- {
581
- "epoch": 0.1796780767791041,
582
- "grad_norm": 0.1622256189584732,
583
- "learning_rate": 0.00038053588946489615,
584
- "loss": 2.8391,
585
- "step": 4050
586
- },
587
- {
588
- "epoch": 0.18189632464057454,
589
- "grad_norm": 0.16176052391529083,
590
- "learning_rate": 0.0003799034854436545,
591
- "loss": 2.8371,
592
- "step": 4100
593
- },
594
- {
595
- "epoch": 0.18411457250204494,
596
- "grad_norm": 0.16316720843315125,
597
- "learning_rate": 0.0003792615141969462,
598
- "loss": 2.8365,
599
- "step": 4150
600
- },
601
- {
602
- "epoch": 0.18633282036351537,
603
- "grad_norm": 0.16207610070705414,
604
- "learning_rate": 0.0003786100098646524,
605
- "loss": 2.8346,
606
- "step": 4200
607
- },
608
- {
609
- "epoch": 0.1885510682249858,
610
- "grad_norm": 0.1638234406709671,
611
- "learning_rate": 0.000377949007093622,
612
- "loss": 2.8319,
613
- "step": 4250
614
- },
615
- {
616
- "epoch": 0.1907693160864562,
617
- "grad_norm": 0.1628599315881729,
618
- "learning_rate": 0.0003772785410358283,
619
- "loss": 2.8369,
620
- "step": 4300
621
- },
622
- {
623
- "epoch": 0.19298756394792663,
624
- "grad_norm": 0.1653933823108673,
625
- "learning_rate": 0.00037659864734650026,
626
- "loss": 2.8304,
627
- "step": 4350
628
- },
629
- {
630
- "epoch": 0.19520581180939706,
631
- "grad_norm": 0.16370812058448792,
632
- "learning_rate": 0.0003759093621822259,
633
- "loss": 2.8369,
634
- "step": 4400
635
- },
636
- {
637
- "epoch": 0.19742405967086749,
638
- "grad_norm": 0.1629050225019455,
639
- "learning_rate": 0.0003752107221990298,
640
- "loss": 2.8339,
641
- "step": 4450
642
- },
643
- {
644
- "epoch": 0.1996423075323379,
645
- "grad_norm": 0.1610899269580841,
646
- "learning_rate": 0.00037450276455042354,
647
- "loss": 2.829,
648
- "step": 4500
649
- },
650
- {
651
- "epoch": 0.20186055539380832,
652
- "grad_norm": 0.1629941165447235,
653
- "learning_rate": 0.00037378552688543005,
654
- "loss": 2.8351,
655
- "step": 4550
656
- },
657
- {
658
- "epoch": 0.20407880325527875,
659
- "grad_norm": 0.16439735889434814,
660
- "learning_rate": 0.0003730590473465814,
661
- "loss": 2.8316,
662
- "step": 4600
663
- },
664
- {
665
- "epoch": 0.20629705111674915,
666
- "grad_norm": 0.16572241485118866,
667
- "learning_rate": 0.00037232336456789023,
668
- "loss": 2.8335,
669
- "step": 4650
670
- },
671
- {
672
- "epoch": 0.20851529897821958,
673
- "grad_norm": 0.16380038857460022,
674
- "learning_rate": 0.00037157851767279543,
675
- "loss": 2.8286,
676
- "step": 4700
677
- },
678
- {
679
- "epoch": 0.21073354683969,
680
- "grad_norm": 0.16284549236297607,
681
- "learning_rate": 0.00037082454627208156,
682
- "loss": 2.8301,
683
- "step": 4750
684
- },
685
- {
686
- "epoch": 0.2129517947011604,
687
- "grad_norm": 0.16597482562065125,
688
- "learning_rate": 0.0003700614904617721,
689
- "loss": 2.8323,
690
- "step": 4800
691
- },
692
- {
693
- "epoch": 0.21517004256263084,
694
- "grad_norm": 0.16378666460514069,
695
- "learning_rate": 0.0003692893908209973,
696
- "loss": 2.8299,
697
- "step": 4850
698
- },
699
- {
700
- "epoch": 0.21738829042410127,
701
- "grad_norm": 0.1630343496799469,
702
- "learning_rate": 0.0003685082884098363,
703
- "loss": 2.8333,
704
- "step": 4900
705
- },
706
- {
707
- "epoch": 0.2196065382855717,
708
- "grad_norm": 0.16490814089775085,
709
- "learning_rate": 0.00036771822476713346,
710
- "loss": 2.8307,
711
- "step": 4950
712
- },
713
- {
714
- "epoch": 0.2218247861470421,
715
- "grad_norm": 0.1655721366405487,
716
- "learning_rate": 0.00036691924190828935,
717
- "loss": 2.8301,
718
- "step": 5000
719
- },
720
- {
721
- "epoch": 0.22404303400851253,
722
- "grad_norm": 0.16776245832443237,
723
- "learning_rate": 0.0003661113823230264,
724
- "loss": 2.8228,
725
- "step": 5050
726
- },
727
- {
728
- "epoch": 0.22626128186998296,
729
- "grad_norm": 0.1626349687576294,
730
- "learning_rate": 0.00036529468897312926,
731
- "loss": 2.8262,
732
- "step": 5100
733
- },
734
- {
735
- "epoch": 0.22847952973145336,
736
- "grad_norm": 0.16331753134727478,
737
- "learning_rate": 0.00036446920529016,
738
- "loss": 2.8282,
739
- "step": 5150
740
- },
741
- {
742
- "epoch": 0.2306977775929238,
743
- "grad_norm": 0.1676001250743866,
744
- "learning_rate": 0.00036363497517314877,
745
- "loss": 2.8313,
746
- "step": 5200
747
- },
748
- {
749
- "epoch": 0.23291602545439422,
750
- "grad_norm": 0.16441357135772705,
751
- "learning_rate": 0.000362792042986259,
752
- "loss": 2.8278,
753
- "step": 5250
754
- },
755
- {
756
- "epoch": 0.23513427331586464,
757
- "grad_norm": 0.16601622104644775,
758
- "learning_rate": 0.000361940453556428,
759
- "loss": 2.8303,
760
- "step": 5300
761
- },
762
- {
763
- "epoch": 0.23735252117733505,
764
- "grad_norm": 0.1679011583328247,
765
- "learning_rate": 0.0003610802521709833,
766
- "loss": 2.8252,
767
- "step": 5350
768
- },
769
- {
770
- "epoch": 0.23957076903880548,
771
- "grad_norm": 0.1650955229997635,
772
- "learning_rate": 0.0003602114845752345,
773
- "loss": 2.8299,
774
- "step": 5400
775
- },
776
- {
777
- "epoch": 0.2417890169002759,
778
- "grad_norm": 0.16651777923107147,
779
- "learning_rate": 0.00035933419697004,
780
- "loss": 2.832,
781
- "step": 5450
782
- },
783
- {
784
- "epoch": 0.2440072647617463,
785
- "grad_norm": 0.166709303855896,
786
- "learning_rate": 0.00035844843600935024,
787
- "loss": 2.8262,
788
- "step": 5500
789
- },
790
- {
791
- "epoch": 0.24622551262321674,
792
- "grad_norm": 0.16586416959762573,
793
- "learning_rate": 0.000357554248797727,
794
- "loss": 2.8255,
795
- "step": 5550
796
- },
797
- {
798
- "epoch": 0.24844376048468717,
799
- "grad_norm": 0.1647614985704422,
800
- "learning_rate": 0.00035665168288783795,
801
- "loss": 2.8298,
802
- "step": 5600
803
- },
804
- {
805
- "epoch": 0.2506620083461576,
806
- "grad_norm": 0.16310204565525055,
807
- "learning_rate": 0.000355740786277928,
808
- "loss": 2.8273,
809
- "step": 5650
810
- },
811
- {
812
- "epoch": 0.252880256207628,
813
- "grad_norm": 0.1629767119884491,
814
- "learning_rate": 0.00035482160740926683,
815
- "loss": 2.8231,
816
- "step": 5700
817
- },
818
- {
819
- "epoch": 0.2550985040690984,
820
- "grad_norm": 0.16427451372146606,
821
- "learning_rate": 0.00035389419516357253,
822
- "loss": 2.8188,
823
- "step": 5750
824
- },
825
- {
826
- "epoch": 0.25731675193056885,
827
- "grad_norm": 0.1655891388654709,
828
- "learning_rate": 0.0003529585988604125,
829
- "loss": 2.8258,
830
- "step": 5800
831
- },
832
- {
833
- "epoch": 0.25953499979203926,
834
- "grad_norm": 0.16402335464954376,
835
- "learning_rate": 0.0003520148682545803,
836
- "loss": 2.8254,
837
- "step": 5850
838
- },
839
- {
840
- "epoch": 0.26175324765350966,
841
- "grad_norm": 0.1638861894607544,
842
- "learning_rate": 0.0003510630535334497,
843
- "loss": 2.8298,
844
- "step": 5900
845
- },
846
- {
847
- "epoch": 0.2639714955149801,
848
- "grad_norm": 0.16864845156669617,
849
- "learning_rate": 0.0003501032053143061,
850
- "loss": 2.8238,
851
- "step": 5950
852
- },
853
- {
854
- "epoch": 0.2661897433764505,
855
- "grad_norm": 0.16578635573387146,
856
- "learning_rate": 0.0003491353746416541,
857
- "loss": 2.8225,
858
- "step": 6000
859
- },
860
- {
861
- "epoch": 0.2661897433764505,
862
- "eval_accuracy": 0.4264626282364436,
863
- "eval_loss": 2.8843319416046143,
864
- "eval_runtime": 242.3694,
865
- "eval_samples_per_second": 8.252,
866
- "eval_steps_per_second": 1.031,
867
- "step": 6000
868
- },
869
- {
870
- "epoch": 0.268407991237921,
871
- "grad_norm": 0.16673897206783295,
872
- "learning_rate": 0.00034815961298450377,
873
- "loss": 2.823,
874
- "step": 6050
875
- },
876
- {
877
- "epoch": 0.2706262390993914,
878
- "grad_norm": 0.16588376462459564,
879
- "learning_rate": 0.0003471759722336326,
880
- "loss": 2.8193,
881
- "step": 6100
882
- },
883
- {
884
- "epoch": 0.2728444869608618,
885
- "grad_norm": 0.16813361644744873,
886
- "learning_rate": 0.00034618450469882687,
887
- "loss": 2.8267,
888
- "step": 6150
889
- },
890
- {
891
- "epoch": 0.27506273482233223,
892
- "grad_norm": 0.16656942665576935,
893
- "learning_rate": 0.0003451852631060991,
894
- "loss": 2.8219,
895
- "step": 6200
896
- },
897
- {
898
- "epoch": 0.27728098268380263,
899
- "grad_norm": 0.1666443794965744,
900
- "learning_rate": 0.0003441783005948846,
901
- "loss": 2.8233,
902
- "step": 6250
903
- },
904
- {
905
- "epoch": 0.27949923054527304,
906
- "grad_norm": 0.1673704832792282,
907
- "learning_rate": 0.0003431636707152152,
908
- "loss": 2.824,
909
- "step": 6300
910
- },
911
- {
912
- "epoch": 0.2817174784067435,
913
- "grad_norm": 0.16707104444503784,
914
- "learning_rate": 0.00034214142742487177,
915
- "loss": 2.8221,
916
- "step": 6350
917
- },
918
- {
919
- "epoch": 0.2839357262682139,
920
- "grad_norm": 0.16775397956371307,
921
- "learning_rate": 0.0003411116250865143,
922
- "loss": 2.8234,
923
- "step": 6400
924
- },
925
- {
926
- "epoch": 0.2861539741296843,
927
- "grad_norm": 0.16813720762729645,
928
- "learning_rate": 0.0003400743184647915,
929
- "loss": 2.8258,
930
- "step": 6450
931
- },
932
- {
933
- "epoch": 0.28837222199115475,
934
- "grad_norm": 0.16362161934375763,
935
- "learning_rate": 0.00033902956272342783,
936
- "loss": 2.8232,
937
- "step": 6500
938
- },
939
- {
940
- "epoch": 0.29059046985262516,
941
- "grad_norm": 0.16950780153274536,
942
- "learning_rate": 0.00033797741342229054,
943
- "loss": 2.821,
944
- "step": 6550
945
- },
946
- {
947
- "epoch": 0.29280871771409556,
948
- "grad_norm": 0.1657160073518753,
949
- "learning_rate": 0.00033691792651443435,
950
- "loss": 2.8181,
951
- "step": 6600
952
- },
953
- {
954
- "epoch": 0.295026965575566,
955
- "grad_norm": 0.1689310073852539,
956
- "learning_rate": 0.0003358511583431264,
957
- "loss": 2.8257,
958
- "step": 6650
959
- },
960
- {
961
- "epoch": 0.2972452134370364,
962
- "grad_norm": 0.16674135625362396,
963
- "learning_rate": 0.00033477716563884956,
964
- "loss": 2.8209,
965
- "step": 6700
966
- },
967
- {
968
- "epoch": 0.2994634612985068,
969
- "grad_norm": 0.16600748896598816,
970
- "learning_rate": 0.00033369600551628586,
971
- "loss": 2.8227,
972
- "step": 6750
973
- },
974
- {
975
- "epoch": 0.3016817091599773,
976
- "grad_norm": 0.16666853427886963,
977
- "learning_rate": 0.0003326077354712789,
978
- "loss": 2.8199,
979
- "step": 6800
980
- },
981
- {
982
- "epoch": 0.3038999570214477,
983
- "grad_norm": 0.1671936959028244,
984
- "learning_rate": 0.00033151241337777624,
985
- "loss": 2.82,
986
- "step": 6850
987
- },
988
- {
989
- "epoch": 0.30611820488291813,
990
- "grad_norm": 0.1675061583518982,
991
- "learning_rate": 0.00033041009748475166,
992
- "loss": 2.8246,
993
- "step": 6900
994
- },
995
- {
996
- "epoch": 0.30833645274438853,
997
- "grad_norm": 0.16512750089168549,
998
- "learning_rate": 0.0003293008464131079,
999
- "loss": 2.8178,
1000
- "step": 6950
1001
- },
1002
- {
1003
- "epoch": 0.31055470060585894,
1004
- "grad_norm": 0.1670486181974411,
1005
- "learning_rate": 0.0003281847191525585,
1006
- "loss": 2.8185,
1007
- "step": 7000
1008
- },
1009
- {
1010
- "epoch": 0.3127729484673294,
1011
- "grad_norm": 0.1692744940519333,
1012
- "learning_rate": 0.0003270617750584913,
1013
- "loss": 2.8184,
1014
- "step": 7050
1015
- },
1016
- {
1017
- "epoch": 0.3149911963287998,
1018
- "grad_norm": 0.16573506593704224,
1019
- "learning_rate": 0.0003259320738488119,
1020
- "loss": 2.823,
1021
- "step": 7100
1022
- },
1023
- {
1024
- "epoch": 0.3172094441902702,
1025
- "grad_norm": 0.17004618048667908,
1026
- "learning_rate": 0.00032479567560076745,
1027
- "loss": 2.8174,
1028
- "step": 7150
1029
- },
1030
- {
1031
- "epoch": 0.31942769205174065,
1032
- "grad_norm": 0.16867642104625702,
1033
- "learning_rate": 0.00032365264074775223,
1034
- "loss": 2.8183,
1035
- "step": 7200
1036
- },
1037
- {
1038
- "epoch": 0.32164593991321105,
1039
- "grad_norm": 0.16543437540531158,
1040
- "learning_rate": 0.00032250303007609366,
1041
- "loss": 2.8178,
1042
- "step": 7250
1043
- },
1044
- {
1045
- "epoch": 0.32386418777468146,
1046
- "grad_norm": 0.16606374084949493,
1047
- "learning_rate": 0.0003213469047218194,
1048
- "loss": 2.8182,
1049
- "step": 7300
1050
- },
1051
- {
1052
- "epoch": 0.3260824356361519,
1053
- "grad_norm": 0.1708928942680359,
1054
- "learning_rate": 0.0003201843261674067,
1055
- "loss": 2.8194,
1056
- "step": 7350
1057
- },
1058
- {
1059
- "epoch": 0.3283006834976223,
1060
- "grad_norm": 0.16661237180233002,
1061
- "learning_rate": 0.00031901535623851245,
1062
- "loss": 2.8226,
1063
- "step": 7400
1064
- },
1065
- {
1066
- "epoch": 0.3305189313590927,
1067
- "grad_norm": 0.16710756719112396,
1068
- "learning_rate": 0.0003178400571006852,
1069
- "loss": 2.8187,
1070
- "step": 7450
1071
- },
1072
- {
1073
- "epoch": 0.3327371792205632,
1074
- "grad_norm": 0.16679760813713074,
1075
- "learning_rate": 0.00031665849125605937,
1076
- "loss": 2.8163,
1077
- "step": 7500
1078
- },
1079
- {
1080
- "epoch": 0.3349554270820336,
1081
- "grad_norm": 0.16872857511043549,
1082
- "learning_rate": 0.00031547072154003154,
1083
- "loss": 2.8147,
1084
- "step": 7550
1085
- },
1086
- {
1087
- "epoch": 0.337173674943504,
1088
- "grad_norm": 0.1672954261302948,
1089
- "learning_rate": 0.0003142768111179187,
1090
- "loss": 2.8167,
1091
- "step": 7600
1092
- },
1093
- {
1094
- "epoch": 0.33939192280497443,
1095
- "grad_norm": 0.16654394567012787,
1096
- "learning_rate": 0.00031307682348159907,
1097
- "loss": 2.816,
1098
- "step": 7650
1099
- },
1100
- {
1101
- "epoch": 0.34161017066644483,
1102
- "grad_norm": 0.16810841858386993,
1103
- "learning_rate": 0.00031187082244613567,
1104
- "loss": 2.8139,
1105
- "step": 7700
1106
- },
1107
- {
1108
- "epoch": 0.34382841852791524,
1109
- "grad_norm": 0.1682497262954712,
1110
- "learning_rate": 0.00031065887214638284,
1111
- "loss": 2.8157,
1112
- "step": 7750
1113
- },
1114
- {
1115
- "epoch": 0.3460466663893857,
1116
- "grad_norm": 0.17154847085475922,
1117
- "learning_rate": 0.00030944103703357524,
1118
- "loss": 2.8143,
1119
- "step": 7800
1120
- },
1121
- {
1122
- "epoch": 0.3482649142508561,
1123
- "grad_norm": 0.16658836603164673,
1124
- "learning_rate": 0.00030821738187190075,
1125
- "loss": 2.8143,
1126
- "step": 7850
1127
- },
1128
- {
1129
- "epoch": 0.35048316211232655,
1130
- "grad_norm": 0.16820305585861206,
1131
- "learning_rate": 0.00030698797173505586,
1132
- "loss": 2.8157,
1133
- "step": 7900
1134
- },
1135
- {
1136
- "epoch": 0.35270140997379695,
1137
- "grad_norm": 0.16843385994434357,
1138
- "learning_rate": 0.0003057528720027853,
1139
- "loss": 2.8103,
1140
- "step": 7950
1141
- },
1142
- {
1143
- "epoch": 0.35491965783526735,
1144
- "grad_norm": 0.17145898938179016,
1145
- "learning_rate": 0.0003045121483574054,
1146
- "loss": 2.8161,
1147
- "step": 8000
1148
- },
1149
- {
1150
- "epoch": 0.3571379056967378,
1151
- "grad_norm": 0.1709701269865036,
1152
- "learning_rate": 0.00030326586678031066,
1153
- "loss": 2.8134,
1154
- "step": 8050
1155
- },
1156
- {
1157
- "epoch": 0.3593561535582082,
1158
- "grad_norm": 0.16859866678714752,
1159
- "learning_rate": 0.0003020140935484653,
1160
- "loss": 2.818,
1161
- "step": 8100
1162
- },
1163
- {
1164
- "epoch": 0.3615744014196786,
1165
- "grad_norm": 0.16738031804561615,
1166
- "learning_rate": 0.00030075689523087804,
1167
- "loss": 2.8128,
1168
- "step": 8150
1169
- },
1170
- {
1171
- "epoch": 0.36379264928114907,
1172
- "grad_norm": 0.1693500131368637,
1173
- "learning_rate": 0.00029949433868506293,
1174
- "loss": 2.8138,
1175
- "step": 8200
1176
- },
1177
- {
1178
- "epoch": 0.3660108971426195,
1179
- "grad_norm": 0.16915106773376465,
1180
- "learning_rate": 0.00029822649105348294,
1181
- "loss": 2.8209,
1182
- "step": 8250
1183
- },
1184
- {
1185
- "epoch": 0.3682291450040899,
1186
- "grad_norm": 0.17108069360256195,
1187
- "learning_rate": 0.00029695341975998006,
1188
- "loss": 2.8174,
1189
- "step": 8300
1190
- },
1191
- {
1192
- "epoch": 0.37044739286556033,
1193
- "grad_norm": 0.16659317910671234,
1194
- "learning_rate": 0.00029567519250618907,
1195
- "loss": 2.8153,
1196
- "step": 8350
1197
- },
1198
- {
1199
- "epoch": 0.37266564072703073,
1200
- "grad_norm": 0.16678906977176666,
1201
- "learning_rate": 0.0002943918772679379,
1202
- "loss": 2.8163,
1203
- "step": 8400
1204
- },
1205
- {
1206
- "epoch": 0.37488388858850114,
1207
- "grad_norm": 0.16928167641162872,
1208
- "learning_rate": 0.00029310354229163197,
1209
- "loss": 2.8165,
1210
- "step": 8450
1211
- },
1212
- {
1213
- "epoch": 0.3771021364499716,
1214
- "grad_norm": 0.1695391833782196,
1215
- "learning_rate": 0.0002918102560906254,
1216
- "loss": 2.8197,
1217
- "step": 8500
1218
- },
1219
- {
1220
- "epoch": 0.379320384311442,
1221
- "grad_norm": 0.17006346583366394,
1222
- "learning_rate": 0.0002905120874415772,
1223
- "loss": 2.8172,
1224
- "step": 8550
1225
- },
1226
- {
1227
- "epoch": 0.3815386321729124,
1228
- "grad_norm": 0.16821132600307465,
1229
- "learning_rate": 0.0002892091053807939,
1230
- "loss": 2.8137,
1231
- "step": 8600
1232
- },
1233
- {
1234
- "epoch": 0.38375688003438285,
1235
- "grad_norm": 0.17077401280403137,
1236
- "learning_rate": 0.000287901379200558,
1237
- "loss": 2.8174,
1238
- "step": 8650
1239
- },
1240
- {
1241
- "epoch": 0.38597512789585325,
1242
- "grad_norm": 0.17006562650203705,
1243
- "learning_rate": 0.0002865889784454435,
1244
- "loss": 2.813,
1245
- "step": 8700
1246
- },
1247
- {
1248
- "epoch": 0.3881933757573237,
1249
- "grad_norm": 0.16847462952136993,
1250
- "learning_rate": 0.0002852719729086167,
1251
- "loss": 2.8158,
1252
- "step": 8750
1253
- },
1254
- {
1255
- "epoch": 0.3904116236187941,
1256
- "grad_norm": 0.16790613532066345,
1257
- "learning_rate": 0.0002839504326281256,
1258
- "loss": 2.816,
1259
- "step": 8800
1260
- },
1261
- {
1262
- "epoch": 0.3926298714802645,
1263
- "grad_norm": 0.16898341476917267,
1264
- "learning_rate": 0.00028262442788317446,
1265
- "loss": 2.8143,
1266
- "step": 8850
1267
- },
1268
- {
1269
- "epoch": 0.39484811934173497,
1270
- "grad_norm": 0.17099575698375702,
1271
- "learning_rate": 0.00028129402919038695,
1272
- "loss": 2.812,
1273
- "step": 8900
1274
- },
1275
- {
1276
- "epoch": 0.3970663672032054,
1277
- "grad_norm": 0.17063932120800018,
1278
- "learning_rate": 0.00027995930730005577,
1279
- "loss": 2.815,
1280
- "step": 8950
1281
- },
1282
- {
1283
- "epoch": 0.3992846150646758,
1284
- "grad_norm": 0.1704034060239792,
1285
- "learning_rate": 0.00027862033319238025,
1286
- "loss": 2.8144,
1287
- "step": 9000
1288
- },
1289
- {
1290
- "epoch": 0.3992846150646758,
1291
- "eval_accuracy": 0.42786541279921836,
1292
- "eval_loss": 2.8759515285491943,
1293
- "eval_runtime": 250.6732,
1294
- "eval_samples_per_second": 7.979,
1295
- "eval_steps_per_second": 0.997,
1296
- "step": 9000
1297
- },
1298
- {
1299
- "epoch": 0.40150286292614623,
1300
- "grad_norm": 0.1675969511270523,
1301
- "learning_rate": 0.0002772771780736917,
1302
- "loss": 2.8128,
1303
- "step": 9050
1304
- },
1305
- {
1306
- "epoch": 0.40372111078761663,
1307
- "grad_norm": 0.1697956621646881,
1308
- "learning_rate": 0.0002759299133726665,
1309
- "loss": 2.8121,
1310
- "step": 9100
1311
- },
1312
- {
1313
- "epoch": 0.40593935864908703,
1314
- "grad_norm": 0.1710100620985031,
1315
- "learning_rate": 0.00027457861073652785,
1316
- "loss": 2.8156,
1317
- "step": 9150
1318
- },
1319
- {
1320
- "epoch": 0.4081576065105575,
1321
- "grad_norm": 0.16877809166908264,
1322
- "learning_rate": 0.00027322334202723527,
1323
- "loss": 2.815,
1324
- "step": 9200
1325
- },
1326
- {
1327
- "epoch": 0.4103758543720279,
1328
- "grad_norm": 0.17122440040111542,
1329
- "learning_rate": 0.0002718641793176631,
1330
- "loss": 2.8119,
1331
- "step": 9250
1332
- },
1333
- {
1334
- "epoch": 0.4125941022334983,
1335
- "grad_norm": 0.16771045327186584,
1336
- "learning_rate": 0.0002705011948877679,
1337
- "loss": 2.808,
1338
- "step": 9300
1339
- },
1340
- {
1341
- "epoch": 0.41481235009496875,
1342
- "grad_norm": 0.16941729187965393,
1343
- "learning_rate": 0.0002691344612207442,
1344
- "loss": 2.8121,
1345
- "step": 9350
1346
- },
1347
- {
1348
- "epoch": 0.41703059795643915,
1349
- "grad_norm": 0.1719992607831955,
1350
- "learning_rate": 0.00026776405099917014,
1351
- "loss": 2.8094,
1352
- "step": 9400
1353
- },
1354
- {
1355
- "epoch": 0.41924884581790955,
1356
- "grad_norm": 0.1693243533372879,
1357
- "learning_rate": 0.00026639003710114223,
1358
- "loss": 2.8103,
1359
- "step": 9450
1360
- },
1361
- {
1362
- "epoch": 0.42146709367938,
1363
- "grad_norm": 0.17014500498771667,
1364
- "learning_rate": 0.0002650124925963998,
1365
- "loss": 2.8129,
1366
- "step": 9500
1367
- },
1368
- {
1369
- "epoch": 0.4236853415408504,
1370
- "grad_norm": 0.1709510087966919,
1371
- "learning_rate": 0.00026363149074243867,
1372
- "loss": 2.8084,
1373
- "step": 9550
1374
- },
1375
- {
1376
- "epoch": 0.4259035894023208,
1377
- "grad_norm": 0.16937118768692017,
1378
- "learning_rate": 0.0002622471049806159,
1379
- "loss": 2.814,
1380
- "step": 9600
1381
- },
1382
- {
1383
- "epoch": 0.42812183726379127,
1384
- "grad_norm": 0.1713036149740219,
1385
- "learning_rate": 0.00026085940893224403,
1386
- "loss": 2.8162,
1387
- "step": 9650
1388
- },
1389
- {
1390
- "epoch": 0.4303400851252617,
1391
- "grad_norm": 0.17020347714424133,
1392
- "learning_rate": 0.0002594684763946758,
1393
- "loss": 2.8116,
1394
- "step": 9700
1395
- },
1396
- {
1397
- "epoch": 0.43255833298673213,
1398
- "grad_norm": 0.16786696016788483,
1399
- "learning_rate": 0.0002580743813373796,
1400
- "loss": 2.8111,
1401
- "step": 9750
1402
- },
1403
- {
1404
- "epoch": 0.43477658084820253,
1405
- "grad_norm": 0.17273075878620148,
1406
- "learning_rate": 0.00025667719789800606,
1407
- "loss": 2.8131,
1408
- "step": 9800
1409
- },
1410
- {
1411
- "epoch": 0.43699482870967293,
1412
- "grad_norm": 0.16986466944217682,
1413
- "learning_rate": 0.00025527700037844515,
1414
- "loss": 2.8139,
1415
- "step": 9850
1416
- },
1417
- {
1418
- "epoch": 0.4392130765711434,
1419
- "grad_norm": 0.17129731178283691,
1420
- "learning_rate": 0.00025387386324087494,
1421
- "loss": 2.8125,
1422
- "step": 9900
1423
- },
1424
- {
1425
- "epoch": 0.4414313244326138,
1426
- "grad_norm": 0.16890868544578552,
1427
- "learning_rate": 0.00025246786110380163,
1428
- "loss": 2.8142,
1429
- "step": 9950
1430
- },
1431
- {
1432
- "epoch": 0.4436495722940842,
1433
- "grad_norm": 0.17167522013187408,
1434
- "learning_rate": 0.00025105906873809154,
1435
- "loss": 2.8142,
1436
- "step": 10000
1437
- },
1438
- {
1439
- "epoch": 0.44586782015555465,
1440
- "grad_norm": 0.17136669158935547,
1441
- "learning_rate": 0.0002496475610629947,
1442
- "loss": 2.8112,
1443
- "step": 10050
1444
- },
1445
- {
1446
- "epoch": 0.44808606801702505,
1447
- "grad_norm": 0.16926760971546173,
1448
- "learning_rate": 0.00024823341314216056,
1449
- "loss": 2.8156,
1450
- "step": 10100
1451
- },
1452
- {
1453
- "epoch": 0.45030431587849545,
1454
- "grad_norm": 0.16898435354232788,
1455
- "learning_rate": 0.00024681670017964627,
1456
- "loss": 2.8079,
1457
- "step": 10150
1458
- },
1459
- {
1460
- "epoch": 0.4525225637399659,
1461
- "grad_norm": 0.17237040400505066,
1462
- "learning_rate": 0.0002453974975159173,
1463
- "loss": 2.813,
1464
- "step": 10200
1465
- },
1466
- {
1467
- "epoch": 0.4547408116014363,
1468
- "grad_norm": 0.16995486617088318,
1469
- "learning_rate": 0.00024397588062384095,
1470
- "loss": 2.8117,
1471
- "step": 10250
1472
- },
1473
- {
1474
- "epoch": 0.4569590594629067,
1475
- "grad_norm": 0.17290563881397247,
1476
- "learning_rate": 0.00024255192510467245,
1477
- "loss": 2.8121,
1478
- "step": 10300
1479
- },
1480
- {
1481
- "epoch": 0.45917730732437717,
1482
- "grad_norm": 0.17059782147407532,
1483
- "learning_rate": 0.00024112570668403472,
1484
- "loss": 2.8138,
1485
- "step": 10350
1486
- },
1487
- {
1488
- "epoch": 0.4613955551858476,
1489
- "grad_norm": 0.17196382582187653,
1490
- "learning_rate": 0.00023969730120789132,
1491
- "loss": 2.8095,
1492
- "step": 10400
1493
- },
1494
- {
1495
- "epoch": 0.463613803047318,
1496
- "grad_norm": 0.16942380368709564,
1497
- "learning_rate": 0.00023826678463851285,
1498
- "loss": 2.8124,
1499
- "step": 10450
1500
- },
1501
- {
1502
- "epoch": 0.46583205090878843,
1503
- "grad_norm": 0.17288681864738464,
1504
- "learning_rate": 0.00023683423305043749,
1505
- "loss": 2.813,
1506
- "step": 10500
1507
- },
1508
- {
1509
- "epoch": 0.46805029877025883,
1510
- "grad_norm": 0.17040428519248962,
1511
- "learning_rate": 0.00023539972262642502,
1512
- "loss": 2.8141,
1513
- "step": 10550
1514
- },
1515
- {
1516
- "epoch": 0.4702685466317293,
1517
- "grad_norm": 0.17321184277534485,
1518
- "learning_rate": 0.00023396332965340585,
1519
- "loss": 2.8146,
1520
- "step": 10600
1521
- },
1522
- {
1523
- "epoch": 0.4724867944931997,
1524
- "grad_norm": 0.17026926577091217,
1525
- "learning_rate": 0.00023252513051842373,
1526
- "loss": 2.8086,
1527
- "step": 10650
1528
- },
1529
- {
1530
- "epoch": 0.4747050423546701,
1531
- "grad_norm": 0.1710352748632431,
1532
- "learning_rate": 0.00023108520170457398,
1533
- "loss": 2.8099,
1534
- "step": 10700
1535
- },
1536
- {
1537
- "epoch": 0.47692329021614055,
1538
- "grad_norm": 0.17067080736160278,
1539
- "learning_rate": 0.00022964361978693542,
1540
- "loss": 2.8099,
1541
- "step": 10750
1542
- },
1543
- {
1544
- "epoch": 0.47914153807761095,
1545
- "grad_norm": 0.17244164645671844,
1546
- "learning_rate": 0.0002282004614284989,
1547
- "loss": 2.8054,
1548
- "step": 10800
1549
- }
1550
- ],
1551
- "logging_steps": 50,
1552
- "max_steps": 22540,
1553
- "num_input_tokens_seen": 0,
1554
- "num_train_epochs": 1,
1555
- "save_steps": 1200,
1556
- "stateful_callbacks": {
1557
- "TrainerControl": {
1558
- "args": {
1559
- "should_epoch_stop": false,
1560
- "should_evaluate": false,
1561
- "should_log": false,
1562
- "should_save": true,
1563
- "should_training_stop": false
1564
- },
1565
- "attributes": {}
1566
- }
1567
- },
1568
- "total_flos": 1.5864483085358727e+19,
1569
- "train_batch_size": 1,
1570
- "trial_name": null,
1571
- "trial_params": null
1572
- }