vivit-b-16x2-collected-dataset / trainer_state.json
yehiawp4's picture
End of training
06b52b9 verified
raw
history blame contribute delete
No virus
230 kB
{
"best_metric": 0.9669172932330827,
"best_model_checkpoint": "YAHIA/vivit-b-16x2-collected-dataset\\checkpoint-8418",
"epoch": 9.099358059914408,
"eval_steps": 500,
"global_step": 14020,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 16.260046005249023,
"learning_rate": 3.566333808844508e-07,
"loss": 1.7843,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 24.135656356811523,
"learning_rate": 7.132667617689016e-07,
"loss": 1.8164,
"step": 20
},
{
"epoch": 0.0,
"grad_norm": 22.565906524658203,
"learning_rate": 1.0699001426533523e-06,
"loss": 1.9396,
"step": 30
},
{
"epoch": 0.0,
"grad_norm": 20.68889045715332,
"learning_rate": 1.4265335235378032e-06,
"loss": 1.9576,
"step": 40
},
{
"epoch": 0.0,
"grad_norm": 22.999574661254883,
"learning_rate": 1.7831669044222541e-06,
"loss": 1.9828,
"step": 50
},
{
"epoch": 0.0,
"grad_norm": 27.69795036315918,
"learning_rate": 2.1398002853067046e-06,
"loss": 1.9381,
"step": 60
},
{
"epoch": 0.0,
"grad_norm": 25.143293380737305,
"learning_rate": 2.4964336661911553e-06,
"loss": 1.8222,
"step": 70
},
{
"epoch": 0.01,
"grad_norm": 20.670278549194336,
"learning_rate": 2.8530670470756064e-06,
"loss": 1.7593,
"step": 80
},
{
"epoch": 0.01,
"grad_norm": 20.401081085205078,
"learning_rate": 3.209700427960057e-06,
"loss": 1.7611,
"step": 90
},
{
"epoch": 0.01,
"grad_norm": 21.053512573242188,
"learning_rate": 3.5663338088445082e-06,
"loss": 1.7861,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 21.759618759155273,
"learning_rate": 3.922967189728959e-06,
"loss": 1.7698,
"step": 110
},
{
"epoch": 0.01,
"grad_norm": 22.194372177124023,
"learning_rate": 4.279600570613409e-06,
"loss": 1.7558,
"step": 120
},
{
"epoch": 0.01,
"grad_norm": 20.19968605041504,
"learning_rate": 4.63623395149786e-06,
"loss": 1.8624,
"step": 130
},
{
"epoch": 0.01,
"grad_norm": 19.7205753326416,
"learning_rate": 4.992867332382311e-06,
"loss": 1.6795,
"step": 140
},
{
"epoch": 0.01,
"grad_norm": 19.424144744873047,
"learning_rate": 5.349500713266762e-06,
"loss": 1.7443,
"step": 150
},
{
"epoch": 0.01,
"grad_norm": 19.568191528320312,
"learning_rate": 5.706134094151213e-06,
"loss": 1.7468,
"step": 160
},
{
"epoch": 0.01,
"grad_norm": 22.05777931213379,
"learning_rate": 6.062767475035663e-06,
"loss": 1.7362,
"step": 170
},
{
"epoch": 0.01,
"grad_norm": 17.77819061279297,
"learning_rate": 6.419400855920114e-06,
"loss": 1.527,
"step": 180
},
{
"epoch": 0.01,
"grad_norm": 19.465238571166992,
"learning_rate": 6.776034236804565e-06,
"loss": 1.5923,
"step": 190
},
{
"epoch": 0.01,
"grad_norm": 20.62281036376953,
"learning_rate": 7.1326676176890165e-06,
"loss": 1.697,
"step": 200
},
{
"epoch": 0.01,
"grad_norm": 16.7163028717041,
"learning_rate": 7.489300998573468e-06,
"loss": 1.4694,
"step": 210
},
{
"epoch": 0.02,
"grad_norm": 20.071901321411133,
"learning_rate": 7.845934379457918e-06,
"loss": 1.4549,
"step": 220
},
{
"epoch": 0.02,
"grad_norm": 20.55426597595215,
"learning_rate": 8.202567760342367e-06,
"loss": 1.3167,
"step": 230
},
{
"epoch": 0.02,
"grad_norm": 26.36579704284668,
"learning_rate": 8.559201141226818e-06,
"loss": 1.6743,
"step": 240
},
{
"epoch": 0.02,
"grad_norm": 17.331533432006836,
"learning_rate": 8.91583452211127e-06,
"loss": 1.4754,
"step": 250
},
{
"epoch": 0.02,
"grad_norm": 19.567764282226562,
"learning_rate": 9.27246790299572e-06,
"loss": 1.45,
"step": 260
},
{
"epoch": 0.02,
"grad_norm": 16.322946548461914,
"learning_rate": 9.629101283880172e-06,
"loss": 1.3971,
"step": 270
},
{
"epoch": 0.02,
"grad_norm": 18.62678337097168,
"learning_rate": 9.985734664764621e-06,
"loss": 1.4368,
"step": 280
},
{
"epoch": 0.02,
"grad_norm": 20.327966690063477,
"learning_rate": 1.0342368045649072e-05,
"loss": 1.5098,
"step": 290
},
{
"epoch": 0.02,
"grad_norm": 18.31135368347168,
"learning_rate": 1.0699001426533523e-05,
"loss": 1.1699,
"step": 300
},
{
"epoch": 0.02,
"grad_norm": 28.94702911376953,
"learning_rate": 1.1055634807417975e-05,
"loss": 1.5048,
"step": 310
},
{
"epoch": 0.02,
"grad_norm": 21.377225875854492,
"learning_rate": 1.1412268188302426e-05,
"loss": 1.4112,
"step": 320
},
{
"epoch": 0.02,
"grad_norm": 15.965813636779785,
"learning_rate": 1.1768901569186877e-05,
"loss": 1.5097,
"step": 330
},
{
"epoch": 0.02,
"grad_norm": 19.742080688476562,
"learning_rate": 1.2125534950071326e-05,
"loss": 1.2703,
"step": 340
},
{
"epoch": 0.02,
"grad_norm": 18.924072265625,
"learning_rate": 1.2482168330955777e-05,
"loss": 1.2194,
"step": 350
},
{
"epoch": 0.03,
"grad_norm": 18.15528106689453,
"learning_rate": 1.2838801711840228e-05,
"loss": 1.0668,
"step": 360
},
{
"epoch": 0.03,
"grad_norm": 21.82122802734375,
"learning_rate": 1.3195435092724678e-05,
"loss": 0.9053,
"step": 370
},
{
"epoch": 0.03,
"grad_norm": 20.609405517578125,
"learning_rate": 1.355206847360913e-05,
"loss": 1.2574,
"step": 380
},
{
"epoch": 0.03,
"grad_norm": 25.153718948364258,
"learning_rate": 1.390870185449358e-05,
"loss": 1.1619,
"step": 390
},
{
"epoch": 0.03,
"grad_norm": 12.118425369262695,
"learning_rate": 1.4265335235378033e-05,
"loss": 1.1514,
"step": 400
},
{
"epoch": 0.03,
"grad_norm": 25.673738479614258,
"learning_rate": 1.4621968616262482e-05,
"loss": 1.2191,
"step": 410
},
{
"epoch": 0.03,
"grad_norm": 23.109697341918945,
"learning_rate": 1.4978601997146935e-05,
"loss": 0.8535,
"step": 420
},
{
"epoch": 0.03,
"grad_norm": 15.181422233581543,
"learning_rate": 1.5335235378031385e-05,
"loss": 0.7464,
"step": 430
},
{
"epoch": 0.03,
"grad_norm": 31.820419311523438,
"learning_rate": 1.5691868758915836e-05,
"loss": 1.2202,
"step": 440
},
{
"epoch": 0.03,
"grad_norm": 24.667930603027344,
"learning_rate": 1.6048502139800287e-05,
"loss": 1.0542,
"step": 450
},
{
"epoch": 0.03,
"grad_norm": 16.041976928710938,
"learning_rate": 1.6405135520684735e-05,
"loss": 0.9541,
"step": 460
},
{
"epoch": 0.03,
"grad_norm": 9.076415061950684,
"learning_rate": 1.676176890156919e-05,
"loss": 1.0625,
"step": 470
},
{
"epoch": 0.03,
"grad_norm": 9.516477584838867,
"learning_rate": 1.7118402282453637e-05,
"loss": 1.1725,
"step": 480
},
{
"epoch": 0.03,
"grad_norm": 29.433717727661133,
"learning_rate": 1.7475035663338088e-05,
"loss": 0.9226,
"step": 490
},
{
"epoch": 0.04,
"grad_norm": 14.57030200958252,
"learning_rate": 1.783166904422254e-05,
"loss": 0.7345,
"step": 500
},
{
"epoch": 0.04,
"grad_norm": 19.724756240844727,
"learning_rate": 1.818830242510699e-05,
"loss": 1.1076,
"step": 510
},
{
"epoch": 0.04,
"grad_norm": 17.7041072845459,
"learning_rate": 1.854493580599144e-05,
"loss": 1.1412,
"step": 520
},
{
"epoch": 0.04,
"grad_norm": 4.248980522155762,
"learning_rate": 1.8901569186875892e-05,
"loss": 0.675,
"step": 530
},
{
"epoch": 0.04,
"grad_norm": 6.876579284667969,
"learning_rate": 1.9258202567760344e-05,
"loss": 0.703,
"step": 540
},
{
"epoch": 0.04,
"grad_norm": 15.930359840393066,
"learning_rate": 1.9614835948644795e-05,
"loss": 0.671,
"step": 550
},
{
"epoch": 0.04,
"grad_norm": 7.9089226722717285,
"learning_rate": 1.9971469329529242e-05,
"loss": 0.7656,
"step": 560
},
{
"epoch": 0.04,
"grad_norm": 20.674118041992188,
"learning_rate": 2.0328102710413697e-05,
"loss": 1.4598,
"step": 570
},
{
"epoch": 0.04,
"grad_norm": 33.44108581542969,
"learning_rate": 2.0684736091298145e-05,
"loss": 0.9271,
"step": 580
},
{
"epoch": 0.04,
"grad_norm": 9.660829544067383,
"learning_rate": 2.10413694721826e-05,
"loss": 0.9135,
"step": 590
},
{
"epoch": 0.04,
"grad_norm": 3.2455947399139404,
"learning_rate": 2.1398002853067047e-05,
"loss": 0.9244,
"step": 600
},
{
"epoch": 0.04,
"grad_norm": 16.9035587310791,
"learning_rate": 2.1754636233951498e-05,
"loss": 1.2397,
"step": 610
},
{
"epoch": 0.04,
"grad_norm": 12.139324188232422,
"learning_rate": 2.211126961483595e-05,
"loss": 0.9137,
"step": 620
},
{
"epoch": 0.04,
"grad_norm": 13.0936861038208,
"learning_rate": 2.24679029957204e-05,
"loss": 0.891,
"step": 630
},
{
"epoch": 0.05,
"grad_norm": 8.328577995300293,
"learning_rate": 2.282453637660485e-05,
"loss": 0.8631,
"step": 640
},
{
"epoch": 0.05,
"grad_norm": 24.814929962158203,
"learning_rate": 2.3181169757489303e-05,
"loss": 0.5948,
"step": 650
},
{
"epoch": 0.05,
"grad_norm": 15.284310340881348,
"learning_rate": 2.3537803138373754e-05,
"loss": 1.1153,
"step": 660
},
{
"epoch": 0.05,
"grad_norm": 2.1705708503723145,
"learning_rate": 2.3894436519258205e-05,
"loss": 0.6161,
"step": 670
},
{
"epoch": 0.05,
"grad_norm": 15.621281623840332,
"learning_rate": 2.4251069900142652e-05,
"loss": 0.8466,
"step": 680
},
{
"epoch": 0.05,
"grad_norm": 37.767173767089844,
"learning_rate": 2.4607703281027107e-05,
"loss": 0.7471,
"step": 690
},
{
"epoch": 0.05,
"grad_norm": 5.153799533843994,
"learning_rate": 2.4964336661911555e-05,
"loss": 0.5421,
"step": 700
},
{
"epoch": 0.05,
"grad_norm": 3.665609359741211,
"learning_rate": 2.5320970042796006e-05,
"loss": 0.4251,
"step": 710
},
{
"epoch": 0.05,
"grad_norm": 21.673925399780273,
"learning_rate": 2.5677603423680457e-05,
"loss": 0.8117,
"step": 720
},
{
"epoch": 0.05,
"grad_norm": 23.484006881713867,
"learning_rate": 2.603423680456491e-05,
"loss": 0.4761,
"step": 730
},
{
"epoch": 0.05,
"grad_norm": 24.750452041625977,
"learning_rate": 2.6390870185449356e-05,
"loss": 0.95,
"step": 740
},
{
"epoch": 0.05,
"grad_norm": 6.027065277099609,
"learning_rate": 2.674750356633381e-05,
"loss": 0.9197,
"step": 750
},
{
"epoch": 0.05,
"grad_norm": 33.312313079833984,
"learning_rate": 2.710413694721826e-05,
"loss": 1.033,
"step": 760
},
{
"epoch": 0.05,
"grad_norm": 15.621706008911133,
"learning_rate": 2.7460770328102713e-05,
"loss": 0.2779,
"step": 770
},
{
"epoch": 0.06,
"grad_norm": 10.880739212036133,
"learning_rate": 2.781740370898716e-05,
"loss": 0.5387,
"step": 780
},
{
"epoch": 0.06,
"grad_norm": 11.6985445022583,
"learning_rate": 2.8174037089871615e-05,
"loss": 0.7687,
"step": 790
},
{
"epoch": 0.06,
"grad_norm": 25.108810424804688,
"learning_rate": 2.8530670470756066e-05,
"loss": 0.4862,
"step": 800
},
{
"epoch": 0.06,
"grad_norm": 23.200624465942383,
"learning_rate": 2.8887303851640514e-05,
"loss": 0.9553,
"step": 810
},
{
"epoch": 0.06,
"grad_norm": 30.682540893554688,
"learning_rate": 2.9243937232524965e-05,
"loss": 0.8558,
"step": 820
},
{
"epoch": 0.06,
"grad_norm": 12.823701858520508,
"learning_rate": 2.9600570613409416e-05,
"loss": 0.6195,
"step": 830
},
{
"epoch": 0.06,
"grad_norm": 11.762367248535156,
"learning_rate": 2.995720399429387e-05,
"loss": 0.758,
"step": 840
},
{
"epoch": 0.06,
"grad_norm": 1.1662691831588745,
"learning_rate": 3.0313837375178318e-05,
"loss": 0.354,
"step": 850
},
{
"epoch": 0.06,
"grad_norm": 23.4963436126709,
"learning_rate": 3.067047075606277e-05,
"loss": 0.8267,
"step": 860
},
{
"epoch": 0.06,
"grad_norm": 0.13900019228458405,
"learning_rate": 3.102710413694722e-05,
"loss": 0.6618,
"step": 870
},
{
"epoch": 0.06,
"grad_norm": 7.843920707702637,
"learning_rate": 3.138373751783167e-05,
"loss": 0.7689,
"step": 880
},
{
"epoch": 0.06,
"grad_norm": 31.13179588317871,
"learning_rate": 3.174037089871612e-05,
"loss": 0.7772,
"step": 890
},
{
"epoch": 0.06,
"grad_norm": 44.58312225341797,
"learning_rate": 3.2097004279600574e-05,
"loss": 1.1062,
"step": 900
},
{
"epoch": 0.06,
"grad_norm": 18.089794158935547,
"learning_rate": 3.2453637660485025e-05,
"loss": 0.7678,
"step": 910
},
{
"epoch": 0.07,
"grad_norm": 33.472625732421875,
"learning_rate": 3.281027104136947e-05,
"loss": 1.6911,
"step": 920
},
{
"epoch": 0.07,
"grad_norm": 33.618831634521484,
"learning_rate": 3.316690442225393e-05,
"loss": 0.8881,
"step": 930
},
{
"epoch": 0.07,
"grad_norm": 1.6782217025756836,
"learning_rate": 3.352353780313838e-05,
"loss": 0.7327,
"step": 940
},
{
"epoch": 0.07,
"grad_norm": 2.1791036128997803,
"learning_rate": 3.388017118402282e-05,
"loss": 0.8054,
"step": 950
},
{
"epoch": 0.07,
"grad_norm": 6.972609043121338,
"learning_rate": 3.4236804564907274e-05,
"loss": 0.2614,
"step": 960
},
{
"epoch": 0.07,
"grad_norm": 24.085866928100586,
"learning_rate": 3.459343794579173e-05,
"loss": 0.4054,
"step": 970
},
{
"epoch": 0.07,
"grad_norm": 1.9996914863586426,
"learning_rate": 3.4950071326676176e-05,
"loss": 0.5344,
"step": 980
},
{
"epoch": 0.07,
"grad_norm": 0.13388022780418396,
"learning_rate": 3.530670470756063e-05,
"loss": 0.7224,
"step": 990
},
{
"epoch": 0.07,
"grad_norm": 30.018585205078125,
"learning_rate": 3.566333808844508e-05,
"loss": 0.6226,
"step": 1000
},
{
"epoch": 0.07,
"grad_norm": 14.195096015930176,
"learning_rate": 3.6019971469329536e-05,
"loss": 0.7356,
"step": 1010
},
{
"epoch": 0.07,
"grad_norm": 25.853748321533203,
"learning_rate": 3.637660485021398e-05,
"loss": 0.7235,
"step": 1020
},
{
"epoch": 0.07,
"grad_norm": 29.89474868774414,
"learning_rate": 3.673323823109843e-05,
"loss": 1.1801,
"step": 1030
},
{
"epoch": 0.07,
"grad_norm": 12.760407447814941,
"learning_rate": 3.708987161198288e-05,
"loss": 0.4307,
"step": 1040
},
{
"epoch": 0.07,
"grad_norm": 29.496700286865234,
"learning_rate": 3.7446504992867334e-05,
"loss": 0.9473,
"step": 1050
},
{
"epoch": 0.08,
"grad_norm": 1.0010541677474976,
"learning_rate": 3.7803138373751785e-05,
"loss": 0.5983,
"step": 1060
},
{
"epoch": 0.08,
"grad_norm": 2.141446352005005,
"learning_rate": 3.8159771754636236e-05,
"loss": 0.4888,
"step": 1070
},
{
"epoch": 0.08,
"grad_norm": 1.1996098756790161,
"learning_rate": 3.851640513552069e-05,
"loss": 0.5292,
"step": 1080
},
{
"epoch": 0.08,
"grad_norm": 20.964256286621094,
"learning_rate": 3.887303851640514e-05,
"loss": 0.6905,
"step": 1090
},
{
"epoch": 0.08,
"grad_norm": 0.3161448538303375,
"learning_rate": 3.922967189728959e-05,
"loss": 0.7078,
"step": 1100
},
{
"epoch": 0.08,
"grad_norm": 13.272440910339355,
"learning_rate": 3.958630527817404e-05,
"loss": 0.9984,
"step": 1110
},
{
"epoch": 0.08,
"grad_norm": 0.3290501832962036,
"learning_rate": 3.9942938659058485e-05,
"loss": 0.4982,
"step": 1120
},
{
"epoch": 0.08,
"grad_norm": 0.7225183248519897,
"learning_rate": 4.029957203994294e-05,
"loss": 0.4532,
"step": 1130
},
{
"epoch": 0.08,
"grad_norm": 24.277801513671875,
"learning_rate": 4.0656205420827394e-05,
"loss": 0.5147,
"step": 1140
},
{
"epoch": 0.08,
"grad_norm": 9.140922546386719,
"learning_rate": 4.1012838801711845e-05,
"loss": 0.7049,
"step": 1150
},
{
"epoch": 0.08,
"grad_norm": 4.139643669128418,
"learning_rate": 4.136947218259629e-05,
"loss": 1.3454,
"step": 1160
},
{
"epoch": 0.08,
"grad_norm": 24.44458770751953,
"learning_rate": 4.172610556348075e-05,
"loss": 0.6409,
"step": 1170
},
{
"epoch": 0.08,
"grad_norm": 53.11198425292969,
"learning_rate": 4.20827389443652e-05,
"loss": 0.7063,
"step": 1180
},
{
"epoch": 0.08,
"grad_norm": 15.888784408569336,
"learning_rate": 4.243937232524964e-05,
"loss": 0.686,
"step": 1190
},
{
"epoch": 0.09,
"grad_norm": 29.689838409423828,
"learning_rate": 4.2796005706134094e-05,
"loss": 0.6301,
"step": 1200
},
{
"epoch": 0.09,
"grad_norm": 37.24555206298828,
"learning_rate": 4.3152639087018545e-05,
"loss": 0.939,
"step": 1210
},
{
"epoch": 0.09,
"grad_norm": 39.6224479675293,
"learning_rate": 4.3509272467902996e-05,
"loss": 0.9322,
"step": 1220
},
{
"epoch": 0.09,
"grad_norm": 28.799930572509766,
"learning_rate": 4.386590584878745e-05,
"loss": 1.1431,
"step": 1230
},
{
"epoch": 0.09,
"grad_norm": 0.9416821002960205,
"learning_rate": 4.42225392296719e-05,
"loss": 0.7622,
"step": 1240
},
{
"epoch": 0.09,
"grad_norm": 11.397088050842285,
"learning_rate": 4.457917261055635e-05,
"loss": 0.5302,
"step": 1250
},
{
"epoch": 0.09,
"grad_norm": 0.13693714141845703,
"learning_rate": 4.49358059914408e-05,
"loss": 0.9497,
"step": 1260
},
{
"epoch": 0.09,
"grad_norm": 22.64994239807129,
"learning_rate": 4.529243937232525e-05,
"loss": 1.4811,
"step": 1270
},
{
"epoch": 0.09,
"grad_norm": 63.26667404174805,
"learning_rate": 4.56490727532097e-05,
"loss": 0.7023,
"step": 1280
},
{
"epoch": 0.09,
"grad_norm": 24.035776138305664,
"learning_rate": 4.6005706134094154e-05,
"loss": 0.3478,
"step": 1290
},
{
"epoch": 0.09,
"grad_norm": 0.0889860987663269,
"learning_rate": 4.6362339514978605e-05,
"loss": 0.3317,
"step": 1300
},
{
"epoch": 0.09,
"grad_norm": 14.644208908081055,
"learning_rate": 4.6718972895863056e-05,
"loss": 0.6627,
"step": 1310
},
{
"epoch": 0.09,
"grad_norm": 16.509044647216797,
"learning_rate": 4.707560627674751e-05,
"loss": 1.3097,
"step": 1320
},
{
"epoch": 0.09,
"grad_norm": 23.583152770996094,
"learning_rate": 4.743223965763195e-05,
"loss": 0.9481,
"step": 1330
},
{
"epoch": 0.1,
"grad_norm": 50.59526443481445,
"learning_rate": 4.778887303851641e-05,
"loss": 0.7222,
"step": 1340
},
{
"epoch": 0.1,
"grad_norm": 18.746498107910156,
"learning_rate": 4.814550641940086e-05,
"loss": 0.7993,
"step": 1350
},
{
"epoch": 0.1,
"grad_norm": 14.619526863098145,
"learning_rate": 4.8502139800285305e-05,
"loss": 0.8045,
"step": 1360
},
{
"epoch": 0.1,
"grad_norm": 0.3897199332714081,
"learning_rate": 4.8858773181169756e-05,
"loss": 0.7668,
"step": 1370
},
{
"epoch": 0.1,
"grad_norm": 0.14925141632556915,
"learning_rate": 4.9215406562054214e-05,
"loss": 0.2882,
"step": 1380
},
{
"epoch": 0.1,
"grad_norm": 38.2923469543457,
"learning_rate": 4.9572039942938665e-05,
"loss": 0.8372,
"step": 1390
},
{
"epoch": 0.1,
"grad_norm": 0.04119894281029701,
"learning_rate": 4.992867332382311e-05,
"loss": 0.1001,
"step": 1400
},
{
"epoch": 0.1,
"eval_accuracy": 0.7789473684210526,
"eval_loss": 0.898942768573761,
"eval_runtime": 2157.0352,
"eval_samples_per_second": 0.308,
"eval_steps_per_second": 0.154,
"step": 1403
},
{
"epoch": 1.0,
"grad_norm": 4.562422752380371,
"learning_rate": 4.99682992550325e-05,
"loss": 0.6604,
"step": 1410
},
{
"epoch": 1.0,
"grad_norm": 0.05182512477040291,
"learning_rate": 4.992867332382311e-05,
"loss": 0.7648,
"step": 1420
},
{
"epoch": 1.0,
"grad_norm": 10.46511459350586,
"learning_rate": 4.988904739261373e-05,
"loss": 0.7306,
"step": 1430
},
{
"epoch": 1.0,
"grad_norm": 26.981674194335938,
"learning_rate": 4.984942146140435e-05,
"loss": 0.3427,
"step": 1440
},
{
"epoch": 1.0,
"grad_norm": 38.77156066894531,
"learning_rate": 4.9809795530194966e-05,
"loss": 0.4313,
"step": 1450
},
{
"epoch": 1.0,
"grad_norm": 0.08875282108783722,
"learning_rate": 4.977016959898558e-05,
"loss": 0.1047,
"step": 1460
},
{
"epoch": 1.0,
"grad_norm": 7.9550042152404785,
"learning_rate": 4.97305436677762e-05,
"loss": 0.8428,
"step": 1470
},
{
"epoch": 1.01,
"grad_norm": 3.2877941131591797,
"learning_rate": 4.969091773656681e-05,
"loss": 0.5087,
"step": 1480
},
{
"epoch": 1.01,
"grad_norm": 27.133525848388672,
"learning_rate": 4.965129180535743e-05,
"loss": 0.4621,
"step": 1490
},
{
"epoch": 1.01,
"grad_norm": 24.116609573364258,
"learning_rate": 4.9611665874148046e-05,
"loss": 0.8207,
"step": 1500
},
{
"epoch": 1.01,
"grad_norm": 48.552242279052734,
"learning_rate": 4.9572039942938665e-05,
"loss": 1.3676,
"step": 1510
},
{
"epoch": 1.01,
"grad_norm": 0.1313333660364151,
"learning_rate": 4.953241401172928e-05,
"loss": 0.4705,
"step": 1520
},
{
"epoch": 1.01,
"grad_norm": 14.919997215270996,
"learning_rate": 4.9492788080519896e-05,
"loss": 1.6541,
"step": 1530
},
{
"epoch": 1.01,
"grad_norm": 2.8064146041870117,
"learning_rate": 4.945316214931051e-05,
"loss": 0.6358,
"step": 1540
},
{
"epoch": 1.01,
"grad_norm": 33.633766174316406,
"learning_rate": 4.941353621810113e-05,
"loss": 0.3344,
"step": 1550
},
{
"epoch": 1.01,
"grad_norm": 25.58049774169922,
"learning_rate": 4.9373910286891746e-05,
"loss": 0.3765,
"step": 1560
},
{
"epoch": 1.01,
"grad_norm": 0.5938677191734314,
"learning_rate": 4.933428435568236e-05,
"loss": 0.7599,
"step": 1570
},
{
"epoch": 1.01,
"grad_norm": 0.17297320067882538,
"learning_rate": 4.9294658424472976e-05,
"loss": 0.1381,
"step": 1580
},
{
"epoch": 1.01,
"grad_norm": 1.1137043237686157,
"learning_rate": 4.9255032493263595e-05,
"loss": 0.5216,
"step": 1590
},
{
"epoch": 1.01,
"grad_norm": 11.281981468200684,
"learning_rate": 4.9215406562054214e-05,
"loss": 0.9504,
"step": 1600
},
{
"epoch": 1.01,
"grad_norm": 35.159671783447266,
"learning_rate": 4.9175780630844826e-05,
"loss": 0.6236,
"step": 1610
},
{
"epoch": 1.02,
"grad_norm": 15.732198715209961,
"learning_rate": 4.9136154699635445e-05,
"loss": 0.3037,
"step": 1620
},
{
"epoch": 1.02,
"grad_norm": 4.352818965911865,
"learning_rate": 4.909652876842606e-05,
"loss": 0.5294,
"step": 1630
},
{
"epoch": 1.02,
"grad_norm": 27.470956802368164,
"learning_rate": 4.9056902837216676e-05,
"loss": 0.5626,
"step": 1640
},
{
"epoch": 1.02,
"grad_norm": 33.91129684448242,
"learning_rate": 4.9017276906007294e-05,
"loss": 0.2882,
"step": 1650
},
{
"epoch": 1.02,
"grad_norm": 0.26898714900016785,
"learning_rate": 4.897765097479791e-05,
"loss": 0.4265,
"step": 1660
},
{
"epoch": 1.02,
"grad_norm": 0.8199774622917175,
"learning_rate": 4.8938025043588525e-05,
"loss": 0.5277,
"step": 1670
},
{
"epoch": 1.02,
"grad_norm": 0.020548412576317787,
"learning_rate": 4.8898399112379144e-05,
"loss": 0.889,
"step": 1680
},
{
"epoch": 1.02,
"grad_norm": 0.048078641295433044,
"learning_rate": 4.8858773181169756e-05,
"loss": 0.8639,
"step": 1690
},
{
"epoch": 1.02,
"grad_norm": 20.957611083984375,
"learning_rate": 4.881914724996038e-05,
"loss": 0.3244,
"step": 1700
},
{
"epoch": 1.02,
"grad_norm": 0.15246962010860443,
"learning_rate": 4.8779521318750994e-05,
"loss": 0.258,
"step": 1710
},
{
"epoch": 1.02,
"grad_norm": 0.09393693506717682,
"learning_rate": 4.873989538754161e-05,
"loss": 0.9169,
"step": 1720
},
{
"epoch": 1.02,
"grad_norm": 0.9115855097770691,
"learning_rate": 4.8700269456332225e-05,
"loss": 0.5618,
"step": 1730
},
{
"epoch": 1.02,
"grad_norm": 6.85861873626709,
"learning_rate": 4.866064352512284e-05,
"loss": 0.6588,
"step": 1740
},
{
"epoch": 1.02,
"grad_norm": 33.108909606933594,
"learning_rate": 4.862101759391346e-05,
"loss": 0.3317,
"step": 1750
},
{
"epoch": 1.03,
"grad_norm": 2.785113573074341,
"learning_rate": 4.8581391662704074e-05,
"loss": 0.1886,
"step": 1760
},
{
"epoch": 1.03,
"grad_norm": 0.07260994613170624,
"learning_rate": 4.854176573149469e-05,
"loss": 0.4813,
"step": 1770
},
{
"epoch": 1.03,
"grad_norm": 1.9168213605880737,
"learning_rate": 4.8502139800285305e-05,
"loss": 0.6163,
"step": 1780
},
{
"epoch": 1.03,
"grad_norm": 32.30327224731445,
"learning_rate": 4.8462513869075924e-05,
"loss": 0.5272,
"step": 1790
},
{
"epoch": 1.03,
"grad_norm": 0.09055186808109283,
"learning_rate": 4.842288793786654e-05,
"loss": 0.58,
"step": 1800
},
{
"epoch": 1.03,
"grad_norm": 0.15506546199321747,
"learning_rate": 4.838326200665716e-05,
"loss": 0.1171,
"step": 1810
},
{
"epoch": 1.03,
"grad_norm": 32.89198303222656,
"learning_rate": 4.8343636075447773e-05,
"loss": 0.4713,
"step": 1820
},
{
"epoch": 1.03,
"grad_norm": 0.009175814688205719,
"learning_rate": 4.830401014423839e-05,
"loss": 0.6139,
"step": 1830
},
{
"epoch": 1.03,
"grad_norm": 32.81629943847656,
"learning_rate": 4.8264384213029004e-05,
"loss": 0.6131,
"step": 1840
},
{
"epoch": 1.03,
"grad_norm": 62.49550247192383,
"learning_rate": 4.822475828181962e-05,
"loss": 0.5677,
"step": 1850
},
{
"epoch": 1.03,
"grad_norm": 2.451925754547119,
"learning_rate": 4.818513235061024e-05,
"loss": 1.4171,
"step": 1860
},
{
"epoch": 1.03,
"grad_norm": 0.2953392267227173,
"learning_rate": 4.814550641940086e-05,
"loss": 0.3838,
"step": 1870
},
{
"epoch": 1.03,
"grad_norm": 53.240325927734375,
"learning_rate": 4.810588048819147e-05,
"loss": 0.1432,
"step": 1880
},
{
"epoch": 1.03,
"grad_norm": 0.1574406921863556,
"learning_rate": 4.806625455698209e-05,
"loss": 0.318,
"step": 1890
},
{
"epoch": 1.04,
"grad_norm": 35.6072998046875,
"learning_rate": 4.802662862577271e-05,
"loss": 0.6746,
"step": 1900
},
{
"epoch": 1.04,
"grad_norm": 0.012536413036286831,
"learning_rate": 4.798700269456333e-05,
"loss": 0.7181,
"step": 1910
},
{
"epoch": 1.04,
"grad_norm": 0.05652592331171036,
"learning_rate": 4.794737676335394e-05,
"loss": 0.0939,
"step": 1920
},
{
"epoch": 1.04,
"grad_norm": 2.5210182666778564,
"learning_rate": 4.790775083214456e-05,
"loss": 1.1188,
"step": 1930
},
{
"epoch": 1.04,
"grad_norm": 0.16478443145751953,
"learning_rate": 4.786812490093517e-05,
"loss": 0.4504,
"step": 1940
},
{
"epoch": 1.04,
"grad_norm": 26.002525329589844,
"learning_rate": 4.782849896972579e-05,
"loss": 0.8049,
"step": 1950
},
{
"epoch": 1.04,
"grad_norm": 37.67827606201172,
"learning_rate": 4.778887303851641e-05,
"loss": 0.4756,
"step": 1960
},
{
"epoch": 1.04,
"grad_norm": 36.84476852416992,
"learning_rate": 4.774924710730702e-05,
"loss": 0.9565,
"step": 1970
},
{
"epoch": 1.04,
"grad_norm": 0.18539421260356903,
"learning_rate": 4.770962117609764e-05,
"loss": 0.3279,
"step": 1980
},
{
"epoch": 1.04,
"grad_norm": 3.1605958938598633,
"learning_rate": 4.766999524488825e-05,
"loss": 0.1506,
"step": 1990
},
{
"epoch": 1.04,
"grad_norm": 0.08869732171297073,
"learning_rate": 4.763036931367887e-05,
"loss": 0.5895,
"step": 2000
},
{
"epoch": 1.04,
"grad_norm": 1.4320793151855469,
"learning_rate": 4.759074338246949e-05,
"loss": 0.3709,
"step": 2010
},
{
"epoch": 1.04,
"grad_norm": 0.013628893531858921,
"learning_rate": 4.755111745126011e-05,
"loss": 0.4708,
"step": 2020
},
{
"epoch": 1.04,
"grad_norm": 0.008771849796175957,
"learning_rate": 4.751149152005072e-05,
"loss": 0.2291,
"step": 2030
},
{
"epoch": 1.05,
"grad_norm": 0.034172721207141876,
"learning_rate": 4.747186558884134e-05,
"loss": 0.3005,
"step": 2040
},
{
"epoch": 1.05,
"grad_norm": 2.04589581489563,
"learning_rate": 4.743223965763195e-05,
"loss": 0.7397,
"step": 2050
},
{
"epoch": 1.05,
"grad_norm": 7.416354656219482,
"learning_rate": 4.739261372642258e-05,
"loss": 0.5153,
"step": 2060
},
{
"epoch": 1.05,
"grad_norm": 59.661014556884766,
"learning_rate": 4.735298779521319e-05,
"loss": 0.7417,
"step": 2070
},
{
"epoch": 1.05,
"grad_norm": 38.264408111572266,
"learning_rate": 4.731336186400381e-05,
"loss": 0.4673,
"step": 2080
},
{
"epoch": 1.05,
"grad_norm": 0.010330034419894218,
"learning_rate": 4.727373593279442e-05,
"loss": 0.7802,
"step": 2090
},
{
"epoch": 1.05,
"grad_norm": 0.009081050753593445,
"learning_rate": 4.723411000158504e-05,
"loss": 0.5108,
"step": 2100
},
{
"epoch": 1.05,
"grad_norm": 0.9804019331932068,
"learning_rate": 4.719448407037566e-05,
"loss": 0.1208,
"step": 2110
},
{
"epoch": 1.05,
"grad_norm": 17.980236053466797,
"learning_rate": 4.7154858139166276e-05,
"loss": 0.3317,
"step": 2120
},
{
"epoch": 1.05,
"grad_norm": 0.03598076477646828,
"learning_rate": 4.711523220795689e-05,
"loss": 0.4675,
"step": 2130
},
{
"epoch": 1.05,
"grad_norm": 28.66923713684082,
"learning_rate": 4.707560627674751e-05,
"loss": 0.7232,
"step": 2140
},
{
"epoch": 1.05,
"grad_norm": 0.05090579390525818,
"learning_rate": 4.703598034553812e-05,
"loss": 0.2404,
"step": 2150
},
{
"epoch": 1.05,
"grad_norm": 39.81483840942383,
"learning_rate": 4.6996354414328745e-05,
"loss": 0.382,
"step": 2160
},
{
"epoch": 1.05,
"grad_norm": 0.010164987295866013,
"learning_rate": 4.695672848311936e-05,
"loss": 0.4326,
"step": 2170
},
{
"epoch": 1.06,
"grad_norm": 0.007712522987276316,
"learning_rate": 4.691710255190997e-05,
"loss": 0.1419,
"step": 2180
},
{
"epoch": 1.06,
"grad_norm": 0.03637077286839485,
"learning_rate": 4.687747662070059e-05,
"loss": 0.0087,
"step": 2190
},
{
"epoch": 1.06,
"grad_norm": 0.2942463755607605,
"learning_rate": 4.68378506894912e-05,
"loss": 0.5895,
"step": 2200
},
{
"epoch": 1.06,
"grad_norm": 11.139754295349121,
"learning_rate": 4.6798224758281825e-05,
"loss": 0.2236,
"step": 2210
},
{
"epoch": 1.06,
"grad_norm": 0.1554485708475113,
"learning_rate": 4.675859882707244e-05,
"loss": 0.3646,
"step": 2220
},
{
"epoch": 1.06,
"grad_norm": 0.10144095867872238,
"learning_rate": 4.6718972895863056e-05,
"loss": 0.2301,
"step": 2230
},
{
"epoch": 1.06,
"grad_norm": 34.84081268310547,
"learning_rate": 4.667934696465367e-05,
"loss": 0.7788,
"step": 2240
},
{
"epoch": 1.06,
"grad_norm": 0.31722915172576904,
"learning_rate": 4.663972103344429e-05,
"loss": 0.1405,
"step": 2250
},
{
"epoch": 1.06,
"grad_norm": 32.6774787902832,
"learning_rate": 4.6600095102234906e-05,
"loss": 0.4079,
"step": 2260
},
{
"epoch": 1.06,
"grad_norm": 0.04206797853112221,
"learning_rate": 4.6560469171025525e-05,
"loss": 0.8566,
"step": 2270
},
{
"epoch": 1.06,
"grad_norm": 0.007674456108361483,
"learning_rate": 4.6520843239816137e-05,
"loss": 0.6745,
"step": 2280
},
{
"epoch": 1.06,
"grad_norm": 4.521772384643555,
"learning_rate": 4.6481217308606755e-05,
"loss": 0.2311,
"step": 2290
},
{
"epoch": 1.06,
"grad_norm": 0.43009287118911743,
"learning_rate": 4.644159137739737e-05,
"loss": 0.9246,
"step": 2300
},
{
"epoch": 1.06,
"grad_norm": 0.108181893825531,
"learning_rate": 4.6401965446187986e-05,
"loss": 1.0581,
"step": 2310
},
{
"epoch": 1.07,
"grad_norm": 0.0881645604968071,
"learning_rate": 4.6362339514978605e-05,
"loss": 0.431,
"step": 2320
},
{
"epoch": 1.07,
"grad_norm": 0.03945665806531906,
"learning_rate": 4.6322713583769224e-05,
"loss": 0.3784,
"step": 2330
},
{
"epoch": 1.07,
"grad_norm": 0.01008934061974287,
"learning_rate": 4.6283087652559836e-05,
"loss": 0.4657,
"step": 2340
},
{
"epoch": 1.07,
"grad_norm": 1.8987274169921875,
"learning_rate": 4.6243461721350455e-05,
"loss": 0.3025,
"step": 2350
},
{
"epoch": 1.07,
"grad_norm": 52.36662292480469,
"learning_rate": 4.6203835790141073e-05,
"loss": 0.1047,
"step": 2360
},
{
"epoch": 1.07,
"grad_norm": 30.82796287536621,
"learning_rate": 4.616420985893169e-05,
"loss": 0.3864,
"step": 2370
},
{
"epoch": 1.07,
"grad_norm": 28.43499755859375,
"learning_rate": 4.6124583927722304e-05,
"loss": 0.5029,
"step": 2380
},
{
"epoch": 1.07,
"grad_norm": 36.37118911743164,
"learning_rate": 4.608495799651292e-05,
"loss": 0.3412,
"step": 2390
},
{
"epoch": 1.07,
"grad_norm": 0.03059449978172779,
"learning_rate": 4.6045332065303535e-05,
"loss": 1.6759,
"step": 2400
},
{
"epoch": 1.07,
"grad_norm": 0.028554683551192284,
"learning_rate": 4.6005706134094154e-05,
"loss": 0.4232,
"step": 2410
},
{
"epoch": 1.07,
"grad_norm": 0.3136725425720215,
"learning_rate": 4.596608020288477e-05,
"loss": 0.4531,
"step": 2420
},
{
"epoch": 1.07,
"grad_norm": 9.164505004882812,
"learning_rate": 4.5926454271675385e-05,
"loss": 1.4434,
"step": 2430
},
{
"epoch": 1.07,
"grad_norm": 26.755535125732422,
"learning_rate": 4.5886828340466004e-05,
"loss": 0.5678,
"step": 2440
},
{
"epoch": 1.07,
"grad_norm": 0.027405157685279846,
"learning_rate": 4.5847202409256616e-05,
"loss": 0.1547,
"step": 2450
},
{
"epoch": 1.08,
"grad_norm": 7.302061080932617,
"learning_rate": 4.5807576478047234e-05,
"loss": 0.1004,
"step": 2460
},
{
"epoch": 1.08,
"grad_norm": 36.72040557861328,
"learning_rate": 4.576795054683785e-05,
"loss": 0.4913,
"step": 2470
},
{
"epoch": 1.08,
"grad_norm": 0.10275045782327652,
"learning_rate": 4.572832461562847e-05,
"loss": 0.0076,
"step": 2480
},
{
"epoch": 1.08,
"grad_norm": 0.008994188159704208,
"learning_rate": 4.5688698684419084e-05,
"loss": 0.0018,
"step": 2490
},
{
"epoch": 1.08,
"grad_norm": 0.02831762284040451,
"learning_rate": 4.56490727532097e-05,
"loss": 0.086,
"step": 2500
},
{
"epoch": 1.08,
"grad_norm": 0.6608620285987854,
"learning_rate": 4.5609446822000315e-05,
"loss": 0.0874,
"step": 2510
},
{
"epoch": 1.08,
"grad_norm": 1.8176270723342896,
"learning_rate": 4.556982089079094e-05,
"loss": 0.459,
"step": 2520
},
{
"epoch": 1.08,
"grad_norm": 28.674335479736328,
"learning_rate": 4.553019495958155e-05,
"loss": 0.4304,
"step": 2530
},
{
"epoch": 1.08,
"grad_norm": 0.06465455144643784,
"learning_rate": 4.549056902837217e-05,
"loss": 0.6094,
"step": 2540
},
{
"epoch": 1.08,
"grad_norm": 0.011676542460918427,
"learning_rate": 4.545094309716278e-05,
"loss": 0.333,
"step": 2550
},
{
"epoch": 1.08,
"grad_norm": 4.420731544494629,
"learning_rate": 4.54113171659534e-05,
"loss": 0.1013,
"step": 2560
},
{
"epoch": 1.08,
"grad_norm": 0.1971130520105362,
"learning_rate": 4.537169123474402e-05,
"loss": 0.0122,
"step": 2570
},
{
"epoch": 1.08,
"grad_norm": 2.309307813644409,
"learning_rate": 4.533206530353464e-05,
"loss": 0.0247,
"step": 2580
},
{
"epoch": 1.08,
"grad_norm": 0.010364987887442112,
"learning_rate": 4.529243937232525e-05,
"loss": 0.0751,
"step": 2590
},
{
"epoch": 1.09,
"grad_norm": 30.956512451171875,
"learning_rate": 4.525281344111587e-05,
"loss": 0.7706,
"step": 2600
},
{
"epoch": 1.09,
"grad_norm": 21.555742263793945,
"learning_rate": 4.521318750990648e-05,
"loss": 0.0501,
"step": 2610
},
{
"epoch": 1.09,
"grad_norm": 0.02271176129579544,
"learning_rate": 4.51735615786971e-05,
"loss": 0.219,
"step": 2620
},
{
"epoch": 1.09,
"grad_norm": 0.10638172179460526,
"learning_rate": 4.513393564748772e-05,
"loss": 0.4009,
"step": 2630
},
{
"epoch": 1.09,
"grad_norm": 0.0012674570316448808,
"learning_rate": 4.509430971627833e-05,
"loss": 0.0365,
"step": 2640
},
{
"epoch": 1.09,
"grad_norm": 0.01860959082841873,
"learning_rate": 4.505468378506895e-05,
"loss": 0.064,
"step": 2650
},
{
"epoch": 1.09,
"grad_norm": 0.029620472341775894,
"learning_rate": 4.501505785385956e-05,
"loss": 0.5788,
"step": 2660
},
{
"epoch": 1.09,
"grad_norm": 69.32429504394531,
"learning_rate": 4.497543192265019e-05,
"loss": 0.5635,
"step": 2670
},
{
"epoch": 1.09,
"grad_norm": 0.004012781195342541,
"learning_rate": 4.49358059914408e-05,
"loss": 0.8204,
"step": 2680
},
{
"epoch": 1.09,
"grad_norm": 0.007074211724102497,
"learning_rate": 4.489618006023142e-05,
"loss": 0.2872,
"step": 2690
},
{
"epoch": 1.09,
"grad_norm": 2.3659746646881104,
"learning_rate": 4.485655412902203e-05,
"loss": 0.6915,
"step": 2700
},
{
"epoch": 1.09,
"grad_norm": 0.2181590050458908,
"learning_rate": 4.481692819781265e-05,
"loss": 0.0489,
"step": 2710
},
{
"epoch": 1.09,
"grad_norm": 0.00418996112421155,
"learning_rate": 4.477730226660327e-05,
"loss": 0.4435,
"step": 2720
},
{
"epoch": 1.09,
"grad_norm": 0.011325598694384098,
"learning_rate": 4.473767633539389e-05,
"loss": 0.0595,
"step": 2730
},
{
"epoch": 1.1,
"grad_norm": 6.933524131774902,
"learning_rate": 4.46980504041845e-05,
"loss": 0.6284,
"step": 2740
},
{
"epoch": 1.1,
"grad_norm": 0.04216031730175018,
"learning_rate": 4.465842447297512e-05,
"loss": 0.1847,
"step": 2750
},
{
"epoch": 1.1,
"grad_norm": 49.92095184326172,
"learning_rate": 4.461879854176573e-05,
"loss": 0.1075,
"step": 2760
},
{
"epoch": 1.1,
"grad_norm": 0.0068773203529417515,
"learning_rate": 4.457917261055635e-05,
"loss": 0.7253,
"step": 2770
},
{
"epoch": 1.1,
"grad_norm": 49.53322219848633,
"learning_rate": 4.453954667934697e-05,
"loss": 0.3937,
"step": 2780
},
{
"epoch": 1.1,
"grad_norm": 0.0059694708324968815,
"learning_rate": 4.449992074813759e-05,
"loss": 0.3752,
"step": 2790
},
{
"epoch": 1.1,
"grad_norm": 0.006113003473728895,
"learning_rate": 4.44602948169282e-05,
"loss": 0.2646,
"step": 2800
},
{
"epoch": 1.1,
"eval_accuracy": 0.8857142857142857,
"eval_loss": 0.5655186772346497,
"eval_runtime": 2204.4149,
"eval_samples_per_second": 0.302,
"eval_steps_per_second": 0.151,
"step": 2806
},
{
"epoch": 2.0,
"grad_norm": 0.030710767954587936,
"learning_rate": 4.442066888571882e-05,
"loss": 0.2584,
"step": 2810
},
{
"epoch": 2.0,
"grad_norm": 0.961925208568573,
"learning_rate": 4.438104295450943e-05,
"loss": 0.0208,
"step": 2820
},
{
"epoch": 2.0,
"grad_norm": 0.034155942499637604,
"learning_rate": 4.434141702330005e-05,
"loss": 0.0855,
"step": 2830
},
{
"epoch": 2.0,
"grad_norm": 0.023880697786808014,
"learning_rate": 4.430179109209067e-05,
"loss": 0.0589,
"step": 2840
},
{
"epoch": 2.0,
"grad_norm": 0.4049380421638489,
"learning_rate": 4.426216516088128e-05,
"loss": 0.0529,
"step": 2850
},
{
"epoch": 2.0,
"grad_norm": 0.3483090102672577,
"learning_rate": 4.42225392296719e-05,
"loss": 0.6943,
"step": 2860
},
{
"epoch": 2.0,
"grad_norm": 24.336814880371094,
"learning_rate": 4.418291329846252e-05,
"loss": 0.5243,
"step": 2870
},
{
"epoch": 2.01,
"grad_norm": 0.0032091333996504545,
"learning_rate": 4.4143287367253136e-05,
"loss": 0.1202,
"step": 2880
},
{
"epoch": 2.01,
"grad_norm": 0.047493454068899155,
"learning_rate": 4.410366143604375e-05,
"loss": 0.1007,
"step": 2890
},
{
"epoch": 2.01,
"grad_norm": 0.017678333446383476,
"learning_rate": 4.406403550483437e-05,
"loss": 0.0008,
"step": 2900
},
{
"epoch": 2.01,
"grad_norm": 0.9050219058990479,
"learning_rate": 4.402440957362498e-05,
"loss": 0.3121,
"step": 2910
},
{
"epoch": 2.01,
"grad_norm": 0.03626730665564537,
"learning_rate": 4.39847836424156e-05,
"loss": 0.1179,
"step": 2920
},
{
"epoch": 2.01,
"grad_norm": 0.011950280517339706,
"learning_rate": 4.3945157711206216e-05,
"loss": 0.4693,
"step": 2930
},
{
"epoch": 2.01,
"grad_norm": 0.02412373013794422,
"learning_rate": 4.3905531779996835e-05,
"loss": 0.8616,
"step": 2940
},
{
"epoch": 2.01,
"grad_norm": 30.13632583618164,
"learning_rate": 4.386590584878745e-05,
"loss": 0.3553,
"step": 2950
},
{
"epoch": 2.01,
"grad_norm": 0.0061193606816232204,
"learning_rate": 4.3826279917578066e-05,
"loss": 0.0383,
"step": 2960
},
{
"epoch": 2.01,
"grad_norm": 0.2041795253753662,
"learning_rate": 4.378665398636868e-05,
"loss": 0.6114,
"step": 2970
},
{
"epoch": 2.01,
"grad_norm": 0.26872244477272034,
"learning_rate": 4.3747028055159304e-05,
"loss": 0.5494,
"step": 2980
},
{
"epoch": 2.01,
"grad_norm": 0.06702332943677902,
"learning_rate": 4.3707402123949916e-05,
"loss": 0.0144,
"step": 2990
},
{
"epoch": 2.01,
"grad_norm": 0.0052034310065209866,
"learning_rate": 4.3667776192740534e-05,
"loss": 0.0128,
"step": 3000
},
{
"epoch": 2.01,
"grad_norm": 2.874134063720703,
"learning_rate": 4.3628150261531146e-05,
"loss": 0.6844,
"step": 3010
},
{
"epoch": 2.02,
"grad_norm": 1.1829482316970825,
"learning_rate": 4.3588524330321765e-05,
"loss": 0.1866,
"step": 3020
},
{
"epoch": 2.02,
"grad_norm": 41.475399017333984,
"learning_rate": 4.3548898399112384e-05,
"loss": 0.156,
"step": 3030
},
{
"epoch": 2.02,
"grad_norm": 0.07054935395717621,
"learning_rate": 4.3509272467902996e-05,
"loss": 0.3379,
"step": 3040
},
{
"epoch": 2.02,
"grad_norm": 0.44977086782455444,
"learning_rate": 4.3469646536693615e-05,
"loss": 0.5629,
"step": 3050
},
{
"epoch": 2.02,
"grad_norm": 0.06215721368789673,
"learning_rate": 4.343002060548423e-05,
"loss": 0.0019,
"step": 3060
},
{
"epoch": 2.02,
"grad_norm": 43.73810958862305,
"learning_rate": 4.3390394674274846e-05,
"loss": 0.3984,
"step": 3070
},
{
"epoch": 2.02,
"grad_norm": 0.9523270130157471,
"learning_rate": 4.3350768743065464e-05,
"loss": 0.0888,
"step": 3080
},
{
"epoch": 2.02,
"grad_norm": 0.005942572373896837,
"learning_rate": 4.331114281185608e-05,
"loss": 0.1961,
"step": 3090
},
{
"epoch": 2.02,
"grad_norm": 0.022418642416596413,
"learning_rate": 4.3271516880646695e-05,
"loss": 0.0597,
"step": 3100
},
{
"epoch": 2.02,
"grad_norm": 0.04196101427078247,
"learning_rate": 4.3231890949437314e-05,
"loss": 0.1147,
"step": 3110
},
{
"epoch": 2.02,
"grad_norm": 0.003765852889046073,
"learning_rate": 4.3192265018227926e-05,
"loss": 0.0416,
"step": 3120
},
{
"epoch": 2.02,
"grad_norm": 0.053471703082323074,
"learning_rate": 4.3152639087018545e-05,
"loss": 0.3866,
"step": 3130
},
{
"epoch": 2.02,
"grad_norm": 51.969120025634766,
"learning_rate": 4.3113013155809164e-05,
"loss": 0.0616,
"step": 3140
},
{
"epoch": 2.02,
"grad_norm": 0.005074084736406803,
"learning_rate": 4.307338722459978e-05,
"loss": 0.1907,
"step": 3150
},
{
"epoch": 2.03,
"grad_norm": 0.0045975870452821255,
"learning_rate": 4.3033761293390395e-05,
"loss": 0.0072,
"step": 3160
},
{
"epoch": 2.03,
"grad_norm": 0.0040842327289283276,
"learning_rate": 4.299413536218101e-05,
"loss": 0.2306,
"step": 3170
},
{
"epoch": 2.03,
"grad_norm": 0.008049121126532555,
"learning_rate": 4.295450943097163e-05,
"loss": 0.0058,
"step": 3180
},
{
"epoch": 2.03,
"grad_norm": 21.22314453125,
"learning_rate": 4.291488349976225e-05,
"loss": 0.5502,
"step": 3190
},
{
"epoch": 2.03,
"grad_norm": 0.036627013236284256,
"learning_rate": 4.287525756855286e-05,
"loss": 0.1419,
"step": 3200
},
{
"epoch": 2.03,
"grad_norm": 2.3564202785491943,
"learning_rate": 4.283563163734348e-05,
"loss": 0.0279,
"step": 3210
},
{
"epoch": 2.03,
"grad_norm": 0.0108193913474679,
"learning_rate": 4.2796005706134094e-05,
"loss": 0.0004,
"step": 3220
},
{
"epoch": 2.03,
"grad_norm": 0.0201814454048872,
"learning_rate": 4.275637977492471e-05,
"loss": 0.3249,
"step": 3230
},
{
"epoch": 2.03,
"grad_norm": 0.03389296308159828,
"learning_rate": 4.271675384371533e-05,
"loss": 0.3386,
"step": 3240
},
{
"epoch": 2.03,
"grad_norm": 0.01544855535030365,
"learning_rate": 4.267712791250595e-05,
"loss": 0.3866,
"step": 3250
},
{
"epoch": 2.03,
"grad_norm": 0.025016358122229576,
"learning_rate": 4.263750198129656e-05,
"loss": 0.0013,
"step": 3260
},
{
"epoch": 2.03,
"grad_norm": 0.0923624038696289,
"learning_rate": 4.2597876050087174e-05,
"loss": 0.2532,
"step": 3270
},
{
"epoch": 2.03,
"grad_norm": 23.150659561157227,
"learning_rate": 4.255825011887779e-05,
"loss": 0.3962,
"step": 3280
},
{
"epoch": 2.03,
"grad_norm": 0.015515293926000595,
"learning_rate": 4.251862418766841e-05,
"loss": 0.0004,
"step": 3290
},
{
"epoch": 2.04,
"grad_norm": 0.003917529247701168,
"learning_rate": 4.247899825645903e-05,
"loss": 0.2484,
"step": 3300
},
{
"epoch": 2.04,
"grad_norm": 28.370773315429688,
"learning_rate": 4.243937232524964e-05,
"loss": 0.0367,
"step": 3310
},
{
"epoch": 2.04,
"grad_norm": 0.026205556467175484,
"learning_rate": 4.239974639404026e-05,
"loss": 0.3111,
"step": 3320
},
{
"epoch": 2.04,
"grad_norm": 0.01336923148483038,
"learning_rate": 4.2360120462830874e-05,
"loss": 0.0047,
"step": 3330
},
{
"epoch": 2.04,
"grad_norm": 0.011190090328454971,
"learning_rate": 4.23204945316215e-05,
"loss": 0.3007,
"step": 3340
},
{
"epoch": 2.04,
"grad_norm": 0.003655917476862669,
"learning_rate": 4.228086860041211e-05,
"loss": 0.5659,
"step": 3350
},
{
"epoch": 2.04,
"grad_norm": 0.017216026782989502,
"learning_rate": 4.224124266920273e-05,
"loss": 0.0933,
"step": 3360
},
{
"epoch": 2.04,
"grad_norm": 0.007373865228146315,
"learning_rate": 4.220161673799334e-05,
"loss": 0.0298,
"step": 3370
},
{
"epoch": 2.04,
"grad_norm": 0.035991325974464417,
"learning_rate": 4.216199080678396e-05,
"loss": 0.0916,
"step": 3380
},
{
"epoch": 2.04,
"grad_norm": 0.007277372293174267,
"learning_rate": 4.212236487557458e-05,
"loss": 0.0008,
"step": 3390
},
{
"epoch": 2.04,
"grad_norm": 0.0012711473973467946,
"learning_rate": 4.20827389443652e-05,
"loss": 0.049,
"step": 3400
},
{
"epoch": 2.04,
"grad_norm": 0.004262813366949558,
"learning_rate": 4.204311301315581e-05,
"loss": 0.3255,
"step": 3410
},
{
"epoch": 2.04,
"grad_norm": 0.6016376614570618,
"learning_rate": 4.200348708194643e-05,
"loss": 0.0016,
"step": 3420
},
{
"epoch": 2.04,
"grad_norm": 0.027856985107064247,
"learning_rate": 4.196386115073704e-05,
"loss": 0.1706,
"step": 3430
},
{
"epoch": 2.05,
"grad_norm": 36.658660888671875,
"learning_rate": 4.192423521952766e-05,
"loss": 0.393,
"step": 3440
},
{
"epoch": 2.05,
"grad_norm": 4.459847927093506,
"learning_rate": 4.188460928831828e-05,
"loss": 0.2113,
"step": 3450
},
{
"epoch": 2.05,
"grad_norm": 0.003763306187465787,
"learning_rate": 4.18449833571089e-05,
"loss": 0.0946,
"step": 3460
},
{
"epoch": 2.05,
"grad_norm": 0.9358043670654297,
"learning_rate": 4.180535742589951e-05,
"loss": 0.1248,
"step": 3470
},
{
"epoch": 2.05,
"grad_norm": 5.325794219970703,
"learning_rate": 4.176573149469012e-05,
"loss": 0.1882,
"step": 3480
},
{
"epoch": 2.05,
"grad_norm": 0.01291597355157137,
"learning_rate": 4.172610556348075e-05,
"loss": 0.5989,
"step": 3490
},
{
"epoch": 2.05,
"grad_norm": 0.05552150309085846,
"learning_rate": 4.168647963227136e-05,
"loss": 0.1317,
"step": 3500
},
{
"epoch": 2.05,
"grad_norm": 0.0046797278337180614,
"learning_rate": 4.164685370106198e-05,
"loss": 0.9324,
"step": 3510
},
{
"epoch": 2.05,
"grad_norm": 0.19884918630123138,
"learning_rate": 4.160722776985259e-05,
"loss": 0.0027,
"step": 3520
},
{
"epoch": 2.05,
"grad_norm": 0.06361120939254761,
"learning_rate": 4.156760183864321e-05,
"loss": 0.1294,
"step": 3530
},
{
"epoch": 2.05,
"grad_norm": 0.025872783735394478,
"learning_rate": 4.152797590743383e-05,
"loss": 0.3453,
"step": 3540
},
{
"epoch": 2.05,
"grad_norm": 0.44598618149757385,
"learning_rate": 4.1488349976224446e-05,
"loss": 0.0445,
"step": 3550
},
{
"epoch": 2.05,
"grad_norm": 0.00139313330873847,
"learning_rate": 4.144872404501506e-05,
"loss": 0.4126,
"step": 3560
},
{
"epoch": 2.05,
"grad_norm": 0.004861112684011459,
"learning_rate": 4.140909811380568e-05,
"loss": 0.3162,
"step": 3570
},
{
"epoch": 2.06,
"grad_norm": 37.97075653076172,
"learning_rate": 4.136947218259629e-05,
"loss": 0.0275,
"step": 3580
},
{
"epoch": 2.06,
"grad_norm": 0.006260779220610857,
"learning_rate": 4.132984625138691e-05,
"loss": 0.5518,
"step": 3590
},
{
"epoch": 2.06,
"grad_norm": 10.439234733581543,
"learning_rate": 4.129022032017753e-05,
"loss": 0.0489,
"step": 3600
},
{
"epoch": 2.06,
"grad_norm": 0.009267416782677174,
"learning_rate": 4.1250594388968146e-05,
"loss": 0.2203,
"step": 3610
},
{
"epoch": 2.06,
"grad_norm": 0.003436572849750519,
"learning_rate": 4.121096845775876e-05,
"loss": 0.0489,
"step": 3620
},
{
"epoch": 2.06,
"grad_norm": 0.02378927730023861,
"learning_rate": 4.1171342526549377e-05,
"loss": 0.3647,
"step": 3630
},
{
"epoch": 2.06,
"grad_norm": 0.04053608328104019,
"learning_rate": 4.113171659533999e-05,
"loss": 0.4505,
"step": 3640
},
{
"epoch": 2.06,
"grad_norm": 0.8833039402961731,
"learning_rate": 4.1092090664130614e-05,
"loss": 0.0626,
"step": 3650
},
{
"epoch": 2.06,
"grad_norm": 11.919655799865723,
"learning_rate": 4.1052464732921226e-05,
"loss": 0.0989,
"step": 3660
},
{
"epoch": 2.06,
"grad_norm": 0.03586142137646675,
"learning_rate": 4.1012838801711845e-05,
"loss": 0.2583,
"step": 3670
},
{
"epoch": 2.06,
"grad_norm": 0.1854490488767624,
"learning_rate": 4.097321287050246e-05,
"loss": 0.1368,
"step": 3680
},
{
"epoch": 2.06,
"grad_norm": 0.057375673204660416,
"learning_rate": 4.0933586939293076e-05,
"loss": 0.345,
"step": 3690
},
{
"epoch": 2.06,
"grad_norm": 0.015717756003141403,
"learning_rate": 4.0893961008083695e-05,
"loss": 0.0065,
"step": 3700
},
{
"epoch": 2.06,
"grad_norm": 0.02194334752857685,
"learning_rate": 4.085433507687431e-05,
"loss": 0.0021,
"step": 3710
},
{
"epoch": 2.07,
"grad_norm": 16.584745407104492,
"learning_rate": 4.0814709145664925e-05,
"loss": 0.0147,
"step": 3720
},
{
"epoch": 2.07,
"grad_norm": 0.0053609260357916355,
"learning_rate": 4.077508321445554e-05,
"loss": 0.3077,
"step": 3730
},
{
"epoch": 2.07,
"grad_norm": 0.002716219983994961,
"learning_rate": 4.0735457283246156e-05,
"loss": 0.4135,
"step": 3740
},
{
"epoch": 2.07,
"grad_norm": 0.8324286937713623,
"learning_rate": 4.0695831352036775e-05,
"loss": 0.3889,
"step": 3750
},
{
"epoch": 2.07,
"grad_norm": 0.004210811574012041,
"learning_rate": 4.0656205420827394e-05,
"loss": 0.0015,
"step": 3760
},
{
"epoch": 2.07,
"grad_norm": 0.0026693022809922695,
"learning_rate": 4.0616579489618006e-05,
"loss": 0.566,
"step": 3770
},
{
"epoch": 2.07,
"grad_norm": 0.02392963133752346,
"learning_rate": 4.0576953558408625e-05,
"loss": 0.0124,
"step": 3780
},
{
"epoch": 2.07,
"grad_norm": 0.020079661160707474,
"learning_rate": 4.053732762719924e-05,
"loss": 0.0152,
"step": 3790
},
{
"epoch": 2.07,
"grad_norm": 0.0014172615483403206,
"learning_rate": 4.049770169598986e-05,
"loss": 0.0932,
"step": 3800
},
{
"epoch": 2.07,
"grad_norm": 5.266864776611328,
"learning_rate": 4.0458075764780474e-05,
"loss": 0.5559,
"step": 3810
},
{
"epoch": 2.07,
"grad_norm": 0.009865287691354752,
"learning_rate": 4.041844983357109e-05,
"loss": 0.2131,
"step": 3820
},
{
"epoch": 2.07,
"grad_norm": 0.0027454691007733345,
"learning_rate": 4.0378823902361705e-05,
"loss": 0.0006,
"step": 3830
},
{
"epoch": 2.07,
"grad_norm": 0.15348024666309357,
"learning_rate": 4.0339197971152324e-05,
"loss": 1.0736,
"step": 3840
},
{
"epoch": 2.07,
"grad_norm": 0.002026822417974472,
"learning_rate": 4.029957203994294e-05,
"loss": 0.4398,
"step": 3850
},
{
"epoch": 2.08,
"grad_norm": 0.023032035678625107,
"learning_rate": 4.025994610873356e-05,
"loss": 0.3724,
"step": 3860
},
{
"epoch": 2.08,
"grad_norm": 110.459716796875,
"learning_rate": 4.0220320177524174e-05,
"loss": 0.4695,
"step": 3870
},
{
"epoch": 2.08,
"grad_norm": 0.026824643835425377,
"learning_rate": 4.018069424631479e-05,
"loss": 0.4355,
"step": 3880
},
{
"epoch": 2.08,
"grad_norm": 0.007477205712348223,
"learning_rate": 4.0141068315105404e-05,
"loss": 0.3392,
"step": 3890
},
{
"epoch": 2.08,
"grad_norm": 0.0020925672724843025,
"learning_rate": 4.010144238389602e-05,
"loss": 0.228,
"step": 3900
},
{
"epoch": 2.08,
"grad_norm": 0.003810058580711484,
"learning_rate": 4.006181645268664e-05,
"loss": 0.2477,
"step": 3910
},
{
"epoch": 2.08,
"grad_norm": 0.0076815299689769745,
"learning_rate": 4.0022190521477254e-05,
"loss": 0.4539,
"step": 3920
},
{
"epoch": 2.08,
"grad_norm": 0.005379770882427692,
"learning_rate": 3.998256459026787e-05,
"loss": 0.0341,
"step": 3930
},
{
"epoch": 2.08,
"grad_norm": 0.003831785172224045,
"learning_rate": 3.9942938659058485e-05,
"loss": 0.5954,
"step": 3940
},
{
"epoch": 2.08,
"grad_norm": 0.7539482116699219,
"learning_rate": 3.9903312727849104e-05,
"loss": 0.3885,
"step": 3950
},
{
"epoch": 2.08,
"grad_norm": 0.005533235147595406,
"learning_rate": 3.986368679663972e-05,
"loss": 0.0053,
"step": 3960
},
{
"epoch": 2.08,
"grad_norm": 0.008866420015692711,
"learning_rate": 3.982406086543034e-05,
"loss": 0.0453,
"step": 3970
},
{
"epoch": 2.08,
"grad_norm": 0.014108781702816486,
"learning_rate": 3.978443493422095e-05,
"loss": 0.2954,
"step": 3980
},
{
"epoch": 2.08,
"grad_norm": 0.016585228964686394,
"learning_rate": 3.974480900301157e-05,
"loss": 0.0076,
"step": 3990
},
{
"epoch": 2.09,
"grad_norm": 3.2773778438568115,
"learning_rate": 3.970518307180219e-05,
"loss": 0.1556,
"step": 4000
},
{
"epoch": 2.09,
"grad_norm": 7.254385948181152,
"learning_rate": 3.966555714059281e-05,
"loss": 0.1696,
"step": 4010
},
{
"epoch": 2.09,
"grad_norm": 0.0035600659903138876,
"learning_rate": 3.962593120938342e-05,
"loss": 0.0074,
"step": 4020
},
{
"epoch": 2.09,
"grad_norm": 8.71975040435791,
"learning_rate": 3.958630527817404e-05,
"loss": 0.3048,
"step": 4030
},
{
"epoch": 2.09,
"grad_norm": 0.0020627696067094803,
"learning_rate": 3.954667934696465e-05,
"loss": 0.7165,
"step": 4040
},
{
"epoch": 2.09,
"grad_norm": 0.007494701538234949,
"learning_rate": 3.950705341575527e-05,
"loss": 0.5529,
"step": 4050
},
{
"epoch": 2.09,
"grad_norm": 0.016065679490566254,
"learning_rate": 3.946742748454589e-05,
"loss": 0.0128,
"step": 4060
},
{
"epoch": 2.09,
"grad_norm": 0.322768896818161,
"learning_rate": 3.942780155333651e-05,
"loss": 0.0628,
"step": 4070
},
{
"epoch": 2.09,
"grad_norm": 0.023394137620925903,
"learning_rate": 3.938817562212712e-05,
"loss": 0.4743,
"step": 4080
},
{
"epoch": 2.09,
"grad_norm": 0.0229184590280056,
"learning_rate": 3.934854969091774e-05,
"loss": 0.2818,
"step": 4090
},
{
"epoch": 2.09,
"grad_norm": 0.006755081005394459,
"learning_rate": 3.930892375970835e-05,
"loss": 0.3635,
"step": 4100
},
{
"epoch": 2.09,
"grad_norm": 0.004403649363666773,
"learning_rate": 3.926929782849898e-05,
"loss": 0.0568,
"step": 4110
},
{
"epoch": 2.09,
"grad_norm": 0.0034377514384686947,
"learning_rate": 3.922967189728959e-05,
"loss": 0.1624,
"step": 4120
},
{
"epoch": 2.09,
"grad_norm": 0.005851461086422205,
"learning_rate": 3.91900459660802e-05,
"loss": 0.6674,
"step": 4130
},
{
"epoch": 2.1,
"grad_norm": 0.004862835630774498,
"learning_rate": 3.915042003487082e-05,
"loss": 0.5013,
"step": 4140
},
{
"epoch": 2.1,
"grad_norm": 42.758365631103516,
"learning_rate": 3.911079410366143e-05,
"loss": 0.4173,
"step": 4150
},
{
"epoch": 2.1,
"grad_norm": 0.004606719594448805,
"learning_rate": 3.907116817245206e-05,
"loss": 0.5328,
"step": 4160
},
{
"epoch": 2.1,
"grad_norm": 78.40693664550781,
"learning_rate": 3.903154224124267e-05,
"loss": 0.4105,
"step": 4170
},
{
"epoch": 2.1,
"grad_norm": 23.919864654541016,
"learning_rate": 3.899191631003329e-05,
"loss": 1.4324,
"step": 4180
},
{
"epoch": 2.1,
"grad_norm": 0.00816379301249981,
"learning_rate": 3.89522903788239e-05,
"loss": 0.3002,
"step": 4190
},
{
"epoch": 2.1,
"grad_norm": 0.022155677899718285,
"learning_rate": 3.891266444761452e-05,
"loss": 0.0785,
"step": 4200
},
{
"epoch": 2.1,
"eval_accuracy": 0.9052631578947369,
"eval_loss": 0.4806475341320038,
"eval_runtime": 2299.8623,
"eval_samples_per_second": 0.289,
"eval_steps_per_second": 0.145,
"step": 4209
},
{
"epoch": 3.0,
"grad_norm": 0.0499810166656971,
"learning_rate": 3.887303851640514e-05,
"loss": 0.1101,
"step": 4210
},
{
"epoch": 3.0,
"grad_norm": 0.00219921232201159,
"learning_rate": 3.883341258519576e-05,
"loss": 0.0019,
"step": 4220
},
{
"epoch": 3.0,
"grad_norm": 0.05688053369522095,
"learning_rate": 3.879378665398637e-05,
"loss": 0.4249,
"step": 4230
},
{
"epoch": 3.0,
"grad_norm": 0.004060626961290836,
"learning_rate": 3.875416072277699e-05,
"loss": 0.329,
"step": 4240
},
{
"epoch": 3.0,
"grad_norm": 0.4057186245918274,
"learning_rate": 3.87145347915676e-05,
"loss": 0.0089,
"step": 4250
},
{
"epoch": 3.0,
"grad_norm": 0.0447358600795269,
"learning_rate": 3.8674908860358226e-05,
"loss": 0.0034,
"step": 4260
},
{
"epoch": 3.0,
"grad_norm": 0.003750765696167946,
"learning_rate": 3.863528292914884e-05,
"loss": 0.0953,
"step": 4270
},
{
"epoch": 3.01,
"grad_norm": 6.533902168273926,
"learning_rate": 3.8595656997939456e-05,
"loss": 0.0106,
"step": 4280
},
{
"epoch": 3.01,
"grad_norm": 0.001664067734964192,
"learning_rate": 3.855603106673007e-05,
"loss": 0.0162,
"step": 4290
},
{
"epoch": 3.01,
"grad_norm": 0.8516010046005249,
"learning_rate": 3.851640513552069e-05,
"loss": 0.4751,
"step": 4300
},
{
"epoch": 3.01,
"grad_norm": 0.03567550331354141,
"learning_rate": 3.8476779204311306e-05,
"loss": 0.161,
"step": 4310
},
{
"epoch": 3.01,
"grad_norm": 0.0029626258183270693,
"learning_rate": 3.8437153273101925e-05,
"loss": 0.1448,
"step": 4320
},
{
"epoch": 3.01,
"grad_norm": 0.017234837636351585,
"learning_rate": 3.839752734189254e-05,
"loss": 0.0052,
"step": 4330
},
{
"epoch": 3.01,
"grad_norm": 0.14725999534130096,
"learning_rate": 3.835790141068315e-05,
"loss": 0.0645,
"step": 4340
},
{
"epoch": 3.01,
"grad_norm": 0.002782195108011365,
"learning_rate": 3.831827547947377e-05,
"loss": 0.0004,
"step": 4350
},
{
"epoch": 3.01,
"grad_norm": 34.547061920166016,
"learning_rate": 3.8278649548264386e-05,
"loss": 0.4315,
"step": 4360
},
{
"epoch": 3.01,
"grad_norm": 0.0030270384158939123,
"learning_rate": 3.8239023617055005e-05,
"loss": 0.3151,
"step": 4370
},
{
"epoch": 3.01,
"grad_norm": 0.008927990682423115,
"learning_rate": 3.819939768584562e-05,
"loss": 0.0002,
"step": 4380
},
{
"epoch": 3.01,
"grad_norm": 0.11368348449468613,
"learning_rate": 3.8159771754636236e-05,
"loss": 0.0077,
"step": 4390
},
{
"epoch": 3.01,
"grad_norm": 0.10815131664276123,
"learning_rate": 3.812014582342685e-05,
"loss": 0.0182,
"step": 4400
},
{
"epoch": 3.01,
"grad_norm": 0.020075034350156784,
"learning_rate": 3.808051989221747e-05,
"loss": 0.0007,
"step": 4410
},
{
"epoch": 3.02,
"grad_norm": 0.001029517618007958,
"learning_rate": 3.8040893961008086e-05,
"loss": 0.0397,
"step": 4420
},
{
"epoch": 3.02,
"grad_norm": 0.003121725283563137,
"learning_rate": 3.8001268029798704e-05,
"loss": 0.0001,
"step": 4430
},
{
"epoch": 3.02,
"grad_norm": 5.35357141494751,
"learning_rate": 3.7961642098589316e-05,
"loss": 0.0069,
"step": 4440
},
{
"epoch": 3.02,
"grad_norm": 0.013706800527870655,
"learning_rate": 3.7922016167379935e-05,
"loss": 0.0007,
"step": 4450
},
{
"epoch": 3.02,
"grad_norm": 0.09196832776069641,
"learning_rate": 3.7882390236170554e-05,
"loss": 0.0003,
"step": 4460
},
{
"epoch": 3.02,
"grad_norm": 0.003602321958169341,
"learning_rate": 3.784276430496117e-05,
"loss": 0.2969,
"step": 4470
},
{
"epoch": 3.02,
"grad_norm": 20.944992065429688,
"learning_rate": 3.7803138373751785e-05,
"loss": 0.0272,
"step": 4480
},
{
"epoch": 3.02,
"grad_norm": 0.002105366438627243,
"learning_rate": 3.7763512442542404e-05,
"loss": 0.0002,
"step": 4490
},
{
"epoch": 3.02,
"grad_norm": 0.004411764442920685,
"learning_rate": 3.7723886511333016e-05,
"loss": 0.0076,
"step": 4500
},
{
"epoch": 3.02,
"grad_norm": 0.005865162704139948,
"learning_rate": 3.7684260580123635e-05,
"loss": 0.0059,
"step": 4510
},
{
"epoch": 3.02,
"grad_norm": 0.011046779341995716,
"learning_rate": 3.764463464891425e-05,
"loss": 0.0043,
"step": 4520
},
{
"epoch": 3.02,
"grad_norm": 0.023666031658649445,
"learning_rate": 3.760500871770487e-05,
"loss": 0.0009,
"step": 4530
},
{
"epoch": 3.02,
"grad_norm": 53.04268264770508,
"learning_rate": 3.7565382786495484e-05,
"loss": 0.3129,
"step": 4540
},
{
"epoch": 3.02,
"grad_norm": 16.536462783813477,
"learning_rate": 3.7525756855286096e-05,
"loss": 0.0079,
"step": 4550
},
{
"epoch": 3.03,
"grad_norm": 0.004224766045808792,
"learning_rate": 3.7486130924076715e-05,
"loss": 0.073,
"step": 4560
},
{
"epoch": 3.03,
"grad_norm": 0.005598429590463638,
"learning_rate": 3.7446504992867334e-05,
"loss": 0.2742,
"step": 4570
},
{
"epoch": 3.03,
"grad_norm": 0.0030881077982485294,
"learning_rate": 3.740687906165795e-05,
"loss": 0.2948,
"step": 4580
},
{
"epoch": 3.03,
"grad_norm": 0.019152648746967316,
"learning_rate": 3.7367253130448565e-05,
"loss": 0.0047,
"step": 4590
},
{
"epoch": 3.03,
"grad_norm": 0.001949524856172502,
"learning_rate": 3.7327627199239183e-05,
"loss": 0.0003,
"step": 4600
},
{
"epoch": 3.03,
"grad_norm": 0.001139726140536368,
"learning_rate": 3.7288001268029795e-05,
"loss": 0.0033,
"step": 4610
},
{
"epoch": 3.03,
"grad_norm": 0.0009636884205974638,
"learning_rate": 3.724837533682042e-05,
"loss": 0.5236,
"step": 4620
},
{
"epoch": 3.03,
"grad_norm": 0.0024904939346015453,
"learning_rate": 3.720874940561103e-05,
"loss": 0.0005,
"step": 4630
},
{
"epoch": 3.03,
"grad_norm": 0.004561484791338444,
"learning_rate": 3.716912347440165e-05,
"loss": 0.4394,
"step": 4640
},
{
"epoch": 3.03,
"grad_norm": 21.228055953979492,
"learning_rate": 3.7129497543192264e-05,
"loss": 0.5905,
"step": 4650
},
{
"epoch": 3.03,
"grad_norm": 38.67287063598633,
"learning_rate": 3.708987161198288e-05,
"loss": 0.0304,
"step": 4660
},
{
"epoch": 3.03,
"grad_norm": 0.002863664412871003,
"learning_rate": 3.70502456807735e-05,
"loss": 0.4688,
"step": 4670
},
{
"epoch": 3.03,
"grad_norm": 0.0070022111758589745,
"learning_rate": 3.701061974956412e-05,
"loss": 0.0044,
"step": 4680
},
{
"epoch": 3.03,
"grad_norm": 98.50983428955078,
"learning_rate": 3.697099381835473e-05,
"loss": 0.2539,
"step": 4690
},
{
"epoch": 3.04,
"grad_norm": 0.044561292976140976,
"learning_rate": 3.693136788714535e-05,
"loss": 0.0002,
"step": 4700
},
{
"epoch": 3.04,
"grad_norm": 2.370043992996216,
"learning_rate": 3.689174195593596e-05,
"loss": 0.0055,
"step": 4710
},
{
"epoch": 3.04,
"grad_norm": 12.61652660369873,
"learning_rate": 3.685211602472658e-05,
"loss": 0.456,
"step": 4720
},
{
"epoch": 3.04,
"grad_norm": 0.020174263045191765,
"learning_rate": 3.68124900935172e-05,
"loss": 0.0023,
"step": 4730
},
{
"epoch": 3.04,
"grad_norm": 0.032532501965761185,
"learning_rate": 3.677286416230782e-05,
"loss": 0.0004,
"step": 4740
},
{
"epoch": 3.04,
"grad_norm": 39.96610641479492,
"learning_rate": 3.673323823109843e-05,
"loss": 0.5033,
"step": 4750
},
{
"epoch": 3.04,
"grad_norm": 0.006895292084664106,
"learning_rate": 3.669361229988905e-05,
"loss": 0.2369,
"step": 4760
},
{
"epoch": 3.04,
"grad_norm": 0.0018528720829635859,
"learning_rate": 3.665398636867967e-05,
"loss": 0.0005,
"step": 4770
},
{
"epoch": 3.04,
"grad_norm": 57.440799713134766,
"learning_rate": 3.661436043747028e-05,
"loss": 0.7416,
"step": 4780
},
{
"epoch": 3.04,
"grad_norm": 0.15606503188610077,
"learning_rate": 3.65747345062609e-05,
"loss": 0.0005,
"step": 4790
},
{
"epoch": 3.04,
"grad_norm": 0.06342484056949615,
"learning_rate": 3.653510857505151e-05,
"loss": 0.0008,
"step": 4800
},
{
"epoch": 3.04,
"grad_norm": 0.0007686218596063554,
"learning_rate": 3.649548264384213e-05,
"loss": 0.0083,
"step": 4810
},
{
"epoch": 3.04,
"grad_norm": 0.007868933491408825,
"learning_rate": 3.645585671263275e-05,
"loss": 0.0002,
"step": 4820
},
{
"epoch": 3.04,
"grad_norm": 0.0038664869498461485,
"learning_rate": 3.641623078142337e-05,
"loss": 0.5171,
"step": 4830
},
{
"epoch": 3.05,
"grad_norm": 139.34559631347656,
"learning_rate": 3.637660485021398e-05,
"loss": 0.1279,
"step": 4840
},
{
"epoch": 3.05,
"grad_norm": 0.01549526583403349,
"learning_rate": 3.63369789190046e-05,
"loss": 0.0024,
"step": 4850
},
{
"epoch": 3.05,
"grad_norm": 0.009506451897323132,
"learning_rate": 3.629735298779521e-05,
"loss": 0.0322,
"step": 4860
},
{
"epoch": 3.05,
"grad_norm": 0.01199623104184866,
"learning_rate": 3.625772705658583e-05,
"loss": 0.5853,
"step": 4870
},
{
"epoch": 3.05,
"grad_norm": 0.023425359278917313,
"learning_rate": 3.621810112537645e-05,
"loss": 0.0074,
"step": 4880
},
{
"epoch": 3.05,
"grad_norm": 0.0029667699709534645,
"learning_rate": 3.617847519416707e-05,
"loss": 0.0003,
"step": 4890
},
{
"epoch": 3.05,
"grad_norm": 0.0053655593656003475,
"learning_rate": 3.613884926295768e-05,
"loss": 0.0022,
"step": 4900
},
{
"epoch": 3.05,
"grad_norm": 0.002650972455739975,
"learning_rate": 3.60992233317483e-05,
"loss": 0.0286,
"step": 4910
},
{
"epoch": 3.05,
"grad_norm": 0.029404861852526665,
"learning_rate": 3.605959740053891e-05,
"loss": 0.355,
"step": 4920
},
{
"epoch": 3.05,
"grad_norm": 0.0018920317525044084,
"learning_rate": 3.6019971469329536e-05,
"loss": 0.0105,
"step": 4930
},
{
"epoch": 3.05,
"grad_norm": 0.2511395812034607,
"learning_rate": 3.598034553812015e-05,
"loss": 0.0721,
"step": 4940
},
{
"epoch": 3.05,
"grad_norm": 0.0022750215139240026,
"learning_rate": 3.594071960691077e-05,
"loss": 0.0011,
"step": 4950
},
{
"epoch": 3.05,
"grad_norm": 0.0004970223526470363,
"learning_rate": 3.590109367570138e-05,
"loss": 0.0493,
"step": 4960
},
{
"epoch": 3.05,
"grad_norm": 0.0018257640767842531,
"learning_rate": 3.5861467744492e-05,
"loss": 0.0041,
"step": 4970
},
{
"epoch": 3.05,
"grad_norm": 0.002615696983411908,
"learning_rate": 3.5821841813282617e-05,
"loss": 0.0002,
"step": 4980
},
{
"epoch": 3.06,
"grad_norm": 0.0015312007162719965,
"learning_rate": 3.578221588207323e-05,
"loss": 0.0048,
"step": 4990
},
{
"epoch": 3.06,
"grad_norm": 0.003842801321297884,
"learning_rate": 3.574258995086385e-05,
"loss": 0.0001,
"step": 5000
},
{
"epoch": 3.06,
"grad_norm": 0.003976788371801376,
"learning_rate": 3.570296401965446e-05,
"loss": 0.0002,
"step": 5010
},
{
"epoch": 3.06,
"grad_norm": 0.0027057684492319822,
"learning_rate": 3.566333808844508e-05,
"loss": 0.0001,
"step": 5020
},
{
"epoch": 3.06,
"grad_norm": 0.0013581090606749058,
"learning_rate": 3.56237121572357e-05,
"loss": 0.0643,
"step": 5030
},
{
"epoch": 3.06,
"grad_norm": 73.48147583007812,
"learning_rate": 3.5584086226026316e-05,
"loss": 0.1412,
"step": 5040
},
{
"epoch": 3.06,
"grad_norm": 0.05521896854043007,
"learning_rate": 3.554446029481693e-05,
"loss": 0.8526,
"step": 5050
},
{
"epoch": 3.06,
"grad_norm": 0.01980687491595745,
"learning_rate": 3.550483436360755e-05,
"loss": 0.3712,
"step": 5060
},
{
"epoch": 3.06,
"grad_norm": 0.0016443756176158786,
"learning_rate": 3.546520843239816e-05,
"loss": 0.0004,
"step": 5070
},
{
"epoch": 3.06,
"grad_norm": 2.7786030769348145,
"learning_rate": 3.5425582501188784e-05,
"loss": 0.0024,
"step": 5080
},
{
"epoch": 3.06,
"grad_norm": 0.002752843778580427,
"learning_rate": 3.5385956569979396e-05,
"loss": 0.0989,
"step": 5090
},
{
"epoch": 3.06,
"grad_norm": 0.07084832340478897,
"learning_rate": 3.5346330638770015e-05,
"loss": 0.006,
"step": 5100
},
{
"epoch": 3.06,
"grad_norm": 0.0023438192438334227,
"learning_rate": 3.530670470756063e-05,
"loss": 0.0631,
"step": 5110
},
{
"epoch": 3.06,
"grad_norm": 0.023146087303757668,
"learning_rate": 3.5267078776351246e-05,
"loss": 0.3281,
"step": 5120
},
{
"epoch": 3.07,
"grad_norm": 0.0026536276564002037,
"learning_rate": 3.5227452845141865e-05,
"loss": 0.2622,
"step": 5130
},
{
"epoch": 3.07,
"grad_norm": 25.2746639251709,
"learning_rate": 3.5187826913932483e-05,
"loss": 0.9971,
"step": 5140
},
{
"epoch": 3.07,
"grad_norm": 2.3518447875976562,
"learning_rate": 3.5148200982723095e-05,
"loss": 0.0101,
"step": 5150
},
{
"epoch": 3.07,
"grad_norm": 0.004120847675949335,
"learning_rate": 3.5108575051513714e-05,
"loss": 0.2435,
"step": 5160
},
{
"epoch": 3.07,
"grad_norm": 0.0018114675767719746,
"learning_rate": 3.5068949120304326e-05,
"loss": 0.0002,
"step": 5170
},
{
"epoch": 3.07,
"grad_norm": 0.003768153488636017,
"learning_rate": 3.5029323189094945e-05,
"loss": 0.0041,
"step": 5180
},
{
"epoch": 3.07,
"grad_norm": 0.07695072889328003,
"learning_rate": 3.4989697257885564e-05,
"loss": 0.0025,
"step": 5190
},
{
"epoch": 3.07,
"grad_norm": 57.52178955078125,
"learning_rate": 3.4950071326676176e-05,
"loss": 0.0468,
"step": 5200
},
{
"epoch": 3.07,
"grad_norm": 0.001811747089959681,
"learning_rate": 3.4910445395466795e-05,
"loss": 0.0007,
"step": 5210
},
{
"epoch": 3.07,
"grad_norm": 0.0032631447538733482,
"learning_rate": 3.487081946425741e-05,
"loss": 0.0962,
"step": 5220
},
{
"epoch": 3.07,
"grad_norm": 0.0040063695050776005,
"learning_rate": 3.4831193533048026e-05,
"loss": 0.0718,
"step": 5230
},
{
"epoch": 3.07,
"grad_norm": 0.042804840952157974,
"learning_rate": 3.4791567601838644e-05,
"loss": 0.0038,
"step": 5240
},
{
"epoch": 3.07,
"grad_norm": 0.0023616242688149214,
"learning_rate": 3.475194167062926e-05,
"loss": 0.0002,
"step": 5250
},
{
"epoch": 3.07,
"grad_norm": 0.002275130245834589,
"learning_rate": 3.4712315739419875e-05,
"loss": 0.6309,
"step": 5260
},
{
"epoch": 3.08,
"grad_norm": 0.011256784200668335,
"learning_rate": 3.4672689808210494e-05,
"loss": 0.0002,
"step": 5270
},
{
"epoch": 3.08,
"grad_norm": 0.0045999023132026196,
"learning_rate": 3.463306387700111e-05,
"loss": 0.1079,
"step": 5280
},
{
"epoch": 3.08,
"grad_norm": 0.001873884117230773,
"learning_rate": 3.459343794579173e-05,
"loss": 0.0004,
"step": 5290
},
{
"epoch": 3.08,
"grad_norm": 0.003349520266056061,
"learning_rate": 3.4553812014582344e-05,
"loss": 0.0392,
"step": 5300
},
{
"epoch": 3.08,
"grad_norm": 0.07745254039764404,
"learning_rate": 3.451418608337296e-05,
"loss": 0.1259,
"step": 5310
},
{
"epoch": 3.08,
"grad_norm": 0.001707065268419683,
"learning_rate": 3.4474560152163574e-05,
"loss": 0.2405,
"step": 5320
},
{
"epoch": 3.08,
"grad_norm": 0.00230118609033525,
"learning_rate": 3.443493422095419e-05,
"loss": 0.0026,
"step": 5330
},
{
"epoch": 3.08,
"grad_norm": 0.03412836417555809,
"learning_rate": 3.439530828974481e-05,
"loss": 0.1592,
"step": 5340
},
{
"epoch": 3.08,
"grad_norm": 0.006646712776273489,
"learning_rate": 3.435568235853543e-05,
"loss": 0.0253,
"step": 5350
},
{
"epoch": 3.08,
"grad_norm": 0.10694713890552521,
"learning_rate": 3.431605642732604e-05,
"loss": 0.3725,
"step": 5360
},
{
"epoch": 3.08,
"grad_norm": 0.14875371754169464,
"learning_rate": 3.427643049611666e-05,
"loss": 0.7354,
"step": 5370
},
{
"epoch": 3.08,
"grad_norm": 0.06602335721254349,
"learning_rate": 3.4236804564907274e-05,
"loss": 0.3752,
"step": 5380
},
{
"epoch": 3.08,
"grad_norm": 0.002122233621776104,
"learning_rate": 3.41971786336979e-05,
"loss": 0.0014,
"step": 5390
},
{
"epoch": 3.08,
"grad_norm": 0.020870821550488472,
"learning_rate": 3.415755270248851e-05,
"loss": 0.001,
"step": 5400
},
{
"epoch": 3.09,
"grad_norm": 51.36176681518555,
"learning_rate": 3.411792677127912e-05,
"loss": 0.0521,
"step": 5410
},
{
"epoch": 3.09,
"grad_norm": 0.002612057374790311,
"learning_rate": 3.407830084006974e-05,
"loss": 0.3004,
"step": 5420
},
{
"epoch": 3.09,
"grad_norm": 0.006323930341750383,
"learning_rate": 3.4038674908860354e-05,
"loss": 0.0007,
"step": 5430
},
{
"epoch": 3.09,
"grad_norm": 0.010717890225350857,
"learning_rate": 3.399904897765098e-05,
"loss": 0.0003,
"step": 5440
},
{
"epoch": 3.09,
"grad_norm": 0.20171226561069489,
"learning_rate": 3.395942304644159e-05,
"loss": 0.0007,
"step": 5450
},
{
"epoch": 3.09,
"grad_norm": 0.0015600892947986722,
"learning_rate": 3.391979711523221e-05,
"loss": 0.0011,
"step": 5460
},
{
"epoch": 3.09,
"grad_norm": 1.7858773469924927,
"learning_rate": 3.388017118402282e-05,
"loss": 0.0032,
"step": 5470
},
{
"epoch": 3.09,
"grad_norm": 0.0012454432435333729,
"learning_rate": 3.384054525281344e-05,
"loss": 0.0002,
"step": 5480
},
{
"epoch": 3.09,
"grad_norm": 0.0015952313551679254,
"learning_rate": 3.380091932160406e-05,
"loss": 0.001,
"step": 5490
},
{
"epoch": 3.09,
"grad_norm": 0.002770837862044573,
"learning_rate": 3.376129339039468e-05,
"loss": 0.0008,
"step": 5500
},
{
"epoch": 3.09,
"grad_norm": 1.3940935134887695,
"learning_rate": 3.372166745918529e-05,
"loss": 0.0037,
"step": 5510
},
{
"epoch": 3.09,
"grad_norm": 0.1070881336927414,
"learning_rate": 3.368204152797591e-05,
"loss": 0.0003,
"step": 5520
},
{
"epoch": 3.09,
"grad_norm": 60.142276763916016,
"learning_rate": 3.364241559676652e-05,
"loss": 0.4676,
"step": 5530
},
{
"epoch": 3.09,
"grad_norm": 0.0014883485855534673,
"learning_rate": 3.360278966555714e-05,
"loss": 0.0003,
"step": 5540
},
{
"epoch": 3.1,
"grad_norm": 0.002981774276122451,
"learning_rate": 3.356316373434776e-05,
"loss": 0.0002,
"step": 5550
},
{
"epoch": 3.1,
"grad_norm": 0.000889226037543267,
"learning_rate": 3.352353780313838e-05,
"loss": 0.0001,
"step": 5560
},
{
"epoch": 3.1,
"grad_norm": 0.006324201822280884,
"learning_rate": 3.348391187192899e-05,
"loss": 0.0002,
"step": 5570
},
{
"epoch": 3.1,
"grad_norm": 0.013741032220423222,
"learning_rate": 3.344428594071961e-05,
"loss": 0.0038,
"step": 5580
},
{
"epoch": 3.1,
"grad_norm": 0.0982193648815155,
"learning_rate": 3.340466000951023e-05,
"loss": 0.0002,
"step": 5590
},
{
"epoch": 3.1,
"grad_norm": 0.0023921611718833447,
"learning_rate": 3.336503407830085e-05,
"loss": 0.0001,
"step": 5600
},
{
"epoch": 3.1,
"grad_norm": 0.004157126881182194,
"learning_rate": 3.332540814709146e-05,
"loss": 0.0001,
"step": 5610
},
{
"epoch": 3.1,
"eval_accuracy": 0.9398496240601504,
"eval_loss": 0.3705739974975586,
"eval_runtime": 2358.2538,
"eval_samples_per_second": 0.282,
"eval_steps_per_second": 0.141,
"step": 5612
},
{
"epoch": 4.0,
"grad_norm": 0.0009104391792789102,
"learning_rate": 3.328578221588208e-05,
"loss": 0.3095,
"step": 5620
},
{
"epoch": 4.0,
"grad_norm": 0.002629748312756419,
"learning_rate": 3.324615628467269e-05,
"loss": 0.1543,
"step": 5630
},
{
"epoch": 4.0,
"grad_norm": 0.005159564781934023,
"learning_rate": 3.320653035346331e-05,
"loss": 0.0003,
"step": 5640
},
{
"epoch": 4.0,
"grad_norm": 0.000841008557472378,
"learning_rate": 3.316690442225393e-05,
"loss": 0.0004,
"step": 5650
},
{
"epoch": 4.0,
"grad_norm": 0.004792694002389908,
"learning_rate": 3.312727849104454e-05,
"loss": 0.0017,
"step": 5660
},
{
"epoch": 4.0,
"grad_norm": 0.0014269945677369833,
"learning_rate": 3.308765255983516e-05,
"loss": 0.0002,
"step": 5670
},
{
"epoch": 4.0,
"grad_norm": 0.0021025442983955145,
"learning_rate": 3.304802662862577e-05,
"loss": 0.115,
"step": 5680
},
{
"epoch": 4.01,
"grad_norm": 0.0010108908172696829,
"learning_rate": 3.300840069741639e-05,
"loss": 0.0034,
"step": 5690
},
{
"epoch": 4.01,
"grad_norm": 0.012116851285099983,
"learning_rate": 3.296877476620701e-05,
"loss": 0.001,
"step": 5700
},
{
"epoch": 4.01,
"grad_norm": 28.641616821289062,
"learning_rate": 3.2929148834997626e-05,
"loss": 0.5096,
"step": 5710
},
{
"epoch": 4.01,
"grad_norm": 21.132633209228516,
"learning_rate": 3.288952290378824e-05,
"loss": 0.0347,
"step": 5720
},
{
"epoch": 4.01,
"grad_norm": 0.0044413842260837555,
"learning_rate": 3.284989697257886e-05,
"loss": 0.0145,
"step": 5730
},
{
"epoch": 4.01,
"grad_norm": 48.15180206298828,
"learning_rate": 3.281027104136947e-05,
"loss": 0.0224,
"step": 5740
},
{
"epoch": 4.01,
"grad_norm": 0.003202601335942745,
"learning_rate": 3.2770645110160095e-05,
"loss": 0.1978,
"step": 5750
},
{
"epoch": 4.01,
"grad_norm": 20.207809448242188,
"learning_rate": 3.273101917895071e-05,
"loss": 0.0856,
"step": 5760
},
{
"epoch": 4.01,
"grad_norm": 0.0013485507806763053,
"learning_rate": 3.2691393247741326e-05,
"loss": 0.0013,
"step": 5770
},
{
"epoch": 4.01,
"grad_norm": 0.0005685106734745204,
"learning_rate": 3.265176731653194e-05,
"loss": 0.4924,
"step": 5780
},
{
"epoch": 4.01,
"grad_norm": 0.0010126458946615458,
"learning_rate": 3.2612141385322556e-05,
"loss": 0.0096,
"step": 5790
},
{
"epoch": 4.01,
"grad_norm": 95.05339813232422,
"learning_rate": 3.2572515454113175e-05,
"loss": 0.1921,
"step": 5800
},
{
"epoch": 4.01,
"grad_norm": 0.002471966203302145,
"learning_rate": 3.2532889522903794e-05,
"loss": 0.0008,
"step": 5810
},
{
"epoch": 4.01,
"grad_norm": 0.001297266804613173,
"learning_rate": 3.2493263591694406e-05,
"loss": 0.0002,
"step": 5820
},
{
"epoch": 4.02,
"grad_norm": 19.474943161010742,
"learning_rate": 3.2453637660485025e-05,
"loss": 0.3726,
"step": 5830
},
{
"epoch": 4.02,
"grad_norm": 0.015567510388791561,
"learning_rate": 3.241401172927564e-05,
"loss": 0.0003,
"step": 5840
},
{
"epoch": 4.02,
"grad_norm": 0.002833213657140732,
"learning_rate": 3.2374385798066256e-05,
"loss": 0.6715,
"step": 5850
},
{
"epoch": 4.02,
"grad_norm": 1.4238035678863525,
"learning_rate": 3.2334759866856875e-05,
"loss": 0.0008,
"step": 5860
},
{
"epoch": 4.02,
"grad_norm": 0.0025125148240476847,
"learning_rate": 3.2295133935647487e-05,
"loss": 0.055,
"step": 5870
},
{
"epoch": 4.02,
"grad_norm": 0.001612589810974896,
"learning_rate": 3.2255508004438105e-05,
"loss": 0.1114,
"step": 5880
},
{
"epoch": 4.02,
"grad_norm": 0.0115219596773386,
"learning_rate": 3.221588207322872e-05,
"loss": 0.0002,
"step": 5890
},
{
"epoch": 4.02,
"grad_norm": 0.002122466452419758,
"learning_rate": 3.217625614201934e-05,
"loss": 0.0463,
"step": 5900
},
{
"epoch": 4.02,
"grad_norm": 0.012210741639137268,
"learning_rate": 3.2136630210809955e-05,
"loss": 0.0038,
"step": 5910
},
{
"epoch": 4.02,
"grad_norm": 0.006696117110550404,
"learning_rate": 3.2097004279600574e-05,
"loss": 0.0006,
"step": 5920
},
{
"epoch": 4.02,
"grad_norm": 0.003299353178590536,
"learning_rate": 3.2057378348391186e-05,
"loss": 0.4329,
"step": 5930
},
{
"epoch": 4.02,
"grad_norm": 0.017031671479344368,
"learning_rate": 3.2017752417181805e-05,
"loss": 0.0014,
"step": 5940
},
{
"epoch": 4.02,
"grad_norm": 0.008915259502828121,
"learning_rate": 3.197812648597242e-05,
"loss": 0.0003,
"step": 5950
},
{
"epoch": 4.02,
"grad_norm": 0.0033541598822921515,
"learning_rate": 3.193850055476304e-05,
"loss": 0.0261,
"step": 5960
},
{
"epoch": 4.03,
"grad_norm": 0.0039758519269526005,
"learning_rate": 3.1898874623553654e-05,
"loss": 0.0701,
"step": 5970
},
{
"epoch": 4.03,
"grad_norm": 0.0859561562538147,
"learning_rate": 3.185924869234427e-05,
"loss": 0.0012,
"step": 5980
},
{
"epoch": 4.03,
"grad_norm": 0.0011740931076928973,
"learning_rate": 3.1819622761134885e-05,
"loss": 0.0009,
"step": 5990
},
{
"epoch": 4.03,
"grad_norm": 0.0011881589889526367,
"learning_rate": 3.1779996829925504e-05,
"loss": 0.1377,
"step": 6000
},
{
"epoch": 4.03,
"grad_norm": 0.3393727242946625,
"learning_rate": 3.174037089871612e-05,
"loss": 0.0085,
"step": 6010
},
{
"epoch": 4.03,
"grad_norm": 0.26653632521629333,
"learning_rate": 3.170074496750674e-05,
"loss": 0.5985,
"step": 6020
},
{
"epoch": 4.03,
"grad_norm": 71.80652618408203,
"learning_rate": 3.1661119036297353e-05,
"loss": 0.1081,
"step": 6030
},
{
"epoch": 4.03,
"grad_norm": 0.3816182017326355,
"learning_rate": 3.162149310508797e-05,
"loss": 0.0015,
"step": 6040
},
{
"epoch": 4.03,
"grad_norm": 3.339017629623413,
"learning_rate": 3.1581867173878584e-05,
"loss": 0.0737,
"step": 6050
},
{
"epoch": 4.03,
"grad_norm": 0.0013679158873856068,
"learning_rate": 3.15422412426692e-05,
"loss": 0.2465,
"step": 6060
},
{
"epoch": 4.03,
"grad_norm": 0.003948609344661236,
"learning_rate": 3.150261531145982e-05,
"loss": 0.2111,
"step": 6070
},
{
"epoch": 4.03,
"grad_norm": 0.005721264984458685,
"learning_rate": 3.1462989380250434e-05,
"loss": 0.0024,
"step": 6080
},
{
"epoch": 4.03,
"grad_norm": 0.0018883657176047564,
"learning_rate": 3.142336344904105e-05,
"loss": 0.2687,
"step": 6090
},
{
"epoch": 4.03,
"grad_norm": 0.008159175515174866,
"learning_rate": 3.138373751783167e-05,
"loss": 0.1406,
"step": 6100
},
{
"epoch": 4.04,
"grad_norm": 0.009790794923901558,
"learning_rate": 3.134411158662229e-05,
"loss": 0.2122,
"step": 6110
},
{
"epoch": 4.04,
"grad_norm": 0.645839273929596,
"learning_rate": 3.13044856554129e-05,
"loss": 0.0223,
"step": 6120
},
{
"epoch": 4.04,
"grad_norm": 0.0012109485687687993,
"learning_rate": 3.126485972420352e-05,
"loss": 0.2131,
"step": 6130
},
{
"epoch": 4.04,
"grad_norm": 0.005074062384665012,
"learning_rate": 3.122523379299413e-05,
"loss": 0.4669,
"step": 6140
},
{
"epoch": 4.04,
"grad_norm": 0.04010836407542229,
"learning_rate": 3.118560786178475e-05,
"loss": 0.012,
"step": 6150
},
{
"epoch": 4.04,
"grad_norm": 0.018426967784762383,
"learning_rate": 3.114598193057537e-05,
"loss": 0.0008,
"step": 6160
},
{
"epoch": 4.04,
"grad_norm": 0.0035447929985821247,
"learning_rate": 3.110635599936599e-05,
"loss": 0.1271,
"step": 6170
},
{
"epoch": 4.04,
"grad_norm": 0.012344791553914547,
"learning_rate": 3.10667300681566e-05,
"loss": 0.0002,
"step": 6180
},
{
"epoch": 4.04,
"grad_norm": 0.0015085155609995127,
"learning_rate": 3.102710413694722e-05,
"loss": 0.0064,
"step": 6190
},
{
"epoch": 4.04,
"grad_norm": 0.0013396035647019744,
"learning_rate": 3.098747820573783e-05,
"loss": 0.0003,
"step": 6200
},
{
"epoch": 4.04,
"grad_norm": 0.007324972189962864,
"learning_rate": 3.094785227452846e-05,
"loss": 0.0001,
"step": 6210
},
{
"epoch": 4.04,
"grad_norm": 0.029165761545300484,
"learning_rate": 3.090822634331907e-05,
"loss": 0.0002,
"step": 6220
},
{
"epoch": 4.04,
"grad_norm": 0.006251147948205471,
"learning_rate": 3.086860041210969e-05,
"loss": 0.0001,
"step": 6230
},
{
"epoch": 4.04,
"grad_norm": 0.0033136485144495964,
"learning_rate": 3.08289744809003e-05,
"loss": 0.1959,
"step": 6240
},
{
"epoch": 4.05,
"grad_norm": 15.712539672851562,
"learning_rate": 3.078934854969092e-05,
"loss": 0.0053,
"step": 6250
},
{
"epoch": 4.05,
"grad_norm": 0.004770079627633095,
"learning_rate": 3.074972261848154e-05,
"loss": 0.2429,
"step": 6260
},
{
"epoch": 4.05,
"grad_norm": 0.001170918345451355,
"learning_rate": 3.071009668727215e-05,
"loss": 0.4537,
"step": 6270
},
{
"epoch": 4.05,
"grad_norm": 0.003140375716611743,
"learning_rate": 3.067047075606277e-05,
"loss": 0.0003,
"step": 6280
},
{
"epoch": 4.05,
"grad_norm": 0.005154268350452185,
"learning_rate": 3.063084482485338e-05,
"loss": 0.0002,
"step": 6290
},
{
"epoch": 4.05,
"grad_norm": 0.718346357345581,
"learning_rate": 3.0591218893644e-05,
"loss": 0.0039,
"step": 6300
},
{
"epoch": 4.05,
"grad_norm": 0.29760679602622986,
"learning_rate": 3.055159296243462e-05,
"loss": 0.0325,
"step": 6310
},
{
"epoch": 4.05,
"grad_norm": 0.0015770556638017297,
"learning_rate": 3.0511967031225234e-05,
"loss": 0.1031,
"step": 6320
},
{
"epoch": 4.05,
"grad_norm": 14.039325714111328,
"learning_rate": 3.047234110001585e-05,
"loss": 0.0254,
"step": 6330
},
{
"epoch": 4.05,
"grad_norm": 12.89113998413086,
"learning_rate": 3.043271516880647e-05,
"loss": 0.0182,
"step": 6340
},
{
"epoch": 4.05,
"grad_norm": 0.0020349326077848673,
"learning_rate": 3.0393089237597084e-05,
"loss": 0.0047,
"step": 6350
},
{
"epoch": 4.05,
"grad_norm": 0.0006648111157119274,
"learning_rate": 3.0353463306387703e-05,
"loss": 0.0111,
"step": 6360
},
{
"epoch": 4.05,
"grad_norm": 0.00324794533662498,
"learning_rate": 3.0313837375178318e-05,
"loss": 0.001,
"step": 6370
},
{
"epoch": 4.05,
"grad_norm": 0.002352567156776786,
"learning_rate": 3.0274211443968937e-05,
"loss": 0.5155,
"step": 6380
},
{
"epoch": 4.06,
"grad_norm": 0.0007183744455687702,
"learning_rate": 3.023458551275955e-05,
"loss": 0.1828,
"step": 6390
},
{
"epoch": 4.06,
"grad_norm": 0.0010205942671746016,
"learning_rate": 3.019495958155017e-05,
"loss": 0.0004,
"step": 6400
},
{
"epoch": 4.06,
"grad_norm": 0.0007507322006858885,
"learning_rate": 3.0155333650340783e-05,
"loss": 0.0078,
"step": 6410
},
{
"epoch": 4.06,
"grad_norm": 0.0010719618294388056,
"learning_rate": 3.0115707719131402e-05,
"loss": 0.0024,
"step": 6420
},
{
"epoch": 4.06,
"grad_norm": 0.004630456678569317,
"learning_rate": 3.0076081787922017e-05,
"loss": 0.0001,
"step": 6430
},
{
"epoch": 4.06,
"grad_norm": 110.9379653930664,
"learning_rate": 3.0036455856712636e-05,
"loss": 0.2711,
"step": 6440
},
{
"epoch": 4.06,
"grad_norm": 0.0028752069920301437,
"learning_rate": 2.999682992550325e-05,
"loss": 0.1684,
"step": 6450
},
{
"epoch": 4.06,
"grad_norm": 0.00176974234636873,
"learning_rate": 2.995720399429387e-05,
"loss": 0.1832,
"step": 6460
},
{
"epoch": 4.06,
"grad_norm": 0.0004082988016307354,
"learning_rate": 2.9917578063084482e-05,
"loss": 0.1391,
"step": 6470
},
{
"epoch": 4.06,
"grad_norm": 91.996337890625,
"learning_rate": 2.9877952131875105e-05,
"loss": 0.0717,
"step": 6480
},
{
"epoch": 4.06,
"grad_norm": 0.3914591372013092,
"learning_rate": 2.9838326200665717e-05,
"loss": 0.0006,
"step": 6490
},
{
"epoch": 4.06,
"grad_norm": 0.0014606121694669127,
"learning_rate": 2.9798700269456332e-05,
"loss": 0.0002,
"step": 6500
},
{
"epoch": 4.06,
"grad_norm": 0.002047004410997033,
"learning_rate": 2.975907433824695e-05,
"loss": 0.243,
"step": 6510
},
{
"epoch": 4.06,
"grad_norm": 0.0009985043434426188,
"learning_rate": 2.9719448407037563e-05,
"loss": 0.0339,
"step": 6520
},
{
"epoch": 4.07,
"grad_norm": 0.0007074224413372576,
"learning_rate": 2.9679822475828185e-05,
"loss": 0.0015,
"step": 6530
},
{
"epoch": 4.07,
"grad_norm": 0.004130239132791758,
"learning_rate": 2.9640196544618797e-05,
"loss": 0.0001,
"step": 6540
},
{
"epoch": 4.07,
"grad_norm": 0.004487643018364906,
"learning_rate": 2.9600570613409416e-05,
"loss": 0.0002,
"step": 6550
},
{
"epoch": 4.07,
"grad_norm": 0.001936771790497005,
"learning_rate": 2.956094468220003e-05,
"loss": 0.0001,
"step": 6560
},
{
"epoch": 4.07,
"grad_norm": 0.004075042437762022,
"learning_rate": 2.952131875099065e-05,
"loss": 0.3415,
"step": 6570
},
{
"epoch": 4.07,
"grad_norm": 0.05164702981710434,
"learning_rate": 2.9481692819781266e-05,
"loss": 0.0001,
"step": 6580
},
{
"epoch": 4.07,
"grad_norm": 0.0014617941342294216,
"learning_rate": 2.9442066888571884e-05,
"loss": 0.0001,
"step": 6590
},
{
"epoch": 4.07,
"grad_norm": 0.0017148368060588837,
"learning_rate": 2.94024409573625e-05,
"loss": 0.0753,
"step": 6600
},
{
"epoch": 4.07,
"grad_norm": 0.003370764898136258,
"learning_rate": 2.936281502615312e-05,
"loss": 0.0072,
"step": 6610
},
{
"epoch": 4.07,
"grad_norm": 0.003846656298264861,
"learning_rate": 2.932318909494373e-05,
"loss": 0.0001,
"step": 6620
},
{
"epoch": 4.07,
"grad_norm": 0.002365513239055872,
"learning_rate": 2.928356316373435e-05,
"loss": 0.0008,
"step": 6630
},
{
"epoch": 4.07,
"grad_norm": 0.0008402117528021336,
"learning_rate": 2.9243937232524965e-05,
"loss": 0.0001,
"step": 6640
},
{
"epoch": 4.07,
"grad_norm": 0.004054752178490162,
"learning_rate": 2.9204311301315584e-05,
"loss": 0.0548,
"step": 6650
},
{
"epoch": 4.07,
"grad_norm": 0.0017859063809737563,
"learning_rate": 2.91646853701062e-05,
"loss": 0.0001,
"step": 6660
},
{
"epoch": 4.08,
"grad_norm": 3.045167922973633,
"learning_rate": 2.9125059438896818e-05,
"loss": 0.0106,
"step": 6670
},
{
"epoch": 4.08,
"grad_norm": 0.034478865563869476,
"learning_rate": 2.9085433507687433e-05,
"loss": 0.001,
"step": 6680
},
{
"epoch": 4.08,
"grad_norm": 0.0020598298870027065,
"learning_rate": 2.9045807576478052e-05,
"loss": 0.9616,
"step": 6690
},
{
"epoch": 4.08,
"grad_norm": 0.009513617493212223,
"learning_rate": 2.9006181645268664e-05,
"loss": 0.0601,
"step": 6700
},
{
"epoch": 4.08,
"grad_norm": 0.10755365341901779,
"learning_rate": 2.896655571405928e-05,
"loss": 0.4313,
"step": 6710
},
{
"epoch": 4.08,
"grad_norm": 89.41072845458984,
"learning_rate": 2.8926929782849898e-05,
"loss": 0.7753,
"step": 6720
},
{
"epoch": 4.08,
"grad_norm": 0.005557952914386988,
"learning_rate": 2.8887303851640514e-05,
"loss": 0.022,
"step": 6730
},
{
"epoch": 4.08,
"grad_norm": 0.07544991374015808,
"learning_rate": 2.8847677920431132e-05,
"loss": 0.1043,
"step": 6740
},
{
"epoch": 4.08,
"grad_norm": 0.004230760037899017,
"learning_rate": 2.8808051989221744e-05,
"loss": 0.2305,
"step": 6750
},
{
"epoch": 4.08,
"grad_norm": 0.0005384175456129014,
"learning_rate": 2.8768426058012367e-05,
"loss": 0.0003,
"step": 6760
},
{
"epoch": 4.08,
"grad_norm": 0.0020217718556523323,
"learning_rate": 2.872880012680298e-05,
"loss": 0.0008,
"step": 6770
},
{
"epoch": 4.08,
"grad_norm": 0.001009553438052535,
"learning_rate": 2.8689174195593598e-05,
"loss": 0.3165,
"step": 6780
},
{
"epoch": 4.08,
"grad_norm": 0.002491355175152421,
"learning_rate": 2.8649548264384213e-05,
"loss": 0.5646,
"step": 6790
},
{
"epoch": 4.08,
"grad_norm": 0.002977263880893588,
"learning_rate": 2.8609922333174832e-05,
"loss": 0.0001,
"step": 6800
},
{
"epoch": 4.09,
"grad_norm": 0.0012742202961817384,
"learning_rate": 2.8570296401965447e-05,
"loss": 0.2827,
"step": 6810
},
{
"epoch": 4.09,
"grad_norm": 0.0030132278334349394,
"learning_rate": 2.8530670470756066e-05,
"loss": 0.0006,
"step": 6820
},
{
"epoch": 4.09,
"grad_norm": 0.06876442581415176,
"learning_rate": 2.8491044539546678e-05,
"loss": 0.0062,
"step": 6830
},
{
"epoch": 4.09,
"grad_norm": 0.008195163682103157,
"learning_rate": 2.84514186083373e-05,
"loss": 0.0145,
"step": 6840
},
{
"epoch": 4.09,
"grad_norm": 0.0023913795594125986,
"learning_rate": 2.8411792677127912e-05,
"loss": 0.022,
"step": 6850
},
{
"epoch": 4.09,
"grad_norm": 0.0004799796442966908,
"learning_rate": 2.837216674591853e-05,
"loss": 0.2252,
"step": 6860
},
{
"epoch": 4.09,
"grad_norm": 0.11730991303920746,
"learning_rate": 2.8332540814709146e-05,
"loss": 0.0033,
"step": 6870
},
{
"epoch": 4.09,
"grad_norm": 0.01119227148592472,
"learning_rate": 2.8292914883499765e-05,
"loss": 0.0764,
"step": 6880
},
{
"epoch": 4.09,
"grad_norm": 1.4117075204849243,
"learning_rate": 2.825328895229038e-05,
"loss": 0.0313,
"step": 6890
},
{
"epoch": 4.09,
"grad_norm": 0.9569471478462219,
"learning_rate": 2.8213663021081e-05,
"loss": 0.0037,
"step": 6900
},
{
"epoch": 4.09,
"grad_norm": 0.001442433800548315,
"learning_rate": 2.8174037089871615e-05,
"loss": 0.0001,
"step": 6910
},
{
"epoch": 4.09,
"grad_norm": 0.0015686535043641925,
"learning_rate": 2.8134411158662227e-05,
"loss": 0.2835,
"step": 6920
},
{
"epoch": 4.09,
"grad_norm": 0.00151319510769099,
"learning_rate": 2.8094785227452846e-05,
"loss": 0.0004,
"step": 6930
},
{
"epoch": 4.09,
"grad_norm": 0.00654405914247036,
"learning_rate": 2.805515929624346e-05,
"loss": 0.0011,
"step": 6940
},
{
"epoch": 4.1,
"grad_norm": 0.0008709866087883711,
"learning_rate": 2.801553336503408e-05,
"loss": 0.0128,
"step": 6950
},
{
"epoch": 4.1,
"grad_norm": 0.00043904109043069184,
"learning_rate": 2.7975907433824695e-05,
"loss": 0.8107,
"step": 6960
},
{
"epoch": 4.1,
"grad_norm": 0.00742549542337656,
"learning_rate": 2.7936281502615314e-05,
"loss": 0.3098,
"step": 6970
},
{
"epoch": 4.1,
"grad_norm": 0.0007969782454892993,
"learning_rate": 2.7896655571405926e-05,
"loss": 0.0001,
"step": 6980
},
{
"epoch": 4.1,
"grad_norm": 52.47648620605469,
"learning_rate": 2.7857029640196548e-05,
"loss": 0.4694,
"step": 6990
},
{
"epoch": 4.1,
"grad_norm": 0.0088576078414917,
"learning_rate": 2.781740370898716e-05,
"loss": 0.0016,
"step": 7000
},
{
"epoch": 4.1,
"grad_norm": 3.6093878746032715,
"learning_rate": 2.777777777777778e-05,
"loss": 0.054,
"step": 7010
},
{
"epoch": 4.1,
"eval_accuracy": 0.9368421052631579,
"eval_loss": 0.4006503224372864,
"eval_runtime": 2328.3346,
"eval_samples_per_second": 0.286,
"eval_steps_per_second": 0.143,
"step": 7015
},
{
"epoch": 5.0,
"grad_norm": 0.0007481848588213325,
"learning_rate": 2.7738151846568395e-05,
"loss": 0.0007,
"step": 7020
},
{
"epoch": 5.0,
"grad_norm": 162.9553680419922,
"learning_rate": 2.7698525915359013e-05,
"loss": 0.4298,
"step": 7030
},
{
"epoch": 5.0,
"grad_norm": 0.0006780726835131645,
"learning_rate": 2.765889998414963e-05,
"loss": 0.4696,
"step": 7040
},
{
"epoch": 5.0,
"grad_norm": 0.0014015401247888803,
"learning_rate": 2.7619274052940248e-05,
"loss": 0.028,
"step": 7050
},
{
"epoch": 5.0,
"grad_norm": 0.00443660095334053,
"learning_rate": 2.757964812173086e-05,
"loss": 0.0078,
"step": 7060
},
{
"epoch": 5.0,
"grad_norm": 4.740740776062012,
"learning_rate": 2.7540022190521482e-05,
"loss": 0.1712,
"step": 7070
},
{
"epoch": 5.0,
"grad_norm": 0.0052452534437179565,
"learning_rate": 2.7500396259312094e-05,
"loss": 0.0001,
"step": 7080
},
{
"epoch": 5.01,
"grad_norm": 0.0006377240642905235,
"learning_rate": 2.7460770328102713e-05,
"loss": 0.045,
"step": 7090
},
{
"epoch": 5.01,
"grad_norm": 0.0011151348007842898,
"learning_rate": 2.7421144396893328e-05,
"loss": 0.0001,
"step": 7100
},
{
"epoch": 5.01,
"grad_norm": 0.0006300232489593327,
"learning_rate": 2.7381518465683947e-05,
"loss": 0.0004,
"step": 7110
},
{
"epoch": 5.01,
"grad_norm": 0.008152210153639317,
"learning_rate": 2.7341892534474562e-05,
"loss": 0.0002,
"step": 7120
},
{
"epoch": 5.01,
"grad_norm": 0.0016102171503007412,
"learning_rate": 2.7302266603265174e-05,
"loss": 0.0302,
"step": 7130
},
{
"epoch": 5.01,
"grad_norm": 0.0014644188340753317,
"learning_rate": 2.7262640672055796e-05,
"loss": 0.0,
"step": 7140
},
{
"epoch": 5.01,
"grad_norm": 0.0012343927519395947,
"learning_rate": 2.722301474084641e-05,
"loss": 0.001,
"step": 7150
},
{
"epoch": 5.01,
"grad_norm": 0.002109797904267907,
"learning_rate": 2.7183388809637027e-05,
"loss": 0.0003,
"step": 7160
},
{
"epoch": 5.01,
"grad_norm": 0.0012583807110786438,
"learning_rate": 2.7143762878427643e-05,
"loss": 0.0001,
"step": 7170
},
{
"epoch": 5.01,
"grad_norm": 0.0009702452807687223,
"learning_rate": 2.710413694721826e-05,
"loss": 0.1802,
"step": 7180
},
{
"epoch": 5.01,
"grad_norm": 0.004518999718129635,
"learning_rate": 2.7064511016008877e-05,
"loss": 0.0001,
"step": 7190
},
{
"epoch": 5.01,
"grad_norm": 0.0008531950297765434,
"learning_rate": 2.7024885084799496e-05,
"loss": 0.0001,
"step": 7200
},
{
"epoch": 5.01,
"grad_norm": 0.003954921383410692,
"learning_rate": 2.6985259153590108e-05,
"loss": 0.0001,
"step": 7210
},
{
"epoch": 5.01,
"grad_norm": 0.0006554504507221282,
"learning_rate": 2.694563322238073e-05,
"loss": 0.0002,
"step": 7220
},
{
"epoch": 5.02,
"grad_norm": 0.0011577644618228078,
"learning_rate": 2.6906007291171342e-05,
"loss": 0.006,
"step": 7230
},
{
"epoch": 5.02,
"grad_norm": 0.0004994067130610347,
"learning_rate": 2.686638135996196e-05,
"loss": 0.0001,
"step": 7240
},
{
"epoch": 5.02,
"grad_norm": 0.006224981974810362,
"learning_rate": 2.6826755428752576e-05,
"loss": 0.4425,
"step": 7250
},
{
"epoch": 5.02,
"grad_norm": 0.00843863096088171,
"learning_rate": 2.6787129497543195e-05,
"loss": 0.1975,
"step": 7260
},
{
"epoch": 5.02,
"grad_norm": 0.0011182260932400823,
"learning_rate": 2.674750356633381e-05,
"loss": 0.0002,
"step": 7270
},
{
"epoch": 5.02,
"grad_norm": 0.0012028939090669155,
"learning_rate": 2.670787763512443e-05,
"loss": 0.0001,
"step": 7280
},
{
"epoch": 5.02,
"grad_norm": 0.0064741140231490135,
"learning_rate": 2.666825170391504e-05,
"loss": 0.0066,
"step": 7290
},
{
"epoch": 5.02,
"grad_norm": 0.0013653126079589128,
"learning_rate": 2.6628625772705663e-05,
"loss": 0.0802,
"step": 7300
},
{
"epoch": 5.02,
"grad_norm": 0.0032840375788509846,
"learning_rate": 2.6588999841496275e-05,
"loss": 0.0495,
"step": 7310
},
{
"epoch": 5.02,
"grad_norm": 0.006207801401615143,
"learning_rate": 2.6549373910286894e-05,
"loss": 0.0001,
"step": 7320
},
{
"epoch": 5.02,
"grad_norm": 0.0015818944666534662,
"learning_rate": 2.650974797907751e-05,
"loss": 0.0935,
"step": 7330
},
{
"epoch": 5.02,
"grad_norm": 0.0013846838846802711,
"learning_rate": 2.647012204786813e-05,
"loss": 0.0101,
"step": 7340
},
{
"epoch": 5.02,
"grad_norm": 0.0015213302103802562,
"learning_rate": 2.6430496116658744e-05,
"loss": 0.0001,
"step": 7350
},
{
"epoch": 5.02,
"grad_norm": 0.0016765915788710117,
"learning_rate": 2.6390870185449356e-05,
"loss": 0.0008,
"step": 7360
},
{
"epoch": 5.03,
"grad_norm": 0.0029850786086171865,
"learning_rate": 2.6351244254239975e-05,
"loss": 0.5417,
"step": 7370
},
{
"epoch": 5.03,
"grad_norm": 0.0028296930249780416,
"learning_rate": 2.631161832303059e-05,
"loss": 0.0029,
"step": 7380
},
{
"epoch": 5.03,
"grad_norm": 0.19774562120437622,
"learning_rate": 2.627199239182121e-05,
"loss": 0.0424,
"step": 7390
},
{
"epoch": 5.03,
"grad_norm": 0.20521485805511475,
"learning_rate": 2.6232366460611824e-05,
"loss": 0.0003,
"step": 7400
},
{
"epoch": 5.03,
"grad_norm": 3.243302822113037,
"learning_rate": 2.6192740529402443e-05,
"loss": 0.0033,
"step": 7410
},
{
"epoch": 5.03,
"grad_norm": 0.002176284557208419,
"learning_rate": 2.615311459819306e-05,
"loss": 0.0905,
"step": 7420
},
{
"epoch": 5.03,
"grad_norm": 0.00346784177236259,
"learning_rate": 2.6113488666983677e-05,
"loss": 0.0058,
"step": 7430
},
{
"epoch": 5.03,
"grad_norm": 0.0022136277984827757,
"learning_rate": 2.607386273577429e-05,
"loss": 0.0145,
"step": 7440
},
{
"epoch": 5.03,
"grad_norm": 0.0054547772742807865,
"learning_rate": 2.603423680456491e-05,
"loss": 0.0001,
"step": 7450
},
{
"epoch": 5.03,
"grad_norm": 0.0017041038954630494,
"learning_rate": 2.5994610873355524e-05,
"loss": 0.0043,
"step": 7460
},
{
"epoch": 5.03,
"grad_norm": 0.00526059465482831,
"learning_rate": 2.5954984942146142e-05,
"loss": 0.0001,
"step": 7470
},
{
"epoch": 5.03,
"grad_norm": 0.0015646722167730331,
"learning_rate": 2.5915359010936758e-05,
"loss": 0.0001,
"step": 7480
},
{
"epoch": 5.03,
"grad_norm": 0.0014299266040325165,
"learning_rate": 2.5875733079727377e-05,
"loss": 0.0001,
"step": 7490
},
{
"epoch": 5.03,
"grad_norm": 0.857555627822876,
"learning_rate": 2.5836107148517992e-05,
"loss": 0.0494,
"step": 7500
},
{
"epoch": 5.04,
"grad_norm": 0.0019163636025041342,
"learning_rate": 2.579648121730861e-05,
"loss": 0.0001,
"step": 7510
},
{
"epoch": 5.04,
"grad_norm": 0.001081604859791696,
"learning_rate": 2.5756855286099223e-05,
"loss": 0.0001,
"step": 7520
},
{
"epoch": 5.04,
"grad_norm": 0.002402815269306302,
"learning_rate": 2.5717229354889845e-05,
"loss": 0.0001,
"step": 7530
},
{
"epoch": 5.04,
"grad_norm": 0.0032065189443528652,
"learning_rate": 2.5677603423680457e-05,
"loss": 0.5271,
"step": 7540
},
{
"epoch": 5.04,
"grad_norm": 0.0037377572152763605,
"learning_rate": 2.5637977492471076e-05,
"loss": 0.0001,
"step": 7550
},
{
"epoch": 5.04,
"grad_norm": 0.0010730663780122995,
"learning_rate": 2.559835156126169e-05,
"loss": 0.0001,
"step": 7560
},
{
"epoch": 5.04,
"grad_norm": 0.018039198592305183,
"learning_rate": 2.5558725630052303e-05,
"loss": 0.1574,
"step": 7570
},
{
"epoch": 5.04,
"grad_norm": 0.0008627079077996314,
"learning_rate": 2.5519099698842925e-05,
"loss": 0.0004,
"step": 7580
},
{
"epoch": 5.04,
"grad_norm": 0.00304847932420671,
"learning_rate": 2.5479473767633537e-05,
"loss": 0.0002,
"step": 7590
},
{
"epoch": 5.04,
"grad_norm": 56.73731231689453,
"learning_rate": 2.5439847836424156e-05,
"loss": 0.2908,
"step": 7600
},
{
"epoch": 5.04,
"grad_norm": 0.0014052072074264288,
"learning_rate": 2.540022190521477e-05,
"loss": 0.0001,
"step": 7610
},
{
"epoch": 5.04,
"grad_norm": 0.0024271756410598755,
"learning_rate": 2.536059597400539e-05,
"loss": 0.0363,
"step": 7620
},
{
"epoch": 5.04,
"grad_norm": 0.0011607712367549539,
"learning_rate": 2.5320970042796006e-05,
"loss": 0.0703,
"step": 7630
},
{
"epoch": 5.04,
"grad_norm": 0.0010089229326695204,
"learning_rate": 2.5281344111586625e-05,
"loss": 0.0001,
"step": 7640
},
{
"epoch": 5.05,
"grad_norm": 0.0012477770214900374,
"learning_rate": 2.524171818037724e-05,
"loss": 0.471,
"step": 7650
},
{
"epoch": 5.05,
"grad_norm": 0.0015396666713058949,
"learning_rate": 2.520209224916786e-05,
"loss": 0.2129,
"step": 7660
},
{
"epoch": 5.05,
"grad_norm": 0.000801810878328979,
"learning_rate": 2.516246631795847e-05,
"loss": 0.0314,
"step": 7670
},
{
"epoch": 5.05,
"grad_norm": 0.0009846306638792157,
"learning_rate": 2.512284038674909e-05,
"loss": 0.0003,
"step": 7680
},
{
"epoch": 5.05,
"grad_norm": 0.03625110909342766,
"learning_rate": 2.5083214455539705e-05,
"loss": 0.0016,
"step": 7690
},
{
"epoch": 5.05,
"grad_norm": 0.14931851625442505,
"learning_rate": 2.5043588524330324e-05,
"loss": 0.4488,
"step": 7700
},
{
"epoch": 5.05,
"grad_norm": 0.007826775312423706,
"learning_rate": 2.500396259312094e-05,
"loss": 0.0002,
"step": 7710
},
{
"epoch": 5.05,
"grad_norm": 0.00988730974495411,
"learning_rate": 2.4964336661911555e-05,
"loss": 0.0001,
"step": 7720
},
{
"epoch": 5.05,
"grad_norm": 0.0005387517157942057,
"learning_rate": 2.4924710730702174e-05,
"loss": 0.7611,
"step": 7730
},
{
"epoch": 5.05,
"grad_norm": 0.0011877217330038548,
"learning_rate": 2.488508479949279e-05,
"loss": 0.0001,
"step": 7740
},
{
"epoch": 5.05,
"grad_norm": 0.019128194078803062,
"learning_rate": 2.4845458868283404e-05,
"loss": 0.517,
"step": 7750
},
{
"epoch": 5.05,
"grad_norm": 1.5278313159942627,
"learning_rate": 2.4805832937074023e-05,
"loss": 0.0012,
"step": 7760
},
{
"epoch": 5.05,
"grad_norm": 0.0027036985848098993,
"learning_rate": 2.476620700586464e-05,
"loss": 0.5882,
"step": 7770
},
{
"epoch": 5.05,
"grad_norm": 0.002757065463811159,
"learning_rate": 2.4726581074655254e-05,
"loss": 0.0155,
"step": 7780
},
{
"epoch": 5.06,
"grad_norm": 0.004905847366899252,
"learning_rate": 2.4686955143445873e-05,
"loss": 0.0102,
"step": 7790
},
{
"epoch": 5.06,
"grad_norm": 0.0014356361934915185,
"learning_rate": 2.4647329212236488e-05,
"loss": 0.0001,
"step": 7800
},
{
"epoch": 5.06,
"grad_norm": 3.6968801021575928,
"learning_rate": 2.4607703281027107e-05,
"loss": 0.2234,
"step": 7810
},
{
"epoch": 5.06,
"grad_norm": 66.777099609375,
"learning_rate": 2.4568077349817722e-05,
"loss": 0.3216,
"step": 7820
},
{
"epoch": 5.06,
"grad_norm": 0.001242569531314075,
"learning_rate": 2.4528451418608338e-05,
"loss": 0.0003,
"step": 7830
},
{
"epoch": 5.06,
"grad_norm": 0.0016161628300324082,
"learning_rate": 2.4488825487398957e-05,
"loss": 0.024,
"step": 7840
},
{
"epoch": 5.06,
"grad_norm": 0.06756754219532013,
"learning_rate": 2.4449199556189572e-05,
"loss": 0.0614,
"step": 7850
},
{
"epoch": 5.06,
"grad_norm": 0.0006389593472704291,
"learning_rate": 2.440957362498019e-05,
"loss": 0.1259,
"step": 7860
},
{
"epoch": 5.06,
"grad_norm": 0.004206878133118153,
"learning_rate": 2.4369947693770806e-05,
"loss": 0.2293,
"step": 7870
},
{
"epoch": 5.06,
"grad_norm": 0.0025491828564554453,
"learning_rate": 2.433032176256142e-05,
"loss": 0.0004,
"step": 7880
},
{
"epoch": 5.06,
"grad_norm": 0.0015132069820538163,
"learning_rate": 2.4290695831352037e-05,
"loss": 0.0453,
"step": 7890
},
{
"epoch": 5.06,
"grad_norm": 0.0013023455394431949,
"learning_rate": 2.4251069900142652e-05,
"loss": 0.515,
"step": 7900
},
{
"epoch": 5.06,
"grad_norm": 0.0006147758103907108,
"learning_rate": 2.421144396893327e-05,
"loss": 0.0004,
"step": 7910
},
{
"epoch": 5.06,
"grad_norm": 0.0013257160317152739,
"learning_rate": 2.4171818037723887e-05,
"loss": 0.0004,
"step": 7920
},
{
"epoch": 5.07,
"grad_norm": 0.0010351515375077724,
"learning_rate": 2.4132192106514502e-05,
"loss": 0.2861,
"step": 7930
},
{
"epoch": 5.07,
"grad_norm": 0.004010920412838459,
"learning_rate": 2.409256617530512e-05,
"loss": 0.144,
"step": 7940
},
{
"epoch": 5.07,
"grad_norm": 0.002655152464285493,
"learning_rate": 2.4052940244095736e-05,
"loss": 0.5804,
"step": 7950
},
{
"epoch": 5.07,
"grad_norm": 0.009208135306835175,
"learning_rate": 2.4013314312886355e-05,
"loss": 0.0008,
"step": 7960
},
{
"epoch": 5.07,
"grad_norm": 124.37940979003906,
"learning_rate": 2.397368838167697e-05,
"loss": 0.1359,
"step": 7970
},
{
"epoch": 5.07,
"grad_norm": 0.0007841124897822738,
"learning_rate": 2.3934062450467586e-05,
"loss": 0.0073,
"step": 7980
},
{
"epoch": 5.07,
"grad_norm": 14.345431327819824,
"learning_rate": 2.3894436519258205e-05,
"loss": 0.009,
"step": 7990
},
{
"epoch": 5.07,
"grad_norm": 0.0012639712076634169,
"learning_rate": 2.385481058804882e-05,
"loss": 0.0001,
"step": 8000
},
{
"epoch": 5.07,
"grad_norm": 0.004882665816694498,
"learning_rate": 2.3815184656839436e-05,
"loss": 0.765,
"step": 8010
},
{
"epoch": 5.07,
"grad_norm": 1.992924690246582,
"learning_rate": 2.3775558725630054e-05,
"loss": 0.0199,
"step": 8020
},
{
"epoch": 5.07,
"grad_norm": 0.008574814535677433,
"learning_rate": 2.373593279442067e-05,
"loss": 0.0121,
"step": 8030
},
{
"epoch": 5.07,
"grad_norm": 0.0031569607090204954,
"learning_rate": 2.369630686321129e-05,
"loss": 0.0105,
"step": 8040
},
{
"epoch": 5.07,
"grad_norm": 0.005381352733820677,
"learning_rate": 2.3656680932001904e-05,
"loss": 0.0002,
"step": 8050
},
{
"epoch": 5.07,
"grad_norm": 0.0014025687705725431,
"learning_rate": 2.361705500079252e-05,
"loss": 0.1309,
"step": 8060
},
{
"epoch": 5.08,
"grad_norm": 0.00232652947306633,
"learning_rate": 2.3577429069583138e-05,
"loss": 0.0253,
"step": 8070
},
{
"epoch": 5.08,
"grad_norm": 0.004494811408221722,
"learning_rate": 2.3537803138373754e-05,
"loss": 0.0004,
"step": 8080
},
{
"epoch": 5.08,
"grad_norm": 0.007132168859243393,
"learning_rate": 2.3498177207164372e-05,
"loss": 0.0002,
"step": 8090
},
{
"epoch": 5.08,
"grad_norm": 0.002315562916919589,
"learning_rate": 2.3458551275954984e-05,
"loss": 0.0048,
"step": 8100
},
{
"epoch": 5.08,
"grad_norm": 0.0011102244025096297,
"learning_rate": 2.34189253447456e-05,
"loss": 0.1166,
"step": 8110
},
{
"epoch": 5.08,
"grad_norm": 0.0011376317124813795,
"learning_rate": 2.337929941353622e-05,
"loss": 0.0001,
"step": 8120
},
{
"epoch": 5.08,
"grad_norm": 0.009772238321602345,
"learning_rate": 2.3339673482326834e-05,
"loss": 0.1212,
"step": 8130
},
{
"epoch": 5.08,
"grad_norm": 0.0009250590810552239,
"learning_rate": 2.3300047551117453e-05,
"loss": 0.0077,
"step": 8140
},
{
"epoch": 5.08,
"grad_norm": 0.0008343447698280215,
"learning_rate": 2.3260421619908068e-05,
"loss": 0.0001,
"step": 8150
},
{
"epoch": 5.08,
"grad_norm": 0.005889697000384331,
"learning_rate": 2.3220795688698684e-05,
"loss": 0.2522,
"step": 8160
},
{
"epoch": 5.08,
"grad_norm": 0.004577580373734236,
"learning_rate": 2.3181169757489303e-05,
"loss": 0.0055,
"step": 8170
},
{
"epoch": 5.08,
"grad_norm": 0.0006038689170964062,
"learning_rate": 2.3141543826279918e-05,
"loss": 0.22,
"step": 8180
},
{
"epoch": 5.08,
"grad_norm": 119.69172668457031,
"learning_rate": 2.3101917895070537e-05,
"loss": 0.2874,
"step": 8190
},
{
"epoch": 5.08,
"grad_norm": 0.01207007933408022,
"learning_rate": 2.3062291963861152e-05,
"loss": 0.0003,
"step": 8200
},
{
"epoch": 5.09,
"grad_norm": 0.005133229307830334,
"learning_rate": 2.3022666032651768e-05,
"loss": 0.0002,
"step": 8210
},
{
"epoch": 5.09,
"grad_norm": 0.0014045186107978225,
"learning_rate": 2.2983040101442386e-05,
"loss": 0.0003,
"step": 8220
},
{
"epoch": 5.09,
"grad_norm": 0.005631518550217152,
"learning_rate": 2.2943414170233002e-05,
"loss": 0.0002,
"step": 8230
},
{
"epoch": 5.09,
"grad_norm": 0.0011396125191822648,
"learning_rate": 2.2903788239023617e-05,
"loss": 0.0004,
"step": 8240
},
{
"epoch": 5.09,
"grad_norm": 0.16508010029792786,
"learning_rate": 2.2864162307814236e-05,
"loss": 0.0002,
"step": 8250
},
{
"epoch": 5.09,
"grad_norm": 0.005040541756898165,
"learning_rate": 2.282453637660485e-05,
"loss": 0.016,
"step": 8260
},
{
"epoch": 5.09,
"grad_norm": 0.0026673241518437862,
"learning_rate": 2.278491044539547e-05,
"loss": 0.0024,
"step": 8270
},
{
"epoch": 5.09,
"grad_norm": 0.0025323168374598026,
"learning_rate": 2.2745284514186086e-05,
"loss": 0.0001,
"step": 8280
},
{
"epoch": 5.09,
"grad_norm": 0.002470273757353425,
"learning_rate": 2.27056585829767e-05,
"loss": 0.0001,
"step": 8290
},
{
"epoch": 5.09,
"grad_norm": 0.0011150416685268283,
"learning_rate": 2.266603265176732e-05,
"loss": 0.0027,
"step": 8300
},
{
"epoch": 5.09,
"grad_norm": 0.0062728519551455975,
"learning_rate": 2.2626406720557935e-05,
"loss": 0.0006,
"step": 8310
},
{
"epoch": 5.09,
"grad_norm": 0.001863997895270586,
"learning_rate": 2.258678078934855e-05,
"loss": 0.0001,
"step": 8320
},
{
"epoch": 5.09,
"grad_norm": 0.0009478493593633175,
"learning_rate": 2.2547154858139166e-05,
"loss": 0.0179,
"step": 8330
},
{
"epoch": 5.09,
"grad_norm": 0.0012072144309058785,
"learning_rate": 2.250752892692978e-05,
"loss": 0.2482,
"step": 8340
},
{
"epoch": 5.1,
"grad_norm": 0.0013612033799290657,
"learning_rate": 2.24679029957204e-05,
"loss": 0.0001,
"step": 8350
},
{
"epoch": 5.1,
"grad_norm": 0.001653852523304522,
"learning_rate": 2.2428277064511016e-05,
"loss": 0.0001,
"step": 8360
},
{
"epoch": 5.1,
"grad_norm": 0.004468216095119715,
"learning_rate": 2.2388651133301634e-05,
"loss": 0.0003,
"step": 8370
},
{
"epoch": 5.1,
"grad_norm": 0.21759329736232758,
"learning_rate": 2.234902520209225e-05,
"loss": 0.0004,
"step": 8380
},
{
"epoch": 5.1,
"grad_norm": 0.002769963815808296,
"learning_rate": 2.2309399270882865e-05,
"loss": 0.0002,
"step": 8390
},
{
"epoch": 5.1,
"grad_norm": 0.0010608519660308957,
"learning_rate": 2.2269773339673484e-05,
"loss": 0.1718,
"step": 8400
},
{
"epoch": 5.1,
"grad_norm": 0.0008747797110117972,
"learning_rate": 2.22301474084641e-05,
"loss": 0.0003,
"step": 8410
},
{
"epoch": 5.1,
"eval_accuracy": 0.9669172932330827,
"eval_loss": 0.23544873297214508,
"eval_runtime": 2342.8874,
"eval_samples_per_second": 0.284,
"eval_steps_per_second": 0.142,
"step": 8418
},
{
"epoch": 6.0,
"grad_norm": 0.0007297981064766645,
"learning_rate": 2.2190521477254715e-05,
"loss": 0.0001,
"step": 8420
},
{
"epoch": 6.0,
"grad_norm": 0.007375821936875582,
"learning_rate": 2.2150895546045334e-05,
"loss": 0.0001,
"step": 8430
},
{
"epoch": 6.0,
"grad_norm": 0.0019510581623762846,
"learning_rate": 2.211126961483595e-05,
"loss": 0.0001,
"step": 8440
},
{
"epoch": 6.0,
"grad_norm": 0.009307813830673695,
"learning_rate": 2.2071643683626568e-05,
"loss": 0.0002,
"step": 8450
},
{
"epoch": 6.0,
"grad_norm": 0.07272663712501526,
"learning_rate": 2.2032017752417183e-05,
"loss": 0.0002,
"step": 8460
},
{
"epoch": 6.0,
"grad_norm": 0.004176029469817877,
"learning_rate": 2.19923918212078e-05,
"loss": 0.058,
"step": 8470
},
{
"epoch": 6.0,
"grad_norm": 0.0019298582337796688,
"learning_rate": 2.1952765889998418e-05,
"loss": 0.0001,
"step": 8480
},
{
"epoch": 6.01,
"grad_norm": 137.64112854003906,
"learning_rate": 2.1913139958789033e-05,
"loss": 0.144,
"step": 8490
},
{
"epoch": 6.01,
"grad_norm": 0.0035788225941359997,
"learning_rate": 2.1873514027579652e-05,
"loss": 0.1903,
"step": 8500
},
{
"epoch": 6.01,
"grad_norm": 0.00900218915194273,
"learning_rate": 2.1833888096370267e-05,
"loss": 0.3162,
"step": 8510
},
{
"epoch": 6.01,
"grad_norm": 0.006812531501054764,
"learning_rate": 2.1794262165160883e-05,
"loss": 0.0001,
"step": 8520
},
{
"epoch": 6.01,
"grad_norm": 0.011043643578886986,
"learning_rate": 2.1754636233951498e-05,
"loss": 0.0001,
"step": 8530
},
{
"epoch": 6.01,
"grad_norm": 0.0009386019664816558,
"learning_rate": 2.1715010302742113e-05,
"loss": 0.0132,
"step": 8540
},
{
"epoch": 6.01,
"grad_norm": 0.0009653670713305473,
"learning_rate": 2.1675384371532732e-05,
"loss": 0.0001,
"step": 8550
},
{
"epoch": 6.01,
"grad_norm": 0.000631912553217262,
"learning_rate": 2.1635758440323348e-05,
"loss": 0.0046,
"step": 8560
},
{
"epoch": 6.01,
"grad_norm": 0.005377355497330427,
"learning_rate": 2.1596132509113963e-05,
"loss": 0.0001,
"step": 8570
},
{
"epoch": 6.01,
"grad_norm": 0.0015233962330967188,
"learning_rate": 2.1556506577904582e-05,
"loss": 0.26,
"step": 8580
},
{
"epoch": 6.01,
"grad_norm": 0.003712683217599988,
"learning_rate": 2.1516880646695197e-05,
"loss": 0.0103,
"step": 8590
},
{
"epoch": 6.01,
"grad_norm": 0.002746036509051919,
"learning_rate": 2.1477254715485816e-05,
"loss": 0.0001,
"step": 8600
},
{
"epoch": 6.01,
"grad_norm": 0.001353266416117549,
"learning_rate": 2.143762878427643e-05,
"loss": 0.0001,
"step": 8610
},
{
"epoch": 6.01,
"grad_norm": 0.05317896232008934,
"learning_rate": 2.1398002853067047e-05,
"loss": 0.0002,
"step": 8620
},
{
"epoch": 6.02,
"grad_norm": 0.002108694287016988,
"learning_rate": 2.1358376921857666e-05,
"loss": 0.0001,
"step": 8630
},
{
"epoch": 6.02,
"grad_norm": 0.0015535557176917791,
"learning_rate": 2.131875099064828e-05,
"loss": 0.2856,
"step": 8640
},
{
"epoch": 6.02,
"grad_norm": 0.0007479583146050572,
"learning_rate": 2.1279125059438897e-05,
"loss": 0.0001,
"step": 8650
},
{
"epoch": 6.02,
"grad_norm": 0.0013678164687007666,
"learning_rate": 2.1239499128229515e-05,
"loss": 0.0001,
"step": 8660
},
{
"epoch": 6.02,
"grad_norm": 0.0011460609966889024,
"learning_rate": 2.119987319702013e-05,
"loss": 0.1748,
"step": 8670
},
{
"epoch": 6.02,
"grad_norm": 0.005598797462880611,
"learning_rate": 2.116024726581075e-05,
"loss": 0.0005,
"step": 8680
},
{
"epoch": 6.02,
"grad_norm": 0.0058416505344212055,
"learning_rate": 2.1120621334601365e-05,
"loss": 0.003,
"step": 8690
},
{
"epoch": 6.02,
"grad_norm": 0.0018327133730053902,
"learning_rate": 2.108099540339198e-05,
"loss": 0.0001,
"step": 8700
},
{
"epoch": 6.02,
"grad_norm": 0.0008349318522959948,
"learning_rate": 2.10413694721826e-05,
"loss": 0.0014,
"step": 8710
},
{
"epoch": 6.02,
"grad_norm": 0.0007587561849504709,
"learning_rate": 2.1001743540973215e-05,
"loss": 0.0012,
"step": 8720
},
{
"epoch": 6.02,
"grad_norm": 0.003939191345125437,
"learning_rate": 2.096211760976383e-05,
"loss": 0.0001,
"step": 8730
},
{
"epoch": 6.02,
"grad_norm": 0.007147368974983692,
"learning_rate": 2.092249167855445e-05,
"loss": 0.0003,
"step": 8740
},
{
"epoch": 6.02,
"grad_norm": 0.0007460744236595929,
"learning_rate": 2.088286574734506e-05,
"loss": 0.0,
"step": 8750
},
{
"epoch": 6.02,
"grad_norm": 0.005187608767300844,
"learning_rate": 2.084323981613568e-05,
"loss": 0.0001,
"step": 8760
},
{
"epoch": 6.03,
"grad_norm": 0.0012044048635289073,
"learning_rate": 2.0803613884926295e-05,
"loss": 0.0003,
"step": 8770
},
{
"epoch": 6.03,
"grad_norm": 0.005269182845950127,
"learning_rate": 2.0763987953716914e-05,
"loss": 0.1424,
"step": 8780
},
{
"epoch": 6.03,
"grad_norm": 0.0014458984369412065,
"learning_rate": 2.072436202250753e-05,
"loss": 0.1836,
"step": 8790
},
{
"epoch": 6.03,
"grad_norm": 0.003018228802829981,
"learning_rate": 2.0684736091298145e-05,
"loss": 0.0002,
"step": 8800
},
{
"epoch": 6.03,
"grad_norm": 0.0005208718357607722,
"learning_rate": 2.0645110160088763e-05,
"loss": 0.276,
"step": 8810
},
{
"epoch": 6.03,
"grad_norm": 0.0005419257213361561,
"learning_rate": 2.060548422887938e-05,
"loss": 0.0,
"step": 8820
},
{
"epoch": 6.03,
"grad_norm": 0.0056818630546331406,
"learning_rate": 2.0565858297669994e-05,
"loss": 0.0003,
"step": 8830
},
{
"epoch": 6.03,
"grad_norm": 0.0021387594752013683,
"learning_rate": 2.0526232366460613e-05,
"loss": 0.0001,
"step": 8840
},
{
"epoch": 6.03,
"grad_norm": 0.0017361573409289122,
"learning_rate": 2.048660643525123e-05,
"loss": 0.0235,
"step": 8850
},
{
"epoch": 6.03,
"grad_norm": 0.0031765319872647524,
"learning_rate": 2.0446980504041847e-05,
"loss": 0.0002,
"step": 8860
},
{
"epoch": 6.03,
"grad_norm": 0.0006492682150565088,
"learning_rate": 2.0407354572832463e-05,
"loss": 0.0,
"step": 8870
},
{
"epoch": 6.03,
"grad_norm": 0.009603900834918022,
"learning_rate": 2.0367728641623078e-05,
"loss": 0.0116,
"step": 8880
},
{
"epoch": 6.03,
"grad_norm": 0.0014260296011343598,
"learning_rate": 2.0328102710413697e-05,
"loss": 0.0272,
"step": 8890
},
{
"epoch": 6.03,
"grad_norm": 0.001238304190337658,
"learning_rate": 2.0288476779204312e-05,
"loss": 0.0001,
"step": 8900
},
{
"epoch": 6.04,
"grad_norm": 0.004389143083244562,
"learning_rate": 2.024885084799493e-05,
"loss": 0.0001,
"step": 8910
},
{
"epoch": 6.04,
"grad_norm": 0.0006919855368323624,
"learning_rate": 2.0209224916785547e-05,
"loss": 0.0015,
"step": 8920
},
{
"epoch": 6.04,
"grad_norm": 0.0013250050833448768,
"learning_rate": 2.0169598985576162e-05,
"loss": 0.0,
"step": 8930
},
{
"epoch": 6.04,
"grad_norm": 0.0006862548179924488,
"learning_rate": 2.012997305436678e-05,
"loss": 0.005,
"step": 8940
},
{
"epoch": 6.04,
"grad_norm": 0.0006481676246039569,
"learning_rate": 2.0090347123157396e-05,
"loss": 0.0002,
"step": 8950
},
{
"epoch": 6.04,
"grad_norm": 0.0009765150607563555,
"learning_rate": 2.005072119194801e-05,
"loss": 0.3095,
"step": 8960
},
{
"epoch": 6.04,
"grad_norm": 0.0008786149555817246,
"learning_rate": 2.0011095260738627e-05,
"loss": 0.1903,
"step": 8970
},
{
"epoch": 6.04,
"grad_norm": 0.00043602605001069605,
"learning_rate": 1.9971469329529242e-05,
"loss": 0.0002,
"step": 8980
},
{
"epoch": 6.04,
"grad_norm": 0.0006052827229723334,
"learning_rate": 1.993184339831986e-05,
"loss": 0.0028,
"step": 8990
},
{
"epoch": 6.04,
"grad_norm": 0.0027263278607279062,
"learning_rate": 1.9892217467110477e-05,
"loss": 0.0988,
"step": 9000
},
{
"epoch": 6.04,
"grad_norm": 0.0004901738138869405,
"learning_rate": 1.9852591535901095e-05,
"loss": 0.0008,
"step": 9010
},
{
"epoch": 6.04,
"grad_norm": 0.004134719260036945,
"learning_rate": 1.981296560469171e-05,
"loss": 0.0001,
"step": 9020
},
{
"epoch": 6.04,
"grad_norm": 6.425068378448486,
"learning_rate": 1.9773339673482326e-05,
"loss": 0.0009,
"step": 9030
},
{
"epoch": 6.04,
"grad_norm": 0.0021010099444538355,
"learning_rate": 1.9733713742272945e-05,
"loss": 0.0139,
"step": 9040
},
{
"epoch": 6.05,
"grad_norm": 0.0003429889620747417,
"learning_rate": 1.969408781106356e-05,
"loss": 0.0001,
"step": 9050
},
{
"epoch": 6.05,
"grad_norm": 0.00465469341725111,
"learning_rate": 1.9654461879854176e-05,
"loss": 0.0048,
"step": 9060
},
{
"epoch": 6.05,
"grad_norm": 0.0007626991719007492,
"learning_rate": 1.9614835948644795e-05,
"loss": 0.0694,
"step": 9070
},
{
"epoch": 6.05,
"grad_norm": 0.0005379422218538821,
"learning_rate": 1.957521001743541e-05,
"loss": 0.0001,
"step": 9080
},
{
"epoch": 6.05,
"grad_norm": 0.0018008677288889885,
"learning_rate": 1.953558408622603e-05,
"loss": 0.1537,
"step": 9090
},
{
"epoch": 6.05,
"grad_norm": 0.005486232694238424,
"learning_rate": 1.9495958155016644e-05,
"loss": 0.0001,
"step": 9100
},
{
"epoch": 6.05,
"grad_norm": 0.0016153625911101699,
"learning_rate": 1.945633222380726e-05,
"loss": 0.1517,
"step": 9110
},
{
"epoch": 6.05,
"grad_norm": 0.00048393840552307665,
"learning_rate": 1.941670629259788e-05,
"loss": 0.0515,
"step": 9120
},
{
"epoch": 6.05,
"grad_norm": 0.00044351426186040044,
"learning_rate": 1.9377080361388494e-05,
"loss": 0.0,
"step": 9130
},
{
"epoch": 6.05,
"grad_norm": 0.003928069956600666,
"learning_rate": 1.9337454430179113e-05,
"loss": 0.0001,
"step": 9140
},
{
"epoch": 6.05,
"grad_norm": 0.0009555955766700208,
"learning_rate": 1.9297828498969728e-05,
"loss": 0.0001,
"step": 9150
},
{
"epoch": 6.05,
"grad_norm": 0.003042226191610098,
"learning_rate": 1.9258202567760344e-05,
"loss": 0.0,
"step": 9160
},
{
"epoch": 6.05,
"grad_norm": 0.0003893129760399461,
"learning_rate": 1.9218576636550962e-05,
"loss": 0.0,
"step": 9170
},
{
"epoch": 6.05,
"grad_norm": 0.0008289095130749047,
"learning_rate": 1.9178950705341574e-05,
"loss": 0.0,
"step": 9180
},
{
"epoch": 6.06,
"grad_norm": 0.0010318297427147627,
"learning_rate": 1.9139324774132193e-05,
"loss": 0.0,
"step": 9190
},
{
"epoch": 6.06,
"grad_norm": 0.0007037441828288138,
"learning_rate": 1.909969884292281e-05,
"loss": 0.0837,
"step": 9200
},
{
"epoch": 6.06,
"grad_norm": 44.11083221435547,
"learning_rate": 1.9060072911713424e-05,
"loss": 0.0226,
"step": 9210
},
{
"epoch": 6.06,
"grad_norm": 0.0010193975176662207,
"learning_rate": 1.9020446980504043e-05,
"loss": 0.0018,
"step": 9220
},
{
"epoch": 6.06,
"grad_norm": 0.0026808753609657288,
"learning_rate": 1.8980821049294658e-05,
"loss": 0.0001,
"step": 9230
},
{
"epoch": 6.06,
"grad_norm": 0.0013365477789193392,
"learning_rate": 1.8941195118085277e-05,
"loss": 0.0,
"step": 9240
},
{
"epoch": 6.06,
"grad_norm": 33.180870056152344,
"learning_rate": 1.8901569186875892e-05,
"loss": 0.3046,
"step": 9250
},
{
"epoch": 6.06,
"grad_norm": 0.001624317723326385,
"learning_rate": 1.8861943255666508e-05,
"loss": 0.2476,
"step": 9260
},
{
"epoch": 6.06,
"grad_norm": 0.002660261234268546,
"learning_rate": 1.8822317324457127e-05,
"loss": 0.329,
"step": 9270
},
{
"epoch": 6.06,
"grad_norm": 0.001928847748786211,
"learning_rate": 1.8782691393247742e-05,
"loss": 0.0116,
"step": 9280
},
{
"epoch": 6.06,
"grad_norm": 0.0004771009262185544,
"learning_rate": 1.8743065462038357e-05,
"loss": 0.0,
"step": 9290
},
{
"epoch": 6.06,
"grad_norm": 0.0006694819312542677,
"learning_rate": 1.8703439530828976e-05,
"loss": 0.0743,
"step": 9300
},
{
"epoch": 6.06,
"grad_norm": 0.010220357216894627,
"learning_rate": 1.8663813599619592e-05,
"loss": 0.0001,
"step": 9310
},
{
"epoch": 6.06,
"grad_norm": 0.0014199281577020884,
"learning_rate": 1.862418766841021e-05,
"loss": 0.0103,
"step": 9320
},
{
"epoch": 6.07,
"grad_norm": 0.001806290470995009,
"learning_rate": 1.8584561737200826e-05,
"loss": 0.0006,
"step": 9330
},
{
"epoch": 6.07,
"grad_norm": 0.0005750704440288246,
"learning_rate": 1.854493580599144e-05,
"loss": 0.0003,
"step": 9340
},
{
"epoch": 6.07,
"grad_norm": 0.0009846296161413193,
"learning_rate": 1.850530987478206e-05,
"loss": 0.0013,
"step": 9350
},
{
"epoch": 6.07,
"grad_norm": 0.0016641179099678993,
"learning_rate": 1.8465683943572676e-05,
"loss": 0.06,
"step": 9360
},
{
"epoch": 6.07,
"grad_norm": 0.0014823460951447487,
"learning_rate": 1.842605801236329e-05,
"loss": 0.0008,
"step": 9370
},
{
"epoch": 6.07,
"grad_norm": 0.0026860409416258335,
"learning_rate": 1.838643208115391e-05,
"loss": 0.0005,
"step": 9380
},
{
"epoch": 6.07,
"grad_norm": 0.0014451199676841497,
"learning_rate": 1.8346806149944525e-05,
"loss": 0.0001,
"step": 9390
},
{
"epoch": 6.07,
"grad_norm": 0.004795750603079796,
"learning_rate": 1.830718021873514e-05,
"loss": 0.0003,
"step": 9400
},
{
"epoch": 6.07,
"grad_norm": 0.0025767534971237183,
"learning_rate": 1.8267554287525756e-05,
"loss": 0.0002,
"step": 9410
},
{
"epoch": 6.07,
"grad_norm": 0.0006194358575157821,
"learning_rate": 1.8227928356316375e-05,
"loss": 0.0738,
"step": 9420
},
{
"epoch": 6.07,
"grad_norm": 0.007454677484929562,
"learning_rate": 1.818830242510699e-05,
"loss": 0.0048,
"step": 9430
},
{
"epoch": 6.07,
"grad_norm": 0.0012314959894865751,
"learning_rate": 1.8148676493897606e-05,
"loss": 0.0045,
"step": 9440
},
{
"epoch": 6.07,
"grad_norm": 0.0007009029504843056,
"learning_rate": 1.8109050562688224e-05,
"loss": 0.2806,
"step": 9450
},
{
"epoch": 6.07,
"grad_norm": 0.0005554054514504969,
"learning_rate": 1.806942463147884e-05,
"loss": 0.0001,
"step": 9460
},
{
"epoch": 6.08,
"grad_norm": 0.00048346296534873545,
"learning_rate": 1.8029798700269455e-05,
"loss": 0.0458,
"step": 9470
},
{
"epoch": 6.08,
"grad_norm": 0.0011084218276664615,
"learning_rate": 1.7990172769060074e-05,
"loss": 0.0,
"step": 9480
},
{
"epoch": 6.08,
"grad_norm": 0.0003880435542669147,
"learning_rate": 1.795054683785069e-05,
"loss": 0.0142,
"step": 9490
},
{
"epoch": 6.08,
"grad_norm": 0.0006134477443993092,
"learning_rate": 1.7910920906641308e-05,
"loss": 0.0,
"step": 9500
},
{
"epoch": 6.08,
"grad_norm": 0.0005665639764629304,
"learning_rate": 1.7871294975431924e-05,
"loss": 0.0,
"step": 9510
},
{
"epoch": 6.08,
"grad_norm": 0.0003921152965631336,
"learning_rate": 1.783166904422254e-05,
"loss": 0.0,
"step": 9520
},
{
"epoch": 6.08,
"grad_norm": 0.001750220195390284,
"learning_rate": 1.7792043113013158e-05,
"loss": 0.0437,
"step": 9530
},
{
"epoch": 6.08,
"grad_norm": 0.0012650929857045412,
"learning_rate": 1.7752417181803773e-05,
"loss": 0.3903,
"step": 9540
},
{
"epoch": 6.08,
"grad_norm": 57.58509063720703,
"learning_rate": 1.7712791250594392e-05,
"loss": 0.0546,
"step": 9550
},
{
"epoch": 6.08,
"grad_norm": 0.00026887169224210083,
"learning_rate": 1.7673165319385008e-05,
"loss": 0.0001,
"step": 9560
},
{
"epoch": 6.08,
"grad_norm": 0.0019770157523453236,
"learning_rate": 1.7633539388175623e-05,
"loss": 0.1362,
"step": 9570
},
{
"epoch": 6.08,
"grad_norm": 0.0007267651380971074,
"learning_rate": 1.7593913456966242e-05,
"loss": 0.0006,
"step": 9580
},
{
"epoch": 6.08,
"grad_norm": 0.001434961101040244,
"learning_rate": 1.7554287525756857e-05,
"loss": 0.0263,
"step": 9590
},
{
"epoch": 6.08,
"grad_norm": 0.00044755820999853313,
"learning_rate": 1.7514661594547473e-05,
"loss": 0.0,
"step": 9600
},
{
"epoch": 6.09,
"grad_norm": 0.000376471463823691,
"learning_rate": 1.7475035663338088e-05,
"loss": 0.0207,
"step": 9610
},
{
"epoch": 6.09,
"grad_norm": 0.014877337031066418,
"learning_rate": 1.7435409732128703e-05,
"loss": 0.0,
"step": 9620
},
{
"epoch": 6.09,
"grad_norm": 0.0012328572338446975,
"learning_rate": 1.7395783800919322e-05,
"loss": 0.0259,
"step": 9630
},
{
"epoch": 6.09,
"grad_norm": 0.0011149095371365547,
"learning_rate": 1.7356157869709938e-05,
"loss": 0.0,
"step": 9640
},
{
"epoch": 6.09,
"grad_norm": 0.000868526753038168,
"learning_rate": 1.7316531938500556e-05,
"loss": 0.0107,
"step": 9650
},
{
"epoch": 6.09,
"grad_norm": 0.0003520081809256226,
"learning_rate": 1.7276906007291172e-05,
"loss": 0.0,
"step": 9660
},
{
"epoch": 6.09,
"grad_norm": 0.00045317449257709086,
"learning_rate": 1.7237280076081787e-05,
"loss": 0.1845,
"step": 9670
},
{
"epoch": 6.09,
"grad_norm": 0.00035488023422658443,
"learning_rate": 1.7197654144872406e-05,
"loss": 0.0,
"step": 9680
},
{
"epoch": 6.09,
"grad_norm": 0.0007327714120037854,
"learning_rate": 1.715802821366302e-05,
"loss": 0.0001,
"step": 9690
},
{
"epoch": 6.09,
"grad_norm": 0.0025048046372830868,
"learning_rate": 1.7118402282453637e-05,
"loss": 0.0001,
"step": 9700
},
{
"epoch": 6.09,
"grad_norm": 0.00043331715278327465,
"learning_rate": 1.7078776351244256e-05,
"loss": 0.0167,
"step": 9710
},
{
"epoch": 6.09,
"grad_norm": 0.0004680192796513438,
"learning_rate": 1.703915042003487e-05,
"loss": 0.0,
"step": 9720
},
{
"epoch": 6.09,
"grad_norm": 0.0005406651180237532,
"learning_rate": 1.699952448882549e-05,
"loss": 0.0,
"step": 9730
},
{
"epoch": 6.09,
"grad_norm": 1.6138055324554443,
"learning_rate": 1.6959898557616105e-05,
"loss": 0.0005,
"step": 9740
},
{
"epoch": 6.1,
"grad_norm": 0.0005159827414900064,
"learning_rate": 1.692027262640672e-05,
"loss": 0.0001,
"step": 9750
},
{
"epoch": 6.1,
"grad_norm": 0.001156438491307199,
"learning_rate": 1.688064669519734e-05,
"loss": 0.0,
"step": 9760
},
{
"epoch": 6.1,
"grad_norm": 0.00034518956090323627,
"learning_rate": 1.6841020763987955e-05,
"loss": 0.0316,
"step": 9770
},
{
"epoch": 6.1,
"grad_norm": 0.0007839056779630482,
"learning_rate": 1.680139483277857e-05,
"loss": 0.0016,
"step": 9780
},
{
"epoch": 6.1,
"grad_norm": 0.000456125068012625,
"learning_rate": 1.676176890156919e-05,
"loss": 0.0,
"step": 9790
},
{
"epoch": 6.1,
"grad_norm": 0.0007673576474189758,
"learning_rate": 1.6722142970359805e-05,
"loss": 0.0,
"step": 9800
},
{
"epoch": 6.1,
"grad_norm": 0.000683379708789289,
"learning_rate": 1.6682517039150423e-05,
"loss": 0.0,
"step": 9810
},
{
"epoch": 6.1,
"grad_norm": 0.0009253775351680815,
"learning_rate": 1.664289110794104e-05,
"loss": 0.0001,
"step": 9820
},
{
"epoch": 6.1,
"eval_accuracy": 0.9473684210526315,
"eval_loss": 0.3900492191314697,
"eval_runtime": 2421.9145,
"eval_samples_per_second": 0.275,
"eval_steps_per_second": 0.137,
"step": 9821
},
{
"epoch": 7.0,
"grad_norm": 0.0004204540455248207,
"learning_rate": 1.6603265176731654e-05,
"loss": 0.5604,
"step": 9830
},
{
"epoch": 7.0,
"grad_norm": 0.00036831918987445533,
"learning_rate": 1.656363924552227e-05,
"loss": 0.0001,
"step": 9840
},
{
"epoch": 7.0,
"grad_norm": 0.00044371382682584226,
"learning_rate": 1.6524013314312885e-05,
"loss": 0.0,
"step": 9850
},
{
"epoch": 7.0,
"grad_norm": 0.0005366410478018224,
"learning_rate": 1.6484387383103504e-05,
"loss": 0.0,
"step": 9860
},
{
"epoch": 7.0,
"grad_norm": 0.0006946607609279454,
"learning_rate": 1.644476145189412e-05,
"loss": 0.0,
"step": 9870
},
{
"epoch": 7.0,
"grad_norm": 0.00034042325569316745,
"learning_rate": 1.6405135520684735e-05,
"loss": 0.0001,
"step": 9880
},
{
"epoch": 7.0,
"grad_norm": 0.00025543957599438727,
"learning_rate": 1.6365509589475353e-05,
"loss": 0.0001,
"step": 9890
},
{
"epoch": 7.01,
"grad_norm": 0.0005577169358730316,
"learning_rate": 1.632588365826597e-05,
"loss": 0.0001,
"step": 9900
},
{
"epoch": 7.01,
"grad_norm": 0.0007238492253236473,
"learning_rate": 1.6286257727056588e-05,
"loss": 0.0001,
"step": 9910
},
{
"epoch": 7.01,
"grad_norm": 0.00047818326856940985,
"learning_rate": 1.6246631795847203e-05,
"loss": 0.2815,
"step": 9920
},
{
"epoch": 7.01,
"grad_norm": 0.004355975892394781,
"learning_rate": 1.620700586463782e-05,
"loss": 0.0,
"step": 9930
},
{
"epoch": 7.01,
"grad_norm": 0.0002552367513999343,
"learning_rate": 1.6167379933428437e-05,
"loss": 0.0,
"step": 9940
},
{
"epoch": 7.01,
"grad_norm": 0.0011531308991834521,
"learning_rate": 1.6127754002219053e-05,
"loss": 0.0,
"step": 9950
},
{
"epoch": 7.01,
"grad_norm": 0.0009820818668231368,
"learning_rate": 1.608812807100967e-05,
"loss": 0.0,
"step": 9960
},
{
"epoch": 7.01,
"grad_norm": 0.0006331288604997098,
"learning_rate": 1.6048502139800287e-05,
"loss": 0.0001,
"step": 9970
},
{
"epoch": 7.01,
"grad_norm": 23.247167587280273,
"learning_rate": 1.6008876208590902e-05,
"loss": 0.4045,
"step": 9980
},
{
"epoch": 7.01,
"grad_norm": 0.0005796991754323244,
"learning_rate": 1.596925027738152e-05,
"loss": 0.2264,
"step": 9990
},
{
"epoch": 7.01,
"grad_norm": 0.0002432822366245091,
"learning_rate": 1.5929624346172137e-05,
"loss": 0.0,
"step": 10000
},
{
"epoch": 7.01,
"grad_norm": 0.00044970333692617714,
"learning_rate": 1.5889998414962752e-05,
"loss": 0.5798,
"step": 10010
},
{
"epoch": 7.01,
"grad_norm": 0.00043129053665325046,
"learning_rate": 1.585037248375337e-05,
"loss": 0.0,
"step": 10020
},
{
"epoch": 7.01,
"grad_norm": 0.0009400318958796561,
"learning_rate": 1.5810746552543986e-05,
"loss": 0.0001,
"step": 10030
},
{
"epoch": 7.02,
"grad_norm": 0.001612617983482778,
"learning_rate": 1.57711206213346e-05,
"loss": 0.184,
"step": 10040
},
{
"epoch": 7.02,
"grad_norm": 0.003397996537387371,
"learning_rate": 1.5731494690125217e-05,
"loss": 0.0001,
"step": 10050
},
{
"epoch": 7.02,
"grad_norm": 0.003118938999250531,
"learning_rate": 1.5691868758915836e-05,
"loss": 0.0118,
"step": 10060
},
{
"epoch": 7.02,
"grad_norm": 0.0016245280858129263,
"learning_rate": 1.565224282770645e-05,
"loss": 0.0002,
"step": 10070
},
{
"epoch": 7.02,
"grad_norm": 0.003330792533233762,
"learning_rate": 1.5612616896497067e-05,
"loss": 0.0001,
"step": 10080
},
{
"epoch": 7.02,
"grad_norm": 0.01675890013575554,
"learning_rate": 1.5572990965287685e-05,
"loss": 0.1361,
"step": 10090
},
{
"epoch": 7.02,
"grad_norm": 0.0016782371094450355,
"learning_rate": 1.55333650340783e-05,
"loss": 0.0001,
"step": 10100
},
{
"epoch": 7.02,
"grad_norm": 0.0006982790655456483,
"learning_rate": 1.5493739102868916e-05,
"loss": 0.0,
"step": 10110
},
{
"epoch": 7.02,
"grad_norm": 0.004016962368041277,
"learning_rate": 1.5454113171659535e-05,
"loss": 0.0002,
"step": 10120
},
{
"epoch": 7.02,
"grad_norm": 0.0016343995230272412,
"learning_rate": 1.541448724045015e-05,
"loss": 0.0001,
"step": 10130
},
{
"epoch": 7.02,
"grad_norm": 0.0003891861706506461,
"learning_rate": 1.537486130924077e-05,
"loss": 0.0002,
"step": 10140
},
{
"epoch": 7.02,
"grad_norm": 0.0005568304331973195,
"learning_rate": 1.5335235378031385e-05,
"loss": 0.0,
"step": 10150
},
{
"epoch": 7.02,
"grad_norm": 0.0009192074066959321,
"learning_rate": 1.5295609446822e-05,
"loss": 0.0001,
"step": 10160
},
{
"epoch": 7.02,
"grad_norm": 0.00041831223643384874,
"learning_rate": 1.5255983515612617e-05,
"loss": 0.0,
"step": 10170
},
{
"epoch": 7.03,
"grad_norm": 0.002276873914524913,
"learning_rate": 1.5216357584403234e-05,
"loss": 0.0001,
"step": 10180
},
{
"epoch": 7.03,
"grad_norm": 0.0021974798291921616,
"learning_rate": 1.5176731653193851e-05,
"loss": 0.0,
"step": 10190
},
{
"epoch": 7.03,
"grad_norm": 0.03672347217798233,
"learning_rate": 1.5137105721984468e-05,
"loss": 0.0001,
"step": 10200
},
{
"epoch": 7.03,
"grad_norm": 0.0004960880614817142,
"learning_rate": 1.5097479790775086e-05,
"loss": 0.0001,
"step": 10210
},
{
"epoch": 7.03,
"grad_norm": 0.0003698187356349081,
"learning_rate": 1.5057853859565701e-05,
"loss": 0.0,
"step": 10220
},
{
"epoch": 7.03,
"grad_norm": 30.79059600830078,
"learning_rate": 1.5018227928356318e-05,
"loss": 0.013,
"step": 10230
},
{
"epoch": 7.03,
"grad_norm": 0.00040281921974383295,
"learning_rate": 1.4978601997146935e-05,
"loss": 0.0001,
"step": 10240
},
{
"epoch": 7.03,
"grad_norm": 0.001930213999003172,
"learning_rate": 1.4938976065937552e-05,
"loss": 0.0005,
"step": 10250
},
{
"epoch": 7.03,
"grad_norm": 0.0016294418601319194,
"learning_rate": 1.4899350134728166e-05,
"loss": 0.0001,
"step": 10260
},
{
"epoch": 7.03,
"grad_norm": 0.0029418901540338993,
"learning_rate": 1.4859724203518781e-05,
"loss": 0.0001,
"step": 10270
},
{
"epoch": 7.03,
"grad_norm": 0.0005179463187232614,
"learning_rate": 1.4820098272309399e-05,
"loss": 0.0,
"step": 10280
},
{
"epoch": 7.03,
"grad_norm": 117.77789306640625,
"learning_rate": 1.4780472341100016e-05,
"loss": 0.0485,
"step": 10290
},
{
"epoch": 7.03,
"grad_norm": 0.0006345610017888248,
"learning_rate": 1.4740846409890633e-05,
"loss": 0.0072,
"step": 10300
},
{
"epoch": 7.03,
"grad_norm": 0.004750640131533146,
"learning_rate": 1.470122047868125e-05,
"loss": 0.0001,
"step": 10310
},
{
"epoch": 7.04,
"grad_norm": 0.0016635819338262081,
"learning_rate": 1.4661594547471865e-05,
"loss": 0.5727,
"step": 10320
},
{
"epoch": 7.04,
"grad_norm": 0.0009257107740268111,
"learning_rate": 1.4621968616262482e-05,
"loss": 0.019,
"step": 10330
},
{
"epoch": 7.04,
"grad_norm": 0.0004995065974071622,
"learning_rate": 1.45823426850531e-05,
"loss": 0.0,
"step": 10340
},
{
"epoch": 7.04,
"grad_norm": 0.003641214920207858,
"learning_rate": 1.4542716753843717e-05,
"loss": 0.3737,
"step": 10350
},
{
"epoch": 7.04,
"grad_norm": 0.0005538859404623508,
"learning_rate": 1.4503090822634332e-05,
"loss": 0.1959,
"step": 10360
},
{
"epoch": 7.04,
"grad_norm": 0.0024865760933607817,
"learning_rate": 1.4463464891424949e-05,
"loss": 0.0,
"step": 10370
},
{
"epoch": 7.04,
"grad_norm": 0.0012498443247750401,
"learning_rate": 1.4423838960215566e-05,
"loss": 0.0003,
"step": 10380
},
{
"epoch": 7.04,
"grad_norm": 0.003093864070251584,
"learning_rate": 1.4384213029006183e-05,
"loss": 0.0001,
"step": 10390
},
{
"epoch": 7.04,
"grad_norm": 0.0016697756946086884,
"learning_rate": 1.4344587097796799e-05,
"loss": 0.0001,
"step": 10400
},
{
"epoch": 7.04,
"grad_norm": 0.0024545500054955482,
"learning_rate": 1.4304961166587416e-05,
"loss": 0.0,
"step": 10410
},
{
"epoch": 7.04,
"grad_norm": 0.0010031814454123378,
"learning_rate": 1.4265335235378033e-05,
"loss": 0.0,
"step": 10420
},
{
"epoch": 7.04,
"grad_norm": 0.000834242207929492,
"learning_rate": 1.422570930416865e-05,
"loss": 0.214,
"step": 10430
},
{
"epoch": 7.04,
"grad_norm": 0.0008862247341312468,
"learning_rate": 1.4186083372959265e-05,
"loss": 0.0001,
"step": 10440
},
{
"epoch": 7.04,
"grad_norm": 0.0010633807396516204,
"learning_rate": 1.4146457441749883e-05,
"loss": 0.0104,
"step": 10450
},
{
"epoch": 7.05,
"grad_norm": 24.041336059570312,
"learning_rate": 1.41068315105405e-05,
"loss": 0.0049,
"step": 10460
},
{
"epoch": 7.05,
"grad_norm": 0.0011859643273055553,
"learning_rate": 1.4067205579331113e-05,
"loss": 0.0001,
"step": 10470
},
{
"epoch": 7.05,
"grad_norm": 0.0006510709063149989,
"learning_rate": 1.402757964812173e-05,
"loss": 0.001,
"step": 10480
},
{
"epoch": 7.05,
"grad_norm": 0.000353335402905941,
"learning_rate": 1.3987953716912348e-05,
"loss": 0.0002,
"step": 10490
},
{
"epoch": 7.05,
"grad_norm": 0.0005472557386383414,
"learning_rate": 1.3948327785702963e-05,
"loss": 0.0001,
"step": 10500
},
{
"epoch": 7.05,
"grad_norm": 0.0006235065520741045,
"learning_rate": 1.390870185449358e-05,
"loss": 0.0004,
"step": 10510
},
{
"epoch": 7.05,
"grad_norm": 0.00039498330443166196,
"learning_rate": 1.3869075923284197e-05,
"loss": 0.0,
"step": 10520
},
{
"epoch": 7.05,
"grad_norm": 0.0009459428838454187,
"learning_rate": 1.3829449992074814e-05,
"loss": 0.125,
"step": 10530
},
{
"epoch": 7.05,
"grad_norm": 0.000288288458250463,
"learning_rate": 1.378982406086543e-05,
"loss": 0.0005,
"step": 10540
},
{
"epoch": 7.05,
"grad_norm": 0.0010236125672236085,
"learning_rate": 1.3750198129656047e-05,
"loss": 0.0001,
"step": 10550
},
{
"epoch": 7.05,
"grad_norm": 0.0005923935095779598,
"learning_rate": 1.3710572198446664e-05,
"loss": 0.0,
"step": 10560
},
{
"epoch": 7.05,
"grad_norm": 0.001925037824548781,
"learning_rate": 1.3670946267237281e-05,
"loss": 0.0,
"step": 10570
},
{
"epoch": 7.05,
"grad_norm": 0.0010172594338655472,
"learning_rate": 1.3631320336027898e-05,
"loss": 0.0,
"step": 10580
},
{
"epoch": 7.05,
"grad_norm": 0.00022477912716567516,
"learning_rate": 1.3591694404818514e-05,
"loss": 0.4117,
"step": 10590
},
{
"epoch": 7.06,
"grad_norm": 0.001964542781934142,
"learning_rate": 1.355206847360913e-05,
"loss": 0.0,
"step": 10600
},
{
"epoch": 7.06,
"grad_norm": 0.0008729117107577622,
"learning_rate": 1.3512442542399748e-05,
"loss": 0.005,
"step": 10610
},
{
"epoch": 7.06,
"grad_norm": 0.0013933833688497543,
"learning_rate": 1.3472816611190365e-05,
"loss": 0.0,
"step": 10620
},
{
"epoch": 7.06,
"grad_norm": 0.720376193523407,
"learning_rate": 1.343319067998098e-05,
"loss": 0.0005,
"step": 10630
},
{
"epoch": 7.06,
"grad_norm": 0.0024294324684888124,
"learning_rate": 1.3393564748771597e-05,
"loss": 0.5075,
"step": 10640
},
{
"epoch": 7.06,
"grad_norm": 0.00034565231180749834,
"learning_rate": 1.3353938817562215e-05,
"loss": 0.0017,
"step": 10650
},
{
"epoch": 7.06,
"grad_norm": 0.0005883481935597956,
"learning_rate": 1.3314312886352832e-05,
"loss": 0.0,
"step": 10660
},
{
"epoch": 7.06,
"grad_norm": 0.001018638489767909,
"learning_rate": 1.3274686955143447e-05,
"loss": 0.0005,
"step": 10670
},
{
"epoch": 7.06,
"grad_norm": 0.000567563867662102,
"learning_rate": 1.3235061023934064e-05,
"loss": 0.0071,
"step": 10680
},
{
"epoch": 7.06,
"grad_norm": 0.0006969044334255159,
"learning_rate": 1.3195435092724678e-05,
"loss": 0.2151,
"step": 10690
},
{
"epoch": 7.06,
"grad_norm": 0.000248556025326252,
"learning_rate": 1.3155809161515295e-05,
"loss": 0.0,
"step": 10700
},
{
"epoch": 7.06,
"grad_norm": 0.0008631858509033918,
"learning_rate": 1.3116183230305912e-05,
"loss": 0.0001,
"step": 10710
},
{
"epoch": 7.06,
"grad_norm": 0.001508180401287973,
"learning_rate": 1.307655729909653e-05,
"loss": 0.0001,
"step": 10720
},
{
"epoch": 7.06,
"grad_norm": 0.0005554750678129494,
"learning_rate": 1.3036931367887145e-05,
"loss": 0.0,
"step": 10730
},
{
"epoch": 7.07,
"grad_norm": 0.0003934628330171108,
"learning_rate": 1.2997305436677762e-05,
"loss": 0.0,
"step": 10740
},
{
"epoch": 7.07,
"grad_norm": 0.001727793482132256,
"learning_rate": 1.2957679505468379e-05,
"loss": 0.0,
"step": 10750
},
{
"epoch": 7.07,
"grad_norm": 0.002404275583103299,
"learning_rate": 1.2918053574258996e-05,
"loss": 0.0,
"step": 10760
},
{
"epoch": 7.07,
"grad_norm": 0.0008175792172551155,
"learning_rate": 1.2878427643049611e-05,
"loss": 0.0,
"step": 10770
},
{
"epoch": 7.07,
"grad_norm": 0.0022247559390962124,
"learning_rate": 1.2838801711840228e-05,
"loss": 0.0001,
"step": 10780
},
{
"epoch": 7.07,
"grad_norm": 0.0014646403724327683,
"learning_rate": 1.2799175780630846e-05,
"loss": 0.001,
"step": 10790
},
{
"epoch": 7.07,
"grad_norm": 0.0020718346349895,
"learning_rate": 1.2759549849421463e-05,
"loss": 0.0001,
"step": 10800
},
{
"epoch": 7.07,
"grad_norm": 0.0008824109099805355,
"learning_rate": 1.2719923918212078e-05,
"loss": 0.5802,
"step": 10810
},
{
"epoch": 7.07,
"grad_norm": 0.0007525623659603298,
"learning_rate": 1.2680297987002695e-05,
"loss": 0.0,
"step": 10820
},
{
"epoch": 7.07,
"grad_norm": 64.19054412841797,
"learning_rate": 1.2640672055793312e-05,
"loss": 0.4105,
"step": 10830
},
{
"epoch": 7.07,
"grad_norm": 0.0019008672097697854,
"learning_rate": 1.260104612458393e-05,
"loss": 0.0005,
"step": 10840
},
{
"epoch": 7.07,
"grad_norm": 0.0010036254534497857,
"learning_rate": 1.2561420193374545e-05,
"loss": 0.0006,
"step": 10850
},
{
"epoch": 7.07,
"grad_norm": 0.0006118649616837502,
"learning_rate": 1.2521794262165162e-05,
"loss": 0.0,
"step": 10860
},
{
"epoch": 7.07,
"grad_norm": 0.003166553797200322,
"learning_rate": 1.2482168330955777e-05,
"loss": 0.0001,
"step": 10870
},
{
"epoch": 7.08,
"grad_norm": 0.001118882093578577,
"learning_rate": 1.2442542399746394e-05,
"loss": 0.0,
"step": 10880
},
{
"epoch": 7.08,
"grad_norm": 0.0010632872581481934,
"learning_rate": 1.2402916468537012e-05,
"loss": 0.0001,
"step": 10890
},
{
"epoch": 7.08,
"grad_norm": 0.001252860063686967,
"learning_rate": 1.2363290537327627e-05,
"loss": 0.0001,
"step": 10900
},
{
"epoch": 7.08,
"grad_norm": 0.003005104372277856,
"learning_rate": 1.2323664606118244e-05,
"loss": 0.0001,
"step": 10910
},
{
"epoch": 7.08,
"grad_norm": 0.004219905007630587,
"learning_rate": 1.2284038674908861e-05,
"loss": 0.0006,
"step": 10920
},
{
"epoch": 7.08,
"grad_norm": 0.0003512962721288204,
"learning_rate": 1.2244412743699478e-05,
"loss": 0.0,
"step": 10930
},
{
"epoch": 7.08,
"grad_norm": 0.0026769828982651234,
"learning_rate": 1.2204786812490095e-05,
"loss": 0.0001,
"step": 10940
},
{
"epoch": 7.08,
"grad_norm": 0.0003416830440983176,
"learning_rate": 1.216516088128071e-05,
"loss": 0.0,
"step": 10950
},
{
"epoch": 7.08,
"grad_norm": 0.0010573529871180654,
"learning_rate": 1.2125534950071326e-05,
"loss": 0.0,
"step": 10960
},
{
"epoch": 7.08,
"grad_norm": 0.0013822006294503808,
"learning_rate": 1.2085909018861943e-05,
"loss": 0.2014,
"step": 10970
},
{
"epoch": 7.08,
"grad_norm": 0.0003184191882610321,
"learning_rate": 1.204628308765256e-05,
"loss": 0.0,
"step": 10980
},
{
"epoch": 7.08,
"grad_norm": 0.004402919672429562,
"learning_rate": 1.2006657156443178e-05,
"loss": 0.006,
"step": 10990
},
{
"epoch": 7.08,
"grad_norm": 4.32048225402832,
"learning_rate": 1.1967031225233793e-05,
"loss": 0.0019,
"step": 11000
},
{
"epoch": 7.08,
"grad_norm": 0.0006507772486656904,
"learning_rate": 1.192740529402441e-05,
"loss": 0.0,
"step": 11010
},
{
"epoch": 7.09,
"grad_norm": 0.0004223829018883407,
"learning_rate": 1.1887779362815027e-05,
"loss": 0.0001,
"step": 11020
},
{
"epoch": 7.09,
"grad_norm": 0.0006153634749352932,
"learning_rate": 1.1848153431605644e-05,
"loss": 0.0001,
"step": 11030
},
{
"epoch": 7.09,
"grad_norm": 0.0007072246517054737,
"learning_rate": 1.180852750039626e-05,
"loss": 0.0,
"step": 11040
},
{
"epoch": 7.09,
"grad_norm": 0.0007087733829393983,
"learning_rate": 1.1768901569186877e-05,
"loss": 0.0591,
"step": 11050
},
{
"epoch": 7.09,
"grad_norm": 0.00040967803215608,
"learning_rate": 1.1729275637977492e-05,
"loss": 0.0,
"step": 11060
},
{
"epoch": 7.09,
"grad_norm": 0.0018612256972119212,
"learning_rate": 1.168964970676811e-05,
"loss": 0.0001,
"step": 11070
},
{
"epoch": 7.09,
"grad_norm": 0.0016640513204038143,
"learning_rate": 1.1650023775558726e-05,
"loss": 0.0001,
"step": 11080
},
{
"epoch": 7.09,
"grad_norm": 0.004190579988062382,
"learning_rate": 1.1610397844349342e-05,
"loss": 0.0,
"step": 11090
},
{
"epoch": 7.09,
"grad_norm": 0.001836647279560566,
"learning_rate": 1.1570771913139959e-05,
"loss": 0.0,
"step": 11100
},
{
"epoch": 7.09,
"grad_norm": 0.0005556776304729283,
"learning_rate": 1.1531145981930576e-05,
"loss": 0.0001,
"step": 11110
},
{
"epoch": 7.09,
"grad_norm": 0.0008808193379081786,
"learning_rate": 1.1491520050721193e-05,
"loss": 0.0001,
"step": 11120
},
{
"epoch": 7.09,
"grad_norm": 0.0001681848953012377,
"learning_rate": 1.1451894119511809e-05,
"loss": 0.2524,
"step": 11130
},
{
"epoch": 7.09,
"grad_norm": 0.0008354588062502444,
"learning_rate": 1.1412268188302426e-05,
"loss": 0.0,
"step": 11140
},
{
"epoch": 7.09,
"grad_norm": 0.00051628437358886,
"learning_rate": 1.1372642257093043e-05,
"loss": 0.0,
"step": 11150
},
{
"epoch": 7.1,
"grad_norm": 0.0008145067258737981,
"learning_rate": 1.133301632588366e-05,
"loss": 0.0001,
"step": 11160
},
{
"epoch": 7.1,
"grad_norm": 0.0004701870202552527,
"learning_rate": 1.1293390394674275e-05,
"loss": 0.1907,
"step": 11170
},
{
"epoch": 7.1,
"grad_norm": 0.0011952131753787398,
"learning_rate": 1.125376446346489e-05,
"loss": 0.0,
"step": 11180
},
{
"epoch": 7.1,
"grad_norm": 0.00032050846493802965,
"learning_rate": 1.1214138532255508e-05,
"loss": 0.0,
"step": 11190
},
{
"epoch": 7.1,
"grad_norm": 0.0006612459546886384,
"learning_rate": 1.1174512601046125e-05,
"loss": 0.0001,
"step": 11200
},
{
"epoch": 7.1,
"grad_norm": 0.0030058922711759806,
"learning_rate": 1.1134886669836742e-05,
"loss": 0.0,
"step": 11210
},
{
"epoch": 7.1,
"grad_norm": 0.0034754828084260225,
"learning_rate": 1.1095260738627357e-05,
"loss": 0.0003,
"step": 11220
},
{
"epoch": 7.1,
"eval_accuracy": 0.9578947368421052,
"eval_loss": 0.2666740119457245,
"eval_runtime": 2322.4119,
"eval_samples_per_second": 0.286,
"eval_steps_per_second": 0.143,
"step": 11224
},
{
"epoch": 8.0,
"grad_norm": 0.004194905515760183,
"learning_rate": 1.1055634807417975e-05,
"loss": 0.0001,
"step": 11230
},
{
"epoch": 8.0,
"grad_norm": 0.0024937952402979136,
"learning_rate": 1.1016008876208592e-05,
"loss": 0.0,
"step": 11240
},
{
"epoch": 8.0,
"grad_norm": 0.00039031429332681,
"learning_rate": 1.0976382944999209e-05,
"loss": 0.0,
"step": 11250
},
{
"epoch": 8.0,
"grad_norm": 0.005691041238605976,
"learning_rate": 1.0936757013789826e-05,
"loss": 0.0001,
"step": 11260
},
{
"epoch": 8.0,
"grad_norm": 0.00017179737915284932,
"learning_rate": 1.0897131082580441e-05,
"loss": 0.0001,
"step": 11270
},
{
"epoch": 8.0,
"grad_norm": 0.000949267705436796,
"learning_rate": 1.0857505151371057e-05,
"loss": 0.0001,
"step": 11280
},
{
"epoch": 8.0,
"grad_norm": 0.0003036385169252753,
"learning_rate": 1.0817879220161674e-05,
"loss": 0.0001,
"step": 11290
},
{
"epoch": 8.01,
"grad_norm": 0.004243906121701002,
"learning_rate": 1.0778253288952291e-05,
"loss": 0.0005,
"step": 11300
},
{
"epoch": 8.01,
"grad_norm": 0.0010142240207642317,
"learning_rate": 1.0738627357742908e-05,
"loss": 0.0,
"step": 11310
},
{
"epoch": 8.01,
"grad_norm": 0.0010380720486864448,
"learning_rate": 1.0699001426533523e-05,
"loss": 0.0001,
"step": 11320
},
{
"epoch": 8.01,
"grad_norm": 0.0005737761966884136,
"learning_rate": 1.065937549532414e-05,
"loss": 0.0001,
"step": 11330
},
{
"epoch": 8.01,
"grad_norm": 0.001465731067582965,
"learning_rate": 1.0619749564114758e-05,
"loss": 0.0,
"step": 11340
},
{
"epoch": 8.01,
"grad_norm": 0.002500841859728098,
"learning_rate": 1.0580123632905375e-05,
"loss": 0.0,
"step": 11350
},
{
"epoch": 8.01,
"grad_norm": 0.00024287942505907267,
"learning_rate": 1.054049770169599e-05,
"loss": 0.0,
"step": 11360
},
{
"epoch": 8.01,
"grad_norm": 0.0006320082466118038,
"learning_rate": 1.0500871770486607e-05,
"loss": 0.0001,
"step": 11370
},
{
"epoch": 8.01,
"grad_norm": 0.00030024844454601407,
"learning_rate": 1.0461245839277224e-05,
"loss": 0.0097,
"step": 11380
},
{
"epoch": 8.01,
"grad_norm": 0.00043432554230093956,
"learning_rate": 1.042161990806784e-05,
"loss": 0.0421,
"step": 11390
},
{
"epoch": 8.01,
"grad_norm": 0.002737953094765544,
"learning_rate": 1.0381993976858457e-05,
"loss": 0.0,
"step": 11400
},
{
"epoch": 8.01,
"grad_norm": 0.000816858431790024,
"learning_rate": 1.0342368045649072e-05,
"loss": 0.0,
"step": 11410
},
{
"epoch": 8.01,
"grad_norm": 0.00036986047052778304,
"learning_rate": 1.030274211443969e-05,
"loss": 0.0,
"step": 11420
},
{
"epoch": 8.01,
"grad_norm": 0.0004323932225815952,
"learning_rate": 1.0263116183230307e-05,
"loss": 0.0002,
"step": 11430
},
{
"epoch": 8.02,
"grad_norm": 0.0004024511144962162,
"learning_rate": 1.0223490252020924e-05,
"loss": 0.421,
"step": 11440
},
{
"epoch": 8.02,
"grad_norm": 0.0024430027697235346,
"learning_rate": 1.0183864320811539e-05,
"loss": 0.0,
"step": 11450
},
{
"epoch": 8.02,
"grad_norm": 0.001345345051959157,
"learning_rate": 1.0144238389602156e-05,
"loss": 0.0,
"step": 11460
},
{
"epoch": 8.02,
"grad_norm": 0.0006153620779514313,
"learning_rate": 1.0104612458392773e-05,
"loss": 0.0001,
"step": 11470
},
{
"epoch": 8.02,
"grad_norm": 0.0015972702531144023,
"learning_rate": 1.006498652718339e-05,
"loss": 0.0,
"step": 11480
},
{
"epoch": 8.02,
"grad_norm": 0.0008706132066436112,
"learning_rate": 1.0025360595974006e-05,
"loss": 0.0001,
"step": 11490
},
{
"epoch": 8.02,
"grad_norm": 0.001384895178489387,
"learning_rate": 9.985734664764621e-06,
"loss": 0.0001,
"step": 11500
},
{
"epoch": 8.02,
"grad_norm": 0.0010631100740283728,
"learning_rate": 9.946108733555238e-06,
"loss": 0.0002,
"step": 11510
},
{
"epoch": 8.02,
"grad_norm": 0.0007243629661388695,
"learning_rate": 9.906482802345855e-06,
"loss": 0.0001,
"step": 11520
},
{
"epoch": 8.02,
"grad_norm": 74.74536895751953,
"learning_rate": 9.866856871136473e-06,
"loss": 0.0797,
"step": 11530
},
{
"epoch": 8.02,
"grad_norm": 0.005114846862852573,
"learning_rate": 9.827230939927088e-06,
"loss": 0.0001,
"step": 11540
},
{
"epoch": 8.02,
"grad_norm": 0.0024818070232868195,
"learning_rate": 9.787605008717705e-06,
"loss": 0.0001,
"step": 11550
},
{
"epoch": 8.02,
"grad_norm": 0.00041646783938631415,
"learning_rate": 9.747979077508322e-06,
"loss": 0.0,
"step": 11560
},
{
"epoch": 8.02,
"grad_norm": 0.0007332797977142036,
"learning_rate": 9.70835314629894e-06,
"loss": 0.0116,
"step": 11570
},
{
"epoch": 8.03,
"grad_norm": 0.0007879806798882782,
"learning_rate": 9.668727215089556e-06,
"loss": 0.0,
"step": 11580
},
{
"epoch": 8.03,
"grad_norm": 0.0009714935440570116,
"learning_rate": 9.629101283880172e-06,
"loss": 0.0001,
"step": 11590
},
{
"epoch": 8.03,
"grad_norm": 0.0009343309211544693,
"learning_rate": 9.589475352670787e-06,
"loss": 0.5311,
"step": 11600
},
{
"epoch": 8.03,
"grad_norm": 0.00037891563260927796,
"learning_rate": 9.549849421461404e-06,
"loss": 0.002,
"step": 11610
},
{
"epoch": 8.03,
"grad_norm": 0.001986218150705099,
"learning_rate": 9.510223490252021e-06,
"loss": 0.0111,
"step": 11620
},
{
"epoch": 8.03,
"grad_norm": 0.0015318701043725014,
"learning_rate": 9.470597559042639e-06,
"loss": 0.0001,
"step": 11630
},
{
"epoch": 8.03,
"grad_norm": 0.0006765589932911098,
"learning_rate": 9.430971627833254e-06,
"loss": 0.0,
"step": 11640
},
{
"epoch": 8.03,
"grad_norm": 0.0005000099190510809,
"learning_rate": 9.391345696623871e-06,
"loss": 0.0,
"step": 11650
},
{
"epoch": 8.03,
"grad_norm": 0.0017080691177397966,
"learning_rate": 9.351719765414488e-06,
"loss": 0.0002,
"step": 11660
},
{
"epoch": 8.03,
"grad_norm": 0.0017356324242427945,
"learning_rate": 9.312093834205105e-06,
"loss": 0.0001,
"step": 11670
},
{
"epoch": 8.03,
"grad_norm": 0.0010568661382421851,
"learning_rate": 9.27246790299572e-06,
"loss": 0.0,
"step": 11680
},
{
"epoch": 8.03,
"grad_norm": 0.0014095234218984842,
"learning_rate": 9.232841971786338e-06,
"loss": 0.0021,
"step": 11690
},
{
"epoch": 8.03,
"grad_norm": 0.00167833489831537,
"learning_rate": 9.193216040576955e-06,
"loss": 0.0001,
"step": 11700
},
{
"epoch": 8.03,
"grad_norm": 0.0016795884585008025,
"learning_rate": 9.15359010936757e-06,
"loss": 0.0002,
"step": 11710
},
{
"epoch": 8.04,
"grad_norm": 0.0003502909676171839,
"learning_rate": 9.113964178158187e-06,
"loss": 0.0,
"step": 11720
},
{
"epoch": 8.04,
"grad_norm": 0.10941363871097565,
"learning_rate": 9.074338246948803e-06,
"loss": 0.0001,
"step": 11730
},
{
"epoch": 8.04,
"grad_norm": 0.0014083647402003407,
"learning_rate": 9.03471231573942e-06,
"loss": 0.3081,
"step": 11740
},
{
"epoch": 8.04,
"grad_norm": 0.0014537267852574587,
"learning_rate": 8.995086384530037e-06,
"loss": 0.0,
"step": 11750
},
{
"epoch": 8.04,
"grad_norm": 0.0005781695363111794,
"learning_rate": 8.955460453320654e-06,
"loss": 0.0,
"step": 11760
},
{
"epoch": 8.04,
"grad_norm": 0.0007176825893111527,
"learning_rate": 8.91583452211127e-06,
"loss": 0.0,
"step": 11770
},
{
"epoch": 8.04,
"grad_norm": 0.000545515853445977,
"learning_rate": 8.876208590901887e-06,
"loss": 0.0,
"step": 11780
},
{
"epoch": 8.04,
"grad_norm": 0.0025596795603632927,
"learning_rate": 8.836582659692504e-06,
"loss": 0.0002,
"step": 11790
},
{
"epoch": 8.04,
"grad_norm": 0.030005350708961487,
"learning_rate": 8.796956728483121e-06,
"loss": 0.0001,
"step": 11800
},
{
"epoch": 8.04,
"grad_norm": 0.00035480278893373907,
"learning_rate": 8.757330797273736e-06,
"loss": 0.0018,
"step": 11810
},
{
"epoch": 8.04,
"grad_norm": 0.004515402484685183,
"learning_rate": 8.717704866064352e-06,
"loss": 0.0,
"step": 11820
},
{
"epoch": 8.04,
"grad_norm": 0.0032044288236647844,
"learning_rate": 8.678078934854969e-06,
"loss": 0.0036,
"step": 11830
},
{
"epoch": 8.04,
"grad_norm": 0.0009629257838241756,
"learning_rate": 8.638453003645586e-06,
"loss": 0.149,
"step": 11840
},
{
"epoch": 8.04,
"grad_norm": 0.0024080132134258747,
"learning_rate": 8.598827072436203e-06,
"loss": 0.0003,
"step": 11850
},
{
"epoch": 8.05,
"grad_norm": 0.0015089749358594418,
"learning_rate": 8.559201141226818e-06,
"loss": 0.0,
"step": 11860
},
{
"epoch": 8.05,
"grad_norm": 0.0019321951549500227,
"learning_rate": 8.519575210017436e-06,
"loss": 0.0,
"step": 11870
},
{
"epoch": 8.05,
"grad_norm": 0.005924368277192116,
"learning_rate": 8.479949278808053e-06,
"loss": 0.0,
"step": 11880
},
{
"epoch": 8.05,
"grad_norm": 0.0007942487136460841,
"learning_rate": 8.44032334759867e-06,
"loss": 0.0,
"step": 11890
},
{
"epoch": 8.05,
"grad_norm": 0.0022497123572975397,
"learning_rate": 8.400697416389285e-06,
"loss": 0.1055,
"step": 11900
},
{
"epoch": 8.05,
"grad_norm": 0.0006818937254138291,
"learning_rate": 8.361071485179902e-06,
"loss": 0.0001,
"step": 11910
},
{
"epoch": 8.05,
"grad_norm": 0.0004379069432616234,
"learning_rate": 8.32144555397052e-06,
"loss": 0.0,
"step": 11920
},
{
"epoch": 8.05,
"grad_norm": 0.00047276023542508483,
"learning_rate": 8.281819622761135e-06,
"loss": 0.0,
"step": 11930
},
{
"epoch": 8.05,
"grad_norm": 0.0004771367821376771,
"learning_rate": 8.242193691551752e-06,
"loss": 0.0,
"step": 11940
},
{
"epoch": 8.05,
"grad_norm": 0.0005501986015588045,
"learning_rate": 8.202567760342367e-06,
"loss": 0.0001,
"step": 11950
},
{
"epoch": 8.05,
"grad_norm": 0.0011177220148965716,
"learning_rate": 8.162941829132984e-06,
"loss": 0.0703,
"step": 11960
},
{
"epoch": 8.05,
"grad_norm": 0.0004951581358909607,
"learning_rate": 8.123315897923602e-06,
"loss": 0.0,
"step": 11970
},
{
"epoch": 8.05,
"grad_norm": 0.0008309069671668112,
"learning_rate": 8.083689966714219e-06,
"loss": 0.0,
"step": 11980
},
{
"epoch": 8.05,
"grad_norm": 0.000472767511382699,
"learning_rate": 8.044064035504836e-06,
"loss": 0.5261,
"step": 11990
},
{
"epoch": 8.06,
"grad_norm": 0.00044904148671776056,
"learning_rate": 8.004438104295451e-06,
"loss": 0.0,
"step": 12000
},
{
"epoch": 8.06,
"grad_norm": 0.0004107660206500441,
"learning_rate": 7.964812173086068e-06,
"loss": 0.0,
"step": 12010
},
{
"epoch": 8.06,
"grad_norm": 0.00042746157851070166,
"learning_rate": 7.925186241876685e-06,
"loss": 0.0,
"step": 12020
},
{
"epoch": 8.06,
"grad_norm": 0.0007110532023943961,
"learning_rate": 7.8855603106673e-06,
"loss": 0.0,
"step": 12030
},
{
"epoch": 8.06,
"grad_norm": 0.0007705994066782296,
"learning_rate": 7.845934379457918e-06,
"loss": 0.0,
"step": 12040
},
{
"epoch": 8.06,
"grad_norm": 0.0006966418586671352,
"learning_rate": 7.806308448248533e-06,
"loss": 0.0,
"step": 12050
},
{
"epoch": 8.06,
"grad_norm": 0.020446307957172394,
"learning_rate": 7.76668251703915e-06,
"loss": 0.0001,
"step": 12060
},
{
"epoch": 8.06,
"grad_norm": 0.0004377638688310981,
"learning_rate": 7.727056585829768e-06,
"loss": 0.0,
"step": 12070
},
{
"epoch": 8.06,
"grad_norm": 0.00036184967029839754,
"learning_rate": 7.687430654620385e-06,
"loss": 0.0002,
"step": 12080
},
{
"epoch": 8.06,
"grad_norm": 0.00029569625621661544,
"learning_rate": 7.647804723411e-06,
"loss": 0.0001,
"step": 12090
},
{
"epoch": 8.06,
"grad_norm": 0.0003205812827218324,
"learning_rate": 7.608178792201617e-06,
"loss": 0.0236,
"step": 12100
},
{
"epoch": 8.06,
"grad_norm": 0.00043995011947117746,
"learning_rate": 7.568552860992234e-06,
"loss": 0.0001,
"step": 12110
},
{
"epoch": 8.06,
"grad_norm": 0.0021792801562696695,
"learning_rate": 7.5289269297828505e-06,
"loss": 0.0,
"step": 12120
},
{
"epoch": 8.06,
"grad_norm": 0.003733986523002386,
"learning_rate": 7.489300998573468e-06,
"loss": 0.0,
"step": 12130
},
{
"epoch": 8.07,
"grad_norm": 0.001138021470978856,
"learning_rate": 7.449675067364083e-06,
"loss": 0.0001,
"step": 12140
},
{
"epoch": 8.07,
"grad_norm": 0.0003544053470250219,
"learning_rate": 7.410049136154699e-06,
"loss": 0.0,
"step": 12150
},
{
"epoch": 8.07,
"grad_norm": 0.0007718518027104437,
"learning_rate": 7.370423204945316e-06,
"loss": 0.0,
"step": 12160
},
{
"epoch": 8.07,
"grad_norm": 0.000794577703345567,
"learning_rate": 7.330797273735933e-06,
"loss": 0.0,
"step": 12170
},
{
"epoch": 8.07,
"grad_norm": 0.0007835258147679269,
"learning_rate": 7.29117134252655e-06,
"loss": 0.0,
"step": 12180
},
{
"epoch": 8.07,
"grad_norm": 0.0008351559517905116,
"learning_rate": 7.251545411317166e-06,
"loss": 0.0,
"step": 12190
},
{
"epoch": 8.07,
"grad_norm": 0.001067393459379673,
"learning_rate": 7.211919480107783e-06,
"loss": 0.0,
"step": 12200
},
{
"epoch": 8.07,
"grad_norm": 0.0005535806412808597,
"learning_rate": 7.172293548898399e-06,
"loss": 0.0,
"step": 12210
},
{
"epoch": 8.07,
"grad_norm": 0.0013392162509262562,
"learning_rate": 7.1326676176890165e-06,
"loss": 0.0,
"step": 12220
},
{
"epoch": 8.07,
"grad_norm": 0.011801800690591335,
"learning_rate": 7.093041686479633e-06,
"loss": 0.0001,
"step": 12230
},
{
"epoch": 8.07,
"grad_norm": 0.0003349117760080844,
"learning_rate": 7.05341575527025e-06,
"loss": 0.0,
"step": 12240
},
{
"epoch": 8.07,
"grad_norm": 0.0009791525080800056,
"learning_rate": 7.013789824060865e-06,
"loss": 0.0,
"step": 12250
},
{
"epoch": 8.07,
"grad_norm": 0.0003134564030915499,
"learning_rate": 6.9741638928514815e-06,
"loss": 0.0,
"step": 12260
},
{
"epoch": 8.07,
"grad_norm": 0.0011281302431598306,
"learning_rate": 6.934537961642099e-06,
"loss": 0.0003,
"step": 12270
},
{
"epoch": 8.08,
"grad_norm": 0.0004596656945068389,
"learning_rate": 6.894912030432715e-06,
"loss": 0.0,
"step": 12280
},
{
"epoch": 8.08,
"grad_norm": 0.017007848247885704,
"learning_rate": 6.855286099223332e-06,
"loss": 0.1894,
"step": 12290
},
{
"epoch": 8.08,
"grad_norm": 0.0009561624028719962,
"learning_rate": 6.815660168013949e-06,
"loss": 0.0001,
"step": 12300
},
{
"epoch": 8.08,
"grad_norm": 0.0006208363920450211,
"learning_rate": 6.776034236804565e-06,
"loss": 0.0,
"step": 12310
},
{
"epoch": 8.08,
"grad_norm": 0.00040551909478381276,
"learning_rate": 6.7364083055951825e-06,
"loss": 0.0,
"step": 12320
},
{
"epoch": 8.08,
"grad_norm": 0.0010045063681900501,
"learning_rate": 6.696782374385799e-06,
"loss": 0.0001,
"step": 12330
},
{
"epoch": 8.08,
"grad_norm": 0.001559635391458869,
"learning_rate": 6.657156443176416e-06,
"loss": 0.1317,
"step": 12340
},
{
"epoch": 8.08,
"grad_norm": 0.00036661443300545216,
"learning_rate": 6.617530511967032e-06,
"loss": 0.0,
"step": 12350
},
{
"epoch": 8.08,
"grad_norm": 0.0022761470172554255,
"learning_rate": 6.5779045807576475e-06,
"loss": 0.0,
"step": 12360
},
{
"epoch": 8.08,
"grad_norm": 0.0002771701547317207,
"learning_rate": 6.538278649548265e-06,
"loss": 0.0,
"step": 12370
},
{
"epoch": 8.08,
"grad_norm": 0.0009405760793015361,
"learning_rate": 6.498652718338881e-06,
"loss": 0.0,
"step": 12380
},
{
"epoch": 8.08,
"grad_norm": 0.0011777085019275546,
"learning_rate": 6.459026787129498e-06,
"loss": 0.0,
"step": 12390
},
{
"epoch": 8.08,
"grad_norm": 0.0007950080907903612,
"learning_rate": 6.419400855920114e-06,
"loss": 0.0023,
"step": 12400
},
{
"epoch": 8.08,
"grad_norm": 0.000328573863953352,
"learning_rate": 6.379774924710731e-06,
"loss": 0.0001,
"step": 12410
},
{
"epoch": 8.09,
"grad_norm": 0.000489677709992975,
"learning_rate": 6.340148993501348e-06,
"loss": 0.0,
"step": 12420
},
{
"epoch": 8.09,
"grad_norm": 19.678516387939453,
"learning_rate": 6.300523062291965e-06,
"loss": 0.2121,
"step": 12430
},
{
"epoch": 8.09,
"grad_norm": 0.001576061244122684,
"learning_rate": 6.260897131082581e-06,
"loss": 0.2006,
"step": 12440
},
{
"epoch": 8.09,
"grad_norm": 0.0010969837894663215,
"learning_rate": 6.221271199873197e-06,
"loss": 0.0089,
"step": 12450
},
{
"epoch": 8.09,
"grad_norm": 0.0006820796988904476,
"learning_rate": 6.1816452686638135e-06,
"loss": 0.0001,
"step": 12460
},
{
"epoch": 8.09,
"grad_norm": 0.0039375657215714455,
"learning_rate": 6.142019337454431e-06,
"loss": 0.0,
"step": 12470
},
{
"epoch": 8.09,
"grad_norm": 0.00018676265608519316,
"learning_rate": 6.102393406245048e-06,
"loss": 0.0002,
"step": 12480
},
{
"epoch": 8.09,
"grad_norm": 0.0015864548040553927,
"learning_rate": 6.062767475035663e-06,
"loss": 0.0,
"step": 12490
},
{
"epoch": 8.09,
"grad_norm": 0.0005812132731080055,
"learning_rate": 6.02314154382628e-06,
"loss": 0.0001,
"step": 12500
},
{
"epoch": 8.09,
"grad_norm": 0.0015394919319078326,
"learning_rate": 5.9835156126168965e-06,
"loss": 0.0,
"step": 12510
},
{
"epoch": 8.09,
"grad_norm": 0.5876509547233582,
"learning_rate": 5.943889681407514e-06,
"loss": 0.0002,
"step": 12520
},
{
"epoch": 8.09,
"grad_norm": 0.001257477910257876,
"learning_rate": 5.90426375019813e-06,
"loss": 0.0,
"step": 12530
},
{
"epoch": 8.09,
"grad_norm": 0.007748996838927269,
"learning_rate": 5.864637818988746e-06,
"loss": 0.0002,
"step": 12540
},
{
"epoch": 8.09,
"grad_norm": 0.0004220679693389684,
"learning_rate": 5.825011887779363e-06,
"loss": 0.0001,
"step": 12550
},
{
"epoch": 8.1,
"grad_norm": 0.0003514468262437731,
"learning_rate": 5.7853859565699795e-06,
"loss": 0.0062,
"step": 12560
},
{
"epoch": 8.1,
"grad_norm": 0.0004685299936681986,
"learning_rate": 5.745760025360597e-06,
"loss": 0.0016,
"step": 12570
},
{
"epoch": 8.1,
"grad_norm": 0.0002851441968232393,
"learning_rate": 5.706134094151213e-06,
"loss": 0.0004,
"step": 12580
},
{
"epoch": 8.1,
"grad_norm": 0.0006324647110886872,
"learning_rate": 5.66650816294183e-06,
"loss": 0.0,
"step": 12590
},
{
"epoch": 8.1,
"grad_norm": 0.000717841787263751,
"learning_rate": 5.626882231732445e-06,
"loss": 0.0,
"step": 12600
},
{
"epoch": 8.1,
"grad_norm": 0.001114896615035832,
"learning_rate": 5.5872563005230625e-06,
"loss": 0.0,
"step": 12610
},
{
"epoch": 8.1,
"grad_norm": 0.0011514411307871342,
"learning_rate": 5.547630369313679e-06,
"loss": 0.0001,
"step": 12620
},
{
"epoch": 8.1,
"eval_accuracy": 0.9654135338345865,
"eval_loss": 0.2435862421989441,
"eval_runtime": 2357.1776,
"eval_samples_per_second": 0.282,
"eval_steps_per_second": 0.141,
"step": 12627
},
{
"epoch": 9.0,
"grad_norm": 0.00044704281026497483,
"learning_rate": 5.508004438104296e-06,
"loss": 0.0,
"step": 12630
},
{
"epoch": 9.0,
"grad_norm": 0.00041269470239058137,
"learning_rate": 5.468378506894913e-06,
"loss": 0.0,
"step": 12640
},
{
"epoch": 9.0,
"grad_norm": 0.0003670216246973723,
"learning_rate": 5.428752575685528e-06,
"loss": 0.0,
"step": 12650
},
{
"epoch": 9.0,
"grad_norm": 0.003106119344010949,
"learning_rate": 5.3891266444761455e-06,
"loss": 0.0,
"step": 12660
},
{
"epoch": 9.0,
"grad_norm": 0.00040537622408010066,
"learning_rate": 5.349500713266762e-06,
"loss": 0.0,
"step": 12670
},
{
"epoch": 9.0,
"grad_norm": 0.00037262984551489353,
"learning_rate": 5.309874782057379e-06,
"loss": 0.0,
"step": 12680
},
{
"epoch": 9.0,
"grad_norm": 0.000418797048041597,
"learning_rate": 5.270248850847995e-06,
"loss": 0.0,
"step": 12690
},
{
"epoch": 9.01,
"grad_norm": 0.0015914670657366514,
"learning_rate": 5.230622919638612e-06,
"loss": 0.0,
"step": 12700
},
{
"epoch": 9.01,
"grad_norm": 0.005690779071301222,
"learning_rate": 5.1909969884292285e-06,
"loss": 0.0976,
"step": 12710
},
{
"epoch": 9.01,
"grad_norm": 0.001181070227175951,
"learning_rate": 5.151371057219845e-06,
"loss": 0.0,
"step": 12720
},
{
"epoch": 9.01,
"grad_norm": 0.0007823907653801143,
"learning_rate": 5.111745126010462e-06,
"loss": 0.0,
"step": 12730
},
{
"epoch": 9.01,
"grad_norm": 0.0010620895773172379,
"learning_rate": 5.072119194801078e-06,
"loss": 0.0,
"step": 12740
},
{
"epoch": 9.01,
"grad_norm": 0.00028126072720624506,
"learning_rate": 5.032493263591695e-06,
"loss": 0.0052,
"step": 12750
},
{
"epoch": 9.01,
"grad_norm": 0.0005754511221311986,
"learning_rate": 4.992867332382311e-06,
"loss": 0.0,
"step": 12760
},
{
"epoch": 9.01,
"grad_norm": 0.000247256743023172,
"learning_rate": 4.953241401172928e-06,
"loss": 0.0,
"step": 12770
},
{
"epoch": 9.01,
"grad_norm": 0.0017203809693455696,
"learning_rate": 4.913615469963544e-06,
"loss": 0.0001,
"step": 12780
},
{
"epoch": 9.01,
"grad_norm": 0.0005222621257416904,
"learning_rate": 4.873989538754161e-06,
"loss": 0.0,
"step": 12790
},
{
"epoch": 9.01,
"grad_norm": 0.00047639888362027705,
"learning_rate": 4.834363607544778e-06,
"loss": 0.0001,
"step": 12800
},
{
"epoch": 9.01,
"grad_norm": 0.0015658453339710832,
"learning_rate": 4.794737676335394e-06,
"loss": 0.0,
"step": 12810
},
{
"epoch": 9.01,
"grad_norm": 0.0002700120967347175,
"learning_rate": 4.755111745126011e-06,
"loss": 0.0,
"step": 12820
},
{
"epoch": 9.01,
"grad_norm": 0.00036174681736156344,
"learning_rate": 4.715485813916627e-06,
"loss": 0.0,
"step": 12830
},
{
"epoch": 9.02,
"grad_norm": 0.00048193742986768484,
"learning_rate": 4.675859882707244e-06,
"loss": 0.0001,
"step": 12840
},
{
"epoch": 9.02,
"grad_norm": 0.00021181856573093683,
"learning_rate": 4.63623395149786e-06,
"loss": 0.0,
"step": 12850
},
{
"epoch": 9.02,
"grad_norm": 0.0007221151608973742,
"learning_rate": 4.5966080202884774e-06,
"loss": 0.0001,
"step": 12860
},
{
"epoch": 9.02,
"grad_norm": 0.0008499003597535193,
"learning_rate": 4.556982089079094e-06,
"loss": 0.0,
"step": 12870
},
{
"epoch": 9.02,
"grad_norm": 0.00024478594423271716,
"learning_rate": 4.51735615786971e-06,
"loss": 0.0,
"step": 12880
},
{
"epoch": 9.02,
"grad_norm": 0.000799850036855787,
"learning_rate": 4.477730226660327e-06,
"loss": 0.0,
"step": 12890
},
{
"epoch": 9.02,
"grad_norm": 0.0012479170691221952,
"learning_rate": 4.438104295450943e-06,
"loss": 0.0007,
"step": 12900
},
{
"epoch": 9.02,
"grad_norm": 0.0008572249207645655,
"learning_rate": 4.3984783642415604e-06,
"loss": 0.0,
"step": 12910
},
{
"epoch": 9.02,
"grad_norm": 0.00028230881434865296,
"learning_rate": 4.358852433032176e-06,
"loss": 0.0773,
"step": 12920
},
{
"epoch": 9.02,
"grad_norm": 0.0003641119983512908,
"learning_rate": 4.319226501822793e-06,
"loss": 0.0,
"step": 12930
},
{
"epoch": 9.02,
"grad_norm": 0.0009531981777399778,
"learning_rate": 4.279600570613409e-06,
"loss": 0.0,
"step": 12940
},
{
"epoch": 9.02,
"grad_norm": 0.00067020149435848,
"learning_rate": 4.239974639404026e-06,
"loss": 0.0,
"step": 12950
},
{
"epoch": 9.02,
"grad_norm": 0.0001659138360992074,
"learning_rate": 4.200348708194643e-06,
"loss": 0.0,
"step": 12960
},
{
"epoch": 9.02,
"grad_norm": 0.0005148449563421309,
"learning_rate": 4.16072277698526e-06,
"loss": 0.0107,
"step": 12970
},
{
"epoch": 9.03,
"grad_norm": 0.000638917728792876,
"learning_rate": 4.121096845775876e-06,
"loss": 0.0,
"step": 12980
},
{
"epoch": 9.03,
"grad_norm": 0.00047383896890096366,
"learning_rate": 4.081470914566492e-06,
"loss": 0.0,
"step": 12990
},
{
"epoch": 9.03,
"grad_norm": 0.0007675238302908838,
"learning_rate": 4.041844983357109e-06,
"loss": 0.0,
"step": 13000
},
{
"epoch": 9.03,
"grad_norm": 0.001697351224720478,
"learning_rate": 4.0022190521477256e-06,
"loss": 0.0,
"step": 13010
},
{
"epoch": 9.03,
"grad_norm": 0.00020665867486968637,
"learning_rate": 3.962593120938343e-06,
"loss": 0.0,
"step": 13020
},
{
"epoch": 9.03,
"grad_norm": 0.001027750549837947,
"learning_rate": 3.922967189728959e-06,
"loss": 0.2632,
"step": 13030
},
{
"epoch": 9.03,
"grad_norm": 0.003146632807329297,
"learning_rate": 3.883341258519575e-06,
"loss": 0.0,
"step": 13040
},
{
"epoch": 9.03,
"grad_norm": 0.0007864089566282928,
"learning_rate": 3.843715327310192e-06,
"loss": 0.0044,
"step": 13050
},
{
"epoch": 9.03,
"grad_norm": 0.00022077991161495447,
"learning_rate": 3.8040893961008086e-06,
"loss": 0.0,
"step": 13060
},
{
"epoch": 9.03,
"grad_norm": 0.0005595972179435194,
"learning_rate": 3.7644634648914252e-06,
"loss": 0.0,
"step": 13070
},
{
"epoch": 9.03,
"grad_norm": 0.0005725977243855596,
"learning_rate": 3.7248375336820415e-06,
"loss": 0.0,
"step": 13080
},
{
"epoch": 9.03,
"grad_norm": 0.0011127095203846693,
"learning_rate": 3.685211602472658e-06,
"loss": 0.0,
"step": 13090
},
{
"epoch": 9.03,
"grad_norm": 0.001887647551484406,
"learning_rate": 3.645585671263275e-06,
"loss": 0.0,
"step": 13100
},
{
"epoch": 9.03,
"grad_norm": 0.0005976618267595768,
"learning_rate": 3.6059597400538916e-06,
"loss": 0.0003,
"step": 13110
},
{
"epoch": 9.04,
"grad_norm": 0.0006656855694018304,
"learning_rate": 3.5663338088445082e-06,
"loss": 0.0,
"step": 13120
},
{
"epoch": 9.04,
"grad_norm": 0.003439901163801551,
"learning_rate": 3.526707877635125e-06,
"loss": 0.0,
"step": 13130
},
{
"epoch": 9.04,
"grad_norm": 0.00043997442116960883,
"learning_rate": 3.4870819464257408e-06,
"loss": 0.0,
"step": 13140
},
{
"epoch": 9.04,
"grad_norm": 0.0005484743160195649,
"learning_rate": 3.4474560152163574e-06,
"loss": 0.0,
"step": 13150
},
{
"epoch": 9.04,
"grad_norm": 0.00040827819611877203,
"learning_rate": 3.4078300840069746e-06,
"loss": 0.0,
"step": 13160
},
{
"epoch": 9.04,
"grad_norm": 0.005499335937201977,
"learning_rate": 3.3682041527975912e-06,
"loss": 0.0001,
"step": 13170
},
{
"epoch": 9.04,
"grad_norm": 0.001736334292218089,
"learning_rate": 3.328578221588208e-06,
"loss": 0.0,
"step": 13180
},
{
"epoch": 9.04,
"grad_norm": 0.0006113905692473054,
"learning_rate": 3.2889522903788238e-06,
"loss": 0.0,
"step": 13190
},
{
"epoch": 9.04,
"grad_norm": 0.001001613331027329,
"learning_rate": 3.2493263591694404e-06,
"loss": 0.1631,
"step": 13200
},
{
"epoch": 9.04,
"grad_norm": 0.0003023295139428228,
"learning_rate": 3.209700427960057e-06,
"loss": 0.0,
"step": 13210
},
{
"epoch": 9.04,
"grad_norm": 0.0009469907963648438,
"learning_rate": 3.170074496750674e-06,
"loss": 0.0,
"step": 13220
},
{
"epoch": 9.04,
"grad_norm": 0.0007909215637482703,
"learning_rate": 3.1304485655412905e-06,
"loss": 0.0,
"step": 13230
},
{
"epoch": 9.04,
"grad_norm": 0.001787104643881321,
"learning_rate": 3.0908226343319067e-06,
"loss": 0.0,
"step": 13240
},
{
"epoch": 9.04,
"grad_norm": 0.0008837388013489544,
"learning_rate": 3.051196703122524e-06,
"loss": 0.0,
"step": 13250
},
{
"epoch": 9.05,
"grad_norm": 0.0007934242021292448,
"learning_rate": 3.01157077191314e-06,
"loss": 0.0004,
"step": 13260
},
{
"epoch": 9.05,
"grad_norm": 0.0011570610804483294,
"learning_rate": 2.971944840703757e-06,
"loss": 0.0001,
"step": 13270
},
{
"epoch": 9.05,
"grad_norm": 0.00029090800671838224,
"learning_rate": 2.932318909494373e-06,
"loss": 0.0,
"step": 13280
},
{
"epoch": 9.05,
"grad_norm": 0.0010709144407883286,
"learning_rate": 2.8926929782849897e-06,
"loss": 0.0,
"step": 13290
},
{
"epoch": 9.05,
"grad_norm": 0.001289168605580926,
"learning_rate": 2.8530670470756064e-06,
"loss": 0.0,
"step": 13300
},
{
"epoch": 9.05,
"grad_norm": 0.002187453443184495,
"learning_rate": 2.8134411158662227e-06,
"loss": 0.0,
"step": 13310
},
{
"epoch": 9.05,
"grad_norm": 0.0007116499473340809,
"learning_rate": 2.7738151846568394e-06,
"loss": 0.0,
"step": 13320
},
{
"epoch": 9.05,
"grad_norm": 0.000514859682880342,
"learning_rate": 2.7341892534474565e-06,
"loss": 0.0001,
"step": 13330
},
{
"epoch": 9.05,
"grad_norm": 0.0007328620995394886,
"learning_rate": 2.6945633222380727e-06,
"loss": 0.0691,
"step": 13340
},
{
"epoch": 9.05,
"grad_norm": 0.0007036814349703491,
"learning_rate": 2.6549373910286894e-06,
"loss": 0.0,
"step": 13350
},
{
"epoch": 9.05,
"grad_norm": 0.001070524798706174,
"learning_rate": 2.615311459819306e-06,
"loss": 0.0,
"step": 13360
},
{
"epoch": 9.05,
"grad_norm": 0.0008939993567764759,
"learning_rate": 2.5756855286099224e-06,
"loss": 0.0001,
"step": 13370
},
{
"epoch": 9.05,
"grad_norm": 0.0004034818266518414,
"learning_rate": 2.536059597400539e-06,
"loss": 0.0206,
"step": 13380
},
{
"epoch": 9.05,
"grad_norm": 4.411261081695557,
"learning_rate": 2.4964336661911553e-06,
"loss": 0.0124,
"step": 13390
},
{
"epoch": 9.06,
"grad_norm": 0.0006528793601319194,
"learning_rate": 2.456807734981772e-06,
"loss": 0.0,
"step": 13400
},
{
"epoch": 9.06,
"grad_norm": 0.0003673941537272185,
"learning_rate": 2.417181803772389e-06,
"loss": 0.0,
"step": 13410
},
{
"epoch": 9.06,
"grad_norm": 0.001056182780303061,
"learning_rate": 2.3775558725630054e-06,
"loss": 0.0,
"step": 13420
},
{
"epoch": 9.06,
"grad_norm": 0.0012370526092126966,
"learning_rate": 2.337929941353622e-06,
"loss": 0.0001,
"step": 13430
},
{
"epoch": 9.06,
"grad_norm": 0.0015783560229465365,
"learning_rate": 2.2983040101442387e-06,
"loss": 0.0,
"step": 13440
},
{
"epoch": 9.06,
"grad_norm": 0.0001985041017178446,
"learning_rate": 2.258678078934855e-06,
"loss": 0.0016,
"step": 13450
},
{
"epoch": 9.06,
"grad_norm": 0.0010269788326695561,
"learning_rate": 2.2190521477254717e-06,
"loss": 0.1057,
"step": 13460
},
{
"epoch": 9.06,
"grad_norm": 0.04036625847220421,
"learning_rate": 2.179426216516088e-06,
"loss": 0.2287,
"step": 13470
},
{
"epoch": 9.06,
"grad_norm": 0.000473200052510947,
"learning_rate": 2.1398002853067046e-06,
"loss": 0.0001,
"step": 13480
},
{
"epoch": 9.06,
"grad_norm": 0.0003723807749338448,
"learning_rate": 2.1001743540973213e-06,
"loss": 0.0,
"step": 13490
},
{
"epoch": 9.06,
"grad_norm": 0.0007169354357756674,
"learning_rate": 2.060548422887938e-06,
"loss": 0.0008,
"step": 13500
},
{
"epoch": 9.06,
"grad_norm": 0.00031334979576058686,
"learning_rate": 2.0209224916785547e-06,
"loss": 0.0,
"step": 13510
},
{
"epoch": 9.06,
"grad_norm": 0.000616435194388032,
"learning_rate": 1.9812965604691713e-06,
"loss": 0.0,
"step": 13520
},
{
"epoch": 9.06,
"grad_norm": 0.0008787320111878216,
"learning_rate": 1.9416706292597876e-06,
"loss": 0.0,
"step": 13530
},
{
"epoch": 9.07,
"grad_norm": 0.0002825538394972682,
"learning_rate": 1.9020446980504043e-06,
"loss": 0.0021,
"step": 13540
},
{
"epoch": 9.07,
"grad_norm": 0.002063804306089878,
"learning_rate": 1.8624187668410208e-06,
"loss": 0.0004,
"step": 13550
},
{
"epoch": 9.07,
"grad_norm": 0.000512151513248682,
"learning_rate": 1.8227928356316374e-06,
"loss": 0.0,
"step": 13560
},
{
"epoch": 9.07,
"grad_norm": 0.0006224968819878995,
"learning_rate": 1.7831669044222541e-06,
"loss": 0.0,
"step": 13570
},
{
"epoch": 9.07,
"grad_norm": 0.00019008757953997701,
"learning_rate": 1.7435409732128704e-06,
"loss": 0.0,
"step": 13580
},
{
"epoch": 9.07,
"grad_norm": 0.0002794242464005947,
"learning_rate": 1.7039150420034873e-06,
"loss": 0.0,
"step": 13590
},
{
"epoch": 9.07,
"grad_norm": 0.0009566029766574502,
"learning_rate": 1.664289110794104e-06,
"loss": 0.0001,
"step": 13600
},
{
"epoch": 9.07,
"grad_norm": 0.0003199617494828999,
"learning_rate": 1.6246631795847202e-06,
"loss": 0.0001,
"step": 13610
},
{
"epoch": 9.07,
"grad_norm": 0.00032697312417440116,
"learning_rate": 1.585037248375337e-06,
"loss": 0.0,
"step": 13620
},
{
"epoch": 9.07,
"grad_norm": 0.002565112430602312,
"learning_rate": 1.5454113171659534e-06,
"loss": 0.0,
"step": 13630
},
{
"epoch": 9.07,
"grad_norm": 237.59519958496094,
"learning_rate": 1.50578538595657e-06,
"loss": 0.1313,
"step": 13640
},
{
"epoch": 9.07,
"grad_norm": 0.0006662964588031173,
"learning_rate": 1.4661594547471865e-06,
"loss": 0.0,
"step": 13650
},
{
"epoch": 9.07,
"grad_norm": 0.0011941486736759543,
"learning_rate": 1.4265335235378032e-06,
"loss": 0.0,
"step": 13660
},
{
"epoch": 9.07,
"grad_norm": 0.0028123382944613695,
"learning_rate": 1.3869075923284197e-06,
"loss": 0.0,
"step": 13670
},
{
"epoch": 9.08,
"grad_norm": 0.0008815588662400842,
"learning_rate": 1.3472816611190364e-06,
"loss": 0.0277,
"step": 13680
},
{
"epoch": 9.08,
"grad_norm": 0.00045147593482397497,
"learning_rate": 1.307655729909653e-06,
"loss": 0.0,
"step": 13690
},
{
"epoch": 9.08,
"grad_norm": 0.00011046286817872897,
"learning_rate": 1.2680297987002695e-06,
"loss": 0.1484,
"step": 13700
},
{
"epoch": 9.08,
"grad_norm": 0.0018034332897514105,
"learning_rate": 1.228403867490886e-06,
"loss": 0.0027,
"step": 13710
},
{
"epoch": 9.08,
"grad_norm": 0.000713842804543674,
"learning_rate": 1.1887779362815027e-06,
"loss": 0.0,
"step": 13720
},
{
"epoch": 9.08,
"grad_norm": 0.0010389587841928005,
"learning_rate": 1.1491520050721194e-06,
"loss": 0.0,
"step": 13730
},
{
"epoch": 9.08,
"grad_norm": 0.0003368295438122004,
"learning_rate": 1.1095260738627358e-06,
"loss": 0.0,
"step": 13740
},
{
"epoch": 9.08,
"grad_norm": 0.000346412300132215,
"learning_rate": 1.0699001426533523e-06,
"loss": 0.0,
"step": 13750
},
{
"epoch": 9.08,
"grad_norm": 0.0004677934921346605,
"learning_rate": 1.030274211443969e-06,
"loss": 0.0,
"step": 13760
},
{
"epoch": 9.08,
"grad_norm": 0.0008401199011132121,
"learning_rate": 9.906482802345857e-07,
"loss": 0.0,
"step": 13770
},
{
"epoch": 9.08,
"grad_norm": 0.0003339408722240478,
"learning_rate": 9.510223490252021e-07,
"loss": 0.0002,
"step": 13780
},
{
"epoch": 9.08,
"grad_norm": 0.0004967558197677135,
"learning_rate": 9.113964178158187e-07,
"loss": 0.0,
"step": 13790
},
{
"epoch": 9.08,
"grad_norm": 0.002963978098705411,
"learning_rate": 8.717704866064352e-07,
"loss": 0.0,
"step": 13800
},
{
"epoch": 9.08,
"grad_norm": 0.001155543839558959,
"learning_rate": 8.32144555397052e-07,
"loss": 0.0,
"step": 13810
},
{
"epoch": 9.09,
"grad_norm": 0.000786484801210463,
"learning_rate": 7.925186241876685e-07,
"loss": 0.1625,
"step": 13820
},
{
"epoch": 9.09,
"grad_norm": 0.0002841146197170019,
"learning_rate": 7.52892692978285e-07,
"loss": 0.0,
"step": 13830
},
{
"epoch": 9.09,
"grad_norm": 0.00030605948995798826,
"learning_rate": 7.132667617689016e-07,
"loss": 0.0,
"step": 13840
},
{
"epoch": 9.09,
"grad_norm": 0.001265210215933621,
"learning_rate": 6.736408305595182e-07,
"loss": 0.0,
"step": 13850
},
{
"epoch": 9.09,
"grad_norm": 0.00038683577440679073,
"learning_rate": 6.340148993501348e-07,
"loss": 0.0,
"step": 13860
},
{
"epoch": 9.09,
"grad_norm": 0.0005034942296333611,
"learning_rate": 5.943889681407513e-07,
"loss": 0.0,
"step": 13870
},
{
"epoch": 9.09,
"grad_norm": 0.0011582579463720322,
"learning_rate": 5.547630369313679e-07,
"loss": 0.0,
"step": 13880
},
{
"epoch": 9.09,
"grad_norm": 0.0016904632793739438,
"learning_rate": 5.151371057219845e-07,
"loss": 0.0,
"step": 13890
},
{
"epoch": 9.09,
"grad_norm": 0.00032329061650671065,
"learning_rate": 4.7551117451260107e-07,
"loss": 0.0,
"step": 13900
},
{
"epoch": 9.09,
"grad_norm": 0.0003388900659047067,
"learning_rate": 4.358852433032176e-07,
"loss": 0.0,
"step": 13910
},
{
"epoch": 9.09,
"grad_norm": 0.0003800652630161494,
"learning_rate": 3.962593120938342e-07,
"loss": 0.0,
"step": 13920
},
{
"epoch": 9.09,
"grad_norm": 0.0009641946526244283,
"learning_rate": 3.566333808844508e-07,
"loss": 0.0,
"step": 13930
},
{
"epoch": 9.09,
"grad_norm": 0.0005723941139876842,
"learning_rate": 3.170074496750674e-07,
"loss": 0.0001,
"step": 13940
},
{
"epoch": 9.09,
"grad_norm": 0.0005183956818655133,
"learning_rate": 2.7738151846568396e-07,
"loss": 0.0,
"step": 13950
},
{
"epoch": 9.1,
"grad_norm": 0.009076601825654507,
"learning_rate": 2.3775558725630054e-07,
"loss": 0.0,
"step": 13960
},
{
"epoch": 9.1,
"grad_norm": 0.0007901808712631464,
"learning_rate": 1.981296560469171e-07,
"loss": 0.0,
"step": 13970
},
{
"epoch": 9.1,
"grad_norm": 0.0005284142098389566,
"learning_rate": 1.585037248375337e-07,
"loss": 0.0,
"step": 13980
},
{
"epoch": 9.1,
"grad_norm": 0.0006428571650758386,
"learning_rate": 1.1887779362815027e-07,
"loss": 0.0,
"step": 13990
},
{
"epoch": 9.1,
"grad_norm": 0.0012319569941610098,
"learning_rate": 7.925186241876685e-08,
"loss": 0.0001,
"step": 14000
},
{
"epoch": 9.1,
"grad_norm": 0.000267757655819878,
"learning_rate": 3.962593120938342e-08,
"loss": 0.0,
"step": 14010
},
{
"epoch": 9.1,
"grad_norm": 0.0010718012927100062,
"learning_rate": 0.0,
"loss": 0.0,
"step": 14020
},
{
"epoch": 9.1,
"eval_accuracy": 0.9654135338345865,
"eval_loss": 0.24323464930057526,
"eval_runtime": 2339.7693,
"eval_samples_per_second": 0.284,
"eval_steps_per_second": 0.142,
"step": 14020
},
{
"epoch": 9.1,
"step": 14020,
"total_flos": 7.1819242300007645e+19,
"train_loss": 0.21922695452951727,
"train_runtime": 145352.8053,
"train_samples_per_second": 0.193,
"train_steps_per_second": 0.096
},
{
"epoch": 9.1,
"eval_accuracy": 0.960960960960961,
"eval_loss": 0.25779759883880615,
"eval_runtime": 1196.8626,
"eval_samples_per_second": 0.278,
"eval_steps_per_second": 0.14,
"step": 14020
},
{
"epoch": 9.1,
"eval_accuracy": 0.960960960960961,
"eval_loss": 0.25779759883880615,
"eval_runtime": 1193.7233,
"eval_samples_per_second": 0.279,
"eval_steps_per_second": 0.14,
"step": 14020
}
],
"logging_steps": 10,
"max_steps": 14020,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"total_flos": 7.1819242300007645e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}