{ "best_metric": 0.9669172932330827, "best_model_checkpoint": "YAHIA/vivit-b-16x2-collected-dataset\\checkpoint-8418", "epoch": 9.099358059914408, "eval_steps": 500, "global_step": 14020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 16.260046005249023, "learning_rate": 3.566333808844508e-07, "loss": 1.7843, "step": 10 }, { "epoch": 0.0, "grad_norm": 24.135656356811523, "learning_rate": 7.132667617689016e-07, "loss": 1.8164, "step": 20 }, { "epoch": 0.0, "grad_norm": 22.565906524658203, "learning_rate": 1.0699001426533523e-06, "loss": 1.9396, "step": 30 }, { "epoch": 0.0, "grad_norm": 20.68889045715332, "learning_rate": 1.4265335235378032e-06, "loss": 1.9576, "step": 40 }, { "epoch": 0.0, "grad_norm": 22.999574661254883, "learning_rate": 1.7831669044222541e-06, "loss": 1.9828, "step": 50 }, { "epoch": 0.0, "grad_norm": 27.69795036315918, "learning_rate": 2.1398002853067046e-06, "loss": 1.9381, "step": 60 }, { "epoch": 0.0, "grad_norm": 25.143293380737305, "learning_rate": 2.4964336661911553e-06, "loss": 1.8222, "step": 70 }, { "epoch": 0.01, "grad_norm": 20.670278549194336, "learning_rate": 2.8530670470756064e-06, "loss": 1.7593, "step": 80 }, { "epoch": 0.01, "grad_norm": 20.401081085205078, "learning_rate": 3.209700427960057e-06, "loss": 1.7611, "step": 90 }, { "epoch": 0.01, "grad_norm": 21.053512573242188, "learning_rate": 3.5663338088445082e-06, "loss": 1.7861, "step": 100 }, { "epoch": 0.01, "grad_norm": 21.759618759155273, "learning_rate": 3.922967189728959e-06, "loss": 1.7698, "step": 110 }, { "epoch": 0.01, "grad_norm": 22.194372177124023, "learning_rate": 4.279600570613409e-06, "loss": 1.7558, "step": 120 }, { "epoch": 0.01, "grad_norm": 20.19968605041504, "learning_rate": 4.63623395149786e-06, "loss": 1.8624, "step": 130 }, { "epoch": 0.01, "grad_norm": 19.7205753326416, "learning_rate": 4.992867332382311e-06, "loss": 1.6795, "step": 140 }, { "epoch": 0.01, "grad_norm": 19.424144744873047, "learning_rate": 5.349500713266762e-06, "loss": 1.7443, "step": 150 }, { "epoch": 0.01, "grad_norm": 19.568191528320312, "learning_rate": 5.706134094151213e-06, "loss": 1.7468, "step": 160 }, { "epoch": 0.01, "grad_norm": 22.05777931213379, "learning_rate": 6.062767475035663e-06, "loss": 1.7362, "step": 170 }, { "epoch": 0.01, "grad_norm": 17.77819061279297, "learning_rate": 6.419400855920114e-06, "loss": 1.527, "step": 180 }, { "epoch": 0.01, "grad_norm": 19.465238571166992, "learning_rate": 6.776034236804565e-06, "loss": 1.5923, "step": 190 }, { "epoch": 0.01, "grad_norm": 20.62281036376953, "learning_rate": 7.1326676176890165e-06, "loss": 1.697, "step": 200 }, { "epoch": 0.01, "grad_norm": 16.7163028717041, "learning_rate": 7.489300998573468e-06, "loss": 1.4694, "step": 210 }, { "epoch": 0.02, "grad_norm": 20.071901321411133, "learning_rate": 7.845934379457918e-06, "loss": 1.4549, "step": 220 }, { "epoch": 0.02, "grad_norm": 20.55426597595215, "learning_rate": 8.202567760342367e-06, "loss": 1.3167, "step": 230 }, { "epoch": 0.02, "grad_norm": 26.36579704284668, "learning_rate": 8.559201141226818e-06, "loss": 1.6743, "step": 240 }, { "epoch": 0.02, "grad_norm": 17.331533432006836, "learning_rate": 8.91583452211127e-06, "loss": 1.4754, "step": 250 }, { "epoch": 0.02, "grad_norm": 19.567764282226562, "learning_rate": 9.27246790299572e-06, "loss": 1.45, "step": 260 }, { "epoch": 0.02, "grad_norm": 16.322946548461914, "learning_rate": 9.629101283880172e-06, "loss": 1.3971, "step": 270 }, { "epoch": 0.02, "grad_norm": 18.62678337097168, "learning_rate": 9.985734664764621e-06, "loss": 1.4368, "step": 280 }, { "epoch": 0.02, "grad_norm": 20.327966690063477, "learning_rate": 1.0342368045649072e-05, "loss": 1.5098, "step": 290 }, { "epoch": 0.02, "grad_norm": 18.31135368347168, "learning_rate": 1.0699001426533523e-05, "loss": 1.1699, "step": 300 }, { "epoch": 0.02, "grad_norm": 28.94702911376953, "learning_rate": 1.1055634807417975e-05, "loss": 1.5048, "step": 310 }, { "epoch": 0.02, "grad_norm": 21.377225875854492, "learning_rate": 1.1412268188302426e-05, "loss": 1.4112, "step": 320 }, { "epoch": 0.02, "grad_norm": 15.965813636779785, "learning_rate": 1.1768901569186877e-05, "loss": 1.5097, "step": 330 }, { "epoch": 0.02, "grad_norm": 19.742080688476562, "learning_rate": 1.2125534950071326e-05, "loss": 1.2703, "step": 340 }, { "epoch": 0.02, "grad_norm": 18.924072265625, "learning_rate": 1.2482168330955777e-05, "loss": 1.2194, "step": 350 }, { "epoch": 0.03, "grad_norm": 18.15528106689453, "learning_rate": 1.2838801711840228e-05, "loss": 1.0668, "step": 360 }, { "epoch": 0.03, "grad_norm": 21.82122802734375, "learning_rate": 1.3195435092724678e-05, "loss": 0.9053, "step": 370 }, { "epoch": 0.03, "grad_norm": 20.609405517578125, "learning_rate": 1.355206847360913e-05, "loss": 1.2574, "step": 380 }, { "epoch": 0.03, "grad_norm": 25.153718948364258, "learning_rate": 1.390870185449358e-05, "loss": 1.1619, "step": 390 }, { "epoch": 0.03, "grad_norm": 12.118425369262695, "learning_rate": 1.4265335235378033e-05, "loss": 1.1514, "step": 400 }, { "epoch": 0.03, "grad_norm": 25.673738479614258, "learning_rate": 1.4621968616262482e-05, "loss": 1.2191, "step": 410 }, { "epoch": 0.03, "grad_norm": 23.109697341918945, "learning_rate": 1.4978601997146935e-05, "loss": 0.8535, "step": 420 }, { "epoch": 0.03, "grad_norm": 15.181422233581543, "learning_rate": 1.5335235378031385e-05, "loss": 0.7464, "step": 430 }, { "epoch": 0.03, "grad_norm": 31.820419311523438, "learning_rate": 1.5691868758915836e-05, "loss": 1.2202, "step": 440 }, { "epoch": 0.03, "grad_norm": 24.667930603027344, "learning_rate": 1.6048502139800287e-05, "loss": 1.0542, "step": 450 }, { "epoch": 0.03, "grad_norm": 16.041976928710938, "learning_rate": 1.6405135520684735e-05, "loss": 0.9541, "step": 460 }, { "epoch": 0.03, "grad_norm": 9.076415061950684, "learning_rate": 1.676176890156919e-05, "loss": 1.0625, "step": 470 }, { "epoch": 0.03, "grad_norm": 9.516477584838867, "learning_rate": 1.7118402282453637e-05, "loss": 1.1725, "step": 480 }, { "epoch": 0.03, "grad_norm": 29.433717727661133, "learning_rate": 1.7475035663338088e-05, "loss": 0.9226, "step": 490 }, { "epoch": 0.04, "grad_norm": 14.57030200958252, "learning_rate": 1.783166904422254e-05, "loss": 0.7345, "step": 500 }, { "epoch": 0.04, "grad_norm": 19.724756240844727, "learning_rate": 1.818830242510699e-05, "loss": 1.1076, "step": 510 }, { "epoch": 0.04, "grad_norm": 17.7041072845459, "learning_rate": 1.854493580599144e-05, "loss": 1.1412, "step": 520 }, { "epoch": 0.04, "grad_norm": 4.248980522155762, "learning_rate": 1.8901569186875892e-05, "loss": 0.675, "step": 530 }, { "epoch": 0.04, "grad_norm": 6.876579284667969, "learning_rate": 1.9258202567760344e-05, "loss": 0.703, "step": 540 }, { "epoch": 0.04, "grad_norm": 15.930359840393066, "learning_rate": 1.9614835948644795e-05, "loss": 0.671, "step": 550 }, { "epoch": 0.04, "grad_norm": 7.9089226722717285, "learning_rate": 1.9971469329529242e-05, "loss": 0.7656, "step": 560 }, { "epoch": 0.04, "grad_norm": 20.674118041992188, "learning_rate": 2.0328102710413697e-05, "loss": 1.4598, "step": 570 }, { "epoch": 0.04, "grad_norm": 33.44108581542969, "learning_rate": 2.0684736091298145e-05, "loss": 0.9271, "step": 580 }, { "epoch": 0.04, "grad_norm": 9.660829544067383, "learning_rate": 2.10413694721826e-05, "loss": 0.9135, "step": 590 }, { "epoch": 0.04, "grad_norm": 3.2455947399139404, "learning_rate": 2.1398002853067047e-05, "loss": 0.9244, "step": 600 }, { "epoch": 0.04, "grad_norm": 16.9035587310791, "learning_rate": 2.1754636233951498e-05, "loss": 1.2397, "step": 610 }, { "epoch": 0.04, "grad_norm": 12.139324188232422, "learning_rate": 2.211126961483595e-05, "loss": 0.9137, "step": 620 }, { "epoch": 0.04, "grad_norm": 13.0936861038208, "learning_rate": 2.24679029957204e-05, "loss": 0.891, "step": 630 }, { "epoch": 0.05, "grad_norm": 8.328577995300293, "learning_rate": 2.282453637660485e-05, "loss": 0.8631, "step": 640 }, { "epoch": 0.05, "grad_norm": 24.814929962158203, "learning_rate": 2.3181169757489303e-05, "loss": 0.5948, "step": 650 }, { "epoch": 0.05, "grad_norm": 15.284310340881348, "learning_rate": 2.3537803138373754e-05, "loss": 1.1153, "step": 660 }, { "epoch": 0.05, "grad_norm": 2.1705708503723145, "learning_rate": 2.3894436519258205e-05, "loss": 0.6161, "step": 670 }, { "epoch": 0.05, "grad_norm": 15.621281623840332, "learning_rate": 2.4251069900142652e-05, "loss": 0.8466, "step": 680 }, { "epoch": 0.05, "grad_norm": 37.767173767089844, "learning_rate": 2.4607703281027107e-05, "loss": 0.7471, "step": 690 }, { "epoch": 0.05, "grad_norm": 5.153799533843994, "learning_rate": 2.4964336661911555e-05, "loss": 0.5421, "step": 700 }, { "epoch": 0.05, "grad_norm": 3.665609359741211, "learning_rate": 2.5320970042796006e-05, "loss": 0.4251, "step": 710 }, { "epoch": 0.05, "grad_norm": 21.673925399780273, "learning_rate": 2.5677603423680457e-05, "loss": 0.8117, "step": 720 }, { "epoch": 0.05, "grad_norm": 23.484006881713867, "learning_rate": 2.603423680456491e-05, "loss": 0.4761, "step": 730 }, { "epoch": 0.05, "grad_norm": 24.750452041625977, "learning_rate": 2.6390870185449356e-05, "loss": 0.95, "step": 740 }, { "epoch": 0.05, "grad_norm": 6.027065277099609, "learning_rate": 2.674750356633381e-05, "loss": 0.9197, "step": 750 }, { "epoch": 0.05, "grad_norm": 33.312313079833984, "learning_rate": 2.710413694721826e-05, "loss": 1.033, "step": 760 }, { "epoch": 0.05, "grad_norm": 15.621706008911133, "learning_rate": 2.7460770328102713e-05, "loss": 0.2779, "step": 770 }, { "epoch": 0.06, "grad_norm": 10.880739212036133, "learning_rate": 2.781740370898716e-05, "loss": 0.5387, "step": 780 }, { "epoch": 0.06, "grad_norm": 11.6985445022583, "learning_rate": 2.8174037089871615e-05, "loss": 0.7687, "step": 790 }, { "epoch": 0.06, "grad_norm": 25.108810424804688, "learning_rate": 2.8530670470756066e-05, "loss": 0.4862, "step": 800 }, { "epoch": 0.06, "grad_norm": 23.200624465942383, "learning_rate": 2.8887303851640514e-05, "loss": 0.9553, "step": 810 }, { "epoch": 0.06, "grad_norm": 30.682540893554688, "learning_rate": 2.9243937232524965e-05, "loss": 0.8558, "step": 820 }, { "epoch": 0.06, "grad_norm": 12.823701858520508, "learning_rate": 2.9600570613409416e-05, "loss": 0.6195, "step": 830 }, { "epoch": 0.06, "grad_norm": 11.762367248535156, "learning_rate": 2.995720399429387e-05, "loss": 0.758, "step": 840 }, { "epoch": 0.06, "grad_norm": 1.1662691831588745, "learning_rate": 3.0313837375178318e-05, "loss": 0.354, "step": 850 }, { "epoch": 0.06, "grad_norm": 23.4963436126709, "learning_rate": 3.067047075606277e-05, "loss": 0.8267, "step": 860 }, { "epoch": 0.06, "grad_norm": 0.13900019228458405, "learning_rate": 3.102710413694722e-05, "loss": 0.6618, "step": 870 }, { "epoch": 0.06, "grad_norm": 7.843920707702637, "learning_rate": 3.138373751783167e-05, "loss": 0.7689, "step": 880 }, { "epoch": 0.06, "grad_norm": 31.13179588317871, "learning_rate": 3.174037089871612e-05, "loss": 0.7772, "step": 890 }, { "epoch": 0.06, "grad_norm": 44.58312225341797, "learning_rate": 3.2097004279600574e-05, "loss": 1.1062, "step": 900 }, { "epoch": 0.06, "grad_norm": 18.089794158935547, "learning_rate": 3.2453637660485025e-05, "loss": 0.7678, "step": 910 }, { "epoch": 0.07, "grad_norm": 33.472625732421875, "learning_rate": 3.281027104136947e-05, "loss": 1.6911, "step": 920 }, { "epoch": 0.07, "grad_norm": 33.618831634521484, "learning_rate": 3.316690442225393e-05, "loss": 0.8881, "step": 930 }, { "epoch": 0.07, "grad_norm": 1.6782217025756836, "learning_rate": 3.352353780313838e-05, "loss": 0.7327, "step": 940 }, { "epoch": 0.07, "grad_norm": 2.1791036128997803, "learning_rate": 3.388017118402282e-05, "loss": 0.8054, "step": 950 }, { "epoch": 0.07, "grad_norm": 6.972609043121338, "learning_rate": 3.4236804564907274e-05, "loss": 0.2614, "step": 960 }, { "epoch": 0.07, "grad_norm": 24.085866928100586, "learning_rate": 3.459343794579173e-05, "loss": 0.4054, "step": 970 }, { "epoch": 0.07, "grad_norm": 1.9996914863586426, "learning_rate": 3.4950071326676176e-05, "loss": 0.5344, "step": 980 }, { "epoch": 0.07, "grad_norm": 0.13388022780418396, "learning_rate": 3.530670470756063e-05, "loss": 0.7224, "step": 990 }, { "epoch": 0.07, "grad_norm": 30.018585205078125, "learning_rate": 3.566333808844508e-05, "loss": 0.6226, "step": 1000 }, { "epoch": 0.07, "grad_norm": 14.195096015930176, "learning_rate": 3.6019971469329536e-05, "loss": 0.7356, "step": 1010 }, { "epoch": 0.07, "grad_norm": 25.853748321533203, "learning_rate": 3.637660485021398e-05, "loss": 0.7235, "step": 1020 }, { "epoch": 0.07, "grad_norm": 29.89474868774414, "learning_rate": 3.673323823109843e-05, "loss": 1.1801, "step": 1030 }, { "epoch": 0.07, "grad_norm": 12.760407447814941, "learning_rate": 3.708987161198288e-05, "loss": 0.4307, "step": 1040 }, { "epoch": 0.07, "grad_norm": 29.496700286865234, "learning_rate": 3.7446504992867334e-05, "loss": 0.9473, "step": 1050 }, { "epoch": 0.08, "grad_norm": 1.0010541677474976, "learning_rate": 3.7803138373751785e-05, "loss": 0.5983, "step": 1060 }, { "epoch": 0.08, "grad_norm": 2.141446352005005, "learning_rate": 3.8159771754636236e-05, "loss": 0.4888, "step": 1070 }, { "epoch": 0.08, "grad_norm": 1.1996098756790161, "learning_rate": 3.851640513552069e-05, "loss": 0.5292, "step": 1080 }, { "epoch": 0.08, "grad_norm": 20.964256286621094, "learning_rate": 3.887303851640514e-05, "loss": 0.6905, "step": 1090 }, { "epoch": 0.08, "grad_norm": 0.3161448538303375, "learning_rate": 3.922967189728959e-05, "loss": 0.7078, "step": 1100 }, { "epoch": 0.08, "grad_norm": 13.272440910339355, "learning_rate": 3.958630527817404e-05, "loss": 0.9984, "step": 1110 }, { "epoch": 0.08, "grad_norm": 0.3290501832962036, "learning_rate": 3.9942938659058485e-05, "loss": 0.4982, "step": 1120 }, { "epoch": 0.08, "grad_norm": 0.7225183248519897, "learning_rate": 4.029957203994294e-05, "loss": 0.4532, "step": 1130 }, { "epoch": 0.08, "grad_norm": 24.277801513671875, "learning_rate": 4.0656205420827394e-05, "loss": 0.5147, "step": 1140 }, { "epoch": 0.08, "grad_norm": 9.140922546386719, "learning_rate": 4.1012838801711845e-05, "loss": 0.7049, "step": 1150 }, { "epoch": 0.08, "grad_norm": 4.139643669128418, "learning_rate": 4.136947218259629e-05, "loss": 1.3454, "step": 1160 }, { "epoch": 0.08, "grad_norm": 24.44458770751953, "learning_rate": 4.172610556348075e-05, "loss": 0.6409, "step": 1170 }, { "epoch": 0.08, "grad_norm": 53.11198425292969, "learning_rate": 4.20827389443652e-05, "loss": 0.7063, "step": 1180 }, { "epoch": 0.08, "grad_norm": 15.888784408569336, "learning_rate": 4.243937232524964e-05, "loss": 0.686, "step": 1190 }, { "epoch": 0.09, "grad_norm": 29.689838409423828, "learning_rate": 4.2796005706134094e-05, "loss": 0.6301, "step": 1200 }, { "epoch": 0.09, "grad_norm": 37.24555206298828, "learning_rate": 4.3152639087018545e-05, "loss": 0.939, "step": 1210 }, { "epoch": 0.09, "grad_norm": 39.6224479675293, "learning_rate": 4.3509272467902996e-05, "loss": 0.9322, "step": 1220 }, { "epoch": 0.09, "grad_norm": 28.799930572509766, "learning_rate": 4.386590584878745e-05, "loss": 1.1431, "step": 1230 }, { "epoch": 0.09, "grad_norm": 0.9416821002960205, "learning_rate": 4.42225392296719e-05, "loss": 0.7622, "step": 1240 }, { "epoch": 0.09, "grad_norm": 11.397088050842285, "learning_rate": 4.457917261055635e-05, "loss": 0.5302, "step": 1250 }, { "epoch": 0.09, "grad_norm": 0.13693714141845703, "learning_rate": 4.49358059914408e-05, "loss": 0.9497, "step": 1260 }, { "epoch": 0.09, "grad_norm": 22.64994239807129, "learning_rate": 4.529243937232525e-05, "loss": 1.4811, "step": 1270 }, { "epoch": 0.09, "grad_norm": 63.26667404174805, "learning_rate": 4.56490727532097e-05, "loss": 0.7023, "step": 1280 }, { "epoch": 0.09, "grad_norm": 24.035776138305664, "learning_rate": 4.6005706134094154e-05, "loss": 0.3478, "step": 1290 }, { "epoch": 0.09, "grad_norm": 0.0889860987663269, "learning_rate": 4.6362339514978605e-05, "loss": 0.3317, "step": 1300 }, { "epoch": 0.09, "grad_norm": 14.644208908081055, "learning_rate": 4.6718972895863056e-05, "loss": 0.6627, "step": 1310 }, { "epoch": 0.09, "grad_norm": 16.509044647216797, "learning_rate": 4.707560627674751e-05, "loss": 1.3097, "step": 1320 }, { "epoch": 0.09, "grad_norm": 23.583152770996094, "learning_rate": 4.743223965763195e-05, "loss": 0.9481, "step": 1330 }, { "epoch": 0.1, "grad_norm": 50.59526443481445, "learning_rate": 4.778887303851641e-05, "loss": 0.7222, "step": 1340 }, { "epoch": 0.1, "grad_norm": 18.746498107910156, "learning_rate": 4.814550641940086e-05, "loss": 0.7993, "step": 1350 }, { "epoch": 0.1, "grad_norm": 14.619526863098145, "learning_rate": 4.8502139800285305e-05, "loss": 0.8045, "step": 1360 }, { "epoch": 0.1, "grad_norm": 0.3897199332714081, "learning_rate": 4.8858773181169756e-05, "loss": 0.7668, "step": 1370 }, { "epoch": 0.1, "grad_norm": 0.14925141632556915, "learning_rate": 4.9215406562054214e-05, "loss": 0.2882, "step": 1380 }, { "epoch": 0.1, "grad_norm": 38.2923469543457, "learning_rate": 4.9572039942938665e-05, "loss": 0.8372, "step": 1390 }, { "epoch": 0.1, "grad_norm": 0.04119894281029701, "learning_rate": 4.992867332382311e-05, "loss": 0.1001, "step": 1400 }, { "epoch": 0.1, "eval_accuracy": 0.7789473684210526, "eval_loss": 0.898942768573761, "eval_runtime": 2157.0352, "eval_samples_per_second": 0.308, "eval_steps_per_second": 0.154, "step": 1403 }, { "epoch": 1.0, "grad_norm": 4.562422752380371, "learning_rate": 4.99682992550325e-05, "loss": 0.6604, "step": 1410 }, { "epoch": 1.0, "grad_norm": 0.05182512477040291, "learning_rate": 4.992867332382311e-05, "loss": 0.7648, "step": 1420 }, { "epoch": 1.0, "grad_norm": 10.46511459350586, "learning_rate": 4.988904739261373e-05, "loss": 0.7306, "step": 1430 }, { "epoch": 1.0, "grad_norm": 26.981674194335938, "learning_rate": 4.984942146140435e-05, "loss": 0.3427, "step": 1440 }, { "epoch": 1.0, "grad_norm": 38.77156066894531, "learning_rate": 4.9809795530194966e-05, "loss": 0.4313, "step": 1450 }, { "epoch": 1.0, "grad_norm": 0.08875282108783722, "learning_rate": 4.977016959898558e-05, "loss": 0.1047, "step": 1460 }, { "epoch": 1.0, "grad_norm": 7.9550042152404785, "learning_rate": 4.97305436677762e-05, "loss": 0.8428, "step": 1470 }, { "epoch": 1.01, "grad_norm": 3.2877941131591797, "learning_rate": 4.969091773656681e-05, "loss": 0.5087, "step": 1480 }, { "epoch": 1.01, "grad_norm": 27.133525848388672, "learning_rate": 4.965129180535743e-05, "loss": 0.4621, "step": 1490 }, { "epoch": 1.01, "grad_norm": 24.116609573364258, "learning_rate": 4.9611665874148046e-05, "loss": 0.8207, "step": 1500 }, { "epoch": 1.01, "grad_norm": 48.552242279052734, "learning_rate": 4.9572039942938665e-05, "loss": 1.3676, "step": 1510 }, { "epoch": 1.01, "grad_norm": 0.1313333660364151, "learning_rate": 4.953241401172928e-05, "loss": 0.4705, "step": 1520 }, { "epoch": 1.01, "grad_norm": 14.919997215270996, "learning_rate": 4.9492788080519896e-05, "loss": 1.6541, "step": 1530 }, { "epoch": 1.01, "grad_norm": 2.8064146041870117, "learning_rate": 4.945316214931051e-05, "loss": 0.6358, "step": 1540 }, { "epoch": 1.01, "grad_norm": 33.633766174316406, "learning_rate": 4.941353621810113e-05, "loss": 0.3344, "step": 1550 }, { "epoch": 1.01, "grad_norm": 25.58049774169922, "learning_rate": 4.9373910286891746e-05, "loss": 0.3765, "step": 1560 }, { "epoch": 1.01, "grad_norm": 0.5938677191734314, "learning_rate": 4.933428435568236e-05, "loss": 0.7599, "step": 1570 }, { "epoch": 1.01, "grad_norm": 0.17297320067882538, "learning_rate": 4.9294658424472976e-05, "loss": 0.1381, "step": 1580 }, { "epoch": 1.01, "grad_norm": 1.1137043237686157, "learning_rate": 4.9255032493263595e-05, "loss": 0.5216, "step": 1590 }, { "epoch": 1.01, "grad_norm": 11.281981468200684, "learning_rate": 4.9215406562054214e-05, "loss": 0.9504, "step": 1600 }, { "epoch": 1.01, "grad_norm": 35.159671783447266, "learning_rate": 4.9175780630844826e-05, "loss": 0.6236, "step": 1610 }, { "epoch": 1.02, "grad_norm": 15.732198715209961, "learning_rate": 4.9136154699635445e-05, "loss": 0.3037, "step": 1620 }, { "epoch": 1.02, "grad_norm": 4.352818965911865, "learning_rate": 4.909652876842606e-05, "loss": 0.5294, "step": 1630 }, { "epoch": 1.02, "grad_norm": 27.470956802368164, "learning_rate": 4.9056902837216676e-05, "loss": 0.5626, "step": 1640 }, { "epoch": 1.02, "grad_norm": 33.91129684448242, "learning_rate": 4.9017276906007294e-05, "loss": 0.2882, "step": 1650 }, { "epoch": 1.02, "grad_norm": 0.26898714900016785, "learning_rate": 4.897765097479791e-05, "loss": 0.4265, "step": 1660 }, { "epoch": 1.02, "grad_norm": 0.8199774622917175, "learning_rate": 4.8938025043588525e-05, "loss": 0.5277, "step": 1670 }, { "epoch": 1.02, "grad_norm": 0.020548412576317787, "learning_rate": 4.8898399112379144e-05, "loss": 0.889, "step": 1680 }, { "epoch": 1.02, "grad_norm": 0.048078641295433044, "learning_rate": 4.8858773181169756e-05, "loss": 0.8639, "step": 1690 }, { "epoch": 1.02, "grad_norm": 20.957611083984375, "learning_rate": 4.881914724996038e-05, "loss": 0.3244, "step": 1700 }, { "epoch": 1.02, "grad_norm": 0.15246962010860443, "learning_rate": 4.8779521318750994e-05, "loss": 0.258, "step": 1710 }, { "epoch": 1.02, "grad_norm": 0.09393693506717682, "learning_rate": 4.873989538754161e-05, "loss": 0.9169, "step": 1720 }, { "epoch": 1.02, "grad_norm": 0.9115855097770691, "learning_rate": 4.8700269456332225e-05, "loss": 0.5618, "step": 1730 }, { "epoch": 1.02, "grad_norm": 6.85861873626709, "learning_rate": 4.866064352512284e-05, "loss": 0.6588, "step": 1740 }, { "epoch": 1.02, "grad_norm": 33.108909606933594, "learning_rate": 4.862101759391346e-05, "loss": 0.3317, "step": 1750 }, { "epoch": 1.03, "grad_norm": 2.785113573074341, "learning_rate": 4.8581391662704074e-05, "loss": 0.1886, "step": 1760 }, { "epoch": 1.03, "grad_norm": 0.07260994613170624, "learning_rate": 4.854176573149469e-05, "loss": 0.4813, "step": 1770 }, { "epoch": 1.03, "grad_norm": 1.9168213605880737, "learning_rate": 4.8502139800285305e-05, "loss": 0.6163, "step": 1780 }, { "epoch": 1.03, "grad_norm": 32.30327224731445, "learning_rate": 4.8462513869075924e-05, "loss": 0.5272, "step": 1790 }, { "epoch": 1.03, "grad_norm": 0.09055186808109283, "learning_rate": 4.842288793786654e-05, "loss": 0.58, "step": 1800 }, { "epoch": 1.03, "grad_norm": 0.15506546199321747, "learning_rate": 4.838326200665716e-05, "loss": 0.1171, "step": 1810 }, { "epoch": 1.03, "grad_norm": 32.89198303222656, "learning_rate": 4.8343636075447773e-05, "loss": 0.4713, "step": 1820 }, { "epoch": 1.03, "grad_norm": 0.009175814688205719, "learning_rate": 4.830401014423839e-05, "loss": 0.6139, "step": 1830 }, { "epoch": 1.03, "grad_norm": 32.81629943847656, "learning_rate": 4.8264384213029004e-05, "loss": 0.6131, "step": 1840 }, { "epoch": 1.03, "grad_norm": 62.49550247192383, "learning_rate": 4.822475828181962e-05, "loss": 0.5677, "step": 1850 }, { "epoch": 1.03, "grad_norm": 2.451925754547119, "learning_rate": 4.818513235061024e-05, "loss": 1.4171, "step": 1860 }, { "epoch": 1.03, "grad_norm": 0.2953392267227173, "learning_rate": 4.814550641940086e-05, "loss": 0.3838, "step": 1870 }, { "epoch": 1.03, "grad_norm": 53.240325927734375, "learning_rate": 4.810588048819147e-05, "loss": 0.1432, "step": 1880 }, { "epoch": 1.03, "grad_norm": 0.1574406921863556, "learning_rate": 4.806625455698209e-05, "loss": 0.318, "step": 1890 }, { "epoch": 1.04, "grad_norm": 35.6072998046875, "learning_rate": 4.802662862577271e-05, "loss": 0.6746, "step": 1900 }, { "epoch": 1.04, "grad_norm": 0.012536413036286831, "learning_rate": 4.798700269456333e-05, "loss": 0.7181, "step": 1910 }, { "epoch": 1.04, "grad_norm": 0.05652592331171036, "learning_rate": 4.794737676335394e-05, "loss": 0.0939, "step": 1920 }, { "epoch": 1.04, "grad_norm": 2.5210182666778564, "learning_rate": 4.790775083214456e-05, "loss": 1.1188, "step": 1930 }, { "epoch": 1.04, "grad_norm": 0.16478443145751953, "learning_rate": 4.786812490093517e-05, "loss": 0.4504, "step": 1940 }, { "epoch": 1.04, "grad_norm": 26.002525329589844, "learning_rate": 4.782849896972579e-05, "loss": 0.8049, "step": 1950 }, { "epoch": 1.04, "grad_norm": 37.67827606201172, "learning_rate": 4.778887303851641e-05, "loss": 0.4756, "step": 1960 }, { "epoch": 1.04, "grad_norm": 36.84476852416992, "learning_rate": 4.774924710730702e-05, "loss": 0.9565, "step": 1970 }, { "epoch": 1.04, "grad_norm": 0.18539421260356903, "learning_rate": 4.770962117609764e-05, "loss": 0.3279, "step": 1980 }, { "epoch": 1.04, "grad_norm": 3.1605958938598633, "learning_rate": 4.766999524488825e-05, "loss": 0.1506, "step": 1990 }, { "epoch": 1.04, "grad_norm": 0.08869732171297073, "learning_rate": 4.763036931367887e-05, "loss": 0.5895, "step": 2000 }, { "epoch": 1.04, "grad_norm": 1.4320793151855469, "learning_rate": 4.759074338246949e-05, "loss": 0.3709, "step": 2010 }, { "epoch": 1.04, "grad_norm": 0.013628893531858921, "learning_rate": 4.755111745126011e-05, "loss": 0.4708, "step": 2020 }, { "epoch": 1.04, "grad_norm": 0.008771849796175957, "learning_rate": 4.751149152005072e-05, "loss": 0.2291, "step": 2030 }, { "epoch": 1.05, "grad_norm": 0.034172721207141876, "learning_rate": 4.747186558884134e-05, "loss": 0.3005, "step": 2040 }, { "epoch": 1.05, "grad_norm": 2.04589581489563, "learning_rate": 4.743223965763195e-05, "loss": 0.7397, "step": 2050 }, { "epoch": 1.05, "grad_norm": 7.416354656219482, "learning_rate": 4.739261372642258e-05, "loss": 0.5153, "step": 2060 }, { "epoch": 1.05, "grad_norm": 59.661014556884766, "learning_rate": 4.735298779521319e-05, "loss": 0.7417, "step": 2070 }, { "epoch": 1.05, "grad_norm": 38.264408111572266, "learning_rate": 4.731336186400381e-05, "loss": 0.4673, "step": 2080 }, { "epoch": 1.05, "grad_norm": 0.010330034419894218, "learning_rate": 4.727373593279442e-05, "loss": 0.7802, "step": 2090 }, { "epoch": 1.05, "grad_norm": 0.009081050753593445, "learning_rate": 4.723411000158504e-05, "loss": 0.5108, "step": 2100 }, { "epoch": 1.05, "grad_norm": 0.9804019331932068, "learning_rate": 4.719448407037566e-05, "loss": 0.1208, "step": 2110 }, { "epoch": 1.05, "grad_norm": 17.980236053466797, "learning_rate": 4.7154858139166276e-05, "loss": 0.3317, "step": 2120 }, { "epoch": 1.05, "grad_norm": 0.03598076477646828, "learning_rate": 4.711523220795689e-05, "loss": 0.4675, "step": 2130 }, { "epoch": 1.05, "grad_norm": 28.66923713684082, "learning_rate": 4.707560627674751e-05, "loss": 0.7232, "step": 2140 }, { "epoch": 1.05, "grad_norm": 0.05090579390525818, "learning_rate": 4.703598034553812e-05, "loss": 0.2404, "step": 2150 }, { "epoch": 1.05, "grad_norm": 39.81483840942383, "learning_rate": 4.6996354414328745e-05, "loss": 0.382, "step": 2160 }, { "epoch": 1.05, "grad_norm": 0.010164987295866013, "learning_rate": 4.695672848311936e-05, "loss": 0.4326, "step": 2170 }, { "epoch": 1.06, "grad_norm": 0.007712522987276316, "learning_rate": 4.691710255190997e-05, "loss": 0.1419, "step": 2180 }, { "epoch": 1.06, "grad_norm": 0.03637077286839485, "learning_rate": 4.687747662070059e-05, "loss": 0.0087, "step": 2190 }, { "epoch": 1.06, "grad_norm": 0.2942463755607605, "learning_rate": 4.68378506894912e-05, "loss": 0.5895, "step": 2200 }, { "epoch": 1.06, "grad_norm": 11.139754295349121, "learning_rate": 4.6798224758281825e-05, "loss": 0.2236, "step": 2210 }, { "epoch": 1.06, "grad_norm": 0.1554485708475113, "learning_rate": 4.675859882707244e-05, "loss": 0.3646, "step": 2220 }, { "epoch": 1.06, "grad_norm": 0.10144095867872238, "learning_rate": 4.6718972895863056e-05, "loss": 0.2301, "step": 2230 }, { "epoch": 1.06, "grad_norm": 34.84081268310547, "learning_rate": 4.667934696465367e-05, "loss": 0.7788, "step": 2240 }, { "epoch": 1.06, "grad_norm": 0.31722915172576904, "learning_rate": 4.663972103344429e-05, "loss": 0.1405, "step": 2250 }, { "epoch": 1.06, "grad_norm": 32.6774787902832, "learning_rate": 4.6600095102234906e-05, "loss": 0.4079, "step": 2260 }, { "epoch": 1.06, "grad_norm": 0.04206797853112221, "learning_rate": 4.6560469171025525e-05, "loss": 0.8566, "step": 2270 }, { "epoch": 1.06, "grad_norm": 0.007674456108361483, "learning_rate": 4.6520843239816137e-05, "loss": 0.6745, "step": 2280 }, { "epoch": 1.06, "grad_norm": 4.521772384643555, "learning_rate": 4.6481217308606755e-05, "loss": 0.2311, "step": 2290 }, { "epoch": 1.06, "grad_norm": 0.43009287118911743, "learning_rate": 4.644159137739737e-05, "loss": 0.9246, "step": 2300 }, { "epoch": 1.06, "grad_norm": 0.108181893825531, "learning_rate": 4.6401965446187986e-05, "loss": 1.0581, "step": 2310 }, { "epoch": 1.07, "grad_norm": 0.0881645604968071, "learning_rate": 4.6362339514978605e-05, "loss": 0.431, "step": 2320 }, { "epoch": 1.07, "grad_norm": 0.03945665806531906, "learning_rate": 4.6322713583769224e-05, "loss": 0.3784, "step": 2330 }, { "epoch": 1.07, "grad_norm": 0.01008934061974287, "learning_rate": 4.6283087652559836e-05, "loss": 0.4657, "step": 2340 }, { "epoch": 1.07, "grad_norm": 1.8987274169921875, "learning_rate": 4.6243461721350455e-05, "loss": 0.3025, "step": 2350 }, { "epoch": 1.07, "grad_norm": 52.36662292480469, "learning_rate": 4.6203835790141073e-05, "loss": 0.1047, "step": 2360 }, { "epoch": 1.07, "grad_norm": 30.82796287536621, "learning_rate": 4.616420985893169e-05, "loss": 0.3864, "step": 2370 }, { "epoch": 1.07, "grad_norm": 28.43499755859375, "learning_rate": 4.6124583927722304e-05, "loss": 0.5029, "step": 2380 }, { "epoch": 1.07, "grad_norm": 36.37118911743164, "learning_rate": 4.608495799651292e-05, "loss": 0.3412, "step": 2390 }, { "epoch": 1.07, "grad_norm": 0.03059449978172779, "learning_rate": 4.6045332065303535e-05, "loss": 1.6759, "step": 2400 }, { "epoch": 1.07, "grad_norm": 0.028554683551192284, "learning_rate": 4.6005706134094154e-05, "loss": 0.4232, "step": 2410 }, { "epoch": 1.07, "grad_norm": 0.3136725425720215, "learning_rate": 4.596608020288477e-05, "loss": 0.4531, "step": 2420 }, { "epoch": 1.07, "grad_norm": 9.164505004882812, "learning_rate": 4.5926454271675385e-05, "loss": 1.4434, "step": 2430 }, { "epoch": 1.07, "grad_norm": 26.755535125732422, "learning_rate": 4.5886828340466004e-05, "loss": 0.5678, "step": 2440 }, { "epoch": 1.07, "grad_norm": 0.027405157685279846, "learning_rate": 4.5847202409256616e-05, "loss": 0.1547, "step": 2450 }, { "epoch": 1.08, "grad_norm": 7.302061080932617, "learning_rate": 4.5807576478047234e-05, "loss": 0.1004, "step": 2460 }, { "epoch": 1.08, "grad_norm": 36.72040557861328, "learning_rate": 4.576795054683785e-05, "loss": 0.4913, "step": 2470 }, { "epoch": 1.08, "grad_norm": 0.10275045782327652, "learning_rate": 4.572832461562847e-05, "loss": 0.0076, "step": 2480 }, { "epoch": 1.08, "grad_norm": 0.008994188159704208, "learning_rate": 4.5688698684419084e-05, "loss": 0.0018, "step": 2490 }, { "epoch": 1.08, "grad_norm": 0.02831762284040451, "learning_rate": 4.56490727532097e-05, "loss": 0.086, "step": 2500 }, { "epoch": 1.08, "grad_norm": 0.6608620285987854, "learning_rate": 4.5609446822000315e-05, "loss": 0.0874, "step": 2510 }, { "epoch": 1.08, "grad_norm": 1.8176270723342896, "learning_rate": 4.556982089079094e-05, "loss": 0.459, "step": 2520 }, { "epoch": 1.08, "grad_norm": 28.674335479736328, "learning_rate": 4.553019495958155e-05, "loss": 0.4304, "step": 2530 }, { "epoch": 1.08, "grad_norm": 0.06465455144643784, "learning_rate": 4.549056902837217e-05, "loss": 0.6094, "step": 2540 }, { "epoch": 1.08, "grad_norm": 0.011676542460918427, "learning_rate": 4.545094309716278e-05, "loss": 0.333, "step": 2550 }, { "epoch": 1.08, "grad_norm": 4.420731544494629, "learning_rate": 4.54113171659534e-05, "loss": 0.1013, "step": 2560 }, { "epoch": 1.08, "grad_norm": 0.1971130520105362, "learning_rate": 4.537169123474402e-05, "loss": 0.0122, "step": 2570 }, { "epoch": 1.08, "grad_norm": 2.309307813644409, "learning_rate": 4.533206530353464e-05, "loss": 0.0247, "step": 2580 }, { "epoch": 1.08, "grad_norm": 0.010364987887442112, "learning_rate": 4.529243937232525e-05, "loss": 0.0751, "step": 2590 }, { "epoch": 1.09, "grad_norm": 30.956512451171875, "learning_rate": 4.525281344111587e-05, "loss": 0.7706, "step": 2600 }, { "epoch": 1.09, "grad_norm": 21.555742263793945, "learning_rate": 4.521318750990648e-05, "loss": 0.0501, "step": 2610 }, { "epoch": 1.09, "grad_norm": 0.02271176129579544, "learning_rate": 4.51735615786971e-05, "loss": 0.219, "step": 2620 }, { "epoch": 1.09, "grad_norm": 0.10638172179460526, "learning_rate": 4.513393564748772e-05, "loss": 0.4009, "step": 2630 }, { "epoch": 1.09, "grad_norm": 0.0012674570316448808, "learning_rate": 4.509430971627833e-05, "loss": 0.0365, "step": 2640 }, { "epoch": 1.09, "grad_norm": 0.01860959082841873, "learning_rate": 4.505468378506895e-05, "loss": 0.064, "step": 2650 }, { "epoch": 1.09, "grad_norm": 0.029620472341775894, "learning_rate": 4.501505785385956e-05, "loss": 0.5788, "step": 2660 }, { "epoch": 1.09, "grad_norm": 69.32429504394531, "learning_rate": 4.497543192265019e-05, "loss": 0.5635, "step": 2670 }, { "epoch": 1.09, "grad_norm": 0.004012781195342541, "learning_rate": 4.49358059914408e-05, "loss": 0.8204, "step": 2680 }, { "epoch": 1.09, "grad_norm": 0.007074211724102497, "learning_rate": 4.489618006023142e-05, "loss": 0.2872, "step": 2690 }, { "epoch": 1.09, "grad_norm": 2.3659746646881104, "learning_rate": 4.485655412902203e-05, "loss": 0.6915, "step": 2700 }, { "epoch": 1.09, "grad_norm": 0.2181590050458908, "learning_rate": 4.481692819781265e-05, "loss": 0.0489, "step": 2710 }, { "epoch": 1.09, "grad_norm": 0.00418996112421155, "learning_rate": 4.477730226660327e-05, "loss": 0.4435, "step": 2720 }, { "epoch": 1.09, "grad_norm": 0.011325598694384098, "learning_rate": 4.473767633539389e-05, "loss": 0.0595, "step": 2730 }, { "epoch": 1.1, "grad_norm": 6.933524131774902, "learning_rate": 4.46980504041845e-05, "loss": 0.6284, "step": 2740 }, { "epoch": 1.1, "grad_norm": 0.04216031730175018, "learning_rate": 4.465842447297512e-05, "loss": 0.1847, "step": 2750 }, { "epoch": 1.1, "grad_norm": 49.92095184326172, "learning_rate": 4.461879854176573e-05, "loss": 0.1075, "step": 2760 }, { "epoch": 1.1, "grad_norm": 0.0068773203529417515, "learning_rate": 4.457917261055635e-05, "loss": 0.7253, "step": 2770 }, { "epoch": 1.1, "grad_norm": 49.53322219848633, "learning_rate": 4.453954667934697e-05, "loss": 0.3937, "step": 2780 }, { "epoch": 1.1, "grad_norm": 0.0059694708324968815, "learning_rate": 4.449992074813759e-05, "loss": 0.3752, "step": 2790 }, { "epoch": 1.1, "grad_norm": 0.006113003473728895, "learning_rate": 4.44602948169282e-05, "loss": 0.2646, "step": 2800 }, { "epoch": 1.1, "eval_accuracy": 0.8857142857142857, "eval_loss": 0.5655186772346497, "eval_runtime": 2204.4149, "eval_samples_per_second": 0.302, "eval_steps_per_second": 0.151, "step": 2806 }, { "epoch": 2.0, "grad_norm": 0.030710767954587936, "learning_rate": 4.442066888571882e-05, "loss": 0.2584, "step": 2810 }, { "epoch": 2.0, "grad_norm": 0.961925208568573, "learning_rate": 4.438104295450943e-05, "loss": 0.0208, "step": 2820 }, { "epoch": 2.0, "grad_norm": 0.034155942499637604, "learning_rate": 4.434141702330005e-05, "loss": 0.0855, "step": 2830 }, { "epoch": 2.0, "grad_norm": 0.023880697786808014, "learning_rate": 4.430179109209067e-05, "loss": 0.0589, "step": 2840 }, { "epoch": 2.0, "grad_norm": 0.4049380421638489, "learning_rate": 4.426216516088128e-05, "loss": 0.0529, "step": 2850 }, { "epoch": 2.0, "grad_norm": 0.3483090102672577, "learning_rate": 4.42225392296719e-05, "loss": 0.6943, "step": 2860 }, { "epoch": 2.0, "grad_norm": 24.336814880371094, "learning_rate": 4.418291329846252e-05, "loss": 0.5243, "step": 2870 }, { "epoch": 2.01, "grad_norm": 0.0032091333996504545, "learning_rate": 4.4143287367253136e-05, "loss": 0.1202, "step": 2880 }, { "epoch": 2.01, "grad_norm": 0.047493454068899155, "learning_rate": 4.410366143604375e-05, "loss": 0.1007, "step": 2890 }, { "epoch": 2.01, "grad_norm": 0.017678333446383476, "learning_rate": 4.406403550483437e-05, "loss": 0.0008, "step": 2900 }, { "epoch": 2.01, "grad_norm": 0.9050219058990479, "learning_rate": 4.402440957362498e-05, "loss": 0.3121, "step": 2910 }, { "epoch": 2.01, "grad_norm": 0.03626730665564537, "learning_rate": 4.39847836424156e-05, "loss": 0.1179, "step": 2920 }, { "epoch": 2.01, "grad_norm": 0.011950280517339706, "learning_rate": 4.3945157711206216e-05, "loss": 0.4693, "step": 2930 }, { "epoch": 2.01, "grad_norm": 0.02412373013794422, "learning_rate": 4.3905531779996835e-05, "loss": 0.8616, "step": 2940 }, { "epoch": 2.01, "grad_norm": 30.13632583618164, "learning_rate": 4.386590584878745e-05, "loss": 0.3553, "step": 2950 }, { "epoch": 2.01, "grad_norm": 0.0061193606816232204, "learning_rate": 4.3826279917578066e-05, "loss": 0.0383, "step": 2960 }, { "epoch": 2.01, "grad_norm": 0.2041795253753662, "learning_rate": 4.378665398636868e-05, "loss": 0.6114, "step": 2970 }, { "epoch": 2.01, "grad_norm": 0.26872244477272034, "learning_rate": 4.3747028055159304e-05, "loss": 0.5494, "step": 2980 }, { "epoch": 2.01, "grad_norm": 0.06702332943677902, "learning_rate": 4.3707402123949916e-05, "loss": 0.0144, "step": 2990 }, { "epoch": 2.01, "grad_norm": 0.0052034310065209866, "learning_rate": 4.3667776192740534e-05, "loss": 0.0128, "step": 3000 }, { "epoch": 2.01, "grad_norm": 2.874134063720703, "learning_rate": 4.3628150261531146e-05, "loss": 0.6844, "step": 3010 }, { "epoch": 2.02, "grad_norm": 1.1829482316970825, "learning_rate": 4.3588524330321765e-05, "loss": 0.1866, "step": 3020 }, { "epoch": 2.02, "grad_norm": 41.475399017333984, "learning_rate": 4.3548898399112384e-05, "loss": 0.156, "step": 3030 }, { "epoch": 2.02, "grad_norm": 0.07054935395717621, "learning_rate": 4.3509272467902996e-05, "loss": 0.3379, "step": 3040 }, { "epoch": 2.02, "grad_norm": 0.44977086782455444, "learning_rate": 4.3469646536693615e-05, "loss": 0.5629, "step": 3050 }, { "epoch": 2.02, "grad_norm": 0.06215721368789673, "learning_rate": 4.343002060548423e-05, "loss": 0.0019, "step": 3060 }, { "epoch": 2.02, "grad_norm": 43.73810958862305, "learning_rate": 4.3390394674274846e-05, "loss": 0.3984, "step": 3070 }, { "epoch": 2.02, "grad_norm": 0.9523270130157471, "learning_rate": 4.3350768743065464e-05, "loss": 0.0888, "step": 3080 }, { "epoch": 2.02, "grad_norm": 0.005942572373896837, "learning_rate": 4.331114281185608e-05, "loss": 0.1961, "step": 3090 }, { "epoch": 2.02, "grad_norm": 0.022418642416596413, "learning_rate": 4.3271516880646695e-05, "loss": 0.0597, "step": 3100 }, { "epoch": 2.02, "grad_norm": 0.04196101427078247, "learning_rate": 4.3231890949437314e-05, "loss": 0.1147, "step": 3110 }, { "epoch": 2.02, "grad_norm": 0.003765852889046073, "learning_rate": 4.3192265018227926e-05, "loss": 0.0416, "step": 3120 }, { "epoch": 2.02, "grad_norm": 0.053471703082323074, "learning_rate": 4.3152639087018545e-05, "loss": 0.3866, "step": 3130 }, { "epoch": 2.02, "grad_norm": 51.969120025634766, "learning_rate": 4.3113013155809164e-05, "loss": 0.0616, "step": 3140 }, { "epoch": 2.02, "grad_norm": 0.005074084736406803, "learning_rate": 4.307338722459978e-05, "loss": 0.1907, "step": 3150 }, { "epoch": 2.03, "grad_norm": 0.0045975870452821255, "learning_rate": 4.3033761293390395e-05, "loss": 0.0072, "step": 3160 }, { "epoch": 2.03, "grad_norm": 0.0040842327289283276, "learning_rate": 4.299413536218101e-05, "loss": 0.2306, "step": 3170 }, { "epoch": 2.03, "grad_norm": 0.008049121126532555, "learning_rate": 4.295450943097163e-05, "loss": 0.0058, "step": 3180 }, { "epoch": 2.03, "grad_norm": 21.22314453125, "learning_rate": 4.291488349976225e-05, "loss": 0.5502, "step": 3190 }, { "epoch": 2.03, "grad_norm": 0.036627013236284256, "learning_rate": 4.287525756855286e-05, "loss": 0.1419, "step": 3200 }, { "epoch": 2.03, "grad_norm": 2.3564202785491943, "learning_rate": 4.283563163734348e-05, "loss": 0.0279, "step": 3210 }, { "epoch": 2.03, "grad_norm": 0.0108193913474679, "learning_rate": 4.2796005706134094e-05, "loss": 0.0004, "step": 3220 }, { "epoch": 2.03, "grad_norm": 0.0201814454048872, "learning_rate": 4.275637977492471e-05, "loss": 0.3249, "step": 3230 }, { "epoch": 2.03, "grad_norm": 0.03389296308159828, "learning_rate": 4.271675384371533e-05, "loss": 0.3386, "step": 3240 }, { "epoch": 2.03, "grad_norm": 0.01544855535030365, "learning_rate": 4.267712791250595e-05, "loss": 0.3866, "step": 3250 }, { "epoch": 2.03, "grad_norm": 0.025016358122229576, "learning_rate": 4.263750198129656e-05, "loss": 0.0013, "step": 3260 }, { "epoch": 2.03, "grad_norm": 0.0923624038696289, "learning_rate": 4.2597876050087174e-05, "loss": 0.2532, "step": 3270 }, { "epoch": 2.03, "grad_norm": 23.150659561157227, "learning_rate": 4.255825011887779e-05, "loss": 0.3962, "step": 3280 }, { "epoch": 2.03, "grad_norm": 0.015515293926000595, "learning_rate": 4.251862418766841e-05, "loss": 0.0004, "step": 3290 }, { "epoch": 2.04, "grad_norm": 0.003917529247701168, "learning_rate": 4.247899825645903e-05, "loss": 0.2484, "step": 3300 }, { "epoch": 2.04, "grad_norm": 28.370773315429688, "learning_rate": 4.243937232524964e-05, "loss": 0.0367, "step": 3310 }, { "epoch": 2.04, "grad_norm": 0.026205556467175484, "learning_rate": 4.239974639404026e-05, "loss": 0.3111, "step": 3320 }, { "epoch": 2.04, "grad_norm": 0.01336923148483038, "learning_rate": 4.2360120462830874e-05, "loss": 0.0047, "step": 3330 }, { "epoch": 2.04, "grad_norm": 0.011190090328454971, "learning_rate": 4.23204945316215e-05, "loss": 0.3007, "step": 3340 }, { "epoch": 2.04, "grad_norm": 0.003655917476862669, "learning_rate": 4.228086860041211e-05, "loss": 0.5659, "step": 3350 }, { "epoch": 2.04, "grad_norm": 0.017216026782989502, "learning_rate": 4.224124266920273e-05, "loss": 0.0933, "step": 3360 }, { "epoch": 2.04, "grad_norm": 0.007373865228146315, "learning_rate": 4.220161673799334e-05, "loss": 0.0298, "step": 3370 }, { "epoch": 2.04, "grad_norm": 0.035991325974464417, "learning_rate": 4.216199080678396e-05, "loss": 0.0916, "step": 3380 }, { "epoch": 2.04, "grad_norm": 0.007277372293174267, "learning_rate": 4.212236487557458e-05, "loss": 0.0008, "step": 3390 }, { "epoch": 2.04, "grad_norm": 0.0012711473973467946, "learning_rate": 4.20827389443652e-05, "loss": 0.049, "step": 3400 }, { "epoch": 2.04, "grad_norm": 0.004262813366949558, "learning_rate": 4.204311301315581e-05, "loss": 0.3255, "step": 3410 }, { "epoch": 2.04, "grad_norm": 0.6016376614570618, "learning_rate": 4.200348708194643e-05, "loss": 0.0016, "step": 3420 }, { "epoch": 2.04, "grad_norm": 0.027856985107064247, "learning_rate": 4.196386115073704e-05, "loss": 0.1706, "step": 3430 }, { "epoch": 2.05, "grad_norm": 36.658660888671875, "learning_rate": 4.192423521952766e-05, "loss": 0.393, "step": 3440 }, { "epoch": 2.05, "grad_norm": 4.459847927093506, "learning_rate": 4.188460928831828e-05, "loss": 0.2113, "step": 3450 }, { "epoch": 2.05, "grad_norm": 0.003763306187465787, "learning_rate": 4.18449833571089e-05, "loss": 0.0946, "step": 3460 }, { "epoch": 2.05, "grad_norm": 0.9358043670654297, "learning_rate": 4.180535742589951e-05, "loss": 0.1248, "step": 3470 }, { "epoch": 2.05, "grad_norm": 5.325794219970703, "learning_rate": 4.176573149469012e-05, "loss": 0.1882, "step": 3480 }, { "epoch": 2.05, "grad_norm": 0.01291597355157137, "learning_rate": 4.172610556348075e-05, "loss": 0.5989, "step": 3490 }, { "epoch": 2.05, "grad_norm": 0.05552150309085846, "learning_rate": 4.168647963227136e-05, "loss": 0.1317, "step": 3500 }, { "epoch": 2.05, "grad_norm": 0.0046797278337180614, "learning_rate": 4.164685370106198e-05, "loss": 0.9324, "step": 3510 }, { "epoch": 2.05, "grad_norm": 0.19884918630123138, "learning_rate": 4.160722776985259e-05, "loss": 0.0027, "step": 3520 }, { "epoch": 2.05, "grad_norm": 0.06361120939254761, "learning_rate": 4.156760183864321e-05, "loss": 0.1294, "step": 3530 }, { "epoch": 2.05, "grad_norm": 0.025872783735394478, "learning_rate": 4.152797590743383e-05, "loss": 0.3453, "step": 3540 }, { "epoch": 2.05, "grad_norm": 0.44598618149757385, "learning_rate": 4.1488349976224446e-05, "loss": 0.0445, "step": 3550 }, { "epoch": 2.05, "grad_norm": 0.00139313330873847, "learning_rate": 4.144872404501506e-05, "loss": 0.4126, "step": 3560 }, { "epoch": 2.05, "grad_norm": 0.004861112684011459, "learning_rate": 4.140909811380568e-05, "loss": 0.3162, "step": 3570 }, { "epoch": 2.06, "grad_norm": 37.97075653076172, "learning_rate": 4.136947218259629e-05, "loss": 0.0275, "step": 3580 }, { "epoch": 2.06, "grad_norm": 0.006260779220610857, "learning_rate": 4.132984625138691e-05, "loss": 0.5518, "step": 3590 }, { "epoch": 2.06, "grad_norm": 10.439234733581543, "learning_rate": 4.129022032017753e-05, "loss": 0.0489, "step": 3600 }, { "epoch": 2.06, "grad_norm": 0.009267416782677174, "learning_rate": 4.1250594388968146e-05, "loss": 0.2203, "step": 3610 }, { "epoch": 2.06, "grad_norm": 0.003436572849750519, "learning_rate": 4.121096845775876e-05, "loss": 0.0489, "step": 3620 }, { "epoch": 2.06, "grad_norm": 0.02378927730023861, "learning_rate": 4.1171342526549377e-05, "loss": 0.3647, "step": 3630 }, { "epoch": 2.06, "grad_norm": 0.04053608328104019, "learning_rate": 4.113171659533999e-05, "loss": 0.4505, "step": 3640 }, { "epoch": 2.06, "grad_norm": 0.8833039402961731, "learning_rate": 4.1092090664130614e-05, "loss": 0.0626, "step": 3650 }, { "epoch": 2.06, "grad_norm": 11.919655799865723, "learning_rate": 4.1052464732921226e-05, "loss": 0.0989, "step": 3660 }, { "epoch": 2.06, "grad_norm": 0.03586142137646675, "learning_rate": 4.1012838801711845e-05, "loss": 0.2583, "step": 3670 }, { "epoch": 2.06, "grad_norm": 0.1854490488767624, "learning_rate": 4.097321287050246e-05, "loss": 0.1368, "step": 3680 }, { "epoch": 2.06, "grad_norm": 0.057375673204660416, "learning_rate": 4.0933586939293076e-05, "loss": 0.345, "step": 3690 }, { "epoch": 2.06, "grad_norm": 0.015717756003141403, "learning_rate": 4.0893961008083695e-05, "loss": 0.0065, "step": 3700 }, { "epoch": 2.06, "grad_norm": 0.02194334752857685, "learning_rate": 4.085433507687431e-05, "loss": 0.0021, "step": 3710 }, { "epoch": 2.07, "grad_norm": 16.584745407104492, "learning_rate": 4.0814709145664925e-05, "loss": 0.0147, "step": 3720 }, { "epoch": 2.07, "grad_norm": 0.0053609260357916355, "learning_rate": 4.077508321445554e-05, "loss": 0.3077, "step": 3730 }, { "epoch": 2.07, "grad_norm": 0.002716219983994961, "learning_rate": 4.0735457283246156e-05, "loss": 0.4135, "step": 3740 }, { "epoch": 2.07, "grad_norm": 0.8324286937713623, "learning_rate": 4.0695831352036775e-05, "loss": 0.3889, "step": 3750 }, { "epoch": 2.07, "grad_norm": 0.004210811574012041, "learning_rate": 4.0656205420827394e-05, "loss": 0.0015, "step": 3760 }, { "epoch": 2.07, "grad_norm": 0.0026693022809922695, "learning_rate": 4.0616579489618006e-05, "loss": 0.566, "step": 3770 }, { "epoch": 2.07, "grad_norm": 0.02392963133752346, "learning_rate": 4.0576953558408625e-05, "loss": 0.0124, "step": 3780 }, { "epoch": 2.07, "grad_norm": 0.020079661160707474, "learning_rate": 4.053732762719924e-05, "loss": 0.0152, "step": 3790 }, { "epoch": 2.07, "grad_norm": 0.0014172615483403206, "learning_rate": 4.049770169598986e-05, "loss": 0.0932, "step": 3800 }, { "epoch": 2.07, "grad_norm": 5.266864776611328, "learning_rate": 4.0458075764780474e-05, "loss": 0.5559, "step": 3810 }, { "epoch": 2.07, "grad_norm": 0.009865287691354752, "learning_rate": 4.041844983357109e-05, "loss": 0.2131, "step": 3820 }, { "epoch": 2.07, "grad_norm": 0.0027454691007733345, "learning_rate": 4.0378823902361705e-05, "loss": 0.0006, "step": 3830 }, { "epoch": 2.07, "grad_norm": 0.15348024666309357, "learning_rate": 4.0339197971152324e-05, "loss": 1.0736, "step": 3840 }, { "epoch": 2.07, "grad_norm": 0.002026822417974472, "learning_rate": 4.029957203994294e-05, "loss": 0.4398, "step": 3850 }, { "epoch": 2.08, "grad_norm": 0.023032035678625107, "learning_rate": 4.025994610873356e-05, "loss": 0.3724, "step": 3860 }, { "epoch": 2.08, "grad_norm": 110.459716796875, "learning_rate": 4.0220320177524174e-05, "loss": 0.4695, "step": 3870 }, { "epoch": 2.08, "grad_norm": 0.026824643835425377, "learning_rate": 4.018069424631479e-05, "loss": 0.4355, "step": 3880 }, { "epoch": 2.08, "grad_norm": 0.007477205712348223, "learning_rate": 4.0141068315105404e-05, "loss": 0.3392, "step": 3890 }, { "epoch": 2.08, "grad_norm": 0.0020925672724843025, "learning_rate": 4.010144238389602e-05, "loss": 0.228, "step": 3900 }, { "epoch": 2.08, "grad_norm": 0.003810058580711484, "learning_rate": 4.006181645268664e-05, "loss": 0.2477, "step": 3910 }, { "epoch": 2.08, "grad_norm": 0.0076815299689769745, "learning_rate": 4.0022190521477254e-05, "loss": 0.4539, "step": 3920 }, { "epoch": 2.08, "grad_norm": 0.005379770882427692, "learning_rate": 3.998256459026787e-05, "loss": 0.0341, "step": 3930 }, { "epoch": 2.08, "grad_norm": 0.003831785172224045, "learning_rate": 3.9942938659058485e-05, "loss": 0.5954, "step": 3940 }, { "epoch": 2.08, "grad_norm": 0.7539482116699219, "learning_rate": 3.9903312727849104e-05, "loss": 0.3885, "step": 3950 }, { "epoch": 2.08, "grad_norm": 0.005533235147595406, "learning_rate": 3.986368679663972e-05, "loss": 0.0053, "step": 3960 }, { "epoch": 2.08, "grad_norm": 0.008866420015692711, "learning_rate": 3.982406086543034e-05, "loss": 0.0453, "step": 3970 }, { "epoch": 2.08, "grad_norm": 0.014108781702816486, "learning_rate": 3.978443493422095e-05, "loss": 0.2954, "step": 3980 }, { "epoch": 2.08, "grad_norm": 0.016585228964686394, "learning_rate": 3.974480900301157e-05, "loss": 0.0076, "step": 3990 }, { "epoch": 2.09, "grad_norm": 3.2773778438568115, "learning_rate": 3.970518307180219e-05, "loss": 0.1556, "step": 4000 }, { "epoch": 2.09, "grad_norm": 7.254385948181152, "learning_rate": 3.966555714059281e-05, "loss": 0.1696, "step": 4010 }, { "epoch": 2.09, "grad_norm": 0.0035600659903138876, "learning_rate": 3.962593120938342e-05, "loss": 0.0074, "step": 4020 }, { "epoch": 2.09, "grad_norm": 8.71975040435791, "learning_rate": 3.958630527817404e-05, "loss": 0.3048, "step": 4030 }, { "epoch": 2.09, "grad_norm": 0.0020627696067094803, "learning_rate": 3.954667934696465e-05, "loss": 0.7165, "step": 4040 }, { "epoch": 2.09, "grad_norm": 0.007494701538234949, "learning_rate": 3.950705341575527e-05, "loss": 0.5529, "step": 4050 }, { "epoch": 2.09, "grad_norm": 0.016065679490566254, "learning_rate": 3.946742748454589e-05, "loss": 0.0128, "step": 4060 }, { "epoch": 2.09, "grad_norm": 0.322768896818161, "learning_rate": 3.942780155333651e-05, "loss": 0.0628, "step": 4070 }, { "epoch": 2.09, "grad_norm": 0.023394137620925903, "learning_rate": 3.938817562212712e-05, "loss": 0.4743, "step": 4080 }, { "epoch": 2.09, "grad_norm": 0.0229184590280056, "learning_rate": 3.934854969091774e-05, "loss": 0.2818, "step": 4090 }, { "epoch": 2.09, "grad_norm": 0.006755081005394459, "learning_rate": 3.930892375970835e-05, "loss": 0.3635, "step": 4100 }, { "epoch": 2.09, "grad_norm": 0.004403649363666773, "learning_rate": 3.926929782849898e-05, "loss": 0.0568, "step": 4110 }, { "epoch": 2.09, "grad_norm": 0.0034377514384686947, "learning_rate": 3.922967189728959e-05, "loss": 0.1624, "step": 4120 }, { "epoch": 2.09, "grad_norm": 0.005851461086422205, "learning_rate": 3.91900459660802e-05, "loss": 0.6674, "step": 4130 }, { "epoch": 2.1, "grad_norm": 0.004862835630774498, "learning_rate": 3.915042003487082e-05, "loss": 0.5013, "step": 4140 }, { "epoch": 2.1, "grad_norm": 42.758365631103516, "learning_rate": 3.911079410366143e-05, "loss": 0.4173, "step": 4150 }, { "epoch": 2.1, "grad_norm": 0.004606719594448805, "learning_rate": 3.907116817245206e-05, "loss": 0.5328, "step": 4160 }, { "epoch": 2.1, "grad_norm": 78.40693664550781, "learning_rate": 3.903154224124267e-05, "loss": 0.4105, "step": 4170 }, { "epoch": 2.1, "grad_norm": 23.919864654541016, "learning_rate": 3.899191631003329e-05, "loss": 1.4324, "step": 4180 }, { "epoch": 2.1, "grad_norm": 0.00816379301249981, "learning_rate": 3.89522903788239e-05, "loss": 0.3002, "step": 4190 }, { "epoch": 2.1, "grad_norm": 0.022155677899718285, "learning_rate": 3.891266444761452e-05, "loss": 0.0785, "step": 4200 }, { "epoch": 2.1, "eval_accuracy": 0.9052631578947369, "eval_loss": 0.4806475341320038, "eval_runtime": 2299.8623, "eval_samples_per_second": 0.289, "eval_steps_per_second": 0.145, "step": 4209 }, { "epoch": 3.0, "grad_norm": 0.0499810166656971, "learning_rate": 3.887303851640514e-05, "loss": 0.1101, "step": 4210 }, { "epoch": 3.0, "grad_norm": 0.00219921232201159, "learning_rate": 3.883341258519576e-05, "loss": 0.0019, "step": 4220 }, { "epoch": 3.0, "grad_norm": 0.05688053369522095, "learning_rate": 3.879378665398637e-05, "loss": 0.4249, "step": 4230 }, { "epoch": 3.0, "grad_norm": 0.004060626961290836, "learning_rate": 3.875416072277699e-05, "loss": 0.329, "step": 4240 }, { "epoch": 3.0, "grad_norm": 0.4057186245918274, "learning_rate": 3.87145347915676e-05, "loss": 0.0089, "step": 4250 }, { "epoch": 3.0, "grad_norm": 0.0447358600795269, "learning_rate": 3.8674908860358226e-05, "loss": 0.0034, "step": 4260 }, { "epoch": 3.0, "grad_norm": 0.003750765696167946, "learning_rate": 3.863528292914884e-05, "loss": 0.0953, "step": 4270 }, { "epoch": 3.01, "grad_norm": 6.533902168273926, "learning_rate": 3.8595656997939456e-05, "loss": 0.0106, "step": 4280 }, { "epoch": 3.01, "grad_norm": 0.001664067734964192, "learning_rate": 3.855603106673007e-05, "loss": 0.0162, "step": 4290 }, { "epoch": 3.01, "grad_norm": 0.8516010046005249, "learning_rate": 3.851640513552069e-05, "loss": 0.4751, "step": 4300 }, { "epoch": 3.01, "grad_norm": 0.03567550331354141, "learning_rate": 3.8476779204311306e-05, "loss": 0.161, "step": 4310 }, { "epoch": 3.01, "grad_norm": 0.0029626258183270693, "learning_rate": 3.8437153273101925e-05, "loss": 0.1448, "step": 4320 }, { "epoch": 3.01, "grad_norm": 0.017234837636351585, "learning_rate": 3.839752734189254e-05, "loss": 0.0052, "step": 4330 }, { "epoch": 3.01, "grad_norm": 0.14725999534130096, "learning_rate": 3.835790141068315e-05, "loss": 0.0645, "step": 4340 }, { "epoch": 3.01, "grad_norm": 0.002782195108011365, "learning_rate": 3.831827547947377e-05, "loss": 0.0004, "step": 4350 }, { "epoch": 3.01, "grad_norm": 34.547061920166016, "learning_rate": 3.8278649548264386e-05, "loss": 0.4315, "step": 4360 }, { "epoch": 3.01, "grad_norm": 0.0030270384158939123, "learning_rate": 3.8239023617055005e-05, "loss": 0.3151, "step": 4370 }, { "epoch": 3.01, "grad_norm": 0.008927990682423115, "learning_rate": 3.819939768584562e-05, "loss": 0.0002, "step": 4380 }, { "epoch": 3.01, "grad_norm": 0.11368348449468613, "learning_rate": 3.8159771754636236e-05, "loss": 0.0077, "step": 4390 }, { "epoch": 3.01, "grad_norm": 0.10815131664276123, "learning_rate": 3.812014582342685e-05, "loss": 0.0182, "step": 4400 }, { "epoch": 3.01, "grad_norm": 0.020075034350156784, "learning_rate": 3.808051989221747e-05, "loss": 0.0007, "step": 4410 }, { "epoch": 3.02, "grad_norm": 0.001029517618007958, "learning_rate": 3.8040893961008086e-05, "loss": 0.0397, "step": 4420 }, { "epoch": 3.02, "grad_norm": 0.003121725283563137, "learning_rate": 3.8001268029798704e-05, "loss": 0.0001, "step": 4430 }, { "epoch": 3.02, "grad_norm": 5.35357141494751, "learning_rate": 3.7961642098589316e-05, "loss": 0.0069, "step": 4440 }, { "epoch": 3.02, "grad_norm": 0.013706800527870655, "learning_rate": 3.7922016167379935e-05, "loss": 0.0007, "step": 4450 }, { "epoch": 3.02, "grad_norm": 0.09196832776069641, "learning_rate": 3.7882390236170554e-05, "loss": 0.0003, "step": 4460 }, { "epoch": 3.02, "grad_norm": 0.003602321958169341, "learning_rate": 3.784276430496117e-05, "loss": 0.2969, "step": 4470 }, { "epoch": 3.02, "grad_norm": 20.944992065429688, "learning_rate": 3.7803138373751785e-05, "loss": 0.0272, "step": 4480 }, { "epoch": 3.02, "grad_norm": 0.002105366438627243, "learning_rate": 3.7763512442542404e-05, "loss": 0.0002, "step": 4490 }, { "epoch": 3.02, "grad_norm": 0.004411764442920685, "learning_rate": 3.7723886511333016e-05, "loss": 0.0076, "step": 4500 }, { "epoch": 3.02, "grad_norm": 0.005865162704139948, "learning_rate": 3.7684260580123635e-05, "loss": 0.0059, "step": 4510 }, { "epoch": 3.02, "grad_norm": 0.011046779341995716, "learning_rate": 3.764463464891425e-05, "loss": 0.0043, "step": 4520 }, { "epoch": 3.02, "grad_norm": 0.023666031658649445, "learning_rate": 3.760500871770487e-05, "loss": 0.0009, "step": 4530 }, { "epoch": 3.02, "grad_norm": 53.04268264770508, "learning_rate": 3.7565382786495484e-05, "loss": 0.3129, "step": 4540 }, { "epoch": 3.02, "grad_norm": 16.536462783813477, "learning_rate": 3.7525756855286096e-05, "loss": 0.0079, "step": 4550 }, { "epoch": 3.03, "grad_norm": 0.004224766045808792, "learning_rate": 3.7486130924076715e-05, "loss": 0.073, "step": 4560 }, { "epoch": 3.03, "grad_norm": 0.005598429590463638, "learning_rate": 3.7446504992867334e-05, "loss": 0.2742, "step": 4570 }, { "epoch": 3.03, "grad_norm": 0.0030881077982485294, "learning_rate": 3.740687906165795e-05, "loss": 0.2948, "step": 4580 }, { "epoch": 3.03, "grad_norm": 0.019152648746967316, "learning_rate": 3.7367253130448565e-05, "loss": 0.0047, "step": 4590 }, { "epoch": 3.03, "grad_norm": 0.001949524856172502, "learning_rate": 3.7327627199239183e-05, "loss": 0.0003, "step": 4600 }, { "epoch": 3.03, "grad_norm": 0.001139726140536368, "learning_rate": 3.7288001268029795e-05, "loss": 0.0033, "step": 4610 }, { "epoch": 3.03, "grad_norm": 0.0009636884205974638, "learning_rate": 3.724837533682042e-05, "loss": 0.5236, "step": 4620 }, { "epoch": 3.03, "grad_norm": 0.0024904939346015453, "learning_rate": 3.720874940561103e-05, "loss": 0.0005, "step": 4630 }, { "epoch": 3.03, "grad_norm": 0.004561484791338444, "learning_rate": 3.716912347440165e-05, "loss": 0.4394, "step": 4640 }, { "epoch": 3.03, "grad_norm": 21.228055953979492, "learning_rate": 3.7129497543192264e-05, "loss": 0.5905, "step": 4650 }, { "epoch": 3.03, "grad_norm": 38.67287063598633, "learning_rate": 3.708987161198288e-05, "loss": 0.0304, "step": 4660 }, { "epoch": 3.03, "grad_norm": 0.002863664412871003, "learning_rate": 3.70502456807735e-05, "loss": 0.4688, "step": 4670 }, { "epoch": 3.03, "grad_norm": 0.0070022111758589745, "learning_rate": 3.701061974956412e-05, "loss": 0.0044, "step": 4680 }, { "epoch": 3.03, "grad_norm": 98.50983428955078, "learning_rate": 3.697099381835473e-05, "loss": 0.2539, "step": 4690 }, { "epoch": 3.04, "grad_norm": 0.044561292976140976, "learning_rate": 3.693136788714535e-05, "loss": 0.0002, "step": 4700 }, { "epoch": 3.04, "grad_norm": 2.370043992996216, "learning_rate": 3.689174195593596e-05, "loss": 0.0055, "step": 4710 }, { "epoch": 3.04, "grad_norm": 12.61652660369873, "learning_rate": 3.685211602472658e-05, "loss": 0.456, "step": 4720 }, { "epoch": 3.04, "grad_norm": 0.020174263045191765, "learning_rate": 3.68124900935172e-05, "loss": 0.0023, "step": 4730 }, { "epoch": 3.04, "grad_norm": 0.032532501965761185, "learning_rate": 3.677286416230782e-05, "loss": 0.0004, "step": 4740 }, { "epoch": 3.04, "grad_norm": 39.96610641479492, "learning_rate": 3.673323823109843e-05, "loss": 0.5033, "step": 4750 }, { "epoch": 3.04, "grad_norm": 0.006895292084664106, "learning_rate": 3.669361229988905e-05, "loss": 0.2369, "step": 4760 }, { "epoch": 3.04, "grad_norm": 0.0018528720829635859, "learning_rate": 3.665398636867967e-05, "loss": 0.0005, "step": 4770 }, { "epoch": 3.04, "grad_norm": 57.440799713134766, "learning_rate": 3.661436043747028e-05, "loss": 0.7416, "step": 4780 }, { "epoch": 3.04, "grad_norm": 0.15606503188610077, "learning_rate": 3.65747345062609e-05, "loss": 0.0005, "step": 4790 }, { "epoch": 3.04, "grad_norm": 0.06342484056949615, "learning_rate": 3.653510857505151e-05, "loss": 0.0008, "step": 4800 }, { "epoch": 3.04, "grad_norm": 0.0007686218596063554, "learning_rate": 3.649548264384213e-05, "loss": 0.0083, "step": 4810 }, { "epoch": 3.04, "grad_norm": 0.007868933491408825, "learning_rate": 3.645585671263275e-05, "loss": 0.0002, "step": 4820 }, { "epoch": 3.04, "grad_norm": 0.0038664869498461485, "learning_rate": 3.641623078142337e-05, "loss": 0.5171, "step": 4830 }, { "epoch": 3.05, "grad_norm": 139.34559631347656, "learning_rate": 3.637660485021398e-05, "loss": 0.1279, "step": 4840 }, { "epoch": 3.05, "grad_norm": 0.01549526583403349, "learning_rate": 3.63369789190046e-05, "loss": 0.0024, "step": 4850 }, { "epoch": 3.05, "grad_norm": 0.009506451897323132, "learning_rate": 3.629735298779521e-05, "loss": 0.0322, "step": 4860 }, { "epoch": 3.05, "grad_norm": 0.01199623104184866, "learning_rate": 3.625772705658583e-05, "loss": 0.5853, "step": 4870 }, { "epoch": 3.05, "grad_norm": 0.023425359278917313, "learning_rate": 3.621810112537645e-05, "loss": 0.0074, "step": 4880 }, { "epoch": 3.05, "grad_norm": 0.0029667699709534645, "learning_rate": 3.617847519416707e-05, "loss": 0.0003, "step": 4890 }, { "epoch": 3.05, "grad_norm": 0.0053655593656003475, "learning_rate": 3.613884926295768e-05, "loss": 0.0022, "step": 4900 }, { "epoch": 3.05, "grad_norm": 0.002650972455739975, "learning_rate": 3.60992233317483e-05, "loss": 0.0286, "step": 4910 }, { "epoch": 3.05, "grad_norm": 0.029404861852526665, "learning_rate": 3.605959740053891e-05, "loss": 0.355, "step": 4920 }, { "epoch": 3.05, "grad_norm": 0.0018920317525044084, "learning_rate": 3.6019971469329536e-05, "loss": 0.0105, "step": 4930 }, { "epoch": 3.05, "grad_norm": 0.2511395812034607, "learning_rate": 3.598034553812015e-05, "loss": 0.0721, "step": 4940 }, { "epoch": 3.05, "grad_norm": 0.0022750215139240026, "learning_rate": 3.594071960691077e-05, "loss": 0.0011, "step": 4950 }, { "epoch": 3.05, "grad_norm": 0.0004970223526470363, "learning_rate": 3.590109367570138e-05, "loss": 0.0493, "step": 4960 }, { "epoch": 3.05, "grad_norm": 0.0018257640767842531, "learning_rate": 3.5861467744492e-05, "loss": 0.0041, "step": 4970 }, { "epoch": 3.05, "grad_norm": 0.002615696983411908, "learning_rate": 3.5821841813282617e-05, "loss": 0.0002, "step": 4980 }, { "epoch": 3.06, "grad_norm": 0.0015312007162719965, "learning_rate": 3.578221588207323e-05, "loss": 0.0048, "step": 4990 }, { "epoch": 3.06, "grad_norm": 0.003842801321297884, "learning_rate": 3.574258995086385e-05, "loss": 0.0001, "step": 5000 }, { "epoch": 3.06, "grad_norm": 0.003976788371801376, "learning_rate": 3.570296401965446e-05, "loss": 0.0002, "step": 5010 }, { "epoch": 3.06, "grad_norm": 0.0027057684492319822, "learning_rate": 3.566333808844508e-05, "loss": 0.0001, "step": 5020 }, { "epoch": 3.06, "grad_norm": 0.0013581090606749058, "learning_rate": 3.56237121572357e-05, "loss": 0.0643, "step": 5030 }, { "epoch": 3.06, "grad_norm": 73.48147583007812, "learning_rate": 3.5584086226026316e-05, "loss": 0.1412, "step": 5040 }, { "epoch": 3.06, "grad_norm": 0.05521896854043007, "learning_rate": 3.554446029481693e-05, "loss": 0.8526, "step": 5050 }, { "epoch": 3.06, "grad_norm": 0.01980687491595745, "learning_rate": 3.550483436360755e-05, "loss": 0.3712, "step": 5060 }, { "epoch": 3.06, "grad_norm": 0.0016443756176158786, "learning_rate": 3.546520843239816e-05, "loss": 0.0004, "step": 5070 }, { "epoch": 3.06, "grad_norm": 2.7786030769348145, "learning_rate": 3.5425582501188784e-05, "loss": 0.0024, "step": 5080 }, { "epoch": 3.06, "grad_norm": 0.002752843778580427, "learning_rate": 3.5385956569979396e-05, "loss": 0.0989, "step": 5090 }, { "epoch": 3.06, "grad_norm": 0.07084832340478897, "learning_rate": 3.5346330638770015e-05, "loss": 0.006, "step": 5100 }, { "epoch": 3.06, "grad_norm": 0.0023438192438334227, "learning_rate": 3.530670470756063e-05, "loss": 0.0631, "step": 5110 }, { "epoch": 3.06, "grad_norm": 0.023146087303757668, "learning_rate": 3.5267078776351246e-05, "loss": 0.3281, "step": 5120 }, { "epoch": 3.07, "grad_norm": 0.0026536276564002037, "learning_rate": 3.5227452845141865e-05, "loss": 0.2622, "step": 5130 }, { "epoch": 3.07, "grad_norm": 25.2746639251709, "learning_rate": 3.5187826913932483e-05, "loss": 0.9971, "step": 5140 }, { "epoch": 3.07, "grad_norm": 2.3518447875976562, "learning_rate": 3.5148200982723095e-05, "loss": 0.0101, "step": 5150 }, { "epoch": 3.07, "grad_norm": 0.004120847675949335, "learning_rate": 3.5108575051513714e-05, "loss": 0.2435, "step": 5160 }, { "epoch": 3.07, "grad_norm": 0.0018114675767719746, "learning_rate": 3.5068949120304326e-05, "loss": 0.0002, "step": 5170 }, { "epoch": 3.07, "grad_norm": 0.003768153488636017, "learning_rate": 3.5029323189094945e-05, "loss": 0.0041, "step": 5180 }, { "epoch": 3.07, "grad_norm": 0.07695072889328003, "learning_rate": 3.4989697257885564e-05, "loss": 0.0025, "step": 5190 }, { "epoch": 3.07, "grad_norm": 57.52178955078125, "learning_rate": 3.4950071326676176e-05, "loss": 0.0468, "step": 5200 }, { "epoch": 3.07, "grad_norm": 0.001811747089959681, "learning_rate": 3.4910445395466795e-05, "loss": 0.0007, "step": 5210 }, { "epoch": 3.07, "grad_norm": 0.0032631447538733482, "learning_rate": 3.487081946425741e-05, "loss": 0.0962, "step": 5220 }, { "epoch": 3.07, "grad_norm": 0.0040063695050776005, "learning_rate": 3.4831193533048026e-05, "loss": 0.0718, "step": 5230 }, { "epoch": 3.07, "grad_norm": 0.042804840952157974, "learning_rate": 3.4791567601838644e-05, "loss": 0.0038, "step": 5240 }, { "epoch": 3.07, "grad_norm": 0.0023616242688149214, "learning_rate": 3.475194167062926e-05, "loss": 0.0002, "step": 5250 }, { "epoch": 3.07, "grad_norm": 0.002275130245834589, "learning_rate": 3.4712315739419875e-05, "loss": 0.6309, "step": 5260 }, { "epoch": 3.08, "grad_norm": 0.011256784200668335, "learning_rate": 3.4672689808210494e-05, "loss": 0.0002, "step": 5270 }, { "epoch": 3.08, "grad_norm": 0.0045999023132026196, "learning_rate": 3.463306387700111e-05, "loss": 0.1079, "step": 5280 }, { "epoch": 3.08, "grad_norm": 0.001873884117230773, "learning_rate": 3.459343794579173e-05, "loss": 0.0004, "step": 5290 }, { "epoch": 3.08, "grad_norm": 0.003349520266056061, "learning_rate": 3.4553812014582344e-05, "loss": 0.0392, "step": 5300 }, { "epoch": 3.08, "grad_norm": 0.07745254039764404, "learning_rate": 3.451418608337296e-05, "loss": 0.1259, "step": 5310 }, { "epoch": 3.08, "grad_norm": 0.001707065268419683, "learning_rate": 3.4474560152163574e-05, "loss": 0.2405, "step": 5320 }, { "epoch": 3.08, "grad_norm": 0.00230118609033525, "learning_rate": 3.443493422095419e-05, "loss": 0.0026, "step": 5330 }, { "epoch": 3.08, "grad_norm": 0.03412836417555809, "learning_rate": 3.439530828974481e-05, "loss": 0.1592, "step": 5340 }, { "epoch": 3.08, "grad_norm": 0.006646712776273489, "learning_rate": 3.435568235853543e-05, "loss": 0.0253, "step": 5350 }, { "epoch": 3.08, "grad_norm": 0.10694713890552521, "learning_rate": 3.431605642732604e-05, "loss": 0.3725, "step": 5360 }, { "epoch": 3.08, "grad_norm": 0.14875371754169464, "learning_rate": 3.427643049611666e-05, "loss": 0.7354, "step": 5370 }, { "epoch": 3.08, "grad_norm": 0.06602335721254349, "learning_rate": 3.4236804564907274e-05, "loss": 0.3752, "step": 5380 }, { "epoch": 3.08, "grad_norm": 0.002122233621776104, "learning_rate": 3.41971786336979e-05, "loss": 0.0014, "step": 5390 }, { "epoch": 3.08, "grad_norm": 0.020870821550488472, "learning_rate": 3.415755270248851e-05, "loss": 0.001, "step": 5400 }, { "epoch": 3.09, "grad_norm": 51.36176681518555, "learning_rate": 3.411792677127912e-05, "loss": 0.0521, "step": 5410 }, { "epoch": 3.09, "grad_norm": 0.002612057374790311, "learning_rate": 3.407830084006974e-05, "loss": 0.3004, "step": 5420 }, { "epoch": 3.09, "grad_norm": 0.006323930341750383, "learning_rate": 3.4038674908860354e-05, "loss": 0.0007, "step": 5430 }, { "epoch": 3.09, "grad_norm": 0.010717890225350857, "learning_rate": 3.399904897765098e-05, "loss": 0.0003, "step": 5440 }, { "epoch": 3.09, "grad_norm": 0.20171226561069489, "learning_rate": 3.395942304644159e-05, "loss": 0.0007, "step": 5450 }, { "epoch": 3.09, "grad_norm": 0.0015600892947986722, "learning_rate": 3.391979711523221e-05, "loss": 0.0011, "step": 5460 }, { "epoch": 3.09, "grad_norm": 1.7858773469924927, "learning_rate": 3.388017118402282e-05, "loss": 0.0032, "step": 5470 }, { "epoch": 3.09, "grad_norm": 0.0012454432435333729, "learning_rate": 3.384054525281344e-05, "loss": 0.0002, "step": 5480 }, { "epoch": 3.09, "grad_norm": 0.0015952313551679254, "learning_rate": 3.380091932160406e-05, "loss": 0.001, "step": 5490 }, { "epoch": 3.09, "grad_norm": 0.002770837862044573, "learning_rate": 3.376129339039468e-05, "loss": 0.0008, "step": 5500 }, { "epoch": 3.09, "grad_norm": 1.3940935134887695, "learning_rate": 3.372166745918529e-05, "loss": 0.0037, "step": 5510 }, { "epoch": 3.09, "grad_norm": 0.1070881336927414, "learning_rate": 3.368204152797591e-05, "loss": 0.0003, "step": 5520 }, { "epoch": 3.09, "grad_norm": 60.142276763916016, "learning_rate": 3.364241559676652e-05, "loss": 0.4676, "step": 5530 }, { "epoch": 3.09, "grad_norm": 0.0014883485855534673, "learning_rate": 3.360278966555714e-05, "loss": 0.0003, "step": 5540 }, { "epoch": 3.1, "grad_norm": 0.002981774276122451, "learning_rate": 3.356316373434776e-05, "loss": 0.0002, "step": 5550 }, { "epoch": 3.1, "grad_norm": 0.000889226037543267, "learning_rate": 3.352353780313838e-05, "loss": 0.0001, "step": 5560 }, { "epoch": 3.1, "grad_norm": 0.006324201822280884, "learning_rate": 3.348391187192899e-05, "loss": 0.0002, "step": 5570 }, { "epoch": 3.1, "grad_norm": 0.013741032220423222, "learning_rate": 3.344428594071961e-05, "loss": 0.0038, "step": 5580 }, { "epoch": 3.1, "grad_norm": 0.0982193648815155, "learning_rate": 3.340466000951023e-05, "loss": 0.0002, "step": 5590 }, { "epoch": 3.1, "grad_norm": 0.0023921611718833447, "learning_rate": 3.336503407830085e-05, "loss": 0.0001, "step": 5600 }, { "epoch": 3.1, "grad_norm": 0.004157126881182194, "learning_rate": 3.332540814709146e-05, "loss": 0.0001, "step": 5610 }, { "epoch": 3.1, "eval_accuracy": 0.9398496240601504, "eval_loss": 0.3705739974975586, "eval_runtime": 2358.2538, "eval_samples_per_second": 0.282, "eval_steps_per_second": 0.141, "step": 5612 }, { "epoch": 4.0, "grad_norm": 0.0009104391792789102, "learning_rate": 3.328578221588208e-05, "loss": 0.3095, "step": 5620 }, { "epoch": 4.0, "grad_norm": 0.002629748312756419, "learning_rate": 3.324615628467269e-05, "loss": 0.1543, "step": 5630 }, { "epoch": 4.0, "grad_norm": 0.005159564781934023, "learning_rate": 3.320653035346331e-05, "loss": 0.0003, "step": 5640 }, { "epoch": 4.0, "grad_norm": 0.000841008557472378, "learning_rate": 3.316690442225393e-05, "loss": 0.0004, "step": 5650 }, { "epoch": 4.0, "grad_norm": 0.004792694002389908, "learning_rate": 3.312727849104454e-05, "loss": 0.0017, "step": 5660 }, { "epoch": 4.0, "grad_norm": 0.0014269945677369833, "learning_rate": 3.308765255983516e-05, "loss": 0.0002, "step": 5670 }, { "epoch": 4.0, "grad_norm": 0.0021025442983955145, "learning_rate": 3.304802662862577e-05, "loss": 0.115, "step": 5680 }, { "epoch": 4.01, "grad_norm": 0.0010108908172696829, "learning_rate": 3.300840069741639e-05, "loss": 0.0034, "step": 5690 }, { "epoch": 4.01, "grad_norm": 0.012116851285099983, "learning_rate": 3.296877476620701e-05, "loss": 0.001, "step": 5700 }, { "epoch": 4.01, "grad_norm": 28.641616821289062, "learning_rate": 3.2929148834997626e-05, "loss": 0.5096, "step": 5710 }, { "epoch": 4.01, "grad_norm": 21.132633209228516, "learning_rate": 3.288952290378824e-05, "loss": 0.0347, "step": 5720 }, { "epoch": 4.01, "grad_norm": 0.0044413842260837555, "learning_rate": 3.284989697257886e-05, "loss": 0.0145, "step": 5730 }, { "epoch": 4.01, "grad_norm": 48.15180206298828, "learning_rate": 3.281027104136947e-05, "loss": 0.0224, "step": 5740 }, { "epoch": 4.01, "grad_norm": 0.003202601335942745, "learning_rate": 3.2770645110160095e-05, "loss": 0.1978, "step": 5750 }, { "epoch": 4.01, "grad_norm": 20.207809448242188, "learning_rate": 3.273101917895071e-05, "loss": 0.0856, "step": 5760 }, { "epoch": 4.01, "grad_norm": 0.0013485507806763053, "learning_rate": 3.2691393247741326e-05, "loss": 0.0013, "step": 5770 }, { "epoch": 4.01, "grad_norm": 0.0005685106734745204, "learning_rate": 3.265176731653194e-05, "loss": 0.4924, "step": 5780 }, { "epoch": 4.01, "grad_norm": 0.0010126458946615458, "learning_rate": 3.2612141385322556e-05, "loss": 0.0096, "step": 5790 }, { "epoch": 4.01, "grad_norm": 95.05339813232422, "learning_rate": 3.2572515454113175e-05, "loss": 0.1921, "step": 5800 }, { "epoch": 4.01, "grad_norm": 0.002471966203302145, "learning_rate": 3.2532889522903794e-05, "loss": 0.0008, "step": 5810 }, { "epoch": 4.01, "grad_norm": 0.001297266804613173, "learning_rate": 3.2493263591694406e-05, "loss": 0.0002, "step": 5820 }, { "epoch": 4.02, "grad_norm": 19.474943161010742, "learning_rate": 3.2453637660485025e-05, "loss": 0.3726, "step": 5830 }, { "epoch": 4.02, "grad_norm": 0.015567510388791561, "learning_rate": 3.241401172927564e-05, "loss": 0.0003, "step": 5840 }, { "epoch": 4.02, "grad_norm": 0.002833213657140732, "learning_rate": 3.2374385798066256e-05, "loss": 0.6715, "step": 5850 }, { "epoch": 4.02, "grad_norm": 1.4238035678863525, "learning_rate": 3.2334759866856875e-05, "loss": 0.0008, "step": 5860 }, { "epoch": 4.02, "grad_norm": 0.0025125148240476847, "learning_rate": 3.2295133935647487e-05, "loss": 0.055, "step": 5870 }, { "epoch": 4.02, "grad_norm": 0.001612589810974896, "learning_rate": 3.2255508004438105e-05, "loss": 0.1114, "step": 5880 }, { "epoch": 4.02, "grad_norm": 0.0115219596773386, "learning_rate": 3.221588207322872e-05, "loss": 0.0002, "step": 5890 }, { "epoch": 4.02, "grad_norm": 0.002122466452419758, "learning_rate": 3.217625614201934e-05, "loss": 0.0463, "step": 5900 }, { "epoch": 4.02, "grad_norm": 0.012210741639137268, "learning_rate": 3.2136630210809955e-05, "loss": 0.0038, "step": 5910 }, { "epoch": 4.02, "grad_norm": 0.006696117110550404, "learning_rate": 3.2097004279600574e-05, "loss": 0.0006, "step": 5920 }, { "epoch": 4.02, "grad_norm": 0.003299353178590536, "learning_rate": 3.2057378348391186e-05, "loss": 0.4329, "step": 5930 }, { "epoch": 4.02, "grad_norm": 0.017031671479344368, "learning_rate": 3.2017752417181805e-05, "loss": 0.0014, "step": 5940 }, { "epoch": 4.02, "grad_norm": 0.008915259502828121, "learning_rate": 3.197812648597242e-05, "loss": 0.0003, "step": 5950 }, { "epoch": 4.02, "grad_norm": 0.0033541598822921515, "learning_rate": 3.193850055476304e-05, "loss": 0.0261, "step": 5960 }, { "epoch": 4.03, "grad_norm": 0.0039758519269526005, "learning_rate": 3.1898874623553654e-05, "loss": 0.0701, "step": 5970 }, { "epoch": 4.03, "grad_norm": 0.0859561562538147, "learning_rate": 3.185924869234427e-05, "loss": 0.0012, "step": 5980 }, { "epoch": 4.03, "grad_norm": 0.0011740931076928973, "learning_rate": 3.1819622761134885e-05, "loss": 0.0009, "step": 5990 }, { "epoch": 4.03, "grad_norm": 0.0011881589889526367, "learning_rate": 3.1779996829925504e-05, "loss": 0.1377, "step": 6000 }, { "epoch": 4.03, "grad_norm": 0.3393727242946625, "learning_rate": 3.174037089871612e-05, "loss": 0.0085, "step": 6010 }, { "epoch": 4.03, "grad_norm": 0.26653632521629333, "learning_rate": 3.170074496750674e-05, "loss": 0.5985, "step": 6020 }, { "epoch": 4.03, "grad_norm": 71.80652618408203, "learning_rate": 3.1661119036297353e-05, "loss": 0.1081, "step": 6030 }, { "epoch": 4.03, "grad_norm": 0.3816182017326355, "learning_rate": 3.162149310508797e-05, "loss": 0.0015, "step": 6040 }, { "epoch": 4.03, "grad_norm": 3.339017629623413, "learning_rate": 3.1581867173878584e-05, "loss": 0.0737, "step": 6050 }, { "epoch": 4.03, "grad_norm": 0.0013679158873856068, "learning_rate": 3.15422412426692e-05, "loss": 0.2465, "step": 6060 }, { "epoch": 4.03, "grad_norm": 0.003948609344661236, "learning_rate": 3.150261531145982e-05, "loss": 0.2111, "step": 6070 }, { "epoch": 4.03, "grad_norm": 0.005721264984458685, "learning_rate": 3.1462989380250434e-05, "loss": 0.0024, "step": 6080 }, { "epoch": 4.03, "grad_norm": 0.0018883657176047564, "learning_rate": 3.142336344904105e-05, "loss": 0.2687, "step": 6090 }, { "epoch": 4.03, "grad_norm": 0.008159175515174866, "learning_rate": 3.138373751783167e-05, "loss": 0.1406, "step": 6100 }, { "epoch": 4.04, "grad_norm": 0.009790794923901558, "learning_rate": 3.134411158662229e-05, "loss": 0.2122, "step": 6110 }, { "epoch": 4.04, "grad_norm": 0.645839273929596, "learning_rate": 3.13044856554129e-05, "loss": 0.0223, "step": 6120 }, { "epoch": 4.04, "grad_norm": 0.0012109485687687993, "learning_rate": 3.126485972420352e-05, "loss": 0.2131, "step": 6130 }, { "epoch": 4.04, "grad_norm": 0.005074062384665012, "learning_rate": 3.122523379299413e-05, "loss": 0.4669, "step": 6140 }, { "epoch": 4.04, "grad_norm": 0.04010836407542229, "learning_rate": 3.118560786178475e-05, "loss": 0.012, "step": 6150 }, { "epoch": 4.04, "grad_norm": 0.018426967784762383, "learning_rate": 3.114598193057537e-05, "loss": 0.0008, "step": 6160 }, { "epoch": 4.04, "grad_norm": 0.0035447929985821247, "learning_rate": 3.110635599936599e-05, "loss": 0.1271, "step": 6170 }, { "epoch": 4.04, "grad_norm": 0.012344791553914547, "learning_rate": 3.10667300681566e-05, "loss": 0.0002, "step": 6180 }, { "epoch": 4.04, "grad_norm": 0.0015085155609995127, "learning_rate": 3.102710413694722e-05, "loss": 0.0064, "step": 6190 }, { "epoch": 4.04, "grad_norm": 0.0013396035647019744, "learning_rate": 3.098747820573783e-05, "loss": 0.0003, "step": 6200 }, { "epoch": 4.04, "grad_norm": 0.007324972189962864, "learning_rate": 3.094785227452846e-05, "loss": 0.0001, "step": 6210 }, { "epoch": 4.04, "grad_norm": 0.029165761545300484, "learning_rate": 3.090822634331907e-05, "loss": 0.0002, "step": 6220 }, { "epoch": 4.04, "grad_norm": 0.006251147948205471, "learning_rate": 3.086860041210969e-05, "loss": 0.0001, "step": 6230 }, { "epoch": 4.04, "grad_norm": 0.0033136485144495964, "learning_rate": 3.08289744809003e-05, "loss": 0.1959, "step": 6240 }, { "epoch": 4.05, "grad_norm": 15.712539672851562, "learning_rate": 3.078934854969092e-05, "loss": 0.0053, "step": 6250 }, { "epoch": 4.05, "grad_norm": 0.004770079627633095, "learning_rate": 3.074972261848154e-05, "loss": 0.2429, "step": 6260 }, { "epoch": 4.05, "grad_norm": 0.001170918345451355, "learning_rate": 3.071009668727215e-05, "loss": 0.4537, "step": 6270 }, { "epoch": 4.05, "grad_norm": 0.003140375716611743, "learning_rate": 3.067047075606277e-05, "loss": 0.0003, "step": 6280 }, { "epoch": 4.05, "grad_norm": 0.005154268350452185, "learning_rate": 3.063084482485338e-05, "loss": 0.0002, "step": 6290 }, { "epoch": 4.05, "grad_norm": 0.718346357345581, "learning_rate": 3.0591218893644e-05, "loss": 0.0039, "step": 6300 }, { "epoch": 4.05, "grad_norm": 0.29760679602622986, "learning_rate": 3.055159296243462e-05, "loss": 0.0325, "step": 6310 }, { "epoch": 4.05, "grad_norm": 0.0015770556638017297, "learning_rate": 3.0511967031225234e-05, "loss": 0.1031, "step": 6320 }, { "epoch": 4.05, "grad_norm": 14.039325714111328, "learning_rate": 3.047234110001585e-05, "loss": 0.0254, "step": 6330 }, { "epoch": 4.05, "grad_norm": 12.89113998413086, "learning_rate": 3.043271516880647e-05, "loss": 0.0182, "step": 6340 }, { "epoch": 4.05, "grad_norm": 0.0020349326077848673, "learning_rate": 3.0393089237597084e-05, "loss": 0.0047, "step": 6350 }, { "epoch": 4.05, "grad_norm": 0.0006648111157119274, "learning_rate": 3.0353463306387703e-05, "loss": 0.0111, "step": 6360 }, { "epoch": 4.05, "grad_norm": 0.00324794533662498, "learning_rate": 3.0313837375178318e-05, "loss": 0.001, "step": 6370 }, { "epoch": 4.05, "grad_norm": 0.002352567156776786, "learning_rate": 3.0274211443968937e-05, "loss": 0.5155, "step": 6380 }, { "epoch": 4.06, "grad_norm": 0.0007183744455687702, "learning_rate": 3.023458551275955e-05, "loss": 0.1828, "step": 6390 }, { "epoch": 4.06, "grad_norm": 0.0010205942671746016, "learning_rate": 3.019495958155017e-05, "loss": 0.0004, "step": 6400 }, { "epoch": 4.06, "grad_norm": 0.0007507322006858885, "learning_rate": 3.0155333650340783e-05, "loss": 0.0078, "step": 6410 }, { "epoch": 4.06, "grad_norm": 0.0010719618294388056, "learning_rate": 3.0115707719131402e-05, "loss": 0.0024, "step": 6420 }, { "epoch": 4.06, "grad_norm": 0.004630456678569317, "learning_rate": 3.0076081787922017e-05, "loss": 0.0001, "step": 6430 }, { "epoch": 4.06, "grad_norm": 110.9379653930664, "learning_rate": 3.0036455856712636e-05, "loss": 0.2711, "step": 6440 }, { "epoch": 4.06, "grad_norm": 0.0028752069920301437, "learning_rate": 2.999682992550325e-05, "loss": 0.1684, "step": 6450 }, { "epoch": 4.06, "grad_norm": 0.00176974234636873, "learning_rate": 2.995720399429387e-05, "loss": 0.1832, "step": 6460 }, { "epoch": 4.06, "grad_norm": 0.0004082988016307354, "learning_rate": 2.9917578063084482e-05, "loss": 0.1391, "step": 6470 }, { "epoch": 4.06, "grad_norm": 91.996337890625, "learning_rate": 2.9877952131875105e-05, "loss": 0.0717, "step": 6480 }, { "epoch": 4.06, "grad_norm": 0.3914591372013092, "learning_rate": 2.9838326200665717e-05, "loss": 0.0006, "step": 6490 }, { "epoch": 4.06, "grad_norm": 0.0014606121694669127, "learning_rate": 2.9798700269456332e-05, "loss": 0.0002, "step": 6500 }, { "epoch": 4.06, "grad_norm": 0.002047004410997033, "learning_rate": 2.975907433824695e-05, "loss": 0.243, "step": 6510 }, { "epoch": 4.06, "grad_norm": 0.0009985043434426188, "learning_rate": 2.9719448407037563e-05, "loss": 0.0339, "step": 6520 }, { "epoch": 4.07, "grad_norm": 0.0007074224413372576, "learning_rate": 2.9679822475828185e-05, "loss": 0.0015, "step": 6530 }, { "epoch": 4.07, "grad_norm": 0.004130239132791758, "learning_rate": 2.9640196544618797e-05, "loss": 0.0001, "step": 6540 }, { "epoch": 4.07, "grad_norm": 0.004487643018364906, "learning_rate": 2.9600570613409416e-05, "loss": 0.0002, "step": 6550 }, { "epoch": 4.07, "grad_norm": 0.001936771790497005, "learning_rate": 2.956094468220003e-05, "loss": 0.0001, "step": 6560 }, { "epoch": 4.07, "grad_norm": 0.004075042437762022, "learning_rate": 2.952131875099065e-05, "loss": 0.3415, "step": 6570 }, { "epoch": 4.07, "grad_norm": 0.05164702981710434, "learning_rate": 2.9481692819781266e-05, "loss": 0.0001, "step": 6580 }, { "epoch": 4.07, "grad_norm": 0.0014617941342294216, "learning_rate": 2.9442066888571884e-05, "loss": 0.0001, "step": 6590 }, { "epoch": 4.07, "grad_norm": 0.0017148368060588837, "learning_rate": 2.94024409573625e-05, "loss": 0.0753, "step": 6600 }, { "epoch": 4.07, "grad_norm": 0.003370764898136258, "learning_rate": 2.936281502615312e-05, "loss": 0.0072, "step": 6610 }, { "epoch": 4.07, "grad_norm": 0.003846656298264861, "learning_rate": 2.932318909494373e-05, "loss": 0.0001, "step": 6620 }, { "epoch": 4.07, "grad_norm": 0.002365513239055872, "learning_rate": 2.928356316373435e-05, "loss": 0.0008, "step": 6630 }, { "epoch": 4.07, "grad_norm": 0.0008402117528021336, "learning_rate": 2.9243937232524965e-05, "loss": 0.0001, "step": 6640 }, { "epoch": 4.07, "grad_norm": 0.004054752178490162, "learning_rate": 2.9204311301315584e-05, "loss": 0.0548, "step": 6650 }, { "epoch": 4.07, "grad_norm": 0.0017859063809737563, "learning_rate": 2.91646853701062e-05, "loss": 0.0001, "step": 6660 }, { "epoch": 4.08, "grad_norm": 3.045167922973633, "learning_rate": 2.9125059438896818e-05, "loss": 0.0106, "step": 6670 }, { "epoch": 4.08, "grad_norm": 0.034478865563869476, "learning_rate": 2.9085433507687433e-05, "loss": 0.001, "step": 6680 }, { "epoch": 4.08, "grad_norm": 0.0020598298870027065, "learning_rate": 2.9045807576478052e-05, "loss": 0.9616, "step": 6690 }, { "epoch": 4.08, "grad_norm": 0.009513617493212223, "learning_rate": 2.9006181645268664e-05, "loss": 0.0601, "step": 6700 }, { "epoch": 4.08, "grad_norm": 0.10755365341901779, "learning_rate": 2.896655571405928e-05, "loss": 0.4313, "step": 6710 }, { "epoch": 4.08, "grad_norm": 89.41072845458984, "learning_rate": 2.8926929782849898e-05, "loss": 0.7753, "step": 6720 }, { "epoch": 4.08, "grad_norm": 0.005557952914386988, "learning_rate": 2.8887303851640514e-05, "loss": 0.022, "step": 6730 }, { "epoch": 4.08, "grad_norm": 0.07544991374015808, "learning_rate": 2.8847677920431132e-05, "loss": 0.1043, "step": 6740 }, { "epoch": 4.08, "grad_norm": 0.004230760037899017, "learning_rate": 2.8808051989221744e-05, "loss": 0.2305, "step": 6750 }, { "epoch": 4.08, "grad_norm": 0.0005384175456129014, "learning_rate": 2.8768426058012367e-05, "loss": 0.0003, "step": 6760 }, { "epoch": 4.08, "grad_norm": 0.0020217718556523323, "learning_rate": 2.872880012680298e-05, "loss": 0.0008, "step": 6770 }, { "epoch": 4.08, "grad_norm": 0.001009553438052535, "learning_rate": 2.8689174195593598e-05, "loss": 0.3165, "step": 6780 }, { "epoch": 4.08, "grad_norm": 0.002491355175152421, "learning_rate": 2.8649548264384213e-05, "loss": 0.5646, "step": 6790 }, { "epoch": 4.08, "grad_norm": 0.002977263880893588, "learning_rate": 2.8609922333174832e-05, "loss": 0.0001, "step": 6800 }, { "epoch": 4.09, "grad_norm": 0.0012742202961817384, "learning_rate": 2.8570296401965447e-05, "loss": 0.2827, "step": 6810 }, { "epoch": 4.09, "grad_norm": 0.0030132278334349394, "learning_rate": 2.8530670470756066e-05, "loss": 0.0006, "step": 6820 }, { "epoch": 4.09, "grad_norm": 0.06876442581415176, "learning_rate": 2.8491044539546678e-05, "loss": 0.0062, "step": 6830 }, { "epoch": 4.09, "grad_norm": 0.008195163682103157, "learning_rate": 2.84514186083373e-05, "loss": 0.0145, "step": 6840 }, { "epoch": 4.09, "grad_norm": 0.0023913795594125986, "learning_rate": 2.8411792677127912e-05, "loss": 0.022, "step": 6850 }, { "epoch": 4.09, "grad_norm": 0.0004799796442966908, "learning_rate": 2.837216674591853e-05, "loss": 0.2252, "step": 6860 }, { "epoch": 4.09, "grad_norm": 0.11730991303920746, "learning_rate": 2.8332540814709146e-05, "loss": 0.0033, "step": 6870 }, { "epoch": 4.09, "grad_norm": 0.01119227148592472, "learning_rate": 2.8292914883499765e-05, "loss": 0.0764, "step": 6880 }, { "epoch": 4.09, "grad_norm": 1.4117075204849243, "learning_rate": 2.825328895229038e-05, "loss": 0.0313, "step": 6890 }, { "epoch": 4.09, "grad_norm": 0.9569471478462219, "learning_rate": 2.8213663021081e-05, "loss": 0.0037, "step": 6900 }, { "epoch": 4.09, "grad_norm": 0.001442433800548315, "learning_rate": 2.8174037089871615e-05, "loss": 0.0001, "step": 6910 }, { "epoch": 4.09, "grad_norm": 0.0015686535043641925, "learning_rate": 2.8134411158662227e-05, "loss": 0.2835, "step": 6920 }, { "epoch": 4.09, "grad_norm": 0.00151319510769099, "learning_rate": 2.8094785227452846e-05, "loss": 0.0004, "step": 6930 }, { "epoch": 4.09, "grad_norm": 0.00654405914247036, "learning_rate": 2.805515929624346e-05, "loss": 0.0011, "step": 6940 }, { "epoch": 4.1, "grad_norm": 0.0008709866087883711, "learning_rate": 2.801553336503408e-05, "loss": 0.0128, "step": 6950 }, { "epoch": 4.1, "grad_norm": 0.00043904109043069184, "learning_rate": 2.7975907433824695e-05, "loss": 0.8107, "step": 6960 }, { "epoch": 4.1, "grad_norm": 0.00742549542337656, "learning_rate": 2.7936281502615314e-05, "loss": 0.3098, "step": 6970 }, { "epoch": 4.1, "grad_norm": 0.0007969782454892993, "learning_rate": 2.7896655571405926e-05, "loss": 0.0001, "step": 6980 }, { "epoch": 4.1, "grad_norm": 52.47648620605469, "learning_rate": 2.7857029640196548e-05, "loss": 0.4694, "step": 6990 }, { "epoch": 4.1, "grad_norm": 0.0088576078414917, "learning_rate": 2.781740370898716e-05, "loss": 0.0016, "step": 7000 }, { "epoch": 4.1, "grad_norm": 3.6093878746032715, "learning_rate": 2.777777777777778e-05, "loss": 0.054, "step": 7010 }, { "epoch": 4.1, "eval_accuracy": 0.9368421052631579, "eval_loss": 0.4006503224372864, "eval_runtime": 2328.3346, "eval_samples_per_second": 0.286, "eval_steps_per_second": 0.143, "step": 7015 }, { "epoch": 5.0, "grad_norm": 0.0007481848588213325, "learning_rate": 2.7738151846568395e-05, "loss": 0.0007, "step": 7020 }, { "epoch": 5.0, "grad_norm": 162.9553680419922, "learning_rate": 2.7698525915359013e-05, "loss": 0.4298, "step": 7030 }, { "epoch": 5.0, "grad_norm": 0.0006780726835131645, "learning_rate": 2.765889998414963e-05, "loss": 0.4696, "step": 7040 }, { "epoch": 5.0, "grad_norm": 0.0014015401247888803, "learning_rate": 2.7619274052940248e-05, "loss": 0.028, "step": 7050 }, { "epoch": 5.0, "grad_norm": 0.00443660095334053, "learning_rate": 2.757964812173086e-05, "loss": 0.0078, "step": 7060 }, { "epoch": 5.0, "grad_norm": 4.740740776062012, "learning_rate": 2.7540022190521482e-05, "loss": 0.1712, "step": 7070 }, { "epoch": 5.0, "grad_norm": 0.0052452534437179565, "learning_rate": 2.7500396259312094e-05, "loss": 0.0001, "step": 7080 }, { "epoch": 5.01, "grad_norm": 0.0006377240642905235, "learning_rate": 2.7460770328102713e-05, "loss": 0.045, "step": 7090 }, { "epoch": 5.01, "grad_norm": 0.0011151348007842898, "learning_rate": 2.7421144396893328e-05, "loss": 0.0001, "step": 7100 }, { "epoch": 5.01, "grad_norm": 0.0006300232489593327, "learning_rate": 2.7381518465683947e-05, "loss": 0.0004, "step": 7110 }, { "epoch": 5.01, "grad_norm": 0.008152210153639317, "learning_rate": 2.7341892534474562e-05, "loss": 0.0002, "step": 7120 }, { "epoch": 5.01, "grad_norm": 0.0016102171503007412, "learning_rate": 2.7302266603265174e-05, "loss": 0.0302, "step": 7130 }, { "epoch": 5.01, "grad_norm": 0.0014644188340753317, "learning_rate": 2.7262640672055796e-05, "loss": 0.0, "step": 7140 }, { "epoch": 5.01, "grad_norm": 0.0012343927519395947, "learning_rate": 2.722301474084641e-05, "loss": 0.001, "step": 7150 }, { "epoch": 5.01, "grad_norm": 0.002109797904267907, "learning_rate": 2.7183388809637027e-05, "loss": 0.0003, "step": 7160 }, { "epoch": 5.01, "grad_norm": 0.0012583807110786438, "learning_rate": 2.7143762878427643e-05, "loss": 0.0001, "step": 7170 }, { "epoch": 5.01, "grad_norm": 0.0009702452807687223, "learning_rate": 2.710413694721826e-05, "loss": 0.1802, "step": 7180 }, { "epoch": 5.01, "grad_norm": 0.004518999718129635, "learning_rate": 2.7064511016008877e-05, "loss": 0.0001, "step": 7190 }, { "epoch": 5.01, "grad_norm": 0.0008531950297765434, "learning_rate": 2.7024885084799496e-05, "loss": 0.0001, "step": 7200 }, { "epoch": 5.01, "grad_norm": 0.003954921383410692, "learning_rate": 2.6985259153590108e-05, "loss": 0.0001, "step": 7210 }, { "epoch": 5.01, "grad_norm": 0.0006554504507221282, "learning_rate": 2.694563322238073e-05, "loss": 0.0002, "step": 7220 }, { "epoch": 5.02, "grad_norm": 0.0011577644618228078, "learning_rate": 2.6906007291171342e-05, "loss": 0.006, "step": 7230 }, { "epoch": 5.02, "grad_norm": 0.0004994067130610347, "learning_rate": 2.686638135996196e-05, "loss": 0.0001, "step": 7240 }, { "epoch": 5.02, "grad_norm": 0.006224981974810362, "learning_rate": 2.6826755428752576e-05, "loss": 0.4425, "step": 7250 }, { "epoch": 5.02, "grad_norm": 0.00843863096088171, "learning_rate": 2.6787129497543195e-05, "loss": 0.1975, "step": 7260 }, { "epoch": 5.02, "grad_norm": 0.0011182260932400823, "learning_rate": 2.674750356633381e-05, "loss": 0.0002, "step": 7270 }, { "epoch": 5.02, "grad_norm": 0.0012028939090669155, "learning_rate": 2.670787763512443e-05, "loss": 0.0001, "step": 7280 }, { "epoch": 5.02, "grad_norm": 0.0064741140231490135, "learning_rate": 2.666825170391504e-05, "loss": 0.0066, "step": 7290 }, { "epoch": 5.02, "grad_norm": 0.0013653126079589128, "learning_rate": 2.6628625772705663e-05, "loss": 0.0802, "step": 7300 }, { "epoch": 5.02, "grad_norm": 0.0032840375788509846, "learning_rate": 2.6588999841496275e-05, "loss": 0.0495, "step": 7310 }, { "epoch": 5.02, "grad_norm": 0.006207801401615143, "learning_rate": 2.6549373910286894e-05, "loss": 0.0001, "step": 7320 }, { "epoch": 5.02, "grad_norm": 0.0015818944666534662, "learning_rate": 2.650974797907751e-05, "loss": 0.0935, "step": 7330 }, { "epoch": 5.02, "grad_norm": 0.0013846838846802711, "learning_rate": 2.647012204786813e-05, "loss": 0.0101, "step": 7340 }, { "epoch": 5.02, "grad_norm": 0.0015213302103802562, "learning_rate": 2.6430496116658744e-05, "loss": 0.0001, "step": 7350 }, { "epoch": 5.02, "grad_norm": 0.0016765915788710117, "learning_rate": 2.6390870185449356e-05, "loss": 0.0008, "step": 7360 }, { "epoch": 5.03, "grad_norm": 0.0029850786086171865, "learning_rate": 2.6351244254239975e-05, "loss": 0.5417, "step": 7370 }, { "epoch": 5.03, "grad_norm": 0.0028296930249780416, "learning_rate": 2.631161832303059e-05, "loss": 0.0029, "step": 7380 }, { "epoch": 5.03, "grad_norm": 0.19774562120437622, "learning_rate": 2.627199239182121e-05, "loss": 0.0424, "step": 7390 }, { "epoch": 5.03, "grad_norm": 0.20521485805511475, "learning_rate": 2.6232366460611824e-05, "loss": 0.0003, "step": 7400 }, { "epoch": 5.03, "grad_norm": 3.243302822113037, "learning_rate": 2.6192740529402443e-05, "loss": 0.0033, "step": 7410 }, { "epoch": 5.03, "grad_norm": 0.002176284557208419, "learning_rate": 2.615311459819306e-05, "loss": 0.0905, "step": 7420 }, { "epoch": 5.03, "grad_norm": 0.00346784177236259, "learning_rate": 2.6113488666983677e-05, "loss": 0.0058, "step": 7430 }, { "epoch": 5.03, "grad_norm": 0.0022136277984827757, "learning_rate": 2.607386273577429e-05, "loss": 0.0145, "step": 7440 }, { "epoch": 5.03, "grad_norm": 0.0054547772742807865, "learning_rate": 2.603423680456491e-05, "loss": 0.0001, "step": 7450 }, { "epoch": 5.03, "grad_norm": 0.0017041038954630494, "learning_rate": 2.5994610873355524e-05, "loss": 0.0043, "step": 7460 }, { "epoch": 5.03, "grad_norm": 0.00526059465482831, "learning_rate": 2.5954984942146142e-05, "loss": 0.0001, "step": 7470 }, { "epoch": 5.03, "grad_norm": 0.0015646722167730331, "learning_rate": 2.5915359010936758e-05, "loss": 0.0001, "step": 7480 }, { "epoch": 5.03, "grad_norm": 0.0014299266040325165, "learning_rate": 2.5875733079727377e-05, "loss": 0.0001, "step": 7490 }, { "epoch": 5.03, "grad_norm": 0.857555627822876, "learning_rate": 2.5836107148517992e-05, "loss": 0.0494, "step": 7500 }, { "epoch": 5.04, "grad_norm": 0.0019163636025041342, "learning_rate": 2.579648121730861e-05, "loss": 0.0001, "step": 7510 }, { "epoch": 5.04, "grad_norm": 0.001081604859791696, "learning_rate": 2.5756855286099223e-05, "loss": 0.0001, "step": 7520 }, { "epoch": 5.04, "grad_norm": 0.002402815269306302, "learning_rate": 2.5717229354889845e-05, "loss": 0.0001, "step": 7530 }, { "epoch": 5.04, "grad_norm": 0.0032065189443528652, "learning_rate": 2.5677603423680457e-05, "loss": 0.5271, "step": 7540 }, { "epoch": 5.04, "grad_norm": 0.0037377572152763605, "learning_rate": 2.5637977492471076e-05, "loss": 0.0001, "step": 7550 }, { "epoch": 5.04, "grad_norm": 0.0010730663780122995, "learning_rate": 2.559835156126169e-05, "loss": 0.0001, "step": 7560 }, { "epoch": 5.04, "grad_norm": 0.018039198592305183, "learning_rate": 2.5558725630052303e-05, "loss": 0.1574, "step": 7570 }, { "epoch": 5.04, "grad_norm": 0.0008627079077996314, "learning_rate": 2.5519099698842925e-05, "loss": 0.0004, "step": 7580 }, { "epoch": 5.04, "grad_norm": 0.00304847932420671, "learning_rate": 2.5479473767633537e-05, "loss": 0.0002, "step": 7590 }, { "epoch": 5.04, "grad_norm": 56.73731231689453, "learning_rate": 2.5439847836424156e-05, "loss": 0.2908, "step": 7600 }, { "epoch": 5.04, "grad_norm": 0.0014052072074264288, "learning_rate": 2.540022190521477e-05, "loss": 0.0001, "step": 7610 }, { "epoch": 5.04, "grad_norm": 0.0024271756410598755, "learning_rate": 2.536059597400539e-05, "loss": 0.0363, "step": 7620 }, { "epoch": 5.04, "grad_norm": 0.0011607712367549539, "learning_rate": 2.5320970042796006e-05, "loss": 0.0703, "step": 7630 }, { "epoch": 5.04, "grad_norm": 0.0010089229326695204, "learning_rate": 2.5281344111586625e-05, "loss": 0.0001, "step": 7640 }, { "epoch": 5.05, "grad_norm": 0.0012477770214900374, "learning_rate": 2.524171818037724e-05, "loss": 0.471, "step": 7650 }, { "epoch": 5.05, "grad_norm": 0.0015396666713058949, "learning_rate": 2.520209224916786e-05, "loss": 0.2129, "step": 7660 }, { "epoch": 5.05, "grad_norm": 0.000801810878328979, "learning_rate": 2.516246631795847e-05, "loss": 0.0314, "step": 7670 }, { "epoch": 5.05, "grad_norm": 0.0009846306638792157, "learning_rate": 2.512284038674909e-05, "loss": 0.0003, "step": 7680 }, { "epoch": 5.05, "grad_norm": 0.03625110909342766, "learning_rate": 2.5083214455539705e-05, "loss": 0.0016, "step": 7690 }, { "epoch": 5.05, "grad_norm": 0.14931851625442505, "learning_rate": 2.5043588524330324e-05, "loss": 0.4488, "step": 7700 }, { "epoch": 5.05, "grad_norm": 0.007826775312423706, "learning_rate": 2.500396259312094e-05, "loss": 0.0002, "step": 7710 }, { "epoch": 5.05, "grad_norm": 0.00988730974495411, "learning_rate": 2.4964336661911555e-05, "loss": 0.0001, "step": 7720 }, { "epoch": 5.05, "grad_norm": 0.0005387517157942057, "learning_rate": 2.4924710730702174e-05, "loss": 0.7611, "step": 7730 }, { "epoch": 5.05, "grad_norm": 0.0011877217330038548, "learning_rate": 2.488508479949279e-05, "loss": 0.0001, "step": 7740 }, { "epoch": 5.05, "grad_norm": 0.019128194078803062, "learning_rate": 2.4845458868283404e-05, "loss": 0.517, "step": 7750 }, { "epoch": 5.05, "grad_norm": 1.5278313159942627, "learning_rate": 2.4805832937074023e-05, "loss": 0.0012, "step": 7760 }, { "epoch": 5.05, "grad_norm": 0.0027036985848098993, "learning_rate": 2.476620700586464e-05, "loss": 0.5882, "step": 7770 }, { "epoch": 5.05, "grad_norm": 0.002757065463811159, "learning_rate": 2.4726581074655254e-05, "loss": 0.0155, "step": 7780 }, { "epoch": 5.06, "grad_norm": 0.004905847366899252, "learning_rate": 2.4686955143445873e-05, "loss": 0.0102, "step": 7790 }, { "epoch": 5.06, "grad_norm": 0.0014356361934915185, "learning_rate": 2.4647329212236488e-05, "loss": 0.0001, "step": 7800 }, { "epoch": 5.06, "grad_norm": 3.6968801021575928, "learning_rate": 2.4607703281027107e-05, "loss": 0.2234, "step": 7810 }, { "epoch": 5.06, "grad_norm": 66.777099609375, "learning_rate": 2.4568077349817722e-05, "loss": 0.3216, "step": 7820 }, { "epoch": 5.06, "grad_norm": 0.001242569531314075, "learning_rate": 2.4528451418608338e-05, "loss": 0.0003, "step": 7830 }, { "epoch": 5.06, "grad_norm": 0.0016161628300324082, "learning_rate": 2.4488825487398957e-05, "loss": 0.024, "step": 7840 }, { "epoch": 5.06, "grad_norm": 0.06756754219532013, "learning_rate": 2.4449199556189572e-05, "loss": 0.0614, "step": 7850 }, { "epoch": 5.06, "grad_norm": 0.0006389593472704291, "learning_rate": 2.440957362498019e-05, "loss": 0.1259, "step": 7860 }, { "epoch": 5.06, "grad_norm": 0.004206878133118153, "learning_rate": 2.4369947693770806e-05, "loss": 0.2293, "step": 7870 }, { "epoch": 5.06, "grad_norm": 0.0025491828564554453, "learning_rate": 2.433032176256142e-05, "loss": 0.0004, "step": 7880 }, { "epoch": 5.06, "grad_norm": 0.0015132069820538163, "learning_rate": 2.4290695831352037e-05, "loss": 0.0453, "step": 7890 }, { "epoch": 5.06, "grad_norm": 0.0013023455394431949, "learning_rate": 2.4251069900142652e-05, "loss": 0.515, "step": 7900 }, { "epoch": 5.06, "grad_norm": 0.0006147758103907108, "learning_rate": 2.421144396893327e-05, "loss": 0.0004, "step": 7910 }, { "epoch": 5.06, "grad_norm": 0.0013257160317152739, "learning_rate": 2.4171818037723887e-05, "loss": 0.0004, "step": 7920 }, { "epoch": 5.07, "grad_norm": 0.0010351515375077724, "learning_rate": 2.4132192106514502e-05, "loss": 0.2861, "step": 7930 }, { "epoch": 5.07, "grad_norm": 0.004010920412838459, "learning_rate": 2.409256617530512e-05, "loss": 0.144, "step": 7940 }, { "epoch": 5.07, "grad_norm": 0.002655152464285493, "learning_rate": 2.4052940244095736e-05, "loss": 0.5804, "step": 7950 }, { "epoch": 5.07, "grad_norm": 0.009208135306835175, "learning_rate": 2.4013314312886355e-05, "loss": 0.0008, "step": 7960 }, { "epoch": 5.07, "grad_norm": 124.37940979003906, "learning_rate": 2.397368838167697e-05, "loss": 0.1359, "step": 7970 }, { "epoch": 5.07, "grad_norm": 0.0007841124897822738, "learning_rate": 2.3934062450467586e-05, "loss": 0.0073, "step": 7980 }, { "epoch": 5.07, "grad_norm": 14.345431327819824, "learning_rate": 2.3894436519258205e-05, "loss": 0.009, "step": 7990 }, { "epoch": 5.07, "grad_norm": 0.0012639712076634169, "learning_rate": 2.385481058804882e-05, "loss": 0.0001, "step": 8000 }, { "epoch": 5.07, "grad_norm": 0.004882665816694498, "learning_rate": 2.3815184656839436e-05, "loss": 0.765, "step": 8010 }, { "epoch": 5.07, "grad_norm": 1.992924690246582, "learning_rate": 2.3775558725630054e-05, "loss": 0.0199, "step": 8020 }, { "epoch": 5.07, "grad_norm": 0.008574814535677433, "learning_rate": 2.373593279442067e-05, "loss": 0.0121, "step": 8030 }, { "epoch": 5.07, "grad_norm": 0.0031569607090204954, "learning_rate": 2.369630686321129e-05, "loss": 0.0105, "step": 8040 }, { "epoch": 5.07, "grad_norm": 0.005381352733820677, "learning_rate": 2.3656680932001904e-05, "loss": 0.0002, "step": 8050 }, { "epoch": 5.07, "grad_norm": 0.0014025687705725431, "learning_rate": 2.361705500079252e-05, "loss": 0.1309, "step": 8060 }, { "epoch": 5.08, "grad_norm": 0.00232652947306633, "learning_rate": 2.3577429069583138e-05, "loss": 0.0253, "step": 8070 }, { "epoch": 5.08, "grad_norm": 0.004494811408221722, "learning_rate": 2.3537803138373754e-05, "loss": 0.0004, "step": 8080 }, { "epoch": 5.08, "grad_norm": 0.007132168859243393, "learning_rate": 2.3498177207164372e-05, "loss": 0.0002, "step": 8090 }, { "epoch": 5.08, "grad_norm": 0.002315562916919589, "learning_rate": 2.3458551275954984e-05, "loss": 0.0048, "step": 8100 }, { "epoch": 5.08, "grad_norm": 0.0011102244025096297, "learning_rate": 2.34189253447456e-05, "loss": 0.1166, "step": 8110 }, { "epoch": 5.08, "grad_norm": 0.0011376317124813795, "learning_rate": 2.337929941353622e-05, "loss": 0.0001, "step": 8120 }, { "epoch": 5.08, "grad_norm": 0.009772238321602345, "learning_rate": 2.3339673482326834e-05, "loss": 0.1212, "step": 8130 }, { "epoch": 5.08, "grad_norm": 0.0009250590810552239, "learning_rate": 2.3300047551117453e-05, "loss": 0.0077, "step": 8140 }, { "epoch": 5.08, "grad_norm": 0.0008343447698280215, "learning_rate": 2.3260421619908068e-05, "loss": 0.0001, "step": 8150 }, { "epoch": 5.08, "grad_norm": 0.005889697000384331, "learning_rate": 2.3220795688698684e-05, "loss": 0.2522, "step": 8160 }, { "epoch": 5.08, "grad_norm": 0.004577580373734236, "learning_rate": 2.3181169757489303e-05, "loss": 0.0055, "step": 8170 }, { "epoch": 5.08, "grad_norm": 0.0006038689170964062, "learning_rate": 2.3141543826279918e-05, "loss": 0.22, "step": 8180 }, { "epoch": 5.08, "grad_norm": 119.69172668457031, "learning_rate": 2.3101917895070537e-05, "loss": 0.2874, "step": 8190 }, { "epoch": 5.08, "grad_norm": 0.01207007933408022, "learning_rate": 2.3062291963861152e-05, "loss": 0.0003, "step": 8200 }, { "epoch": 5.09, "grad_norm": 0.005133229307830334, "learning_rate": 2.3022666032651768e-05, "loss": 0.0002, "step": 8210 }, { "epoch": 5.09, "grad_norm": 0.0014045186107978225, "learning_rate": 2.2983040101442386e-05, "loss": 0.0003, "step": 8220 }, { "epoch": 5.09, "grad_norm": 0.005631518550217152, "learning_rate": 2.2943414170233002e-05, "loss": 0.0002, "step": 8230 }, { "epoch": 5.09, "grad_norm": 0.0011396125191822648, "learning_rate": 2.2903788239023617e-05, "loss": 0.0004, "step": 8240 }, { "epoch": 5.09, "grad_norm": 0.16508010029792786, "learning_rate": 2.2864162307814236e-05, "loss": 0.0002, "step": 8250 }, { "epoch": 5.09, "grad_norm": 0.005040541756898165, "learning_rate": 2.282453637660485e-05, "loss": 0.016, "step": 8260 }, { "epoch": 5.09, "grad_norm": 0.0026673241518437862, "learning_rate": 2.278491044539547e-05, "loss": 0.0024, "step": 8270 }, { "epoch": 5.09, "grad_norm": 0.0025323168374598026, "learning_rate": 2.2745284514186086e-05, "loss": 0.0001, "step": 8280 }, { "epoch": 5.09, "grad_norm": 0.002470273757353425, "learning_rate": 2.27056585829767e-05, "loss": 0.0001, "step": 8290 }, { "epoch": 5.09, "grad_norm": 0.0011150416685268283, "learning_rate": 2.266603265176732e-05, "loss": 0.0027, "step": 8300 }, { "epoch": 5.09, "grad_norm": 0.0062728519551455975, "learning_rate": 2.2626406720557935e-05, "loss": 0.0006, "step": 8310 }, { "epoch": 5.09, "grad_norm": 0.001863997895270586, "learning_rate": 2.258678078934855e-05, "loss": 0.0001, "step": 8320 }, { "epoch": 5.09, "grad_norm": 0.0009478493593633175, "learning_rate": 2.2547154858139166e-05, "loss": 0.0179, "step": 8330 }, { "epoch": 5.09, "grad_norm": 0.0012072144309058785, "learning_rate": 2.250752892692978e-05, "loss": 0.2482, "step": 8340 }, { "epoch": 5.1, "grad_norm": 0.0013612033799290657, "learning_rate": 2.24679029957204e-05, "loss": 0.0001, "step": 8350 }, { "epoch": 5.1, "grad_norm": 0.001653852523304522, "learning_rate": 2.2428277064511016e-05, "loss": 0.0001, "step": 8360 }, { "epoch": 5.1, "grad_norm": 0.004468216095119715, "learning_rate": 2.2388651133301634e-05, "loss": 0.0003, "step": 8370 }, { "epoch": 5.1, "grad_norm": 0.21759329736232758, "learning_rate": 2.234902520209225e-05, "loss": 0.0004, "step": 8380 }, { "epoch": 5.1, "grad_norm": 0.002769963815808296, "learning_rate": 2.2309399270882865e-05, "loss": 0.0002, "step": 8390 }, { "epoch": 5.1, "grad_norm": 0.0010608519660308957, "learning_rate": 2.2269773339673484e-05, "loss": 0.1718, "step": 8400 }, { "epoch": 5.1, "grad_norm": 0.0008747797110117972, "learning_rate": 2.22301474084641e-05, "loss": 0.0003, "step": 8410 }, { "epoch": 5.1, "eval_accuracy": 0.9669172932330827, "eval_loss": 0.23544873297214508, "eval_runtime": 2342.8874, "eval_samples_per_second": 0.284, "eval_steps_per_second": 0.142, "step": 8418 }, { "epoch": 6.0, "grad_norm": 0.0007297981064766645, "learning_rate": 2.2190521477254715e-05, "loss": 0.0001, "step": 8420 }, { "epoch": 6.0, "grad_norm": 0.007375821936875582, "learning_rate": 2.2150895546045334e-05, "loss": 0.0001, "step": 8430 }, { "epoch": 6.0, "grad_norm": 0.0019510581623762846, "learning_rate": 2.211126961483595e-05, "loss": 0.0001, "step": 8440 }, { "epoch": 6.0, "grad_norm": 0.009307813830673695, "learning_rate": 2.2071643683626568e-05, "loss": 0.0002, "step": 8450 }, { "epoch": 6.0, "grad_norm": 0.07272663712501526, "learning_rate": 2.2032017752417183e-05, "loss": 0.0002, "step": 8460 }, { "epoch": 6.0, "grad_norm": 0.004176029469817877, "learning_rate": 2.19923918212078e-05, "loss": 0.058, "step": 8470 }, { "epoch": 6.0, "grad_norm": 0.0019298582337796688, "learning_rate": 2.1952765889998418e-05, "loss": 0.0001, "step": 8480 }, { "epoch": 6.01, "grad_norm": 137.64112854003906, "learning_rate": 2.1913139958789033e-05, "loss": 0.144, "step": 8490 }, { "epoch": 6.01, "grad_norm": 0.0035788225941359997, "learning_rate": 2.1873514027579652e-05, "loss": 0.1903, "step": 8500 }, { "epoch": 6.01, "grad_norm": 0.00900218915194273, "learning_rate": 2.1833888096370267e-05, "loss": 0.3162, "step": 8510 }, { "epoch": 6.01, "grad_norm": 0.006812531501054764, "learning_rate": 2.1794262165160883e-05, "loss": 0.0001, "step": 8520 }, { "epoch": 6.01, "grad_norm": 0.011043643578886986, "learning_rate": 2.1754636233951498e-05, "loss": 0.0001, "step": 8530 }, { "epoch": 6.01, "grad_norm": 0.0009386019664816558, "learning_rate": 2.1715010302742113e-05, "loss": 0.0132, "step": 8540 }, { "epoch": 6.01, "grad_norm": 0.0009653670713305473, "learning_rate": 2.1675384371532732e-05, "loss": 0.0001, "step": 8550 }, { "epoch": 6.01, "grad_norm": 0.000631912553217262, "learning_rate": 2.1635758440323348e-05, "loss": 0.0046, "step": 8560 }, { "epoch": 6.01, "grad_norm": 0.005377355497330427, "learning_rate": 2.1596132509113963e-05, "loss": 0.0001, "step": 8570 }, { "epoch": 6.01, "grad_norm": 0.0015233962330967188, "learning_rate": 2.1556506577904582e-05, "loss": 0.26, "step": 8580 }, { "epoch": 6.01, "grad_norm": 0.003712683217599988, "learning_rate": 2.1516880646695197e-05, "loss": 0.0103, "step": 8590 }, { "epoch": 6.01, "grad_norm": 0.002746036509051919, "learning_rate": 2.1477254715485816e-05, "loss": 0.0001, "step": 8600 }, { "epoch": 6.01, "grad_norm": 0.001353266416117549, "learning_rate": 2.143762878427643e-05, "loss": 0.0001, "step": 8610 }, { "epoch": 6.01, "grad_norm": 0.05317896232008934, "learning_rate": 2.1398002853067047e-05, "loss": 0.0002, "step": 8620 }, { "epoch": 6.02, "grad_norm": 0.002108694287016988, "learning_rate": 2.1358376921857666e-05, "loss": 0.0001, "step": 8630 }, { "epoch": 6.02, "grad_norm": 0.0015535557176917791, "learning_rate": 2.131875099064828e-05, "loss": 0.2856, "step": 8640 }, { "epoch": 6.02, "grad_norm": 0.0007479583146050572, "learning_rate": 2.1279125059438897e-05, "loss": 0.0001, "step": 8650 }, { "epoch": 6.02, "grad_norm": 0.0013678164687007666, "learning_rate": 2.1239499128229515e-05, "loss": 0.0001, "step": 8660 }, { "epoch": 6.02, "grad_norm": 0.0011460609966889024, "learning_rate": 2.119987319702013e-05, "loss": 0.1748, "step": 8670 }, { "epoch": 6.02, "grad_norm": 0.005598797462880611, "learning_rate": 2.116024726581075e-05, "loss": 0.0005, "step": 8680 }, { "epoch": 6.02, "grad_norm": 0.0058416505344212055, "learning_rate": 2.1120621334601365e-05, "loss": 0.003, "step": 8690 }, { "epoch": 6.02, "grad_norm": 0.0018327133730053902, "learning_rate": 2.108099540339198e-05, "loss": 0.0001, "step": 8700 }, { "epoch": 6.02, "grad_norm": 0.0008349318522959948, "learning_rate": 2.10413694721826e-05, "loss": 0.0014, "step": 8710 }, { "epoch": 6.02, "grad_norm": 0.0007587561849504709, "learning_rate": 2.1001743540973215e-05, "loss": 0.0012, "step": 8720 }, { "epoch": 6.02, "grad_norm": 0.003939191345125437, "learning_rate": 2.096211760976383e-05, "loss": 0.0001, "step": 8730 }, { "epoch": 6.02, "grad_norm": 0.007147368974983692, "learning_rate": 2.092249167855445e-05, "loss": 0.0003, "step": 8740 }, { "epoch": 6.02, "grad_norm": 0.0007460744236595929, "learning_rate": 2.088286574734506e-05, "loss": 0.0, "step": 8750 }, { "epoch": 6.02, "grad_norm": 0.005187608767300844, "learning_rate": 2.084323981613568e-05, "loss": 0.0001, "step": 8760 }, { "epoch": 6.03, "grad_norm": 0.0012044048635289073, "learning_rate": 2.0803613884926295e-05, "loss": 0.0003, "step": 8770 }, { "epoch": 6.03, "grad_norm": 0.005269182845950127, "learning_rate": 2.0763987953716914e-05, "loss": 0.1424, "step": 8780 }, { "epoch": 6.03, "grad_norm": 0.0014458984369412065, "learning_rate": 2.072436202250753e-05, "loss": 0.1836, "step": 8790 }, { "epoch": 6.03, "grad_norm": 0.003018228802829981, "learning_rate": 2.0684736091298145e-05, "loss": 0.0002, "step": 8800 }, { "epoch": 6.03, "grad_norm": 0.0005208718357607722, "learning_rate": 2.0645110160088763e-05, "loss": 0.276, "step": 8810 }, { "epoch": 6.03, "grad_norm": 0.0005419257213361561, "learning_rate": 2.060548422887938e-05, "loss": 0.0, "step": 8820 }, { "epoch": 6.03, "grad_norm": 0.0056818630546331406, "learning_rate": 2.0565858297669994e-05, "loss": 0.0003, "step": 8830 }, { "epoch": 6.03, "grad_norm": 0.0021387594752013683, "learning_rate": 2.0526232366460613e-05, "loss": 0.0001, "step": 8840 }, { "epoch": 6.03, "grad_norm": 0.0017361573409289122, "learning_rate": 2.048660643525123e-05, "loss": 0.0235, "step": 8850 }, { "epoch": 6.03, "grad_norm": 0.0031765319872647524, "learning_rate": 2.0446980504041847e-05, "loss": 0.0002, "step": 8860 }, { "epoch": 6.03, "grad_norm": 0.0006492682150565088, "learning_rate": 2.0407354572832463e-05, "loss": 0.0, "step": 8870 }, { "epoch": 6.03, "grad_norm": 0.009603900834918022, "learning_rate": 2.0367728641623078e-05, "loss": 0.0116, "step": 8880 }, { "epoch": 6.03, "grad_norm": 0.0014260296011343598, "learning_rate": 2.0328102710413697e-05, "loss": 0.0272, "step": 8890 }, { "epoch": 6.03, "grad_norm": 0.001238304190337658, "learning_rate": 2.0288476779204312e-05, "loss": 0.0001, "step": 8900 }, { "epoch": 6.04, "grad_norm": 0.004389143083244562, "learning_rate": 2.024885084799493e-05, "loss": 0.0001, "step": 8910 }, { "epoch": 6.04, "grad_norm": 0.0006919855368323624, "learning_rate": 2.0209224916785547e-05, "loss": 0.0015, "step": 8920 }, { "epoch": 6.04, "grad_norm": 0.0013250050833448768, "learning_rate": 2.0169598985576162e-05, "loss": 0.0, "step": 8930 }, { "epoch": 6.04, "grad_norm": 0.0006862548179924488, "learning_rate": 2.012997305436678e-05, "loss": 0.005, "step": 8940 }, { "epoch": 6.04, "grad_norm": 0.0006481676246039569, "learning_rate": 2.0090347123157396e-05, "loss": 0.0002, "step": 8950 }, { "epoch": 6.04, "grad_norm": 0.0009765150607563555, "learning_rate": 2.005072119194801e-05, "loss": 0.3095, "step": 8960 }, { "epoch": 6.04, "grad_norm": 0.0008786149555817246, "learning_rate": 2.0011095260738627e-05, "loss": 0.1903, "step": 8970 }, { "epoch": 6.04, "grad_norm": 0.00043602605001069605, "learning_rate": 1.9971469329529242e-05, "loss": 0.0002, "step": 8980 }, { "epoch": 6.04, "grad_norm": 0.0006052827229723334, "learning_rate": 1.993184339831986e-05, "loss": 0.0028, "step": 8990 }, { "epoch": 6.04, "grad_norm": 0.0027263278607279062, "learning_rate": 1.9892217467110477e-05, "loss": 0.0988, "step": 9000 }, { "epoch": 6.04, "grad_norm": 0.0004901738138869405, "learning_rate": 1.9852591535901095e-05, "loss": 0.0008, "step": 9010 }, { "epoch": 6.04, "grad_norm": 0.004134719260036945, "learning_rate": 1.981296560469171e-05, "loss": 0.0001, "step": 9020 }, { "epoch": 6.04, "grad_norm": 6.425068378448486, "learning_rate": 1.9773339673482326e-05, "loss": 0.0009, "step": 9030 }, { "epoch": 6.04, "grad_norm": 0.0021010099444538355, "learning_rate": 1.9733713742272945e-05, "loss": 0.0139, "step": 9040 }, { "epoch": 6.05, "grad_norm": 0.0003429889620747417, "learning_rate": 1.969408781106356e-05, "loss": 0.0001, "step": 9050 }, { "epoch": 6.05, "grad_norm": 0.00465469341725111, "learning_rate": 1.9654461879854176e-05, "loss": 0.0048, "step": 9060 }, { "epoch": 6.05, "grad_norm": 0.0007626991719007492, "learning_rate": 1.9614835948644795e-05, "loss": 0.0694, "step": 9070 }, { "epoch": 6.05, "grad_norm": 0.0005379422218538821, "learning_rate": 1.957521001743541e-05, "loss": 0.0001, "step": 9080 }, { "epoch": 6.05, "grad_norm": 0.0018008677288889885, "learning_rate": 1.953558408622603e-05, "loss": 0.1537, "step": 9090 }, { "epoch": 6.05, "grad_norm": 0.005486232694238424, "learning_rate": 1.9495958155016644e-05, "loss": 0.0001, "step": 9100 }, { "epoch": 6.05, "grad_norm": 0.0016153625911101699, "learning_rate": 1.945633222380726e-05, "loss": 0.1517, "step": 9110 }, { "epoch": 6.05, "grad_norm": 0.00048393840552307665, "learning_rate": 1.941670629259788e-05, "loss": 0.0515, "step": 9120 }, { "epoch": 6.05, "grad_norm": 0.00044351426186040044, "learning_rate": 1.9377080361388494e-05, "loss": 0.0, "step": 9130 }, { "epoch": 6.05, "grad_norm": 0.003928069956600666, "learning_rate": 1.9337454430179113e-05, "loss": 0.0001, "step": 9140 }, { "epoch": 6.05, "grad_norm": 0.0009555955766700208, "learning_rate": 1.9297828498969728e-05, "loss": 0.0001, "step": 9150 }, { "epoch": 6.05, "grad_norm": 0.003042226191610098, "learning_rate": 1.9258202567760344e-05, "loss": 0.0, "step": 9160 }, { "epoch": 6.05, "grad_norm": 0.0003893129760399461, "learning_rate": 1.9218576636550962e-05, "loss": 0.0, "step": 9170 }, { "epoch": 6.05, "grad_norm": 0.0008289095130749047, "learning_rate": 1.9178950705341574e-05, "loss": 0.0, "step": 9180 }, { "epoch": 6.06, "grad_norm": 0.0010318297427147627, "learning_rate": 1.9139324774132193e-05, "loss": 0.0, "step": 9190 }, { "epoch": 6.06, "grad_norm": 0.0007037441828288138, "learning_rate": 1.909969884292281e-05, "loss": 0.0837, "step": 9200 }, { "epoch": 6.06, "grad_norm": 44.11083221435547, "learning_rate": 1.9060072911713424e-05, "loss": 0.0226, "step": 9210 }, { "epoch": 6.06, "grad_norm": 0.0010193975176662207, "learning_rate": 1.9020446980504043e-05, "loss": 0.0018, "step": 9220 }, { "epoch": 6.06, "grad_norm": 0.0026808753609657288, "learning_rate": 1.8980821049294658e-05, "loss": 0.0001, "step": 9230 }, { "epoch": 6.06, "grad_norm": 0.0013365477789193392, "learning_rate": 1.8941195118085277e-05, "loss": 0.0, "step": 9240 }, { "epoch": 6.06, "grad_norm": 33.180870056152344, "learning_rate": 1.8901569186875892e-05, "loss": 0.3046, "step": 9250 }, { "epoch": 6.06, "grad_norm": 0.001624317723326385, "learning_rate": 1.8861943255666508e-05, "loss": 0.2476, "step": 9260 }, { "epoch": 6.06, "grad_norm": 0.002660261234268546, "learning_rate": 1.8822317324457127e-05, "loss": 0.329, "step": 9270 }, { "epoch": 6.06, "grad_norm": 0.001928847748786211, "learning_rate": 1.8782691393247742e-05, "loss": 0.0116, "step": 9280 }, { "epoch": 6.06, "grad_norm": 0.0004771009262185544, "learning_rate": 1.8743065462038357e-05, "loss": 0.0, "step": 9290 }, { "epoch": 6.06, "grad_norm": 0.0006694819312542677, "learning_rate": 1.8703439530828976e-05, "loss": 0.0743, "step": 9300 }, { "epoch": 6.06, "grad_norm": 0.010220357216894627, "learning_rate": 1.8663813599619592e-05, "loss": 0.0001, "step": 9310 }, { "epoch": 6.06, "grad_norm": 0.0014199281577020884, "learning_rate": 1.862418766841021e-05, "loss": 0.0103, "step": 9320 }, { "epoch": 6.07, "grad_norm": 0.001806290470995009, "learning_rate": 1.8584561737200826e-05, "loss": 0.0006, "step": 9330 }, { "epoch": 6.07, "grad_norm": 0.0005750704440288246, "learning_rate": 1.854493580599144e-05, "loss": 0.0003, "step": 9340 }, { "epoch": 6.07, "grad_norm": 0.0009846296161413193, "learning_rate": 1.850530987478206e-05, "loss": 0.0013, "step": 9350 }, { "epoch": 6.07, "grad_norm": 0.0016641179099678993, "learning_rate": 1.8465683943572676e-05, "loss": 0.06, "step": 9360 }, { "epoch": 6.07, "grad_norm": 0.0014823460951447487, "learning_rate": 1.842605801236329e-05, "loss": 0.0008, "step": 9370 }, { "epoch": 6.07, "grad_norm": 0.0026860409416258335, "learning_rate": 1.838643208115391e-05, "loss": 0.0005, "step": 9380 }, { "epoch": 6.07, "grad_norm": 0.0014451199676841497, "learning_rate": 1.8346806149944525e-05, "loss": 0.0001, "step": 9390 }, { "epoch": 6.07, "grad_norm": 0.004795750603079796, "learning_rate": 1.830718021873514e-05, "loss": 0.0003, "step": 9400 }, { "epoch": 6.07, "grad_norm": 0.0025767534971237183, "learning_rate": 1.8267554287525756e-05, "loss": 0.0002, "step": 9410 }, { "epoch": 6.07, "grad_norm": 0.0006194358575157821, "learning_rate": 1.8227928356316375e-05, "loss": 0.0738, "step": 9420 }, { "epoch": 6.07, "grad_norm": 0.007454677484929562, "learning_rate": 1.818830242510699e-05, "loss": 0.0048, "step": 9430 }, { "epoch": 6.07, "grad_norm": 0.0012314959894865751, "learning_rate": 1.8148676493897606e-05, "loss": 0.0045, "step": 9440 }, { "epoch": 6.07, "grad_norm": 0.0007009029504843056, "learning_rate": 1.8109050562688224e-05, "loss": 0.2806, "step": 9450 }, { "epoch": 6.07, "grad_norm": 0.0005554054514504969, "learning_rate": 1.806942463147884e-05, "loss": 0.0001, "step": 9460 }, { "epoch": 6.08, "grad_norm": 0.00048346296534873545, "learning_rate": 1.8029798700269455e-05, "loss": 0.0458, "step": 9470 }, { "epoch": 6.08, "grad_norm": 0.0011084218276664615, "learning_rate": 1.7990172769060074e-05, "loss": 0.0, "step": 9480 }, { "epoch": 6.08, "grad_norm": 0.0003880435542669147, "learning_rate": 1.795054683785069e-05, "loss": 0.0142, "step": 9490 }, { "epoch": 6.08, "grad_norm": 0.0006134477443993092, "learning_rate": 1.7910920906641308e-05, "loss": 0.0, "step": 9500 }, { "epoch": 6.08, "grad_norm": 0.0005665639764629304, "learning_rate": 1.7871294975431924e-05, "loss": 0.0, "step": 9510 }, { "epoch": 6.08, "grad_norm": 0.0003921152965631336, "learning_rate": 1.783166904422254e-05, "loss": 0.0, "step": 9520 }, { "epoch": 6.08, "grad_norm": 0.001750220195390284, "learning_rate": 1.7792043113013158e-05, "loss": 0.0437, "step": 9530 }, { "epoch": 6.08, "grad_norm": 0.0012650929857045412, "learning_rate": 1.7752417181803773e-05, "loss": 0.3903, "step": 9540 }, { "epoch": 6.08, "grad_norm": 57.58509063720703, "learning_rate": 1.7712791250594392e-05, "loss": 0.0546, "step": 9550 }, { "epoch": 6.08, "grad_norm": 0.00026887169224210083, "learning_rate": 1.7673165319385008e-05, "loss": 0.0001, "step": 9560 }, { "epoch": 6.08, "grad_norm": 0.0019770157523453236, "learning_rate": 1.7633539388175623e-05, "loss": 0.1362, "step": 9570 }, { "epoch": 6.08, "grad_norm": 0.0007267651380971074, "learning_rate": 1.7593913456966242e-05, "loss": 0.0006, "step": 9580 }, { "epoch": 6.08, "grad_norm": 0.001434961101040244, "learning_rate": 1.7554287525756857e-05, "loss": 0.0263, "step": 9590 }, { "epoch": 6.08, "grad_norm": 0.00044755820999853313, "learning_rate": 1.7514661594547473e-05, "loss": 0.0, "step": 9600 }, { "epoch": 6.09, "grad_norm": 0.000376471463823691, "learning_rate": 1.7475035663338088e-05, "loss": 0.0207, "step": 9610 }, { "epoch": 6.09, "grad_norm": 0.014877337031066418, "learning_rate": 1.7435409732128703e-05, "loss": 0.0, "step": 9620 }, { "epoch": 6.09, "grad_norm": 0.0012328572338446975, "learning_rate": 1.7395783800919322e-05, "loss": 0.0259, "step": 9630 }, { "epoch": 6.09, "grad_norm": 0.0011149095371365547, "learning_rate": 1.7356157869709938e-05, "loss": 0.0, "step": 9640 }, { "epoch": 6.09, "grad_norm": 0.000868526753038168, "learning_rate": 1.7316531938500556e-05, "loss": 0.0107, "step": 9650 }, { "epoch": 6.09, "grad_norm": 0.0003520081809256226, "learning_rate": 1.7276906007291172e-05, "loss": 0.0, "step": 9660 }, { "epoch": 6.09, "grad_norm": 0.00045317449257709086, "learning_rate": 1.7237280076081787e-05, "loss": 0.1845, "step": 9670 }, { "epoch": 6.09, "grad_norm": 0.00035488023422658443, "learning_rate": 1.7197654144872406e-05, "loss": 0.0, "step": 9680 }, { "epoch": 6.09, "grad_norm": 0.0007327714120037854, "learning_rate": 1.715802821366302e-05, "loss": 0.0001, "step": 9690 }, { "epoch": 6.09, "grad_norm": 0.0025048046372830868, "learning_rate": 1.7118402282453637e-05, "loss": 0.0001, "step": 9700 }, { "epoch": 6.09, "grad_norm": 0.00043331715278327465, "learning_rate": 1.7078776351244256e-05, "loss": 0.0167, "step": 9710 }, { "epoch": 6.09, "grad_norm": 0.0004680192796513438, "learning_rate": 1.703915042003487e-05, "loss": 0.0, "step": 9720 }, { "epoch": 6.09, "grad_norm": 0.0005406651180237532, "learning_rate": 1.699952448882549e-05, "loss": 0.0, "step": 9730 }, { "epoch": 6.09, "grad_norm": 1.6138055324554443, "learning_rate": 1.6959898557616105e-05, "loss": 0.0005, "step": 9740 }, { "epoch": 6.1, "grad_norm": 0.0005159827414900064, "learning_rate": 1.692027262640672e-05, "loss": 0.0001, "step": 9750 }, { "epoch": 6.1, "grad_norm": 0.001156438491307199, "learning_rate": 1.688064669519734e-05, "loss": 0.0, "step": 9760 }, { "epoch": 6.1, "grad_norm": 0.00034518956090323627, "learning_rate": 1.6841020763987955e-05, "loss": 0.0316, "step": 9770 }, { "epoch": 6.1, "grad_norm": 0.0007839056779630482, "learning_rate": 1.680139483277857e-05, "loss": 0.0016, "step": 9780 }, { "epoch": 6.1, "grad_norm": 0.000456125068012625, "learning_rate": 1.676176890156919e-05, "loss": 0.0, "step": 9790 }, { "epoch": 6.1, "grad_norm": 0.0007673576474189758, "learning_rate": 1.6722142970359805e-05, "loss": 0.0, "step": 9800 }, { "epoch": 6.1, "grad_norm": 0.000683379708789289, "learning_rate": 1.6682517039150423e-05, "loss": 0.0, "step": 9810 }, { "epoch": 6.1, "grad_norm": 0.0009253775351680815, "learning_rate": 1.664289110794104e-05, "loss": 0.0001, "step": 9820 }, { "epoch": 6.1, "eval_accuracy": 0.9473684210526315, "eval_loss": 0.3900492191314697, "eval_runtime": 2421.9145, "eval_samples_per_second": 0.275, "eval_steps_per_second": 0.137, "step": 9821 }, { "epoch": 7.0, "grad_norm": 0.0004204540455248207, "learning_rate": 1.6603265176731654e-05, "loss": 0.5604, "step": 9830 }, { "epoch": 7.0, "grad_norm": 0.00036831918987445533, "learning_rate": 1.656363924552227e-05, "loss": 0.0001, "step": 9840 }, { "epoch": 7.0, "grad_norm": 0.00044371382682584226, "learning_rate": 1.6524013314312885e-05, "loss": 0.0, "step": 9850 }, { "epoch": 7.0, "grad_norm": 0.0005366410478018224, "learning_rate": 1.6484387383103504e-05, "loss": 0.0, "step": 9860 }, { "epoch": 7.0, "grad_norm": 0.0006946607609279454, "learning_rate": 1.644476145189412e-05, "loss": 0.0, "step": 9870 }, { "epoch": 7.0, "grad_norm": 0.00034042325569316745, "learning_rate": 1.6405135520684735e-05, "loss": 0.0001, "step": 9880 }, { "epoch": 7.0, "grad_norm": 0.00025543957599438727, "learning_rate": 1.6365509589475353e-05, "loss": 0.0001, "step": 9890 }, { "epoch": 7.01, "grad_norm": 0.0005577169358730316, "learning_rate": 1.632588365826597e-05, "loss": 0.0001, "step": 9900 }, { "epoch": 7.01, "grad_norm": 0.0007238492253236473, "learning_rate": 1.6286257727056588e-05, "loss": 0.0001, "step": 9910 }, { "epoch": 7.01, "grad_norm": 0.00047818326856940985, "learning_rate": 1.6246631795847203e-05, "loss": 0.2815, "step": 9920 }, { "epoch": 7.01, "grad_norm": 0.004355975892394781, "learning_rate": 1.620700586463782e-05, "loss": 0.0, "step": 9930 }, { "epoch": 7.01, "grad_norm": 0.0002552367513999343, "learning_rate": 1.6167379933428437e-05, "loss": 0.0, "step": 9940 }, { "epoch": 7.01, "grad_norm": 0.0011531308991834521, "learning_rate": 1.6127754002219053e-05, "loss": 0.0, "step": 9950 }, { "epoch": 7.01, "grad_norm": 0.0009820818668231368, "learning_rate": 1.608812807100967e-05, "loss": 0.0, "step": 9960 }, { "epoch": 7.01, "grad_norm": 0.0006331288604997098, "learning_rate": 1.6048502139800287e-05, "loss": 0.0001, "step": 9970 }, { "epoch": 7.01, "grad_norm": 23.247167587280273, "learning_rate": 1.6008876208590902e-05, "loss": 0.4045, "step": 9980 }, { "epoch": 7.01, "grad_norm": 0.0005796991754323244, "learning_rate": 1.596925027738152e-05, "loss": 0.2264, "step": 9990 }, { "epoch": 7.01, "grad_norm": 0.0002432822366245091, "learning_rate": 1.5929624346172137e-05, "loss": 0.0, "step": 10000 }, { "epoch": 7.01, "grad_norm": 0.00044970333692617714, "learning_rate": 1.5889998414962752e-05, "loss": 0.5798, "step": 10010 }, { "epoch": 7.01, "grad_norm": 0.00043129053665325046, "learning_rate": 1.585037248375337e-05, "loss": 0.0, "step": 10020 }, { "epoch": 7.01, "grad_norm": 0.0009400318958796561, "learning_rate": 1.5810746552543986e-05, "loss": 0.0001, "step": 10030 }, { "epoch": 7.02, "grad_norm": 0.001612617983482778, "learning_rate": 1.57711206213346e-05, "loss": 0.184, "step": 10040 }, { "epoch": 7.02, "grad_norm": 0.003397996537387371, "learning_rate": 1.5731494690125217e-05, "loss": 0.0001, "step": 10050 }, { "epoch": 7.02, "grad_norm": 0.003118938999250531, "learning_rate": 1.5691868758915836e-05, "loss": 0.0118, "step": 10060 }, { "epoch": 7.02, "grad_norm": 0.0016245280858129263, "learning_rate": 1.565224282770645e-05, "loss": 0.0002, "step": 10070 }, { "epoch": 7.02, "grad_norm": 0.003330792533233762, "learning_rate": 1.5612616896497067e-05, "loss": 0.0001, "step": 10080 }, { "epoch": 7.02, "grad_norm": 0.01675890013575554, "learning_rate": 1.5572990965287685e-05, "loss": 0.1361, "step": 10090 }, { "epoch": 7.02, "grad_norm": 0.0016782371094450355, "learning_rate": 1.55333650340783e-05, "loss": 0.0001, "step": 10100 }, { "epoch": 7.02, "grad_norm": 0.0006982790655456483, "learning_rate": 1.5493739102868916e-05, "loss": 0.0, "step": 10110 }, { "epoch": 7.02, "grad_norm": 0.004016962368041277, "learning_rate": 1.5454113171659535e-05, "loss": 0.0002, "step": 10120 }, { "epoch": 7.02, "grad_norm": 0.0016343995230272412, "learning_rate": 1.541448724045015e-05, "loss": 0.0001, "step": 10130 }, { "epoch": 7.02, "grad_norm": 0.0003891861706506461, "learning_rate": 1.537486130924077e-05, "loss": 0.0002, "step": 10140 }, { "epoch": 7.02, "grad_norm": 0.0005568304331973195, "learning_rate": 1.5335235378031385e-05, "loss": 0.0, "step": 10150 }, { "epoch": 7.02, "grad_norm": 0.0009192074066959321, "learning_rate": 1.5295609446822e-05, "loss": 0.0001, "step": 10160 }, { "epoch": 7.02, "grad_norm": 0.00041831223643384874, "learning_rate": 1.5255983515612617e-05, "loss": 0.0, "step": 10170 }, { "epoch": 7.03, "grad_norm": 0.002276873914524913, "learning_rate": 1.5216357584403234e-05, "loss": 0.0001, "step": 10180 }, { "epoch": 7.03, "grad_norm": 0.0021974798291921616, "learning_rate": 1.5176731653193851e-05, "loss": 0.0, "step": 10190 }, { "epoch": 7.03, "grad_norm": 0.03672347217798233, "learning_rate": 1.5137105721984468e-05, "loss": 0.0001, "step": 10200 }, { "epoch": 7.03, "grad_norm": 0.0004960880614817142, "learning_rate": 1.5097479790775086e-05, "loss": 0.0001, "step": 10210 }, { "epoch": 7.03, "grad_norm": 0.0003698187356349081, "learning_rate": 1.5057853859565701e-05, "loss": 0.0, "step": 10220 }, { "epoch": 7.03, "grad_norm": 30.79059600830078, "learning_rate": 1.5018227928356318e-05, "loss": 0.013, "step": 10230 }, { "epoch": 7.03, "grad_norm": 0.00040281921974383295, "learning_rate": 1.4978601997146935e-05, "loss": 0.0001, "step": 10240 }, { "epoch": 7.03, "grad_norm": 0.001930213999003172, "learning_rate": 1.4938976065937552e-05, "loss": 0.0005, "step": 10250 }, { "epoch": 7.03, "grad_norm": 0.0016294418601319194, "learning_rate": 1.4899350134728166e-05, "loss": 0.0001, "step": 10260 }, { "epoch": 7.03, "grad_norm": 0.0029418901540338993, "learning_rate": 1.4859724203518781e-05, "loss": 0.0001, "step": 10270 }, { "epoch": 7.03, "grad_norm": 0.0005179463187232614, "learning_rate": 1.4820098272309399e-05, "loss": 0.0, "step": 10280 }, { "epoch": 7.03, "grad_norm": 117.77789306640625, "learning_rate": 1.4780472341100016e-05, "loss": 0.0485, "step": 10290 }, { "epoch": 7.03, "grad_norm": 0.0006345610017888248, "learning_rate": 1.4740846409890633e-05, "loss": 0.0072, "step": 10300 }, { "epoch": 7.03, "grad_norm": 0.004750640131533146, "learning_rate": 1.470122047868125e-05, "loss": 0.0001, "step": 10310 }, { "epoch": 7.04, "grad_norm": 0.0016635819338262081, "learning_rate": 1.4661594547471865e-05, "loss": 0.5727, "step": 10320 }, { "epoch": 7.04, "grad_norm": 0.0009257107740268111, "learning_rate": 1.4621968616262482e-05, "loss": 0.019, "step": 10330 }, { "epoch": 7.04, "grad_norm": 0.0004995065974071622, "learning_rate": 1.45823426850531e-05, "loss": 0.0, "step": 10340 }, { "epoch": 7.04, "grad_norm": 0.003641214920207858, "learning_rate": 1.4542716753843717e-05, "loss": 0.3737, "step": 10350 }, { "epoch": 7.04, "grad_norm": 0.0005538859404623508, "learning_rate": 1.4503090822634332e-05, "loss": 0.1959, "step": 10360 }, { "epoch": 7.04, "grad_norm": 0.0024865760933607817, "learning_rate": 1.4463464891424949e-05, "loss": 0.0, "step": 10370 }, { "epoch": 7.04, "grad_norm": 0.0012498443247750401, "learning_rate": 1.4423838960215566e-05, "loss": 0.0003, "step": 10380 }, { "epoch": 7.04, "grad_norm": 0.003093864070251584, "learning_rate": 1.4384213029006183e-05, "loss": 0.0001, "step": 10390 }, { "epoch": 7.04, "grad_norm": 0.0016697756946086884, "learning_rate": 1.4344587097796799e-05, "loss": 0.0001, "step": 10400 }, { "epoch": 7.04, "grad_norm": 0.0024545500054955482, "learning_rate": 1.4304961166587416e-05, "loss": 0.0, "step": 10410 }, { "epoch": 7.04, "grad_norm": 0.0010031814454123378, "learning_rate": 1.4265335235378033e-05, "loss": 0.0, "step": 10420 }, { "epoch": 7.04, "grad_norm": 0.000834242207929492, "learning_rate": 1.422570930416865e-05, "loss": 0.214, "step": 10430 }, { "epoch": 7.04, "grad_norm": 0.0008862247341312468, "learning_rate": 1.4186083372959265e-05, "loss": 0.0001, "step": 10440 }, { "epoch": 7.04, "grad_norm": 0.0010633807396516204, "learning_rate": 1.4146457441749883e-05, "loss": 0.0104, "step": 10450 }, { "epoch": 7.05, "grad_norm": 24.041336059570312, "learning_rate": 1.41068315105405e-05, "loss": 0.0049, "step": 10460 }, { "epoch": 7.05, "grad_norm": 0.0011859643273055553, "learning_rate": 1.4067205579331113e-05, "loss": 0.0001, "step": 10470 }, { "epoch": 7.05, "grad_norm": 0.0006510709063149989, "learning_rate": 1.402757964812173e-05, "loss": 0.001, "step": 10480 }, { "epoch": 7.05, "grad_norm": 0.000353335402905941, "learning_rate": 1.3987953716912348e-05, "loss": 0.0002, "step": 10490 }, { "epoch": 7.05, "grad_norm": 0.0005472557386383414, "learning_rate": 1.3948327785702963e-05, "loss": 0.0001, "step": 10500 }, { "epoch": 7.05, "grad_norm": 0.0006235065520741045, "learning_rate": 1.390870185449358e-05, "loss": 0.0004, "step": 10510 }, { "epoch": 7.05, "grad_norm": 0.00039498330443166196, "learning_rate": 1.3869075923284197e-05, "loss": 0.0, "step": 10520 }, { "epoch": 7.05, "grad_norm": 0.0009459428838454187, "learning_rate": 1.3829449992074814e-05, "loss": 0.125, "step": 10530 }, { "epoch": 7.05, "grad_norm": 0.000288288458250463, "learning_rate": 1.378982406086543e-05, "loss": 0.0005, "step": 10540 }, { "epoch": 7.05, "grad_norm": 0.0010236125672236085, "learning_rate": 1.3750198129656047e-05, "loss": 0.0001, "step": 10550 }, { "epoch": 7.05, "grad_norm": 0.0005923935095779598, "learning_rate": 1.3710572198446664e-05, "loss": 0.0, "step": 10560 }, { "epoch": 7.05, "grad_norm": 0.001925037824548781, "learning_rate": 1.3670946267237281e-05, "loss": 0.0, "step": 10570 }, { "epoch": 7.05, "grad_norm": 0.0010172594338655472, "learning_rate": 1.3631320336027898e-05, "loss": 0.0, "step": 10580 }, { "epoch": 7.05, "grad_norm": 0.00022477912716567516, "learning_rate": 1.3591694404818514e-05, "loss": 0.4117, "step": 10590 }, { "epoch": 7.06, "grad_norm": 0.001964542781934142, "learning_rate": 1.355206847360913e-05, "loss": 0.0, "step": 10600 }, { "epoch": 7.06, "grad_norm": 0.0008729117107577622, "learning_rate": 1.3512442542399748e-05, "loss": 0.005, "step": 10610 }, { "epoch": 7.06, "grad_norm": 0.0013933833688497543, "learning_rate": 1.3472816611190365e-05, "loss": 0.0, "step": 10620 }, { "epoch": 7.06, "grad_norm": 0.720376193523407, "learning_rate": 1.343319067998098e-05, "loss": 0.0005, "step": 10630 }, { "epoch": 7.06, "grad_norm": 0.0024294324684888124, "learning_rate": 1.3393564748771597e-05, "loss": 0.5075, "step": 10640 }, { "epoch": 7.06, "grad_norm": 0.00034565231180749834, "learning_rate": 1.3353938817562215e-05, "loss": 0.0017, "step": 10650 }, { "epoch": 7.06, "grad_norm": 0.0005883481935597956, "learning_rate": 1.3314312886352832e-05, "loss": 0.0, "step": 10660 }, { "epoch": 7.06, "grad_norm": 0.001018638489767909, "learning_rate": 1.3274686955143447e-05, "loss": 0.0005, "step": 10670 }, { "epoch": 7.06, "grad_norm": 0.000567563867662102, "learning_rate": 1.3235061023934064e-05, "loss": 0.0071, "step": 10680 }, { "epoch": 7.06, "grad_norm": 0.0006969044334255159, "learning_rate": 1.3195435092724678e-05, "loss": 0.2151, "step": 10690 }, { "epoch": 7.06, "grad_norm": 0.000248556025326252, "learning_rate": 1.3155809161515295e-05, "loss": 0.0, "step": 10700 }, { "epoch": 7.06, "grad_norm": 0.0008631858509033918, "learning_rate": 1.3116183230305912e-05, "loss": 0.0001, "step": 10710 }, { "epoch": 7.06, "grad_norm": 0.001508180401287973, "learning_rate": 1.307655729909653e-05, "loss": 0.0001, "step": 10720 }, { "epoch": 7.06, "grad_norm": 0.0005554750678129494, "learning_rate": 1.3036931367887145e-05, "loss": 0.0, "step": 10730 }, { "epoch": 7.07, "grad_norm": 0.0003934628330171108, "learning_rate": 1.2997305436677762e-05, "loss": 0.0, "step": 10740 }, { "epoch": 7.07, "grad_norm": 0.001727793482132256, "learning_rate": 1.2957679505468379e-05, "loss": 0.0, "step": 10750 }, { "epoch": 7.07, "grad_norm": 0.002404275583103299, "learning_rate": 1.2918053574258996e-05, "loss": 0.0, "step": 10760 }, { "epoch": 7.07, "grad_norm": 0.0008175792172551155, "learning_rate": 1.2878427643049611e-05, "loss": 0.0, "step": 10770 }, { "epoch": 7.07, "grad_norm": 0.0022247559390962124, "learning_rate": 1.2838801711840228e-05, "loss": 0.0001, "step": 10780 }, { "epoch": 7.07, "grad_norm": 0.0014646403724327683, "learning_rate": 1.2799175780630846e-05, "loss": 0.001, "step": 10790 }, { "epoch": 7.07, "grad_norm": 0.0020718346349895, "learning_rate": 1.2759549849421463e-05, "loss": 0.0001, "step": 10800 }, { "epoch": 7.07, "grad_norm": 0.0008824109099805355, "learning_rate": 1.2719923918212078e-05, "loss": 0.5802, "step": 10810 }, { "epoch": 7.07, "grad_norm": 0.0007525623659603298, "learning_rate": 1.2680297987002695e-05, "loss": 0.0, "step": 10820 }, { "epoch": 7.07, "grad_norm": 64.19054412841797, "learning_rate": 1.2640672055793312e-05, "loss": 0.4105, "step": 10830 }, { "epoch": 7.07, "grad_norm": 0.0019008672097697854, "learning_rate": 1.260104612458393e-05, "loss": 0.0005, "step": 10840 }, { "epoch": 7.07, "grad_norm": 0.0010036254534497857, "learning_rate": 1.2561420193374545e-05, "loss": 0.0006, "step": 10850 }, { "epoch": 7.07, "grad_norm": 0.0006118649616837502, "learning_rate": 1.2521794262165162e-05, "loss": 0.0, "step": 10860 }, { "epoch": 7.07, "grad_norm": 0.003166553797200322, "learning_rate": 1.2482168330955777e-05, "loss": 0.0001, "step": 10870 }, { "epoch": 7.08, "grad_norm": 0.001118882093578577, "learning_rate": 1.2442542399746394e-05, "loss": 0.0, "step": 10880 }, { "epoch": 7.08, "grad_norm": 0.0010632872581481934, "learning_rate": 1.2402916468537012e-05, "loss": 0.0001, "step": 10890 }, { "epoch": 7.08, "grad_norm": 0.001252860063686967, "learning_rate": 1.2363290537327627e-05, "loss": 0.0001, "step": 10900 }, { "epoch": 7.08, "grad_norm": 0.003005104372277856, "learning_rate": 1.2323664606118244e-05, "loss": 0.0001, "step": 10910 }, { "epoch": 7.08, "grad_norm": 0.004219905007630587, "learning_rate": 1.2284038674908861e-05, "loss": 0.0006, "step": 10920 }, { "epoch": 7.08, "grad_norm": 0.0003512962721288204, "learning_rate": 1.2244412743699478e-05, "loss": 0.0, "step": 10930 }, { "epoch": 7.08, "grad_norm": 0.0026769828982651234, "learning_rate": 1.2204786812490095e-05, "loss": 0.0001, "step": 10940 }, { "epoch": 7.08, "grad_norm": 0.0003416830440983176, "learning_rate": 1.216516088128071e-05, "loss": 0.0, "step": 10950 }, { "epoch": 7.08, "grad_norm": 0.0010573529871180654, "learning_rate": 1.2125534950071326e-05, "loss": 0.0, "step": 10960 }, { "epoch": 7.08, "grad_norm": 0.0013822006294503808, "learning_rate": 1.2085909018861943e-05, "loss": 0.2014, "step": 10970 }, { "epoch": 7.08, "grad_norm": 0.0003184191882610321, "learning_rate": 1.204628308765256e-05, "loss": 0.0, "step": 10980 }, { "epoch": 7.08, "grad_norm": 0.004402919672429562, "learning_rate": 1.2006657156443178e-05, "loss": 0.006, "step": 10990 }, { "epoch": 7.08, "grad_norm": 4.32048225402832, "learning_rate": 1.1967031225233793e-05, "loss": 0.0019, "step": 11000 }, { "epoch": 7.08, "grad_norm": 0.0006507772486656904, "learning_rate": 1.192740529402441e-05, "loss": 0.0, "step": 11010 }, { "epoch": 7.09, "grad_norm": 0.0004223829018883407, "learning_rate": 1.1887779362815027e-05, "loss": 0.0001, "step": 11020 }, { "epoch": 7.09, "grad_norm": 0.0006153634749352932, "learning_rate": 1.1848153431605644e-05, "loss": 0.0001, "step": 11030 }, { "epoch": 7.09, "grad_norm": 0.0007072246517054737, "learning_rate": 1.180852750039626e-05, "loss": 0.0, "step": 11040 }, { "epoch": 7.09, "grad_norm": 0.0007087733829393983, "learning_rate": 1.1768901569186877e-05, "loss": 0.0591, "step": 11050 }, { "epoch": 7.09, "grad_norm": 0.00040967803215608, "learning_rate": 1.1729275637977492e-05, "loss": 0.0, "step": 11060 }, { "epoch": 7.09, "grad_norm": 0.0018612256972119212, "learning_rate": 1.168964970676811e-05, "loss": 0.0001, "step": 11070 }, { "epoch": 7.09, "grad_norm": 0.0016640513204038143, "learning_rate": 1.1650023775558726e-05, "loss": 0.0001, "step": 11080 }, { "epoch": 7.09, "grad_norm": 0.004190579988062382, "learning_rate": 1.1610397844349342e-05, "loss": 0.0, "step": 11090 }, { "epoch": 7.09, "grad_norm": 0.001836647279560566, "learning_rate": 1.1570771913139959e-05, "loss": 0.0, "step": 11100 }, { "epoch": 7.09, "grad_norm": 0.0005556776304729283, "learning_rate": 1.1531145981930576e-05, "loss": 0.0001, "step": 11110 }, { "epoch": 7.09, "grad_norm": 0.0008808193379081786, "learning_rate": 1.1491520050721193e-05, "loss": 0.0001, "step": 11120 }, { "epoch": 7.09, "grad_norm": 0.0001681848953012377, "learning_rate": 1.1451894119511809e-05, "loss": 0.2524, "step": 11130 }, { "epoch": 7.09, "grad_norm": 0.0008354588062502444, "learning_rate": 1.1412268188302426e-05, "loss": 0.0, "step": 11140 }, { "epoch": 7.09, "grad_norm": 0.00051628437358886, "learning_rate": 1.1372642257093043e-05, "loss": 0.0, "step": 11150 }, { "epoch": 7.1, "grad_norm": 0.0008145067258737981, "learning_rate": 1.133301632588366e-05, "loss": 0.0001, "step": 11160 }, { "epoch": 7.1, "grad_norm": 0.0004701870202552527, "learning_rate": 1.1293390394674275e-05, "loss": 0.1907, "step": 11170 }, { "epoch": 7.1, "grad_norm": 0.0011952131753787398, "learning_rate": 1.125376446346489e-05, "loss": 0.0, "step": 11180 }, { "epoch": 7.1, "grad_norm": 0.00032050846493802965, "learning_rate": 1.1214138532255508e-05, "loss": 0.0, "step": 11190 }, { "epoch": 7.1, "grad_norm": 0.0006612459546886384, "learning_rate": 1.1174512601046125e-05, "loss": 0.0001, "step": 11200 }, { "epoch": 7.1, "grad_norm": 0.0030058922711759806, "learning_rate": 1.1134886669836742e-05, "loss": 0.0, "step": 11210 }, { "epoch": 7.1, "grad_norm": 0.0034754828084260225, "learning_rate": 1.1095260738627357e-05, "loss": 0.0003, "step": 11220 }, { "epoch": 7.1, "eval_accuracy": 0.9578947368421052, "eval_loss": 0.2666740119457245, "eval_runtime": 2322.4119, "eval_samples_per_second": 0.286, "eval_steps_per_second": 0.143, "step": 11224 }, { "epoch": 8.0, "grad_norm": 0.004194905515760183, "learning_rate": 1.1055634807417975e-05, "loss": 0.0001, "step": 11230 }, { "epoch": 8.0, "grad_norm": 0.0024937952402979136, "learning_rate": 1.1016008876208592e-05, "loss": 0.0, "step": 11240 }, { "epoch": 8.0, "grad_norm": 0.00039031429332681, "learning_rate": 1.0976382944999209e-05, "loss": 0.0, "step": 11250 }, { "epoch": 8.0, "grad_norm": 0.005691041238605976, "learning_rate": 1.0936757013789826e-05, "loss": 0.0001, "step": 11260 }, { "epoch": 8.0, "grad_norm": 0.00017179737915284932, "learning_rate": 1.0897131082580441e-05, "loss": 0.0001, "step": 11270 }, { "epoch": 8.0, "grad_norm": 0.000949267705436796, "learning_rate": 1.0857505151371057e-05, "loss": 0.0001, "step": 11280 }, { "epoch": 8.0, "grad_norm": 0.0003036385169252753, "learning_rate": 1.0817879220161674e-05, "loss": 0.0001, "step": 11290 }, { "epoch": 8.01, "grad_norm": 0.004243906121701002, "learning_rate": 1.0778253288952291e-05, "loss": 0.0005, "step": 11300 }, { "epoch": 8.01, "grad_norm": 0.0010142240207642317, "learning_rate": 1.0738627357742908e-05, "loss": 0.0, "step": 11310 }, { "epoch": 8.01, "grad_norm": 0.0010380720486864448, "learning_rate": 1.0699001426533523e-05, "loss": 0.0001, "step": 11320 }, { "epoch": 8.01, "grad_norm": 0.0005737761966884136, "learning_rate": 1.065937549532414e-05, "loss": 0.0001, "step": 11330 }, { "epoch": 8.01, "grad_norm": 0.001465731067582965, "learning_rate": 1.0619749564114758e-05, "loss": 0.0, "step": 11340 }, { "epoch": 8.01, "grad_norm": 0.002500841859728098, "learning_rate": 1.0580123632905375e-05, "loss": 0.0, "step": 11350 }, { "epoch": 8.01, "grad_norm": 0.00024287942505907267, "learning_rate": 1.054049770169599e-05, "loss": 0.0, "step": 11360 }, { "epoch": 8.01, "grad_norm": 0.0006320082466118038, "learning_rate": 1.0500871770486607e-05, "loss": 0.0001, "step": 11370 }, { "epoch": 8.01, "grad_norm": 0.00030024844454601407, "learning_rate": 1.0461245839277224e-05, "loss": 0.0097, "step": 11380 }, { "epoch": 8.01, "grad_norm": 0.00043432554230093956, "learning_rate": 1.042161990806784e-05, "loss": 0.0421, "step": 11390 }, { "epoch": 8.01, "grad_norm": 0.002737953094765544, "learning_rate": 1.0381993976858457e-05, "loss": 0.0, "step": 11400 }, { "epoch": 8.01, "grad_norm": 0.000816858431790024, "learning_rate": 1.0342368045649072e-05, "loss": 0.0, "step": 11410 }, { "epoch": 8.01, "grad_norm": 0.00036986047052778304, "learning_rate": 1.030274211443969e-05, "loss": 0.0, "step": 11420 }, { "epoch": 8.01, "grad_norm": 0.0004323932225815952, "learning_rate": 1.0263116183230307e-05, "loss": 0.0002, "step": 11430 }, { "epoch": 8.02, "grad_norm": 0.0004024511144962162, "learning_rate": 1.0223490252020924e-05, "loss": 0.421, "step": 11440 }, { "epoch": 8.02, "grad_norm": 0.0024430027697235346, "learning_rate": 1.0183864320811539e-05, "loss": 0.0, "step": 11450 }, { "epoch": 8.02, "grad_norm": 0.001345345051959157, "learning_rate": 1.0144238389602156e-05, "loss": 0.0, "step": 11460 }, { "epoch": 8.02, "grad_norm": 0.0006153620779514313, "learning_rate": 1.0104612458392773e-05, "loss": 0.0001, "step": 11470 }, { "epoch": 8.02, "grad_norm": 0.0015972702531144023, "learning_rate": 1.006498652718339e-05, "loss": 0.0, "step": 11480 }, { "epoch": 8.02, "grad_norm": 0.0008706132066436112, "learning_rate": 1.0025360595974006e-05, "loss": 0.0001, "step": 11490 }, { "epoch": 8.02, "grad_norm": 0.001384895178489387, "learning_rate": 9.985734664764621e-06, "loss": 0.0001, "step": 11500 }, { "epoch": 8.02, "grad_norm": 0.0010631100740283728, "learning_rate": 9.946108733555238e-06, "loss": 0.0002, "step": 11510 }, { "epoch": 8.02, "grad_norm": 0.0007243629661388695, "learning_rate": 9.906482802345855e-06, "loss": 0.0001, "step": 11520 }, { "epoch": 8.02, "grad_norm": 74.74536895751953, "learning_rate": 9.866856871136473e-06, "loss": 0.0797, "step": 11530 }, { "epoch": 8.02, "grad_norm": 0.005114846862852573, "learning_rate": 9.827230939927088e-06, "loss": 0.0001, "step": 11540 }, { "epoch": 8.02, "grad_norm": 0.0024818070232868195, "learning_rate": 9.787605008717705e-06, "loss": 0.0001, "step": 11550 }, { "epoch": 8.02, "grad_norm": 0.00041646783938631415, "learning_rate": 9.747979077508322e-06, "loss": 0.0, "step": 11560 }, { "epoch": 8.02, "grad_norm": 0.0007332797977142036, "learning_rate": 9.70835314629894e-06, "loss": 0.0116, "step": 11570 }, { "epoch": 8.03, "grad_norm": 0.0007879806798882782, "learning_rate": 9.668727215089556e-06, "loss": 0.0, "step": 11580 }, { "epoch": 8.03, "grad_norm": 0.0009714935440570116, "learning_rate": 9.629101283880172e-06, "loss": 0.0001, "step": 11590 }, { "epoch": 8.03, "grad_norm": 0.0009343309211544693, "learning_rate": 9.589475352670787e-06, "loss": 0.5311, "step": 11600 }, { "epoch": 8.03, "grad_norm": 0.00037891563260927796, "learning_rate": 9.549849421461404e-06, "loss": 0.002, "step": 11610 }, { "epoch": 8.03, "grad_norm": 0.001986218150705099, "learning_rate": 9.510223490252021e-06, "loss": 0.0111, "step": 11620 }, { "epoch": 8.03, "grad_norm": 0.0015318701043725014, "learning_rate": 9.470597559042639e-06, "loss": 0.0001, "step": 11630 }, { "epoch": 8.03, "grad_norm": 0.0006765589932911098, "learning_rate": 9.430971627833254e-06, "loss": 0.0, "step": 11640 }, { "epoch": 8.03, "grad_norm": 0.0005000099190510809, "learning_rate": 9.391345696623871e-06, "loss": 0.0, "step": 11650 }, { "epoch": 8.03, "grad_norm": 0.0017080691177397966, "learning_rate": 9.351719765414488e-06, "loss": 0.0002, "step": 11660 }, { "epoch": 8.03, "grad_norm": 0.0017356324242427945, "learning_rate": 9.312093834205105e-06, "loss": 0.0001, "step": 11670 }, { "epoch": 8.03, "grad_norm": 0.0010568661382421851, "learning_rate": 9.27246790299572e-06, "loss": 0.0, "step": 11680 }, { "epoch": 8.03, "grad_norm": 0.0014095234218984842, "learning_rate": 9.232841971786338e-06, "loss": 0.0021, "step": 11690 }, { "epoch": 8.03, "grad_norm": 0.00167833489831537, "learning_rate": 9.193216040576955e-06, "loss": 0.0001, "step": 11700 }, { "epoch": 8.03, "grad_norm": 0.0016795884585008025, "learning_rate": 9.15359010936757e-06, "loss": 0.0002, "step": 11710 }, { "epoch": 8.04, "grad_norm": 0.0003502909676171839, "learning_rate": 9.113964178158187e-06, "loss": 0.0, "step": 11720 }, { "epoch": 8.04, "grad_norm": 0.10941363871097565, "learning_rate": 9.074338246948803e-06, "loss": 0.0001, "step": 11730 }, { "epoch": 8.04, "grad_norm": 0.0014083647402003407, "learning_rate": 9.03471231573942e-06, "loss": 0.3081, "step": 11740 }, { "epoch": 8.04, "grad_norm": 0.0014537267852574587, "learning_rate": 8.995086384530037e-06, "loss": 0.0, "step": 11750 }, { "epoch": 8.04, "grad_norm": 0.0005781695363111794, "learning_rate": 8.955460453320654e-06, "loss": 0.0, "step": 11760 }, { "epoch": 8.04, "grad_norm": 0.0007176825893111527, "learning_rate": 8.91583452211127e-06, "loss": 0.0, "step": 11770 }, { "epoch": 8.04, "grad_norm": 0.000545515853445977, "learning_rate": 8.876208590901887e-06, "loss": 0.0, "step": 11780 }, { "epoch": 8.04, "grad_norm": 0.0025596795603632927, "learning_rate": 8.836582659692504e-06, "loss": 0.0002, "step": 11790 }, { "epoch": 8.04, "grad_norm": 0.030005350708961487, "learning_rate": 8.796956728483121e-06, "loss": 0.0001, "step": 11800 }, { "epoch": 8.04, "grad_norm": 0.00035480278893373907, "learning_rate": 8.757330797273736e-06, "loss": 0.0018, "step": 11810 }, { "epoch": 8.04, "grad_norm": 0.004515402484685183, "learning_rate": 8.717704866064352e-06, "loss": 0.0, "step": 11820 }, { "epoch": 8.04, "grad_norm": 0.0032044288236647844, "learning_rate": 8.678078934854969e-06, "loss": 0.0036, "step": 11830 }, { "epoch": 8.04, "grad_norm": 0.0009629257838241756, "learning_rate": 8.638453003645586e-06, "loss": 0.149, "step": 11840 }, { "epoch": 8.04, "grad_norm": 0.0024080132134258747, "learning_rate": 8.598827072436203e-06, "loss": 0.0003, "step": 11850 }, { "epoch": 8.05, "grad_norm": 0.0015089749358594418, "learning_rate": 8.559201141226818e-06, "loss": 0.0, "step": 11860 }, { "epoch": 8.05, "grad_norm": 0.0019321951549500227, "learning_rate": 8.519575210017436e-06, "loss": 0.0, "step": 11870 }, { "epoch": 8.05, "grad_norm": 0.005924368277192116, "learning_rate": 8.479949278808053e-06, "loss": 0.0, "step": 11880 }, { "epoch": 8.05, "grad_norm": 0.0007942487136460841, "learning_rate": 8.44032334759867e-06, "loss": 0.0, "step": 11890 }, { "epoch": 8.05, "grad_norm": 0.0022497123572975397, "learning_rate": 8.400697416389285e-06, "loss": 0.1055, "step": 11900 }, { "epoch": 8.05, "grad_norm": 0.0006818937254138291, "learning_rate": 8.361071485179902e-06, "loss": 0.0001, "step": 11910 }, { "epoch": 8.05, "grad_norm": 0.0004379069432616234, "learning_rate": 8.32144555397052e-06, "loss": 0.0, "step": 11920 }, { "epoch": 8.05, "grad_norm": 0.00047276023542508483, "learning_rate": 8.281819622761135e-06, "loss": 0.0, "step": 11930 }, { "epoch": 8.05, "grad_norm": 0.0004771367821376771, "learning_rate": 8.242193691551752e-06, "loss": 0.0, "step": 11940 }, { "epoch": 8.05, "grad_norm": 0.0005501986015588045, "learning_rate": 8.202567760342367e-06, "loss": 0.0001, "step": 11950 }, { "epoch": 8.05, "grad_norm": 0.0011177220148965716, "learning_rate": 8.162941829132984e-06, "loss": 0.0703, "step": 11960 }, { "epoch": 8.05, "grad_norm": 0.0004951581358909607, "learning_rate": 8.123315897923602e-06, "loss": 0.0, "step": 11970 }, { "epoch": 8.05, "grad_norm": 0.0008309069671668112, "learning_rate": 8.083689966714219e-06, "loss": 0.0, "step": 11980 }, { "epoch": 8.05, "grad_norm": 0.000472767511382699, "learning_rate": 8.044064035504836e-06, "loss": 0.5261, "step": 11990 }, { "epoch": 8.06, "grad_norm": 0.00044904148671776056, "learning_rate": 8.004438104295451e-06, "loss": 0.0, "step": 12000 }, { "epoch": 8.06, "grad_norm": 0.0004107660206500441, "learning_rate": 7.964812173086068e-06, "loss": 0.0, "step": 12010 }, { "epoch": 8.06, "grad_norm": 0.00042746157851070166, "learning_rate": 7.925186241876685e-06, "loss": 0.0, "step": 12020 }, { "epoch": 8.06, "grad_norm": 0.0007110532023943961, "learning_rate": 7.8855603106673e-06, "loss": 0.0, "step": 12030 }, { "epoch": 8.06, "grad_norm": 0.0007705994066782296, "learning_rate": 7.845934379457918e-06, "loss": 0.0, "step": 12040 }, { "epoch": 8.06, "grad_norm": 0.0006966418586671352, "learning_rate": 7.806308448248533e-06, "loss": 0.0, "step": 12050 }, { "epoch": 8.06, "grad_norm": 0.020446307957172394, "learning_rate": 7.76668251703915e-06, "loss": 0.0001, "step": 12060 }, { "epoch": 8.06, "grad_norm": 0.0004377638688310981, "learning_rate": 7.727056585829768e-06, "loss": 0.0, "step": 12070 }, { "epoch": 8.06, "grad_norm": 0.00036184967029839754, "learning_rate": 7.687430654620385e-06, "loss": 0.0002, "step": 12080 }, { "epoch": 8.06, "grad_norm": 0.00029569625621661544, "learning_rate": 7.647804723411e-06, "loss": 0.0001, "step": 12090 }, { "epoch": 8.06, "grad_norm": 0.0003205812827218324, "learning_rate": 7.608178792201617e-06, "loss": 0.0236, "step": 12100 }, { "epoch": 8.06, "grad_norm": 0.00043995011947117746, "learning_rate": 7.568552860992234e-06, "loss": 0.0001, "step": 12110 }, { "epoch": 8.06, "grad_norm": 0.0021792801562696695, "learning_rate": 7.5289269297828505e-06, "loss": 0.0, "step": 12120 }, { "epoch": 8.06, "grad_norm": 0.003733986523002386, "learning_rate": 7.489300998573468e-06, "loss": 0.0, "step": 12130 }, { "epoch": 8.07, "grad_norm": 0.001138021470978856, "learning_rate": 7.449675067364083e-06, "loss": 0.0001, "step": 12140 }, { "epoch": 8.07, "grad_norm": 0.0003544053470250219, "learning_rate": 7.410049136154699e-06, "loss": 0.0, "step": 12150 }, { "epoch": 8.07, "grad_norm": 0.0007718518027104437, "learning_rate": 7.370423204945316e-06, "loss": 0.0, "step": 12160 }, { "epoch": 8.07, "grad_norm": 0.000794577703345567, "learning_rate": 7.330797273735933e-06, "loss": 0.0, "step": 12170 }, { "epoch": 8.07, "grad_norm": 0.0007835258147679269, "learning_rate": 7.29117134252655e-06, "loss": 0.0, "step": 12180 }, { "epoch": 8.07, "grad_norm": 0.0008351559517905116, "learning_rate": 7.251545411317166e-06, "loss": 0.0, "step": 12190 }, { "epoch": 8.07, "grad_norm": 0.001067393459379673, "learning_rate": 7.211919480107783e-06, "loss": 0.0, "step": 12200 }, { "epoch": 8.07, "grad_norm": 0.0005535806412808597, "learning_rate": 7.172293548898399e-06, "loss": 0.0, "step": 12210 }, { "epoch": 8.07, "grad_norm": 0.0013392162509262562, "learning_rate": 7.1326676176890165e-06, "loss": 0.0, "step": 12220 }, { "epoch": 8.07, "grad_norm": 0.011801800690591335, "learning_rate": 7.093041686479633e-06, "loss": 0.0001, "step": 12230 }, { "epoch": 8.07, "grad_norm": 0.0003349117760080844, "learning_rate": 7.05341575527025e-06, "loss": 0.0, "step": 12240 }, { "epoch": 8.07, "grad_norm": 0.0009791525080800056, "learning_rate": 7.013789824060865e-06, "loss": 0.0, "step": 12250 }, { "epoch": 8.07, "grad_norm": 0.0003134564030915499, "learning_rate": 6.9741638928514815e-06, "loss": 0.0, "step": 12260 }, { "epoch": 8.07, "grad_norm": 0.0011281302431598306, "learning_rate": 6.934537961642099e-06, "loss": 0.0003, "step": 12270 }, { "epoch": 8.08, "grad_norm": 0.0004596656945068389, "learning_rate": 6.894912030432715e-06, "loss": 0.0, "step": 12280 }, { "epoch": 8.08, "grad_norm": 0.017007848247885704, "learning_rate": 6.855286099223332e-06, "loss": 0.1894, "step": 12290 }, { "epoch": 8.08, "grad_norm": 0.0009561624028719962, "learning_rate": 6.815660168013949e-06, "loss": 0.0001, "step": 12300 }, { "epoch": 8.08, "grad_norm": 0.0006208363920450211, "learning_rate": 6.776034236804565e-06, "loss": 0.0, "step": 12310 }, { "epoch": 8.08, "grad_norm": 0.00040551909478381276, "learning_rate": 6.7364083055951825e-06, "loss": 0.0, "step": 12320 }, { "epoch": 8.08, "grad_norm": 0.0010045063681900501, "learning_rate": 6.696782374385799e-06, "loss": 0.0001, "step": 12330 }, { "epoch": 8.08, "grad_norm": 0.001559635391458869, "learning_rate": 6.657156443176416e-06, "loss": 0.1317, "step": 12340 }, { "epoch": 8.08, "grad_norm": 0.00036661443300545216, "learning_rate": 6.617530511967032e-06, "loss": 0.0, "step": 12350 }, { "epoch": 8.08, "grad_norm": 0.0022761470172554255, "learning_rate": 6.5779045807576475e-06, "loss": 0.0, "step": 12360 }, { "epoch": 8.08, "grad_norm": 0.0002771701547317207, "learning_rate": 6.538278649548265e-06, "loss": 0.0, "step": 12370 }, { "epoch": 8.08, "grad_norm": 0.0009405760793015361, "learning_rate": 6.498652718338881e-06, "loss": 0.0, "step": 12380 }, { "epoch": 8.08, "grad_norm": 0.0011777085019275546, "learning_rate": 6.459026787129498e-06, "loss": 0.0, "step": 12390 }, { "epoch": 8.08, "grad_norm": 0.0007950080907903612, "learning_rate": 6.419400855920114e-06, "loss": 0.0023, "step": 12400 }, { "epoch": 8.08, "grad_norm": 0.000328573863953352, "learning_rate": 6.379774924710731e-06, "loss": 0.0001, "step": 12410 }, { "epoch": 8.09, "grad_norm": 0.000489677709992975, "learning_rate": 6.340148993501348e-06, "loss": 0.0, "step": 12420 }, { "epoch": 8.09, "grad_norm": 19.678516387939453, "learning_rate": 6.300523062291965e-06, "loss": 0.2121, "step": 12430 }, { "epoch": 8.09, "grad_norm": 0.001576061244122684, "learning_rate": 6.260897131082581e-06, "loss": 0.2006, "step": 12440 }, { "epoch": 8.09, "grad_norm": 0.0010969837894663215, "learning_rate": 6.221271199873197e-06, "loss": 0.0089, "step": 12450 }, { "epoch": 8.09, "grad_norm": 0.0006820796988904476, "learning_rate": 6.1816452686638135e-06, "loss": 0.0001, "step": 12460 }, { "epoch": 8.09, "grad_norm": 0.0039375657215714455, "learning_rate": 6.142019337454431e-06, "loss": 0.0, "step": 12470 }, { "epoch": 8.09, "grad_norm": 0.00018676265608519316, "learning_rate": 6.102393406245048e-06, "loss": 0.0002, "step": 12480 }, { "epoch": 8.09, "grad_norm": 0.0015864548040553927, "learning_rate": 6.062767475035663e-06, "loss": 0.0, "step": 12490 }, { "epoch": 8.09, "grad_norm": 0.0005812132731080055, "learning_rate": 6.02314154382628e-06, "loss": 0.0001, "step": 12500 }, { "epoch": 8.09, "grad_norm": 0.0015394919319078326, "learning_rate": 5.9835156126168965e-06, "loss": 0.0, "step": 12510 }, { "epoch": 8.09, "grad_norm": 0.5876509547233582, "learning_rate": 5.943889681407514e-06, "loss": 0.0002, "step": 12520 }, { "epoch": 8.09, "grad_norm": 0.001257477910257876, "learning_rate": 5.90426375019813e-06, "loss": 0.0, "step": 12530 }, { "epoch": 8.09, "grad_norm": 0.007748996838927269, "learning_rate": 5.864637818988746e-06, "loss": 0.0002, "step": 12540 }, { "epoch": 8.09, "grad_norm": 0.0004220679693389684, "learning_rate": 5.825011887779363e-06, "loss": 0.0001, "step": 12550 }, { "epoch": 8.1, "grad_norm": 0.0003514468262437731, "learning_rate": 5.7853859565699795e-06, "loss": 0.0062, "step": 12560 }, { "epoch": 8.1, "grad_norm": 0.0004685299936681986, "learning_rate": 5.745760025360597e-06, "loss": 0.0016, "step": 12570 }, { "epoch": 8.1, "grad_norm": 0.0002851441968232393, "learning_rate": 5.706134094151213e-06, "loss": 0.0004, "step": 12580 }, { "epoch": 8.1, "grad_norm": 0.0006324647110886872, "learning_rate": 5.66650816294183e-06, "loss": 0.0, "step": 12590 }, { "epoch": 8.1, "grad_norm": 0.000717841787263751, "learning_rate": 5.626882231732445e-06, "loss": 0.0, "step": 12600 }, { "epoch": 8.1, "grad_norm": 0.001114896615035832, "learning_rate": 5.5872563005230625e-06, "loss": 0.0, "step": 12610 }, { "epoch": 8.1, "grad_norm": 0.0011514411307871342, "learning_rate": 5.547630369313679e-06, "loss": 0.0001, "step": 12620 }, { "epoch": 8.1, "eval_accuracy": 0.9654135338345865, "eval_loss": 0.2435862421989441, "eval_runtime": 2357.1776, "eval_samples_per_second": 0.282, "eval_steps_per_second": 0.141, "step": 12627 }, { "epoch": 9.0, "grad_norm": 0.00044704281026497483, "learning_rate": 5.508004438104296e-06, "loss": 0.0, "step": 12630 }, { "epoch": 9.0, "grad_norm": 0.00041269470239058137, "learning_rate": 5.468378506894913e-06, "loss": 0.0, "step": 12640 }, { "epoch": 9.0, "grad_norm": 0.0003670216246973723, "learning_rate": 5.428752575685528e-06, "loss": 0.0, "step": 12650 }, { "epoch": 9.0, "grad_norm": 0.003106119344010949, "learning_rate": 5.3891266444761455e-06, "loss": 0.0, "step": 12660 }, { "epoch": 9.0, "grad_norm": 0.00040537622408010066, "learning_rate": 5.349500713266762e-06, "loss": 0.0, "step": 12670 }, { "epoch": 9.0, "grad_norm": 0.00037262984551489353, "learning_rate": 5.309874782057379e-06, "loss": 0.0, "step": 12680 }, { "epoch": 9.0, "grad_norm": 0.000418797048041597, "learning_rate": 5.270248850847995e-06, "loss": 0.0, "step": 12690 }, { "epoch": 9.01, "grad_norm": 0.0015914670657366514, "learning_rate": 5.230622919638612e-06, "loss": 0.0, "step": 12700 }, { "epoch": 9.01, "grad_norm": 0.005690779071301222, "learning_rate": 5.1909969884292285e-06, "loss": 0.0976, "step": 12710 }, { "epoch": 9.01, "grad_norm": 0.001181070227175951, "learning_rate": 5.151371057219845e-06, "loss": 0.0, "step": 12720 }, { "epoch": 9.01, "grad_norm": 0.0007823907653801143, "learning_rate": 5.111745126010462e-06, "loss": 0.0, "step": 12730 }, { "epoch": 9.01, "grad_norm": 0.0010620895773172379, "learning_rate": 5.072119194801078e-06, "loss": 0.0, "step": 12740 }, { "epoch": 9.01, "grad_norm": 0.00028126072720624506, "learning_rate": 5.032493263591695e-06, "loss": 0.0052, "step": 12750 }, { "epoch": 9.01, "grad_norm": 0.0005754511221311986, "learning_rate": 4.992867332382311e-06, "loss": 0.0, "step": 12760 }, { "epoch": 9.01, "grad_norm": 0.000247256743023172, "learning_rate": 4.953241401172928e-06, "loss": 0.0, "step": 12770 }, { "epoch": 9.01, "grad_norm": 0.0017203809693455696, "learning_rate": 4.913615469963544e-06, "loss": 0.0001, "step": 12780 }, { "epoch": 9.01, "grad_norm": 0.0005222621257416904, "learning_rate": 4.873989538754161e-06, "loss": 0.0, "step": 12790 }, { "epoch": 9.01, "grad_norm": 0.00047639888362027705, "learning_rate": 4.834363607544778e-06, "loss": 0.0001, "step": 12800 }, { "epoch": 9.01, "grad_norm": 0.0015658453339710832, "learning_rate": 4.794737676335394e-06, "loss": 0.0, "step": 12810 }, { "epoch": 9.01, "grad_norm": 0.0002700120967347175, "learning_rate": 4.755111745126011e-06, "loss": 0.0, "step": 12820 }, { "epoch": 9.01, "grad_norm": 0.00036174681736156344, "learning_rate": 4.715485813916627e-06, "loss": 0.0, "step": 12830 }, { "epoch": 9.02, "grad_norm": 0.00048193742986768484, "learning_rate": 4.675859882707244e-06, "loss": 0.0001, "step": 12840 }, { "epoch": 9.02, "grad_norm": 0.00021181856573093683, "learning_rate": 4.63623395149786e-06, "loss": 0.0, "step": 12850 }, { "epoch": 9.02, "grad_norm": 0.0007221151608973742, "learning_rate": 4.5966080202884774e-06, "loss": 0.0001, "step": 12860 }, { "epoch": 9.02, "grad_norm": 0.0008499003597535193, "learning_rate": 4.556982089079094e-06, "loss": 0.0, "step": 12870 }, { "epoch": 9.02, "grad_norm": 0.00024478594423271716, "learning_rate": 4.51735615786971e-06, "loss": 0.0, "step": 12880 }, { "epoch": 9.02, "grad_norm": 0.000799850036855787, "learning_rate": 4.477730226660327e-06, "loss": 0.0, "step": 12890 }, { "epoch": 9.02, "grad_norm": 0.0012479170691221952, "learning_rate": 4.438104295450943e-06, "loss": 0.0007, "step": 12900 }, { "epoch": 9.02, "grad_norm": 0.0008572249207645655, "learning_rate": 4.3984783642415604e-06, "loss": 0.0, "step": 12910 }, { "epoch": 9.02, "grad_norm": 0.00028230881434865296, "learning_rate": 4.358852433032176e-06, "loss": 0.0773, "step": 12920 }, { "epoch": 9.02, "grad_norm": 0.0003641119983512908, "learning_rate": 4.319226501822793e-06, "loss": 0.0, "step": 12930 }, { "epoch": 9.02, "grad_norm": 0.0009531981777399778, "learning_rate": 4.279600570613409e-06, "loss": 0.0, "step": 12940 }, { "epoch": 9.02, "grad_norm": 0.00067020149435848, "learning_rate": 4.239974639404026e-06, "loss": 0.0, "step": 12950 }, { "epoch": 9.02, "grad_norm": 0.0001659138360992074, "learning_rate": 4.200348708194643e-06, "loss": 0.0, "step": 12960 }, { "epoch": 9.02, "grad_norm": 0.0005148449563421309, "learning_rate": 4.16072277698526e-06, "loss": 0.0107, "step": 12970 }, { "epoch": 9.03, "grad_norm": 0.000638917728792876, "learning_rate": 4.121096845775876e-06, "loss": 0.0, "step": 12980 }, { "epoch": 9.03, "grad_norm": 0.00047383896890096366, "learning_rate": 4.081470914566492e-06, "loss": 0.0, "step": 12990 }, { "epoch": 9.03, "grad_norm": 0.0007675238302908838, "learning_rate": 4.041844983357109e-06, "loss": 0.0, "step": 13000 }, { "epoch": 9.03, "grad_norm": 0.001697351224720478, "learning_rate": 4.0022190521477256e-06, "loss": 0.0, "step": 13010 }, { "epoch": 9.03, "grad_norm": 0.00020665867486968637, "learning_rate": 3.962593120938343e-06, "loss": 0.0, "step": 13020 }, { "epoch": 9.03, "grad_norm": 0.001027750549837947, "learning_rate": 3.922967189728959e-06, "loss": 0.2632, "step": 13030 }, { "epoch": 9.03, "grad_norm": 0.003146632807329297, "learning_rate": 3.883341258519575e-06, "loss": 0.0, "step": 13040 }, { "epoch": 9.03, "grad_norm": 0.0007864089566282928, "learning_rate": 3.843715327310192e-06, "loss": 0.0044, "step": 13050 }, { "epoch": 9.03, "grad_norm": 0.00022077991161495447, "learning_rate": 3.8040893961008086e-06, "loss": 0.0, "step": 13060 }, { "epoch": 9.03, "grad_norm": 0.0005595972179435194, "learning_rate": 3.7644634648914252e-06, "loss": 0.0, "step": 13070 }, { "epoch": 9.03, "grad_norm": 0.0005725977243855596, "learning_rate": 3.7248375336820415e-06, "loss": 0.0, "step": 13080 }, { "epoch": 9.03, "grad_norm": 0.0011127095203846693, "learning_rate": 3.685211602472658e-06, "loss": 0.0, "step": 13090 }, { "epoch": 9.03, "grad_norm": 0.001887647551484406, "learning_rate": 3.645585671263275e-06, "loss": 0.0, "step": 13100 }, { "epoch": 9.03, "grad_norm": 0.0005976618267595768, "learning_rate": 3.6059597400538916e-06, "loss": 0.0003, "step": 13110 }, { "epoch": 9.04, "grad_norm": 0.0006656855694018304, "learning_rate": 3.5663338088445082e-06, "loss": 0.0, "step": 13120 }, { "epoch": 9.04, "grad_norm": 0.003439901163801551, "learning_rate": 3.526707877635125e-06, "loss": 0.0, "step": 13130 }, { "epoch": 9.04, "grad_norm": 0.00043997442116960883, "learning_rate": 3.4870819464257408e-06, "loss": 0.0, "step": 13140 }, { "epoch": 9.04, "grad_norm": 0.0005484743160195649, "learning_rate": 3.4474560152163574e-06, "loss": 0.0, "step": 13150 }, { "epoch": 9.04, "grad_norm": 0.00040827819611877203, "learning_rate": 3.4078300840069746e-06, "loss": 0.0, "step": 13160 }, { "epoch": 9.04, "grad_norm": 0.005499335937201977, "learning_rate": 3.3682041527975912e-06, "loss": 0.0001, "step": 13170 }, { "epoch": 9.04, "grad_norm": 0.001736334292218089, "learning_rate": 3.328578221588208e-06, "loss": 0.0, "step": 13180 }, { "epoch": 9.04, "grad_norm": 0.0006113905692473054, "learning_rate": 3.2889522903788238e-06, "loss": 0.0, "step": 13190 }, { "epoch": 9.04, "grad_norm": 0.001001613331027329, "learning_rate": 3.2493263591694404e-06, "loss": 0.1631, "step": 13200 }, { "epoch": 9.04, "grad_norm": 0.0003023295139428228, "learning_rate": 3.209700427960057e-06, "loss": 0.0, "step": 13210 }, { "epoch": 9.04, "grad_norm": 0.0009469907963648438, "learning_rate": 3.170074496750674e-06, "loss": 0.0, "step": 13220 }, { "epoch": 9.04, "grad_norm": 0.0007909215637482703, "learning_rate": 3.1304485655412905e-06, "loss": 0.0, "step": 13230 }, { "epoch": 9.04, "grad_norm": 0.001787104643881321, "learning_rate": 3.0908226343319067e-06, "loss": 0.0, "step": 13240 }, { "epoch": 9.04, "grad_norm": 0.0008837388013489544, "learning_rate": 3.051196703122524e-06, "loss": 0.0, "step": 13250 }, { "epoch": 9.05, "grad_norm": 0.0007934242021292448, "learning_rate": 3.01157077191314e-06, "loss": 0.0004, "step": 13260 }, { "epoch": 9.05, "grad_norm": 0.0011570610804483294, "learning_rate": 2.971944840703757e-06, "loss": 0.0001, "step": 13270 }, { "epoch": 9.05, "grad_norm": 0.00029090800671838224, "learning_rate": 2.932318909494373e-06, "loss": 0.0, "step": 13280 }, { "epoch": 9.05, "grad_norm": 0.0010709144407883286, "learning_rate": 2.8926929782849897e-06, "loss": 0.0, "step": 13290 }, { "epoch": 9.05, "grad_norm": 0.001289168605580926, "learning_rate": 2.8530670470756064e-06, "loss": 0.0, "step": 13300 }, { "epoch": 9.05, "grad_norm": 0.002187453443184495, "learning_rate": 2.8134411158662227e-06, "loss": 0.0, "step": 13310 }, { "epoch": 9.05, "grad_norm": 0.0007116499473340809, "learning_rate": 2.7738151846568394e-06, "loss": 0.0, "step": 13320 }, { "epoch": 9.05, "grad_norm": 0.000514859682880342, "learning_rate": 2.7341892534474565e-06, "loss": 0.0001, "step": 13330 }, { "epoch": 9.05, "grad_norm": 0.0007328620995394886, "learning_rate": 2.6945633222380727e-06, "loss": 0.0691, "step": 13340 }, { "epoch": 9.05, "grad_norm": 0.0007036814349703491, "learning_rate": 2.6549373910286894e-06, "loss": 0.0, "step": 13350 }, { "epoch": 9.05, "grad_norm": 0.001070524798706174, "learning_rate": 2.615311459819306e-06, "loss": 0.0, "step": 13360 }, { "epoch": 9.05, "grad_norm": 0.0008939993567764759, "learning_rate": 2.5756855286099224e-06, "loss": 0.0001, "step": 13370 }, { "epoch": 9.05, "grad_norm": 0.0004034818266518414, "learning_rate": 2.536059597400539e-06, "loss": 0.0206, "step": 13380 }, { "epoch": 9.05, "grad_norm": 4.411261081695557, "learning_rate": 2.4964336661911553e-06, "loss": 0.0124, "step": 13390 }, { "epoch": 9.06, "grad_norm": 0.0006528793601319194, "learning_rate": 2.456807734981772e-06, "loss": 0.0, "step": 13400 }, { "epoch": 9.06, "grad_norm": 0.0003673941537272185, "learning_rate": 2.417181803772389e-06, "loss": 0.0, "step": 13410 }, { "epoch": 9.06, "grad_norm": 0.001056182780303061, "learning_rate": 2.3775558725630054e-06, "loss": 0.0, "step": 13420 }, { "epoch": 9.06, "grad_norm": 0.0012370526092126966, "learning_rate": 2.337929941353622e-06, "loss": 0.0001, "step": 13430 }, { "epoch": 9.06, "grad_norm": 0.0015783560229465365, "learning_rate": 2.2983040101442387e-06, "loss": 0.0, "step": 13440 }, { "epoch": 9.06, "grad_norm": 0.0001985041017178446, "learning_rate": 2.258678078934855e-06, "loss": 0.0016, "step": 13450 }, { "epoch": 9.06, "grad_norm": 0.0010269788326695561, "learning_rate": 2.2190521477254717e-06, "loss": 0.1057, "step": 13460 }, { "epoch": 9.06, "grad_norm": 0.04036625847220421, "learning_rate": 2.179426216516088e-06, "loss": 0.2287, "step": 13470 }, { "epoch": 9.06, "grad_norm": 0.000473200052510947, "learning_rate": 2.1398002853067046e-06, "loss": 0.0001, "step": 13480 }, { "epoch": 9.06, "grad_norm": 0.0003723807749338448, "learning_rate": 2.1001743540973213e-06, "loss": 0.0, "step": 13490 }, { "epoch": 9.06, "grad_norm": 0.0007169354357756674, "learning_rate": 2.060548422887938e-06, "loss": 0.0008, "step": 13500 }, { "epoch": 9.06, "grad_norm": 0.00031334979576058686, "learning_rate": 2.0209224916785547e-06, "loss": 0.0, "step": 13510 }, { "epoch": 9.06, "grad_norm": 0.000616435194388032, "learning_rate": 1.9812965604691713e-06, "loss": 0.0, "step": 13520 }, { "epoch": 9.06, "grad_norm": 0.0008787320111878216, "learning_rate": 1.9416706292597876e-06, "loss": 0.0, "step": 13530 }, { "epoch": 9.07, "grad_norm": 0.0002825538394972682, "learning_rate": 1.9020446980504043e-06, "loss": 0.0021, "step": 13540 }, { "epoch": 9.07, "grad_norm": 0.002063804306089878, "learning_rate": 1.8624187668410208e-06, "loss": 0.0004, "step": 13550 }, { "epoch": 9.07, "grad_norm": 0.000512151513248682, "learning_rate": 1.8227928356316374e-06, "loss": 0.0, "step": 13560 }, { "epoch": 9.07, "grad_norm": 0.0006224968819878995, "learning_rate": 1.7831669044222541e-06, "loss": 0.0, "step": 13570 }, { "epoch": 9.07, "grad_norm": 0.00019008757953997701, "learning_rate": 1.7435409732128704e-06, "loss": 0.0, "step": 13580 }, { "epoch": 9.07, "grad_norm": 0.0002794242464005947, "learning_rate": 1.7039150420034873e-06, "loss": 0.0, "step": 13590 }, { "epoch": 9.07, "grad_norm": 0.0009566029766574502, "learning_rate": 1.664289110794104e-06, "loss": 0.0001, "step": 13600 }, { "epoch": 9.07, "grad_norm": 0.0003199617494828999, "learning_rate": 1.6246631795847202e-06, "loss": 0.0001, "step": 13610 }, { "epoch": 9.07, "grad_norm": 0.00032697312417440116, "learning_rate": 1.585037248375337e-06, "loss": 0.0, "step": 13620 }, { "epoch": 9.07, "grad_norm": 0.002565112430602312, "learning_rate": 1.5454113171659534e-06, "loss": 0.0, "step": 13630 }, { "epoch": 9.07, "grad_norm": 237.59519958496094, "learning_rate": 1.50578538595657e-06, "loss": 0.1313, "step": 13640 }, { "epoch": 9.07, "grad_norm": 0.0006662964588031173, "learning_rate": 1.4661594547471865e-06, "loss": 0.0, "step": 13650 }, { "epoch": 9.07, "grad_norm": 0.0011941486736759543, "learning_rate": 1.4265335235378032e-06, "loss": 0.0, "step": 13660 }, { "epoch": 9.07, "grad_norm": 0.0028123382944613695, "learning_rate": 1.3869075923284197e-06, "loss": 0.0, "step": 13670 }, { "epoch": 9.08, "grad_norm": 0.0008815588662400842, "learning_rate": 1.3472816611190364e-06, "loss": 0.0277, "step": 13680 }, { "epoch": 9.08, "grad_norm": 0.00045147593482397497, "learning_rate": 1.307655729909653e-06, "loss": 0.0, "step": 13690 }, { "epoch": 9.08, "grad_norm": 0.00011046286817872897, "learning_rate": 1.2680297987002695e-06, "loss": 0.1484, "step": 13700 }, { "epoch": 9.08, "grad_norm": 0.0018034332897514105, "learning_rate": 1.228403867490886e-06, "loss": 0.0027, "step": 13710 }, { "epoch": 9.08, "grad_norm": 0.000713842804543674, "learning_rate": 1.1887779362815027e-06, "loss": 0.0, "step": 13720 }, { "epoch": 9.08, "grad_norm": 0.0010389587841928005, "learning_rate": 1.1491520050721194e-06, "loss": 0.0, "step": 13730 }, { "epoch": 9.08, "grad_norm": 0.0003368295438122004, "learning_rate": 1.1095260738627358e-06, "loss": 0.0, "step": 13740 }, { "epoch": 9.08, "grad_norm": 0.000346412300132215, "learning_rate": 1.0699001426533523e-06, "loss": 0.0, "step": 13750 }, { "epoch": 9.08, "grad_norm": 0.0004677934921346605, "learning_rate": 1.030274211443969e-06, "loss": 0.0, "step": 13760 }, { "epoch": 9.08, "grad_norm": 0.0008401199011132121, "learning_rate": 9.906482802345857e-07, "loss": 0.0, "step": 13770 }, { "epoch": 9.08, "grad_norm": 0.0003339408722240478, "learning_rate": 9.510223490252021e-07, "loss": 0.0002, "step": 13780 }, { "epoch": 9.08, "grad_norm": 0.0004967558197677135, "learning_rate": 9.113964178158187e-07, "loss": 0.0, "step": 13790 }, { "epoch": 9.08, "grad_norm": 0.002963978098705411, "learning_rate": 8.717704866064352e-07, "loss": 0.0, "step": 13800 }, { "epoch": 9.08, "grad_norm": 0.001155543839558959, "learning_rate": 8.32144555397052e-07, "loss": 0.0, "step": 13810 }, { "epoch": 9.09, "grad_norm": 0.000786484801210463, "learning_rate": 7.925186241876685e-07, "loss": 0.1625, "step": 13820 }, { "epoch": 9.09, "grad_norm": 0.0002841146197170019, "learning_rate": 7.52892692978285e-07, "loss": 0.0, "step": 13830 }, { "epoch": 9.09, "grad_norm": 0.00030605948995798826, "learning_rate": 7.132667617689016e-07, "loss": 0.0, "step": 13840 }, { "epoch": 9.09, "grad_norm": 0.001265210215933621, "learning_rate": 6.736408305595182e-07, "loss": 0.0, "step": 13850 }, { "epoch": 9.09, "grad_norm": 0.00038683577440679073, "learning_rate": 6.340148993501348e-07, "loss": 0.0, "step": 13860 }, { "epoch": 9.09, "grad_norm": 0.0005034942296333611, "learning_rate": 5.943889681407513e-07, "loss": 0.0, "step": 13870 }, { "epoch": 9.09, "grad_norm": 0.0011582579463720322, "learning_rate": 5.547630369313679e-07, "loss": 0.0, "step": 13880 }, { "epoch": 9.09, "grad_norm": 0.0016904632793739438, "learning_rate": 5.151371057219845e-07, "loss": 0.0, "step": 13890 }, { "epoch": 9.09, "grad_norm": 0.00032329061650671065, "learning_rate": 4.7551117451260107e-07, "loss": 0.0, "step": 13900 }, { "epoch": 9.09, "grad_norm": 0.0003388900659047067, "learning_rate": 4.358852433032176e-07, "loss": 0.0, "step": 13910 }, { "epoch": 9.09, "grad_norm": 0.0003800652630161494, "learning_rate": 3.962593120938342e-07, "loss": 0.0, "step": 13920 }, { "epoch": 9.09, "grad_norm": 0.0009641946526244283, "learning_rate": 3.566333808844508e-07, "loss": 0.0, "step": 13930 }, { "epoch": 9.09, "grad_norm": 0.0005723941139876842, "learning_rate": 3.170074496750674e-07, "loss": 0.0001, "step": 13940 }, { "epoch": 9.09, "grad_norm": 0.0005183956818655133, "learning_rate": 2.7738151846568396e-07, "loss": 0.0, "step": 13950 }, { "epoch": 9.1, "grad_norm": 0.009076601825654507, "learning_rate": 2.3775558725630054e-07, "loss": 0.0, "step": 13960 }, { "epoch": 9.1, "grad_norm": 0.0007901808712631464, "learning_rate": 1.981296560469171e-07, "loss": 0.0, "step": 13970 }, { "epoch": 9.1, "grad_norm": 0.0005284142098389566, "learning_rate": 1.585037248375337e-07, "loss": 0.0, "step": 13980 }, { "epoch": 9.1, "grad_norm": 0.0006428571650758386, "learning_rate": 1.1887779362815027e-07, "loss": 0.0, "step": 13990 }, { "epoch": 9.1, "grad_norm": 0.0012319569941610098, "learning_rate": 7.925186241876685e-08, "loss": 0.0001, "step": 14000 }, { "epoch": 9.1, "grad_norm": 0.000267757655819878, "learning_rate": 3.962593120938342e-08, "loss": 0.0, "step": 14010 }, { "epoch": 9.1, "grad_norm": 0.0010718012927100062, "learning_rate": 0.0, "loss": 0.0, "step": 14020 }, { "epoch": 9.1, "eval_accuracy": 0.9654135338345865, "eval_loss": 0.24323464930057526, "eval_runtime": 2339.7693, "eval_samples_per_second": 0.284, "eval_steps_per_second": 0.142, "step": 14020 }, { "epoch": 9.1, "step": 14020, "total_flos": 7.1819242300007645e+19, "train_loss": 0.21922695452951727, "train_runtime": 145352.8053, "train_samples_per_second": 0.193, "train_steps_per_second": 0.096 }, { "epoch": 9.1, "eval_accuracy": 0.960960960960961, "eval_loss": 0.25779759883880615, "eval_runtime": 1196.8626, "eval_samples_per_second": 0.278, "eval_steps_per_second": 0.14, "step": 14020 }, { "epoch": 9.1, "eval_accuracy": 0.960960960960961, "eval_loss": 0.25779759883880615, "eval_runtime": 1193.7233, "eval_samples_per_second": 0.279, "eval_steps_per_second": 0.14, "step": 14020 } ], "logging_steps": 10, "max_steps": 14020, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "total_flos": 7.1819242300007645e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }