diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9952 @@ +{ + "best_metric": 0.9669172932330827, + "best_model_checkpoint": "YAHIA/vivit-b-16x2-collected-dataset\\checkpoint-8418", + "epoch": 9.099358059914408, + "eval_steps": 500, + "global_step": 14020, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 16.260046005249023, + "learning_rate": 3.566333808844508e-07, + "loss": 1.7843, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 24.135656356811523, + "learning_rate": 7.132667617689016e-07, + "loss": 1.8164, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 22.565906524658203, + "learning_rate": 1.0699001426533523e-06, + "loss": 1.9396, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 20.68889045715332, + "learning_rate": 1.4265335235378032e-06, + "loss": 1.9576, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 22.999574661254883, + "learning_rate": 1.7831669044222541e-06, + "loss": 1.9828, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 27.69795036315918, + "learning_rate": 2.1398002853067046e-06, + "loss": 1.9381, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 25.143293380737305, + "learning_rate": 2.4964336661911553e-06, + "loss": 1.8222, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 20.670278549194336, + "learning_rate": 2.8530670470756064e-06, + "loss": 1.7593, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 20.401081085205078, + "learning_rate": 3.209700427960057e-06, + "loss": 1.7611, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 21.053512573242188, + "learning_rate": 3.5663338088445082e-06, + "loss": 1.7861, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 21.759618759155273, + "learning_rate": 3.922967189728959e-06, + "loss": 1.7698, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 22.194372177124023, + "learning_rate": 4.279600570613409e-06, + "loss": 1.7558, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 20.19968605041504, + "learning_rate": 4.63623395149786e-06, + "loss": 1.8624, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 19.7205753326416, + "learning_rate": 4.992867332382311e-06, + "loss": 1.6795, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 19.424144744873047, + "learning_rate": 5.349500713266762e-06, + "loss": 1.7443, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 19.568191528320312, + "learning_rate": 5.706134094151213e-06, + "loss": 1.7468, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 22.05777931213379, + "learning_rate": 6.062767475035663e-06, + "loss": 1.7362, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 17.77819061279297, + "learning_rate": 6.419400855920114e-06, + "loss": 1.527, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 19.465238571166992, + "learning_rate": 6.776034236804565e-06, + "loss": 1.5923, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 20.62281036376953, + "learning_rate": 7.1326676176890165e-06, + "loss": 1.697, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 16.7163028717041, + "learning_rate": 7.489300998573468e-06, + "loss": 1.4694, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 20.071901321411133, + "learning_rate": 7.845934379457918e-06, + "loss": 1.4549, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 20.55426597595215, + "learning_rate": 8.202567760342367e-06, + "loss": 1.3167, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 26.36579704284668, + "learning_rate": 8.559201141226818e-06, + "loss": 1.6743, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 17.331533432006836, + "learning_rate": 8.91583452211127e-06, + "loss": 1.4754, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 19.567764282226562, + "learning_rate": 9.27246790299572e-06, + "loss": 1.45, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 16.322946548461914, + "learning_rate": 9.629101283880172e-06, + "loss": 1.3971, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 18.62678337097168, + "learning_rate": 9.985734664764621e-06, + "loss": 1.4368, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 20.327966690063477, + "learning_rate": 1.0342368045649072e-05, + "loss": 1.5098, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 18.31135368347168, + "learning_rate": 1.0699001426533523e-05, + "loss": 1.1699, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 28.94702911376953, + "learning_rate": 1.1055634807417975e-05, + "loss": 1.5048, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 21.377225875854492, + "learning_rate": 1.1412268188302426e-05, + "loss": 1.4112, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 15.965813636779785, + "learning_rate": 1.1768901569186877e-05, + "loss": 1.5097, + "step": 330 + }, + { + "epoch": 0.02, + "grad_norm": 19.742080688476562, + "learning_rate": 1.2125534950071326e-05, + "loss": 1.2703, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 18.924072265625, + "learning_rate": 1.2482168330955777e-05, + "loss": 1.2194, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 18.15528106689453, + "learning_rate": 1.2838801711840228e-05, + "loss": 1.0668, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 21.82122802734375, + "learning_rate": 1.3195435092724678e-05, + "loss": 0.9053, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 20.609405517578125, + "learning_rate": 1.355206847360913e-05, + "loss": 1.2574, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 25.153718948364258, + "learning_rate": 1.390870185449358e-05, + "loss": 1.1619, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 12.118425369262695, + "learning_rate": 1.4265335235378033e-05, + "loss": 1.1514, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 25.673738479614258, + "learning_rate": 1.4621968616262482e-05, + "loss": 1.2191, + "step": 410 + }, + { + "epoch": 0.03, + "grad_norm": 23.109697341918945, + "learning_rate": 1.4978601997146935e-05, + "loss": 0.8535, + "step": 420 + }, + { + "epoch": 0.03, + "grad_norm": 15.181422233581543, + "learning_rate": 1.5335235378031385e-05, + "loss": 0.7464, + "step": 430 + }, + { + "epoch": 0.03, + "grad_norm": 31.820419311523438, + "learning_rate": 1.5691868758915836e-05, + "loss": 1.2202, + "step": 440 + }, + { + "epoch": 0.03, + "grad_norm": 24.667930603027344, + "learning_rate": 1.6048502139800287e-05, + "loss": 1.0542, + "step": 450 + }, + { + "epoch": 0.03, + "grad_norm": 16.041976928710938, + "learning_rate": 1.6405135520684735e-05, + "loss": 0.9541, + "step": 460 + }, + { + "epoch": 0.03, + "grad_norm": 9.076415061950684, + "learning_rate": 1.676176890156919e-05, + "loss": 1.0625, + "step": 470 + }, + { + "epoch": 0.03, + "grad_norm": 9.516477584838867, + "learning_rate": 1.7118402282453637e-05, + "loss": 1.1725, + "step": 480 + }, + { + "epoch": 0.03, + "grad_norm": 29.433717727661133, + "learning_rate": 1.7475035663338088e-05, + "loss": 0.9226, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 14.57030200958252, + "learning_rate": 1.783166904422254e-05, + "loss": 0.7345, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 19.724756240844727, + "learning_rate": 1.818830242510699e-05, + "loss": 1.1076, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 17.7041072845459, + "learning_rate": 1.854493580599144e-05, + "loss": 1.1412, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 4.248980522155762, + "learning_rate": 1.8901569186875892e-05, + "loss": 0.675, + "step": 530 + }, + { + "epoch": 0.04, + "grad_norm": 6.876579284667969, + "learning_rate": 1.9258202567760344e-05, + "loss": 0.703, + "step": 540 + }, + { + "epoch": 0.04, + "grad_norm": 15.930359840393066, + "learning_rate": 1.9614835948644795e-05, + "loss": 0.671, + "step": 550 + }, + { + "epoch": 0.04, + "grad_norm": 7.9089226722717285, + "learning_rate": 1.9971469329529242e-05, + "loss": 0.7656, + "step": 560 + }, + { + "epoch": 0.04, + "grad_norm": 20.674118041992188, + "learning_rate": 2.0328102710413697e-05, + "loss": 1.4598, + "step": 570 + }, + { + "epoch": 0.04, + "grad_norm": 33.44108581542969, + "learning_rate": 2.0684736091298145e-05, + "loss": 0.9271, + "step": 580 + }, + { + "epoch": 0.04, + "grad_norm": 9.660829544067383, + "learning_rate": 2.10413694721826e-05, + "loss": 0.9135, + "step": 590 + }, + { + "epoch": 0.04, + "grad_norm": 3.2455947399139404, + "learning_rate": 2.1398002853067047e-05, + "loss": 0.9244, + "step": 600 + }, + { + "epoch": 0.04, + "grad_norm": 16.9035587310791, + "learning_rate": 2.1754636233951498e-05, + "loss": 1.2397, + "step": 610 + }, + { + "epoch": 0.04, + "grad_norm": 12.139324188232422, + "learning_rate": 2.211126961483595e-05, + "loss": 0.9137, + "step": 620 + }, + { + "epoch": 0.04, + "grad_norm": 13.0936861038208, + "learning_rate": 2.24679029957204e-05, + "loss": 0.891, + "step": 630 + }, + { + "epoch": 0.05, + "grad_norm": 8.328577995300293, + "learning_rate": 2.282453637660485e-05, + "loss": 0.8631, + "step": 640 + }, + { + "epoch": 0.05, + "grad_norm": 24.814929962158203, + "learning_rate": 2.3181169757489303e-05, + "loss": 0.5948, + "step": 650 + }, + { + "epoch": 0.05, + "grad_norm": 15.284310340881348, + "learning_rate": 2.3537803138373754e-05, + "loss": 1.1153, + "step": 660 + }, + { + "epoch": 0.05, + "grad_norm": 2.1705708503723145, + "learning_rate": 2.3894436519258205e-05, + "loss": 0.6161, + "step": 670 + }, + { + "epoch": 0.05, + "grad_norm": 15.621281623840332, + "learning_rate": 2.4251069900142652e-05, + "loss": 0.8466, + "step": 680 + }, + { + "epoch": 0.05, + "grad_norm": 37.767173767089844, + "learning_rate": 2.4607703281027107e-05, + "loss": 0.7471, + "step": 690 + }, + { + "epoch": 0.05, + "grad_norm": 5.153799533843994, + "learning_rate": 2.4964336661911555e-05, + "loss": 0.5421, + "step": 700 + }, + { + "epoch": 0.05, + "grad_norm": 3.665609359741211, + "learning_rate": 2.5320970042796006e-05, + "loss": 0.4251, + "step": 710 + }, + { + "epoch": 0.05, + "grad_norm": 21.673925399780273, + "learning_rate": 2.5677603423680457e-05, + "loss": 0.8117, + "step": 720 + }, + { + "epoch": 0.05, + "grad_norm": 23.484006881713867, + "learning_rate": 2.603423680456491e-05, + "loss": 0.4761, + "step": 730 + }, + { + "epoch": 0.05, + "grad_norm": 24.750452041625977, + "learning_rate": 2.6390870185449356e-05, + "loss": 0.95, + "step": 740 + }, + { + "epoch": 0.05, + "grad_norm": 6.027065277099609, + "learning_rate": 2.674750356633381e-05, + "loss": 0.9197, + "step": 750 + }, + { + "epoch": 0.05, + "grad_norm": 33.312313079833984, + "learning_rate": 2.710413694721826e-05, + "loss": 1.033, + "step": 760 + }, + { + "epoch": 0.05, + "grad_norm": 15.621706008911133, + "learning_rate": 2.7460770328102713e-05, + "loss": 0.2779, + "step": 770 + }, + { + "epoch": 0.06, + "grad_norm": 10.880739212036133, + "learning_rate": 2.781740370898716e-05, + "loss": 0.5387, + "step": 780 + }, + { + "epoch": 0.06, + "grad_norm": 11.6985445022583, + "learning_rate": 2.8174037089871615e-05, + "loss": 0.7687, + "step": 790 + }, + { + "epoch": 0.06, + "grad_norm": 25.108810424804688, + "learning_rate": 2.8530670470756066e-05, + "loss": 0.4862, + "step": 800 + }, + { + "epoch": 0.06, + "grad_norm": 23.200624465942383, + "learning_rate": 2.8887303851640514e-05, + "loss": 0.9553, + "step": 810 + }, + { + "epoch": 0.06, + "grad_norm": 30.682540893554688, + "learning_rate": 2.9243937232524965e-05, + "loss": 0.8558, + "step": 820 + }, + { + "epoch": 0.06, + "grad_norm": 12.823701858520508, + "learning_rate": 2.9600570613409416e-05, + "loss": 0.6195, + "step": 830 + }, + { + "epoch": 0.06, + "grad_norm": 11.762367248535156, + "learning_rate": 2.995720399429387e-05, + "loss": 0.758, + "step": 840 + }, + { + "epoch": 0.06, + "grad_norm": 1.1662691831588745, + "learning_rate": 3.0313837375178318e-05, + "loss": 0.354, + "step": 850 + }, + { + "epoch": 0.06, + "grad_norm": 23.4963436126709, + "learning_rate": 3.067047075606277e-05, + "loss": 0.8267, + "step": 860 + }, + { + "epoch": 0.06, + "grad_norm": 0.13900019228458405, + "learning_rate": 3.102710413694722e-05, + "loss": 0.6618, + "step": 870 + }, + { + "epoch": 0.06, + "grad_norm": 7.843920707702637, + "learning_rate": 3.138373751783167e-05, + "loss": 0.7689, + "step": 880 + }, + { + "epoch": 0.06, + "grad_norm": 31.13179588317871, + "learning_rate": 3.174037089871612e-05, + "loss": 0.7772, + "step": 890 + }, + { + "epoch": 0.06, + "grad_norm": 44.58312225341797, + "learning_rate": 3.2097004279600574e-05, + "loss": 1.1062, + "step": 900 + }, + { + "epoch": 0.06, + "grad_norm": 18.089794158935547, + "learning_rate": 3.2453637660485025e-05, + "loss": 0.7678, + "step": 910 + }, + { + "epoch": 0.07, + "grad_norm": 33.472625732421875, + "learning_rate": 3.281027104136947e-05, + "loss": 1.6911, + "step": 920 + }, + { + "epoch": 0.07, + "grad_norm": 33.618831634521484, + "learning_rate": 3.316690442225393e-05, + "loss": 0.8881, + "step": 930 + }, + { + "epoch": 0.07, + "grad_norm": 1.6782217025756836, + "learning_rate": 3.352353780313838e-05, + "loss": 0.7327, + "step": 940 + }, + { + "epoch": 0.07, + "grad_norm": 2.1791036128997803, + "learning_rate": 3.388017118402282e-05, + "loss": 0.8054, + "step": 950 + }, + { + "epoch": 0.07, + "grad_norm": 6.972609043121338, + "learning_rate": 3.4236804564907274e-05, + "loss": 0.2614, + "step": 960 + }, + { + "epoch": 0.07, + "grad_norm": 24.085866928100586, + "learning_rate": 3.459343794579173e-05, + "loss": 0.4054, + "step": 970 + }, + { + "epoch": 0.07, + "grad_norm": 1.9996914863586426, + "learning_rate": 3.4950071326676176e-05, + "loss": 0.5344, + "step": 980 + }, + { + "epoch": 0.07, + "grad_norm": 0.13388022780418396, + "learning_rate": 3.530670470756063e-05, + "loss": 0.7224, + "step": 990 + }, + { + "epoch": 0.07, + "grad_norm": 30.018585205078125, + "learning_rate": 3.566333808844508e-05, + "loss": 0.6226, + "step": 1000 + }, + { + "epoch": 0.07, + "grad_norm": 14.195096015930176, + "learning_rate": 3.6019971469329536e-05, + "loss": 0.7356, + "step": 1010 + }, + { + "epoch": 0.07, + "grad_norm": 25.853748321533203, + "learning_rate": 3.637660485021398e-05, + "loss": 0.7235, + "step": 1020 + }, + { + "epoch": 0.07, + "grad_norm": 29.89474868774414, + "learning_rate": 3.673323823109843e-05, + "loss": 1.1801, + "step": 1030 + }, + { + "epoch": 0.07, + "grad_norm": 12.760407447814941, + "learning_rate": 3.708987161198288e-05, + "loss": 0.4307, + "step": 1040 + }, + { + "epoch": 0.07, + "grad_norm": 29.496700286865234, + "learning_rate": 3.7446504992867334e-05, + "loss": 0.9473, + "step": 1050 + }, + { + "epoch": 0.08, + "grad_norm": 1.0010541677474976, + "learning_rate": 3.7803138373751785e-05, + "loss": 0.5983, + "step": 1060 + }, + { + "epoch": 0.08, + "grad_norm": 2.141446352005005, + "learning_rate": 3.8159771754636236e-05, + "loss": 0.4888, + "step": 1070 + }, + { + "epoch": 0.08, + "grad_norm": 1.1996098756790161, + "learning_rate": 3.851640513552069e-05, + "loss": 0.5292, + "step": 1080 + }, + { + "epoch": 0.08, + "grad_norm": 20.964256286621094, + "learning_rate": 3.887303851640514e-05, + "loss": 0.6905, + "step": 1090 + }, + { + "epoch": 0.08, + "grad_norm": 0.3161448538303375, + "learning_rate": 3.922967189728959e-05, + "loss": 0.7078, + "step": 1100 + }, + { + "epoch": 0.08, + "grad_norm": 13.272440910339355, + "learning_rate": 3.958630527817404e-05, + "loss": 0.9984, + "step": 1110 + }, + { + "epoch": 0.08, + "grad_norm": 0.3290501832962036, + "learning_rate": 3.9942938659058485e-05, + "loss": 0.4982, + "step": 1120 + }, + { + "epoch": 0.08, + "grad_norm": 0.7225183248519897, + "learning_rate": 4.029957203994294e-05, + "loss": 0.4532, + "step": 1130 + }, + { + "epoch": 0.08, + "grad_norm": 24.277801513671875, + "learning_rate": 4.0656205420827394e-05, + "loss": 0.5147, + "step": 1140 + }, + { + "epoch": 0.08, + "grad_norm": 9.140922546386719, + "learning_rate": 4.1012838801711845e-05, + "loss": 0.7049, + "step": 1150 + }, + { + "epoch": 0.08, + "grad_norm": 4.139643669128418, + "learning_rate": 4.136947218259629e-05, + "loss": 1.3454, + "step": 1160 + }, + { + "epoch": 0.08, + "grad_norm": 24.44458770751953, + "learning_rate": 4.172610556348075e-05, + "loss": 0.6409, + "step": 1170 + }, + { + "epoch": 0.08, + "grad_norm": 53.11198425292969, + "learning_rate": 4.20827389443652e-05, + "loss": 0.7063, + "step": 1180 + }, + { + "epoch": 0.08, + "grad_norm": 15.888784408569336, + "learning_rate": 4.243937232524964e-05, + "loss": 0.686, + "step": 1190 + }, + { + "epoch": 0.09, + "grad_norm": 29.689838409423828, + "learning_rate": 4.2796005706134094e-05, + "loss": 0.6301, + "step": 1200 + }, + { + "epoch": 0.09, + "grad_norm": 37.24555206298828, + "learning_rate": 4.3152639087018545e-05, + "loss": 0.939, + "step": 1210 + }, + { + "epoch": 0.09, + "grad_norm": 39.6224479675293, + "learning_rate": 4.3509272467902996e-05, + "loss": 0.9322, + "step": 1220 + }, + { + "epoch": 0.09, + "grad_norm": 28.799930572509766, + "learning_rate": 4.386590584878745e-05, + "loss": 1.1431, + "step": 1230 + }, + { + "epoch": 0.09, + "grad_norm": 0.9416821002960205, + "learning_rate": 4.42225392296719e-05, + "loss": 0.7622, + "step": 1240 + }, + { + "epoch": 0.09, + "grad_norm": 11.397088050842285, + "learning_rate": 4.457917261055635e-05, + "loss": 0.5302, + "step": 1250 + }, + { + "epoch": 0.09, + "grad_norm": 0.13693714141845703, + "learning_rate": 4.49358059914408e-05, + "loss": 0.9497, + "step": 1260 + }, + { + "epoch": 0.09, + "grad_norm": 22.64994239807129, + "learning_rate": 4.529243937232525e-05, + "loss": 1.4811, + "step": 1270 + }, + { + "epoch": 0.09, + "grad_norm": 63.26667404174805, + "learning_rate": 4.56490727532097e-05, + "loss": 0.7023, + "step": 1280 + }, + { + "epoch": 0.09, + "grad_norm": 24.035776138305664, + "learning_rate": 4.6005706134094154e-05, + "loss": 0.3478, + "step": 1290 + }, + { + "epoch": 0.09, + "grad_norm": 0.0889860987663269, + "learning_rate": 4.6362339514978605e-05, + "loss": 0.3317, + "step": 1300 + }, + { + "epoch": 0.09, + "grad_norm": 14.644208908081055, + "learning_rate": 4.6718972895863056e-05, + "loss": 0.6627, + "step": 1310 + }, + { + "epoch": 0.09, + "grad_norm": 16.509044647216797, + "learning_rate": 4.707560627674751e-05, + "loss": 1.3097, + "step": 1320 + }, + { + "epoch": 0.09, + "grad_norm": 23.583152770996094, + "learning_rate": 4.743223965763195e-05, + "loss": 0.9481, + "step": 1330 + }, + { + "epoch": 0.1, + "grad_norm": 50.59526443481445, + "learning_rate": 4.778887303851641e-05, + "loss": 0.7222, + "step": 1340 + }, + { + "epoch": 0.1, + "grad_norm": 18.746498107910156, + "learning_rate": 4.814550641940086e-05, + "loss": 0.7993, + "step": 1350 + }, + { + "epoch": 0.1, + "grad_norm": 14.619526863098145, + "learning_rate": 4.8502139800285305e-05, + "loss": 0.8045, + "step": 1360 + }, + { + "epoch": 0.1, + "grad_norm": 0.3897199332714081, + "learning_rate": 4.8858773181169756e-05, + "loss": 0.7668, + "step": 1370 + }, + { + "epoch": 0.1, + "grad_norm": 0.14925141632556915, + "learning_rate": 4.9215406562054214e-05, + "loss": 0.2882, + "step": 1380 + }, + { + "epoch": 0.1, + "grad_norm": 38.2923469543457, + "learning_rate": 4.9572039942938665e-05, + "loss": 0.8372, + "step": 1390 + }, + { + "epoch": 0.1, + "grad_norm": 0.04119894281029701, + "learning_rate": 4.992867332382311e-05, + "loss": 0.1001, + "step": 1400 + }, + { + "epoch": 0.1, + "eval_accuracy": 0.7789473684210526, + "eval_loss": 0.898942768573761, + "eval_runtime": 2157.0352, + "eval_samples_per_second": 0.308, + "eval_steps_per_second": 0.154, + "step": 1403 + }, + { + "epoch": 1.0, + "grad_norm": 4.562422752380371, + "learning_rate": 4.99682992550325e-05, + "loss": 0.6604, + "step": 1410 + }, + { + "epoch": 1.0, + "grad_norm": 0.05182512477040291, + "learning_rate": 4.992867332382311e-05, + "loss": 0.7648, + "step": 1420 + }, + { + "epoch": 1.0, + "grad_norm": 10.46511459350586, + "learning_rate": 4.988904739261373e-05, + "loss": 0.7306, + "step": 1430 + }, + { + "epoch": 1.0, + "grad_norm": 26.981674194335938, + "learning_rate": 4.984942146140435e-05, + "loss": 0.3427, + "step": 1440 + }, + { + "epoch": 1.0, + "grad_norm": 38.77156066894531, + "learning_rate": 4.9809795530194966e-05, + "loss": 0.4313, + "step": 1450 + }, + { + "epoch": 1.0, + "grad_norm": 0.08875282108783722, + "learning_rate": 4.977016959898558e-05, + "loss": 0.1047, + "step": 1460 + }, + { + "epoch": 1.0, + "grad_norm": 7.9550042152404785, + "learning_rate": 4.97305436677762e-05, + "loss": 0.8428, + "step": 1470 + }, + { + "epoch": 1.01, + "grad_norm": 3.2877941131591797, + "learning_rate": 4.969091773656681e-05, + "loss": 0.5087, + "step": 1480 + }, + { + "epoch": 1.01, + "grad_norm": 27.133525848388672, + "learning_rate": 4.965129180535743e-05, + "loss": 0.4621, + "step": 1490 + }, + { + "epoch": 1.01, + "grad_norm": 24.116609573364258, + "learning_rate": 4.9611665874148046e-05, + "loss": 0.8207, + "step": 1500 + }, + { + "epoch": 1.01, + "grad_norm": 48.552242279052734, + "learning_rate": 4.9572039942938665e-05, + "loss": 1.3676, + "step": 1510 + }, + { + "epoch": 1.01, + "grad_norm": 0.1313333660364151, + "learning_rate": 4.953241401172928e-05, + "loss": 0.4705, + "step": 1520 + }, + { + "epoch": 1.01, + "grad_norm": 14.919997215270996, + "learning_rate": 4.9492788080519896e-05, + "loss": 1.6541, + "step": 1530 + }, + { + "epoch": 1.01, + "grad_norm": 2.8064146041870117, + "learning_rate": 4.945316214931051e-05, + "loss": 0.6358, + "step": 1540 + }, + { + "epoch": 1.01, + "grad_norm": 33.633766174316406, + "learning_rate": 4.941353621810113e-05, + "loss": 0.3344, + "step": 1550 + }, + { + "epoch": 1.01, + "grad_norm": 25.58049774169922, + "learning_rate": 4.9373910286891746e-05, + "loss": 0.3765, + "step": 1560 + }, + { + "epoch": 1.01, + "grad_norm": 0.5938677191734314, + "learning_rate": 4.933428435568236e-05, + "loss": 0.7599, + "step": 1570 + }, + { + "epoch": 1.01, + "grad_norm": 0.17297320067882538, + "learning_rate": 4.9294658424472976e-05, + "loss": 0.1381, + "step": 1580 + }, + { + "epoch": 1.01, + "grad_norm": 1.1137043237686157, + "learning_rate": 4.9255032493263595e-05, + "loss": 0.5216, + "step": 1590 + }, + { + "epoch": 1.01, + "grad_norm": 11.281981468200684, + "learning_rate": 4.9215406562054214e-05, + "loss": 0.9504, + "step": 1600 + }, + { + "epoch": 1.01, + "grad_norm": 35.159671783447266, + "learning_rate": 4.9175780630844826e-05, + "loss": 0.6236, + "step": 1610 + }, + { + "epoch": 1.02, + "grad_norm": 15.732198715209961, + "learning_rate": 4.9136154699635445e-05, + "loss": 0.3037, + "step": 1620 + }, + { + "epoch": 1.02, + "grad_norm": 4.352818965911865, + "learning_rate": 4.909652876842606e-05, + "loss": 0.5294, + "step": 1630 + }, + { + "epoch": 1.02, + "grad_norm": 27.470956802368164, + "learning_rate": 4.9056902837216676e-05, + "loss": 0.5626, + "step": 1640 + }, + { + "epoch": 1.02, + "grad_norm": 33.91129684448242, + "learning_rate": 4.9017276906007294e-05, + "loss": 0.2882, + "step": 1650 + }, + { + "epoch": 1.02, + "grad_norm": 0.26898714900016785, + "learning_rate": 4.897765097479791e-05, + "loss": 0.4265, + "step": 1660 + }, + { + "epoch": 1.02, + "grad_norm": 0.8199774622917175, + "learning_rate": 4.8938025043588525e-05, + "loss": 0.5277, + "step": 1670 + }, + { + "epoch": 1.02, + "grad_norm": 0.020548412576317787, + "learning_rate": 4.8898399112379144e-05, + "loss": 0.889, + "step": 1680 + }, + { + "epoch": 1.02, + "grad_norm": 0.048078641295433044, + "learning_rate": 4.8858773181169756e-05, + "loss": 0.8639, + "step": 1690 + }, + { + "epoch": 1.02, + "grad_norm": 20.957611083984375, + "learning_rate": 4.881914724996038e-05, + "loss": 0.3244, + "step": 1700 + }, + { + "epoch": 1.02, + "grad_norm": 0.15246962010860443, + "learning_rate": 4.8779521318750994e-05, + "loss": 0.258, + "step": 1710 + }, + { + "epoch": 1.02, + "grad_norm": 0.09393693506717682, + "learning_rate": 4.873989538754161e-05, + "loss": 0.9169, + "step": 1720 + }, + { + "epoch": 1.02, + "grad_norm": 0.9115855097770691, + "learning_rate": 4.8700269456332225e-05, + "loss": 0.5618, + "step": 1730 + }, + { + "epoch": 1.02, + "grad_norm": 6.85861873626709, + "learning_rate": 4.866064352512284e-05, + "loss": 0.6588, + "step": 1740 + }, + { + "epoch": 1.02, + "grad_norm": 33.108909606933594, + "learning_rate": 4.862101759391346e-05, + "loss": 0.3317, + "step": 1750 + }, + { + "epoch": 1.03, + "grad_norm": 2.785113573074341, + "learning_rate": 4.8581391662704074e-05, + "loss": 0.1886, + "step": 1760 + }, + { + "epoch": 1.03, + "grad_norm": 0.07260994613170624, + "learning_rate": 4.854176573149469e-05, + "loss": 0.4813, + "step": 1770 + }, + { + "epoch": 1.03, + "grad_norm": 1.9168213605880737, + "learning_rate": 4.8502139800285305e-05, + "loss": 0.6163, + "step": 1780 + }, + { + "epoch": 1.03, + "grad_norm": 32.30327224731445, + "learning_rate": 4.8462513869075924e-05, + "loss": 0.5272, + "step": 1790 + }, + { + "epoch": 1.03, + "grad_norm": 0.09055186808109283, + "learning_rate": 4.842288793786654e-05, + "loss": 0.58, + "step": 1800 + }, + { + "epoch": 1.03, + "grad_norm": 0.15506546199321747, + "learning_rate": 4.838326200665716e-05, + "loss": 0.1171, + "step": 1810 + }, + { + "epoch": 1.03, + "grad_norm": 32.89198303222656, + "learning_rate": 4.8343636075447773e-05, + "loss": 0.4713, + "step": 1820 + }, + { + "epoch": 1.03, + "grad_norm": 0.009175814688205719, + "learning_rate": 4.830401014423839e-05, + "loss": 0.6139, + "step": 1830 + }, + { + "epoch": 1.03, + "grad_norm": 32.81629943847656, + "learning_rate": 4.8264384213029004e-05, + "loss": 0.6131, + "step": 1840 + }, + { + "epoch": 1.03, + "grad_norm": 62.49550247192383, + "learning_rate": 4.822475828181962e-05, + "loss": 0.5677, + "step": 1850 + }, + { + "epoch": 1.03, + "grad_norm": 2.451925754547119, + "learning_rate": 4.818513235061024e-05, + "loss": 1.4171, + "step": 1860 + }, + { + "epoch": 1.03, + "grad_norm": 0.2953392267227173, + "learning_rate": 4.814550641940086e-05, + "loss": 0.3838, + "step": 1870 + }, + { + "epoch": 1.03, + "grad_norm": 53.240325927734375, + "learning_rate": 4.810588048819147e-05, + "loss": 0.1432, + "step": 1880 + }, + { + "epoch": 1.03, + "grad_norm": 0.1574406921863556, + "learning_rate": 4.806625455698209e-05, + "loss": 0.318, + "step": 1890 + }, + { + "epoch": 1.04, + "grad_norm": 35.6072998046875, + "learning_rate": 4.802662862577271e-05, + "loss": 0.6746, + "step": 1900 + }, + { + "epoch": 1.04, + "grad_norm": 0.012536413036286831, + "learning_rate": 4.798700269456333e-05, + "loss": 0.7181, + "step": 1910 + }, + { + "epoch": 1.04, + "grad_norm": 0.05652592331171036, + "learning_rate": 4.794737676335394e-05, + "loss": 0.0939, + "step": 1920 + }, + { + "epoch": 1.04, + "grad_norm": 2.5210182666778564, + "learning_rate": 4.790775083214456e-05, + "loss": 1.1188, + "step": 1930 + }, + { + "epoch": 1.04, + "grad_norm": 0.16478443145751953, + "learning_rate": 4.786812490093517e-05, + "loss": 0.4504, + "step": 1940 + }, + { + "epoch": 1.04, + "grad_norm": 26.002525329589844, + "learning_rate": 4.782849896972579e-05, + "loss": 0.8049, + "step": 1950 + }, + { + "epoch": 1.04, + "grad_norm": 37.67827606201172, + "learning_rate": 4.778887303851641e-05, + "loss": 0.4756, + "step": 1960 + }, + { + "epoch": 1.04, + "grad_norm": 36.84476852416992, + "learning_rate": 4.774924710730702e-05, + "loss": 0.9565, + "step": 1970 + }, + { + "epoch": 1.04, + "grad_norm": 0.18539421260356903, + "learning_rate": 4.770962117609764e-05, + "loss": 0.3279, + "step": 1980 + }, + { + "epoch": 1.04, + "grad_norm": 3.1605958938598633, + "learning_rate": 4.766999524488825e-05, + "loss": 0.1506, + "step": 1990 + }, + { + "epoch": 1.04, + "grad_norm": 0.08869732171297073, + "learning_rate": 4.763036931367887e-05, + "loss": 0.5895, + "step": 2000 + }, + { + "epoch": 1.04, + "grad_norm": 1.4320793151855469, + "learning_rate": 4.759074338246949e-05, + "loss": 0.3709, + "step": 2010 + }, + { + "epoch": 1.04, + "grad_norm": 0.013628893531858921, + "learning_rate": 4.755111745126011e-05, + "loss": 0.4708, + "step": 2020 + }, + { + "epoch": 1.04, + "grad_norm": 0.008771849796175957, + "learning_rate": 4.751149152005072e-05, + "loss": 0.2291, + "step": 2030 + }, + { + "epoch": 1.05, + "grad_norm": 0.034172721207141876, + "learning_rate": 4.747186558884134e-05, + "loss": 0.3005, + "step": 2040 + }, + { + "epoch": 1.05, + "grad_norm": 2.04589581489563, + "learning_rate": 4.743223965763195e-05, + "loss": 0.7397, + "step": 2050 + }, + { + "epoch": 1.05, + "grad_norm": 7.416354656219482, + "learning_rate": 4.739261372642258e-05, + "loss": 0.5153, + "step": 2060 + }, + { + "epoch": 1.05, + "grad_norm": 59.661014556884766, + "learning_rate": 4.735298779521319e-05, + "loss": 0.7417, + "step": 2070 + }, + { + "epoch": 1.05, + "grad_norm": 38.264408111572266, + "learning_rate": 4.731336186400381e-05, + "loss": 0.4673, + "step": 2080 + }, + { + "epoch": 1.05, + "grad_norm": 0.010330034419894218, + "learning_rate": 4.727373593279442e-05, + "loss": 0.7802, + "step": 2090 + }, + { + "epoch": 1.05, + "grad_norm": 0.009081050753593445, + "learning_rate": 4.723411000158504e-05, + "loss": 0.5108, + "step": 2100 + }, + { + "epoch": 1.05, + "grad_norm": 0.9804019331932068, + "learning_rate": 4.719448407037566e-05, + "loss": 0.1208, + "step": 2110 + }, + { + "epoch": 1.05, + "grad_norm": 17.980236053466797, + "learning_rate": 4.7154858139166276e-05, + "loss": 0.3317, + "step": 2120 + }, + { + "epoch": 1.05, + "grad_norm": 0.03598076477646828, + "learning_rate": 4.711523220795689e-05, + "loss": 0.4675, + "step": 2130 + }, + { + "epoch": 1.05, + "grad_norm": 28.66923713684082, + "learning_rate": 4.707560627674751e-05, + "loss": 0.7232, + "step": 2140 + }, + { + "epoch": 1.05, + "grad_norm": 0.05090579390525818, + "learning_rate": 4.703598034553812e-05, + "loss": 0.2404, + "step": 2150 + }, + { + "epoch": 1.05, + "grad_norm": 39.81483840942383, + "learning_rate": 4.6996354414328745e-05, + "loss": 0.382, + "step": 2160 + }, + { + "epoch": 1.05, + "grad_norm": 0.010164987295866013, + "learning_rate": 4.695672848311936e-05, + "loss": 0.4326, + "step": 2170 + }, + { + "epoch": 1.06, + "grad_norm": 0.007712522987276316, + "learning_rate": 4.691710255190997e-05, + "loss": 0.1419, + "step": 2180 + }, + { + "epoch": 1.06, + "grad_norm": 0.03637077286839485, + "learning_rate": 4.687747662070059e-05, + "loss": 0.0087, + "step": 2190 + }, + { + "epoch": 1.06, + "grad_norm": 0.2942463755607605, + "learning_rate": 4.68378506894912e-05, + "loss": 0.5895, + "step": 2200 + }, + { + "epoch": 1.06, + "grad_norm": 11.139754295349121, + "learning_rate": 4.6798224758281825e-05, + "loss": 0.2236, + "step": 2210 + }, + { + "epoch": 1.06, + "grad_norm": 0.1554485708475113, + "learning_rate": 4.675859882707244e-05, + "loss": 0.3646, + "step": 2220 + }, + { + "epoch": 1.06, + "grad_norm": 0.10144095867872238, + "learning_rate": 4.6718972895863056e-05, + "loss": 0.2301, + "step": 2230 + }, + { + "epoch": 1.06, + "grad_norm": 34.84081268310547, + "learning_rate": 4.667934696465367e-05, + "loss": 0.7788, + "step": 2240 + }, + { + "epoch": 1.06, + "grad_norm": 0.31722915172576904, + "learning_rate": 4.663972103344429e-05, + "loss": 0.1405, + "step": 2250 + }, + { + "epoch": 1.06, + "grad_norm": 32.6774787902832, + "learning_rate": 4.6600095102234906e-05, + "loss": 0.4079, + "step": 2260 + }, + { + "epoch": 1.06, + "grad_norm": 0.04206797853112221, + "learning_rate": 4.6560469171025525e-05, + "loss": 0.8566, + "step": 2270 + }, + { + "epoch": 1.06, + "grad_norm": 0.007674456108361483, + "learning_rate": 4.6520843239816137e-05, + "loss": 0.6745, + "step": 2280 + }, + { + "epoch": 1.06, + "grad_norm": 4.521772384643555, + "learning_rate": 4.6481217308606755e-05, + "loss": 0.2311, + "step": 2290 + }, + { + "epoch": 1.06, + "grad_norm": 0.43009287118911743, + "learning_rate": 4.644159137739737e-05, + "loss": 0.9246, + "step": 2300 + }, + { + "epoch": 1.06, + "grad_norm": 0.108181893825531, + "learning_rate": 4.6401965446187986e-05, + "loss": 1.0581, + "step": 2310 + }, + { + "epoch": 1.07, + "grad_norm": 0.0881645604968071, + "learning_rate": 4.6362339514978605e-05, + "loss": 0.431, + "step": 2320 + }, + { + "epoch": 1.07, + "grad_norm": 0.03945665806531906, + "learning_rate": 4.6322713583769224e-05, + "loss": 0.3784, + "step": 2330 + }, + { + "epoch": 1.07, + "grad_norm": 0.01008934061974287, + "learning_rate": 4.6283087652559836e-05, + "loss": 0.4657, + "step": 2340 + }, + { + "epoch": 1.07, + "grad_norm": 1.8987274169921875, + "learning_rate": 4.6243461721350455e-05, + "loss": 0.3025, + "step": 2350 + }, + { + "epoch": 1.07, + "grad_norm": 52.36662292480469, + "learning_rate": 4.6203835790141073e-05, + "loss": 0.1047, + "step": 2360 + }, + { + "epoch": 1.07, + "grad_norm": 30.82796287536621, + "learning_rate": 4.616420985893169e-05, + "loss": 0.3864, + "step": 2370 + }, + { + "epoch": 1.07, + "grad_norm": 28.43499755859375, + "learning_rate": 4.6124583927722304e-05, + "loss": 0.5029, + "step": 2380 + }, + { + "epoch": 1.07, + "grad_norm": 36.37118911743164, + "learning_rate": 4.608495799651292e-05, + "loss": 0.3412, + "step": 2390 + }, + { + "epoch": 1.07, + "grad_norm": 0.03059449978172779, + "learning_rate": 4.6045332065303535e-05, + "loss": 1.6759, + "step": 2400 + }, + { + "epoch": 1.07, + "grad_norm": 0.028554683551192284, + "learning_rate": 4.6005706134094154e-05, + "loss": 0.4232, + "step": 2410 + }, + { + "epoch": 1.07, + "grad_norm": 0.3136725425720215, + "learning_rate": 4.596608020288477e-05, + "loss": 0.4531, + "step": 2420 + }, + { + "epoch": 1.07, + "grad_norm": 9.164505004882812, + "learning_rate": 4.5926454271675385e-05, + "loss": 1.4434, + "step": 2430 + }, + { + "epoch": 1.07, + "grad_norm": 26.755535125732422, + "learning_rate": 4.5886828340466004e-05, + "loss": 0.5678, + "step": 2440 + }, + { + "epoch": 1.07, + "grad_norm": 0.027405157685279846, + "learning_rate": 4.5847202409256616e-05, + "loss": 0.1547, + "step": 2450 + }, + { + "epoch": 1.08, + "grad_norm": 7.302061080932617, + "learning_rate": 4.5807576478047234e-05, + "loss": 0.1004, + "step": 2460 + }, + { + "epoch": 1.08, + "grad_norm": 36.72040557861328, + "learning_rate": 4.576795054683785e-05, + "loss": 0.4913, + "step": 2470 + }, + { + "epoch": 1.08, + "grad_norm": 0.10275045782327652, + "learning_rate": 4.572832461562847e-05, + "loss": 0.0076, + "step": 2480 + }, + { + "epoch": 1.08, + "grad_norm": 0.008994188159704208, + "learning_rate": 4.5688698684419084e-05, + "loss": 0.0018, + "step": 2490 + }, + { + "epoch": 1.08, + "grad_norm": 0.02831762284040451, + "learning_rate": 4.56490727532097e-05, + "loss": 0.086, + "step": 2500 + }, + { + "epoch": 1.08, + "grad_norm": 0.6608620285987854, + "learning_rate": 4.5609446822000315e-05, + "loss": 0.0874, + "step": 2510 + }, + { + "epoch": 1.08, + "grad_norm": 1.8176270723342896, + "learning_rate": 4.556982089079094e-05, + "loss": 0.459, + "step": 2520 + }, + { + "epoch": 1.08, + "grad_norm": 28.674335479736328, + "learning_rate": 4.553019495958155e-05, + "loss": 0.4304, + "step": 2530 + }, + { + "epoch": 1.08, + "grad_norm": 0.06465455144643784, + "learning_rate": 4.549056902837217e-05, + "loss": 0.6094, + "step": 2540 + }, + { + "epoch": 1.08, + "grad_norm": 0.011676542460918427, + "learning_rate": 4.545094309716278e-05, + "loss": 0.333, + "step": 2550 + }, + { + "epoch": 1.08, + "grad_norm": 4.420731544494629, + "learning_rate": 4.54113171659534e-05, + "loss": 0.1013, + "step": 2560 + }, + { + "epoch": 1.08, + "grad_norm": 0.1971130520105362, + "learning_rate": 4.537169123474402e-05, + "loss": 0.0122, + "step": 2570 + }, + { + "epoch": 1.08, + "grad_norm": 2.309307813644409, + "learning_rate": 4.533206530353464e-05, + "loss": 0.0247, + "step": 2580 + }, + { + "epoch": 1.08, + "grad_norm": 0.010364987887442112, + "learning_rate": 4.529243937232525e-05, + "loss": 0.0751, + "step": 2590 + }, + { + "epoch": 1.09, + "grad_norm": 30.956512451171875, + "learning_rate": 4.525281344111587e-05, + "loss": 0.7706, + "step": 2600 + }, + { + "epoch": 1.09, + "grad_norm": 21.555742263793945, + "learning_rate": 4.521318750990648e-05, + "loss": 0.0501, + "step": 2610 + }, + { + "epoch": 1.09, + "grad_norm": 0.02271176129579544, + "learning_rate": 4.51735615786971e-05, + "loss": 0.219, + "step": 2620 + }, + { + "epoch": 1.09, + "grad_norm": 0.10638172179460526, + "learning_rate": 4.513393564748772e-05, + "loss": 0.4009, + "step": 2630 + }, + { + "epoch": 1.09, + "grad_norm": 0.0012674570316448808, + "learning_rate": 4.509430971627833e-05, + "loss": 0.0365, + "step": 2640 + }, + { + "epoch": 1.09, + "grad_norm": 0.01860959082841873, + "learning_rate": 4.505468378506895e-05, + "loss": 0.064, + "step": 2650 + }, + { + "epoch": 1.09, + "grad_norm": 0.029620472341775894, + "learning_rate": 4.501505785385956e-05, + "loss": 0.5788, + "step": 2660 + }, + { + "epoch": 1.09, + "grad_norm": 69.32429504394531, + "learning_rate": 4.497543192265019e-05, + "loss": 0.5635, + "step": 2670 + }, + { + "epoch": 1.09, + "grad_norm": 0.004012781195342541, + "learning_rate": 4.49358059914408e-05, + "loss": 0.8204, + "step": 2680 + }, + { + "epoch": 1.09, + "grad_norm": 0.007074211724102497, + "learning_rate": 4.489618006023142e-05, + "loss": 0.2872, + "step": 2690 + }, + { + "epoch": 1.09, + "grad_norm": 2.3659746646881104, + "learning_rate": 4.485655412902203e-05, + "loss": 0.6915, + "step": 2700 + }, + { + "epoch": 1.09, + "grad_norm": 0.2181590050458908, + "learning_rate": 4.481692819781265e-05, + "loss": 0.0489, + "step": 2710 + }, + { + "epoch": 1.09, + "grad_norm": 0.00418996112421155, + "learning_rate": 4.477730226660327e-05, + "loss": 0.4435, + "step": 2720 + }, + { + "epoch": 1.09, + "grad_norm": 0.011325598694384098, + "learning_rate": 4.473767633539389e-05, + "loss": 0.0595, + "step": 2730 + }, + { + "epoch": 1.1, + "grad_norm": 6.933524131774902, + "learning_rate": 4.46980504041845e-05, + "loss": 0.6284, + "step": 2740 + }, + { + "epoch": 1.1, + "grad_norm": 0.04216031730175018, + "learning_rate": 4.465842447297512e-05, + "loss": 0.1847, + "step": 2750 + }, + { + "epoch": 1.1, + "grad_norm": 49.92095184326172, + "learning_rate": 4.461879854176573e-05, + "loss": 0.1075, + "step": 2760 + }, + { + "epoch": 1.1, + "grad_norm": 0.0068773203529417515, + "learning_rate": 4.457917261055635e-05, + "loss": 0.7253, + "step": 2770 + }, + { + "epoch": 1.1, + "grad_norm": 49.53322219848633, + "learning_rate": 4.453954667934697e-05, + "loss": 0.3937, + "step": 2780 + }, + { + "epoch": 1.1, + "grad_norm": 0.0059694708324968815, + "learning_rate": 4.449992074813759e-05, + "loss": 0.3752, + "step": 2790 + }, + { + "epoch": 1.1, + "grad_norm": 0.006113003473728895, + "learning_rate": 4.44602948169282e-05, + "loss": 0.2646, + "step": 2800 + }, + { + "epoch": 1.1, + "eval_accuracy": 0.8857142857142857, + "eval_loss": 0.5655186772346497, + "eval_runtime": 2204.4149, + "eval_samples_per_second": 0.302, + "eval_steps_per_second": 0.151, + "step": 2806 + }, + { + "epoch": 2.0, + "grad_norm": 0.030710767954587936, + "learning_rate": 4.442066888571882e-05, + "loss": 0.2584, + "step": 2810 + }, + { + "epoch": 2.0, + "grad_norm": 0.961925208568573, + "learning_rate": 4.438104295450943e-05, + "loss": 0.0208, + "step": 2820 + }, + { + "epoch": 2.0, + "grad_norm": 0.034155942499637604, + "learning_rate": 4.434141702330005e-05, + "loss": 0.0855, + "step": 2830 + }, + { + "epoch": 2.0, + "grad_norm": 0.023880697786808014, + "learning_rate": 4.430179109209067e-05, + "loss": 0.0589, + "step": 2840 + }, + { + "epoch": 2.0, + "grad_norm": 0.4049380421638489, + "learning_rate": 4.426216516088128e-05, + "loss": 0.0529, + "step": 2850 + }, + { + "epoch": 2.0, + "grad_norm": 0.3483090102672577, + "learning_rate": 4.42225392296719e-05, + "loss": 0.6943, + "step": 2860 + }, + { + "epoch": 2.0, + "grad_norm": 24.336814880371094, + "learning_rate": 4.418291329846252e-05, + "loss": 0.5243, + "step": 2870 + }, + { + "epoch": 2.01, + "grad_norm": 0.0032091333996504545, + "learning_rate": 4.4143287367253136e-05, + "loss": 0.1202, + "step": 2880 + }, + { + "epoch": 2.01, + "grad_norm": 0.047493454068899155, + "learning_rate": 4.410366143604375e-05, + "loss": 0.1007, + "step": 2890 + }, + { + "epoch": 2.01, + "grad_norm": 0.017678333446383476, + "learning_rate": 4.406403550483437e-05, + "loss": 0.0008, + "step": 2900 + }, + { + "epoch": 2.01, + "grad_norm": 0.9050219058990479, + "learning_rate": 4.402440957362498e-05, + "loss": 0.3121, + "step": 2910 + }, + { + "epoch": 2.01, + "grad_norm": 0.03626730665564537, + "learning_rate": 4.39847836424156e-05, + "loss": 0.1179, + "step": 2920 + }, + { + "epoch": 2.01, + "grad_norm": 0.011950280517339706, + "learning_rate": 4.3945157711206216e-05, + "loss": 0.4693, + "step": 2930 + }, + { + "epoch": 2.01, + "grad_norm": 0.02412373013794422, + "learning_rate": 4.3905531779996835e-05, + "loss": 0.8616, + "step": 2940 + }, + { + "epoch": 2.01, + "grad_norm": 30.13632583618164, + "learning_rate": 4.386590584878745e-05, + "loss": 0.3553, + "step": 2950 + }, + { + "epoch": 2.01, + "grad_norm": 0.0061193606816232204, + "learning_rate": 4.3826279917578066e-05, + "loss": 0.0383, + "step": 2960 + }, + { + "epoch": 2.01, + "grad_norm": 0.2041795253753662, + "learning_rate": 4.378665398636868e-05, + "loss": 0.6114, + "step": 2970 + }, + { + "epoch": 2.01, + "grad_norm": 0.26872244477272034, + "learning_rate": 4.3747028055159304e-05, + "loss": 0.5494, + "step": 2980 + }, + { + "epoch": 2.01, + "grad_norm": 0.06702332943677902, + "learning_rate": 4.3707402123949916e-05, + "loss": 0.0144, + "step": 2990 + }, + { + "epoch": 2.01, + "grad_norm": 0.0052034310065209866, + "learning_rate": 4.3667776192740534e-05, + "loss": 0.0128, + "step": 3000 + }, + { + "epoch": 2.01, + "grad_norm": 2.874134063720703, + "learning_rate": 4.3628150261531146e-05, + "loss": 0.6844, + "step": 3010 + }, + { + "epoch": 2.02, + "grad_norm": 1.1829482316970825, + "learning_rate": 4.3588524330321765e-05, + "loss": 0.1866, + "step": 3020 + }, + { + "epoch": 2.02, + "grad_norm": 41.475399017333984, + "learning_rate": 4.3548898399112384e-05, + "loss": 0.156, + "step": 3030 + }, + { + "epoch": 2.02, + "grad_norm": 0.07054935395717621, + "learning_rate": 4.3509272467902996e-05, + "loss": 0.3379, + "step": 3040 + }, + { + "epoch": 2.02, + "grad_norm": 0.44977086782455444, + "learning_rate": 4.3469646536693615e-05, + "loss": 0.5629, + "step": 3050 + }, + { + "epoch": 2.02, + "grad_norm": 0.06215721368789673, + "learning_rate": 4.343002060548423e-05, + "loss": 0.0019, + "step": 3060 + }, + { + "epoch": 2.02, + "grad_norm": 43.73810958862305, + "learning_rate": 4.3390394674274846e-05, + "loss": 0.3984, + "step": 3070 + }, + { + "epoch": 2.02, + "grad_norm": 0.9523270130157471, + "learning_rate": 4.3350768743065464e-05, + "loss": 0.0888, + "step": 3080 + }, + { + "epoch": 2.02, + "grad_norm": 0.005942572373896837, + "learning_rate": 4.331114281185608e-05, + "loss": 0.1961, + "step": 3090 + }, + { + "epoch": 2.02, + "grad_norm": 0.022418642416596413, + "learning_rate": 4.3271516880646695e-05, + "loss": 0.0597, + "step": 3100 + }, + { + "epoch": 2.02, + "grad_norm": 0.04196101427078247, + "learning_rate": 4.3231890949437314e-05, + "loss": 0.1147, + "step": 3110 + }, + { + "epoch": 2.02, + "grad_norm": 0.003765852889046073, + "learning_rate": 4.3192265018227926e-05, + "loss": 0.0416, + "step": 3120 + }, + { + "epoch": 2.02, + "grad_norm": 0.053471703082323074, + "learning_rate": 4.3152639087018545e-05, + "loss": 0.3866, + "step": 3130 + }, + { + "epoch": 2.02, + "grad_norm": 51.969120025634766, + "learning_rate": 4.3113013155809164e-05, + "loss": 0.0616, + "step": 3140 + }, + { + "epoch": 2.02, + "grad_norm": 0.005074084736406803, + "learning_rate": 4.307338722459978e-05, + "loss": 0.1907, + "step": 3150 + }, + { + "epoch": 2.03, + "grad_norm": 0.0045975870452821255, + "learning_rate": 4.3033761293390395e-05, + "loss": 0.0072, + "step": 3160 + }, + { + "epoch": 2.03, + "grad_norm": 0.0040842327289283276, + "learning_rate": 4.299413536218101e-05, + "loss": 0.2306, + "step": 3170 + }, + { + "epoch": 2.03, + "grad_norm": 0.008049121126532555, + "learning_rate": 4.295450943097163e-05, + "loss": 0.0058, + "step": 3180 + }, + { + "epoch": 2.03, + "grad_norm": 21.22314453125, + "learning_rate": 4.291488349976225e-05, + "loss": 0.5502, + "step": 3190 + }, + { + "epoch": 2.03, + "grad_norm": 0.036627013236284256, + "learning_rate": 4.287525756855286e-05, + "loss": 0.1419, + "step": 3200 + }, + { + "epoch": 2.03, + "grad_norm": 2.3564202785491943, + "learning_rate": 4.283563163734348e-05, + "loss": 0.0279, + "step": 3210 + }, + { + "epoch": 2.03, + "grad_norm": 0.0108193913474679, + "learning_rate": 4.2796005706134094e-05, + "loss": 0.0004, + "step": 3220 + }, + { + "epoch": 2.03, + "grad_norm": 0.0201814454048872, + "learning_rate": 4.275637977492471e-05, + "loss": 0.3249, + "step": 3230 + }, + { + "epoch": 2.03, + "grad_norm": 0.03389296308159828, + "learning_rate": 4.271675384371533e-05, + "loss": 0.3386, + "step": 3240 + }, + { + "epoch": 2.03, + "grad_norm": 0.01544855535030365, + "learning_rate": 4.267712791250595e-05, + "loss": 0.3866, + "step": 3250 + }, + { + "epoch": 2.03, + "grad_norm": 0.025016358122229576, + "learning_rate": 4.263750198129656e-05, + "loss": 0.0013, + "step": 3260 + }, + { + "epoch": 2.03, + "grad_norm": 0.0923624038696289, + "learning_rate": 4.2597876050087174e-05, + "loss": 0.2532, + "step": 3270 + }, + { + "epoch": 2.03, + "grad_norm": 23.150659561157227, + "learning_rate": 4.255825011887779e-05, + "loss": 0.3962, + "step": 3280 + }, + { + "epoch": 2.03, + "grad_norm": 0.015515293926000595, + "learning_rate": 4.251862418766841e-05, + "loss": 0.0004, + "step": 3290 + }, + { + "epoch": 2.04, + "grad_norm": 0.003917529247701168, + "learning_rate": 4.247899825645903e-05, + "loss": 0.2484, + "step": 3300 + }, + { + "epoch": 2.04, + "grad_norm": 28.370773315429688, + "learning_rate": 4.243937232524964e-05, + "loss": 0.0367, + "step": 3310 + }, + { + "epoch": 2.04, + "grad_norm": 0.026205556467175484, + "learning_rate": 4.239974639404026e-05, + "loss": 0.3111, + "step": 3320 + }, + { + "epoch": 2.04, + "grad_norm": 0.01336923148483038, + "learning_rate": 4.2360120462830874e-05, + "loss": 0.0047, + "step": 3330 + }, + { + "epoch": 2.04, + "grad_norm": 0.011190090328454971, + "learning_rate": 4.23204945316215e-05, + "loss": 0.3007, + "step": 3340 + }, + { + "epoch": 2.04, + "grad_norm": 0.003655917476862669, + "learning_rate": 4.228086860041211e-05, + "loss": 0.5659, + "step": 3350 + }, + { + "epoch": 2.04, + "grad_norm": 0.017216026782989502, + "learning_rate": 4.224124266920273e-05, + "loss": 0.0933, + "step": 3360 + }, + { + "epoch": 2.04, + "grad_norm": 0.007373865228146315, + "learning_rate": 4.220161673799334e-05, + "loss": 0.0298, + "step": 3370 + }, + { + "epoch": 2.04, + "grad_norm": 0.035991325974464417, + "learning_rate": 4.216199080678396e-05, + "loss": 0.0916, + "step": 3380 + }, + { + "epoch": 2.04, + "grad_norm": 0.007277372293174267, + "learning_rate": 4.212236487557458e-05, + "loss": 0.0008, + "step": 3390 + }, + { + "epoch": 2.04, + "grad_norm": 0.0012711473973467946, + "learning_rate": 4.20827389443652e-05, + "loss": 0.049, + "step": 3400 + }, + { + "epoch": 2.04, + "grad_norm": 0.004262813366949558, + "learning_rate": 4.204311301315581e-05, + "loss": 0.3255, + "step": 3410 + }, + { + "epoch": 2.04, + "grad_norm": 0.6016376614570618, + "learning_rate": 4.200348708194643e-05, + "loss": 0.0016, + "step": 3420 + }, + { + "epoch": 2.04, + "grad_norm": 0.027856985107064247, + "learning_rate": 4.196386115073704e-05, + "loss": 0.1706, + "step": 3430 + }, + { + "epoch": 2.05, + "grad_norm": 36.658660888671875, + "learning_rate": 4.192423521952766e-05, + "loss": 0.393, + "step": 3440 + }, + { + "epoch": 2.05, + "grad_norm": 4.459847927093506, + "learning_rate": 4.188460928831828e-05, + "loss": 0.2113, + "step": 3450 + }, + { + "epoch": 2.05, + "grad_norm": 0.003763306187465787, + "learning_rate": 4.18449833571089e-05, + "loss": 0.0946, + "step": 3460 + }, + { + "epoch": 2.05, + "grad_norm": 0.9358043670654297, + "learning_rate": 4.180535742589951e-05, + "loss": 0.1248, + "step": 3470 + }, + { + "epoch": 2.05, + "grad_norm": 5.325794219970703, + "learning_rate": 4.176573149469012e-05, + "loss": 0.1882, + "step": 3480 + }, + { + "epoch": 2.05, + "grad_norm": 0.01291597355157137, + "learning_rate": 4.172610556348075e-05, + "loss": 0.5989, + "step": 3490 + }, + { + "epoch": 2.05, + "grad_norm": 0.05552150309085846, + "learning_rate": 4.168647963227136e-05, + "loss": 0.1317, + "step": 3500 + }, + { + "epoch": 2.05, + "grad_norm": 0.0046797278337180614, + "learning_rate": 4.164685370106198e-05, + "loss": 0.9324, + "step": 3510 + }, + { + "epoch": 2.05, + "grad_norm": 0.19884918630123138, + "learning_rate": 4.160722776985259e-05, + "loss": 0.0027, + "step": 3520 + }, + { + "epoch": 2.05, + "grad_norm": 0.06361120939254761, + "learning_rate": 4.156760183864321e-05, + "loss": 0.1294, + "step": 3530 + }, + { + "epoch": 2.05, + "grad_norm": 0.025872783735394478, + "learning_rate": 4.152797590743383e-05, + "loss": 0.3453, + "step": 3540 + }, + { + "epoch": 2.05, + "grad_norm": 0.44598618149757385, + "learning_rate": 4.1488349976224446e-05, + "loss": 0.0445, + "step": 3550 + }, + { + "epoch": 2.05, + "grad_norm": 0.00139313330873847, + "learning_rate": 4.144872404501506e-05, + "loss": 0.4126, + "step": 3560 + }, + { + "epoch": 2.05, + "grad_norm": 0.004861112684011459, + "learning_rate": 4.140909811380568e-05, + "loss": 0.3162, + "step": 3570 + }, + { + "epoch": 2.06, + "grad_norm": 37.97075653076172, + "learning_rate": 4.136947218259629e-05, + "loss": 0.0275, + "step": 3580 + }, + { + "epoch": 2.06, + "grad_norm": 0.006260779220610857, + "learning_rate": 4.132984625138691e-05, + "loss": 0.5518, + "step": 3590 + }, + { + "epoch": 2.06, + "grad_norm": 10.439234733581543, + "learning_rate": 4.129022032017753e-05, + "loss": 0.0489, + "step": 3600 + }, + { + "epoch": 2.06, + "grad_norm": 0.009267416782677174, + "learning_rate": 4.1250594388968146e-05, + "loss": 0.2203, + "step": 3610 + }, + { + "epoch": 2.06, + "grad_norm": 0.003436572849750519, + "learning_rate": 4.121096845775876e-05, + "loss": 0.0489, + "step": 3620 + }, + { + "epoch": 2.06, + "grad_norm": 0.02378927730023861, + "learning_rate": 4.1171342526549377e-05, + "loss": 0.3647, + "step": 3630 + }, + { + "epoch": 2.06, + "grad_norm": 0.04053608328104019, + "learning_rate": 4.113171659533999e-05, + "loss": 0.4505, + "step": 3640 + }, + { + "epoch": 2.06, + "grad_norm": 0.8833039402961731, + "learning_rate": 4.1092090664130614e-05, + "loss": 0.0626, + "step": 3650 + }, + { + "epoch": 2.06, + "grad_norm": 11.919655799865723, + "learning_rate": 4.1052464732921226e-05, + "loss": 0.0989, + "step": 3660 + }, + { + "epoch": 2.06, + "grad_norm": 0.03586142137646675, + "learning_rate": 4.1012838801711845e-05, + "loss": 0.2583, + "step": 3670 + }, + { + "epoch": 2.06, + "grad_norm": 0.1854490488767624, + "learning_rate": 4.097321287050246e-05, + "loss": 0.1368, + "step": 3680 + }, + { + "epoch": 2.06, + "grad_norm": 0.057375673204660416, + "learning_rate": 4.0933586939293076e-05, + "loss": 0.345, + "step": 3690 + }, + { + "epoch": 2.06, + "grad_norm": 0.015717756003141403, + "learning_rate": 4.0893961008083695e-05, + "loss": 0.0065, + "step": 3700 + }, + { + "epoch": 2.06, + "grad_norm": 0.02194334752857685, + "learning_rate": 4.085433507687431e-05, + "loss": 0.0021, + "step": 3710 + }, + { + "epoch": 2.07, + "grad_norm": 16.584745407104492, + "learning_rate": 4.0814709145664925e-05, + "loss": 0.0147, + "step": 3720 + }, + { + "epoch": 2.07, + "grad_norm": 0.0053609260357916355, + "learning_rate": 4.077508321445554e-05, + "loss": 0.3077, + "step": 3730 + }, + { + "epoch": 2.07, + "grad_norm": 0.002716219983994961, + "learning_rate": 4.0735457283246156e-05, + "loss": 0.4135, + "step": 3740 + }, + { + "epoch": 2.07, + "grad_norm": 0.8324286937713623, + "learning_rate": 4.0695831352036775e-05, + "loss": 0.3889, + "step": 3750 + }, + { + "epoch": 2.07, + "grad_norm": 0.004210811574012041, + "learning_rate": 4.0656205420827394e-05, + "loss": 0.0015, + "step": 3760 + }, + { + "epoch": 2.07, + "grad_norm": 0.0026693022809922695, + "learning_rate": 4.0616579489618006e-05, + "loss": 0.566, + "step": 3770 + }, + { + "epoch": 2.07, + "grad_norm": 0.02392963133752346, + "learning_rate": 4.0576953558408625e-05, + "loss": 0.0124, + "step": 3780 + }, + { + "epoch": 2.07, + "grad_norm": 0.020079661160707474, + "learning_rate": 4.053732762719924e-05, + "loss": 0.0152, + "step": 3790 + }, + { + "epoch": 2.07, + "grad_norm": 0.0014172615483403206, + "learning_rate": 4.049770169598986e-05, + "loss": 0.0932, + "step": 3800 + }, + { + "epoch": 2.07, + "grad_norm": 5.266864776611328, + "learning_rate": 4.0458075764780474e-05, + "loss": 0.5559, + "step": 3810 + }, + { + "epoch": 2.07, + "grad_norm": 0.009865287691354752, + "learning_rate": 4.041844983357109e-05, + "loss": 0.2131, + "step": 3820 + }, + { + "epoch": 2.07, + "grad_norm": 0.0027454691007733345, + "learning_rate": 4.0378823902361705e-05, + "loss": 0.0006, + "step": 3830 + }, + { + "epoch": 2.07, + "grad_norm": 0.15348024666309357, + "learning_rate": 4.0339197971152324e-05, + "loss": 1.0736, + "step": 3840 + }, + { + "epoch": 2.07, + "grad_norm": 0.002026822417974472, + "learning_rate": 4.029957203994294e-05, + "loss": 0.4398, + "step": 3850 + }, + { + "epoch": 2.08, + "grad_norm": 0.023032035678625107, + "learning_rate": 4.025994610873356e-05, + "loss": 0.3724, + "step": 3860 + }, + { + "epoch": 2.08, + "grad_norm": 110.459716796875, + "learning_rate": 4.0220320177524174e-05, + "loss": 0.4695, + "step": 3870 + }, + { + "epoch": 2.08, + "grad_norm": 0.026824643835425377, + "learning_rate": 4.018069424631479e-05, + "loss": 0.4355, + "step": 3880 + }, + { + "epoch": 2.08, + "grad_norm": 0.007477205712348223, + "learning_rate": 4.0141068315105404e-05, + "loss": 0.3392, + "step": 3890 + }, + { + "epoch": 2.08, + "grad_norm": 0.0020925672724843025, + "learning_rate": 4.010144238389602e-05, + "loss": 0.228, + "step": 3900 + }, + { + "epoch": 2.08, + "grad_norm": 0.003810058580711484, + "learning_rate": 4.006181645268664e-05, + "loss": 0.2477, + "step": 3910 + }, + { + "epoch": 2.08, + "grad_norm": 0.0076815299689769745, + "learning_rate": 4.0022190521477254e-05, + "loss": 0.4539, + "step": 3920 + }, + { + "epoch": 2.08, + "grad_norm": 0.005379770882427692, + "learning_rate": 3.998256459026787e-05, + "loss": 0.0341, + "step": 3930 + }, + { + "epoch": 2.08, + "grad_norm": 0.003831785172224045, + "learning_rate": 3.9942938659058485e-05, + "loss": 0.5954, + "step": 3940 + }, + { + "epoch": 2.08, + "grad_norm": 0.7539482116699219, + "learning_rate": 3.9903312727849104e-05, + "loss": 0.3885, + "step": 3950 + }, + { + "epoch": 2.08, + "grad_norm": 0.005533235147595406, + "learning_rate": 3.986368679663972e-05, + "loss": 0.0053, + "step": 3960 + }, + { + "epoch": 2.08, + "grad_norm": 0.008866420015692711, + "learning_rate": 3.982406086543034e-05, + "loss": 0.0453, + "step": 3970 + }, + { + "epoch": 2.08, + "grad_norm": 0.014108781702816486, + "learning_rate": 3.978443493422095e-05, + "loss": 0.2954, + "step": 3980 + }, + { + "epoch": 2.08, + "grad_norm": 0.016585228964686394, + "learning_rate": 3.974480900301157e-05, + "loss": 0.0076, + "step": 3990 + }, + { + "epoch": 2.09, + "grad_norm": 3.2773778438568115, + "learning_rate": 3.970518307180219e-05, + "loss": 0.1556, + "step": 4000 + }, + { + "epoch": 2.09, + "grad_norm": 7.254385948181152, + "learning_rate": 3.966555714059281e-05, + "loss": 0.1696, + "step": 4010 + }, + { + "epoch": 2.09, + "grad_norm": 0.0035600659903138876, + "learning_rate": 3.962593120938342e-05, + "loss": 0.0074, + "step": 4020 + }, + { + "epoch": 2.09, + "grad_norm": 8.71975040435791, + "learning_rate": 3.958630527817404e-05, + "loss": 0.3048, + "step": 4030 + }, + { + "epoch": 2.09, + "grad_norm": 0.0020627696067094803, + "learning_rate": 3.954667934696465e-05, + "loss": 0.7165, + "step": 4040 + }, + { + "epoch": 2.09, + "grad_norm": 0.007494701538234949, + "learning_rate": 3.950705341575527e-05, + "loss": 0.5529, + "step": 4050 + }, + { + "epoch": 2.09, + "grad_norm": 0.016065679490566254, + "learning_rate": 3.946742748454589e-05, + "loss": 0.0128, + "step": 4060 + }, + { + "epoch": 2.09, + "grad_norm": 0.322768896818161, + "learning_rate": 3.942780155333651e-05, + "loss": 0.0628, + "step": 4070 + }, + { + "epoch": 2.09, + "grad_norm": 0.023394137620925903, + "learning_rate": 3.938817562212712e-05, + "loss": 0.4743, + "step": 4080 + }, + { + "epoch": 2.09, + "grad_norm": 0.0229184590280056, + "learning_rate": 3.934854969091774e-05, + "loss": 0.2818, + "step": 4090 + }, + { + "epoch": 2.09, + "grad_norm": 0.006755081005394459, + "learning_rate": 3.930892375970835e-05, + "loss": 0.3635, + "step": 4100 + }, + { + "epoch": 2.09, + "grad_norm": 0.004403649363666773, + "learning_rate": 3.926929782849898e-05, + "loss": 0.0568, + "step": 4110 + }, + { + "epoch": 2.09, + "grad_norm": 0.0034377514384686947, + "learning_rate": 3.922967189728959e-05, + "loss": 0.1624, + "step": 4120 + }, + { + "epoch": 2.09, + "grad_norm": 0.005851461086422205, + "learning_rate": 3.91900459660802e-05, + "loss": 0.6674, + "step": 4130 + }, + { + "epoch": 2.1, + "grad_norm": 0.004862835630774498, + "learning_rate": 3.915042003487082e-05, + "loss": 0.5013, + "step": 4140 + }, + { + "epoch": 2.1, + "grad_norm": 42.758365631103516, + "learning_rate": 3.911079410366143e-05, + "loss": 0.4173, + "step": 4150 + }, + { + "epoch": 2.1, + "grad_norm": 0.004606719594448805, + "learning_rate": 3.907116817245206e-05, + "loss": 0.5328, + "step": 4160 + }, + { + "epoch": 2.1, + "grad_norm": 78.40693664550781, + "learning_rate": 3.903154224124267e-05, + "loss": 0.4105, + "step": 4170 + }, + { + "epoch": 2.1, + "grad_norm": 23.919864654541016, + "learning_rate": 3.899191631003329e-05, + "loss": 1.4324, + "step": 4180 + }, + { + "epoch": 2.1, + "grad_norm": 0.00816379301249981, + "learning_rate": 3.89522903788239e-05, + "loss": 0.3002, + "step": 4190 + }, + { + "epoch": 2.1, + "grad_norm": 0.022155677899718285, + "learning_rate": 3.891266444761452e-05, + "loss": 0.0785, + "step": 4200 + }, + { + "epoch": 2.1, + "eval_accuracy": 0.9052631578947369, + "eval_loss": 0.4806475341320038, + "eval_runtime": 2299.8623, + "eval_samples_per_second": 0.289, + "eval_steps_per_second": 0.145, + "step": 4209 + }, + { + "epoch": 3.0, + "grad_norm": 0.0499810166656971, + "learning_rate": 3.887303851640514e-05, + "loss": 0.1101, + "step": 4210 + }, + { + "epoch": 3.0, + "grad_norm": 0.00219921232201159, + "learning_rate": 3.883341258519576e-05, + "loss": 0.0019, + "step": 4220 + }, + { + "epoch": 3.0, + "grad_norm": 0.05688053369522095, + "learning_rate": 3.879378665398637e-05, + "loss": 0.4249, + "step": 4230 + }, + { + "epoch": 3.0, + "grad_norm": 0.004060626961290836, + "learning_rate": 3.875416072277699e-05, + "loss": 0.329, + "step": 4240 + }, + { + "epoch": 3.0, + "grad_norm": 0.4057186245918274, + "learning_rate": 3.87145347915676e-05, + "loss": 0.0089, + "step": 4250 + }, + { + "epoch": 3.0, + "grad_norm": 0.0447358600795269, + "learning_rate": 3.8674908860358226e-05, + "loss": 0.0034, + "step": 4260 + }, + { + "epoch": 3.0, + "grad_norm": 0.003750765696167946, + "learning_rate": 3.863528292914884e-05, + "loss": 0.0953, + "step": 4270 + }, + { + "epoch": 3.01, + "grad_norm": 6.533902168273926, + "learning_rate": 3.8595656997939456e-05, + "loss": 0.0106, + "step": 4280 + }, + { + "epoch": 3.01, + "grad_norm": 0.001664067734964192, + "learning_rate": 3.855603106673007e-05, + "loss": 0.0162, + "step": 4290 + }, + { + "epoch": 3.01, + "grad_norm": 0.8516010046005249, + "learning_rate": 3.851640513552069e-05, + "loss": 0.4751, + "step": 4300 + }, + { + "epoch": 3.01, + "grad_norm": 0.03567550331354141, + "learning_rate": 3.8476779204311306e-05, + "loss": 0.161, + "step": 4310 + }, + { + "epoch": 3.01, + "grad_norm": 0.0029626258183270693, + "learning_rate": 3.8437153273101925e-05, + "loss": 0.1448, + "step": 4320 + }, + { + "epoch": 3.01, + "grad_norm": 0.017234837636351585, + "learning_rate": 3.839752734189254e-05, + "loss": 0.0052, + "step": 4330 + }, + { + "epoch": 3.01, + "grad_norm": 0.14725999534130096, + "learning_rate": 3.835790141068315e-05, + "loss": 0.0645, + "step": 4340 + }, + { + "epoch": 3.01, + "grad_norm": 0.002782195108011365, + "learning_rate": 3.831827547947377e-05, + "loss": 0.0004, + "step": 4350 + }, + { + "epoch": 3.01, + "grad_norm": 34.547061920166016, + "learning_rate": 3.8278649548264386e-05, + "loss": 0.4315, + "step": 4360 + }, + { + "epoch": 3.01, + "grad_norm": 0.0030270384158939123, + "learning_rate": 3.8239023617055005e-05, + "loss": 0.3151, + "step": 4370 + }, + { + "epoch": 3.01, + "grad_norm": 0.008927990682423115, + "learning_rate": 3.819939768584562e-05, + "loss": 0.0002, + "step": 4380 + }, + { + "epoch": 3.01, + "grad_norm": 0.11368348449468613, + "learning_rate": 3.8159771754636236e-05, + "loss": 0.0077, + "step": 4390 + }, + { + "epoch": 3.01, + "grad_norm": 0.10815131664276123, + "learning_rate": 3.812014582342685e-05, + "loss": 0.0182, + "step": 4400 + }, + { + "epoch": 3.01, + "grad_norm": 0.020075034350156784, + "learning_rate": 3.808051989221747e-05, + "loss": 0.0007, + "step": 4410 + }, + { + "epoch": 3.02, + "grad_norm": 0.001029517618007958, + "learning_rate": 3.8040893961008086e-05, + "loss": 0.0397, + "step": 4420 + }, + { + "epoch": 3.02, + "grad_norm": 0.003121725283563137, + "learning_rate": 3.8001268029798704e-05, + "loss": 0.0001, + "step": 4430 + }, + { + "epoch": 3.02, + "grad_norm": 5.35357141494751, + "learning_rate": 3.7961642098589316e-05, + "loss": 0.0069, + "step": 4440 + }, + { + "epoch": 3.02, + "grad_norm": 0.013706800527870655, + "learning_rate": 3.7922016167379935e-05, + "loss": 0.0007, + "step": 4450 + }, + { + "epoch": 3.02, + "grad_norm": 0.09196832776069641, + "learning_rate": 3.7882390236170554e-05, + "loss": 0.0003, + "step": 4460 + }, + { + "epoch": 3.02, + "grad_norm": 0.003602321958169341, + "learning_rate": 3.784276430496117e-05, + "loss": 0.2969, + "step": 4470 + }, + { + "epoch": 3.02, + "grad_norm": 20.944992065429688, + "learning_rate": 3.7803138373751785e-05, + "loss": 0.0272, + "step": 4480 + }, + { + "epoch": 3.02, + "grad_norm": 0.002105366438627243, + "learning_rate": 3.7763512442542404e-05, + "loss": 0.0002, + "step": 4490 + }, + { + "epoch": 3.02, + "grad_norm": 0.004411764442920685, + "learning_rate": 3.7723886511333016e-05, + "loss": 0.0076, + "step": 4500 + }, + { + "epoch": 3.02, + "grad_norm": 0.005865162704139948, + "learning_rate": 3.7684260580123635e-05, + "loss": 0.0059, + "step": 4510 + }, + { + "epoch": 3.02, + "grad_norm": 0.011046779341995716, + "learning_rate": 3.764463464891425e-05, + "loss": 0.0043, + "step": 4520 + }, + { + "epoch": 3.02, + "grad_norm": 0.023666031658649445, + "learning_rate": 3.760500871770487e-05, + "loss": 0.0009, + "step": 4530 + }, + { + "epoch": 3.02, + "grad_norm": 53.04268264770508, + "learning_rate": 3.7565382786495484e-05, + "loss": 0.3129, + "step": 4540 + }, + { + "epoch": 3.02, + "grad_norm": 16.536462783813477, + "learning_rate": 3.7525756855286096e-05, + "loss": 0.0079, + "step": 4550 + }, + { + "epoch": 3.03, + "grad_norm": 0.004224766045808792, + "learning_rate": 3.7486130924076715e-05, + "loss": 0.073, + "step": 4560 + }, + { + "epoch": 3.03, + "grad_norm": 0.005598429590463638, + "learning_rate": 3.7446504992867334e-05, + "loss": 0.2742, + "step": 4570 + }, + { + "epoch": 3.03, + "grad_norm": 0.0030881077982485294, + "learning_rate": 3.740687906165795e-05, + "loss": 0.2948, + "step": 4580 + }, + { + "epoch": 3.03, + "grad_norm": 0.019152648746967316, + "learning_rate": 3.7367253130448565e-05, + "loss": 0.0047, + "step": 4590 + }, + { + "epoch": 3.03, + "grad_norm": 0.001949524856172502, + "learning_rate": 3.7327627199239183e-05, + "loss": 0.0003, + "step": 4600 + }, + { + "epoch": 3.03, + "grad_norm": 0.001139726140536368, + "learning_rate": 3.7288001268029795e-05, + "loss": 0.0033, + "step": 4610 + }, + { + "epoch": 3.03, + "grad_norm": 0.0009636884205974638, + "learning_rate": 3.724837533682042e-05, + "loss": 0.5236, + "step": 4620 + }, + { + "epoch": 3.03, + "grad_norm": 0.0024904939346015453, + "learning_rate": 3.720874940561103e-05, + "loss": 0.0005, + "step": 4630 + }, + { + "epoch": 3.03, + "grad_norm": 0.004561484791338444, + "learning_rate": 3.716912347440165e-05, + "loss": 0.4394, + "step": 4640 + }, + { + "epoch": 3.03, + "grad_norm": 21.228055953979492, + "learning_rate": 3.7129497543192264e-05, + "loss": 0.5905, + "step": 4650 + }, + { + "epoch": 3.03, + "grad_norm": 38.67287063598633, + "learning_rate": 3.708987161198288e-05, + "loss": 0.0304, + "step": 4660 + }, + { + "epoch": 3.03, + "grad_norm": 0.002863664412871003, + "learning_rate": 3.70502456807735e-05, + "loss": 0.4688, + "step": 4670 + }, + { + "epoch": 3.03, + "grad_norm": 0.0070022111758589745, + "learning_rate": 3.701061974956412e-05, + "loss": 0.0044, + "step": 4680 + }, + { + "epoch": 3.03, + "grad_norm": 98.50983428955078, + "learning_rate": 3.697099381835473e-05, + "loss": 0.2539, + "step": 4690 + }, + { + "epoch": 3.04, + "grad_norm": 0.044561292976140976, + "learning_rate": 3.693136788714535e-05, + "loss": 0.0002, + "step": 4700 + }, + { + "epoch": 3.04, + "grad_norm": 2.370043992996216, + "learning_rate": 3.689174195593596e-05, + "loss": 0.0055, + "step": 4710 + }, + { + "epoch": 3.04, + "grad_norm": 12.61652660369873, + "learning_rate": 3.685211602472658e-05, + "loss": 0.456, + "step": 4720 + }, + { + "epoch": 3.04, + "grad_norm": 0.020174263045191765, + "learning_rate": 3.68124900935172e-05, + "loss": 0.0023, + "step": 4730 + }, + { + "epoch": 3.04, + "grad_norm": 0.032532501965761185, + "learning_rate": 3.677286416230782e-05, + "loss": 0.0004, + "step": 4740 + }, + { + "epoch": 3.04, + "grad_norm": 39.96610641479492, + "learning_rate": 3.673323823109843e-05, + "loss": 0.5033, + "step": 4750 + }, + { + "epoch": 3.04, + "grad_norm": 0.006895292084664106, + "learning_rate": 3.669361229988905e-05, + "loss": 0.2369, + "step": 4760 + }, + { + "epoch": 3.04, + "grad_norm": 0.0018528720829635859, + "learning_rate": 3.665398636867967e-05, + "loss": 0.0005, + "step": 4770 + }, + { + "epoch": 3.04, + "grad_norm": 57.440799713134766, + "learning_rate": 3.661436043747028e-05, + "loss": 0.7416, + "step": 4780 + }, + { + "epoch": 3.04, + "grad_norm": 0.15606503188610077, + "learning_rate": 3.65747345062609e-05, + "loss": 0.0005, + "step": 4790 + }, + { + "epoch": 3.04, + "grad_norm": 0.06342484056949615, + "learning_rate": 3.653510857505151e-05, + "loss": 0.0008, + "step": 4800 + }, + { + "epoch": 3.04, + "grad_norm": 0.0007686218596063554, + "learning_rate": 3.649548264384213e-05, + "loss": 0.0083, + "step": 4810 + }, + { + "epoch": 3.04, + "grad_norm": 0.007868933491408825, + "learning_rate": 3.645585671263275e-05, + "loss": 0.0002, + "step": 4820 + }, + { + "epoch": 3.04, + "grad_norm": 0.0038664869498461485, + "learning_rate": 3.641623078142337e-05, + "loss": 0.5171, + "step": 4830 + }, + { + "epoch": 3.05, + "grad_norm": 139.34559631347656, + "learning_rate": 3.637660485021398e-05, + "loss": 0.1279, + "step": 4840 + }, + { + "epoch": 3.05, + "grad_norm": 0.01549526583403349, + "learning_rate": 3.63369789190046e-05, + "loss": 0.0024, + "step": 4850 + }, + { + "epoch": 3.05, + "grad_norm": 0.009506451897323132, + "learning_rate": 3.629735298779521e-05, + "loss": 0.0322, + "step": 4860 + }, + { + "epoch": 3.05, + "grad_norm": 0.01199623104184866, + "learning_rate": 3.625772705658583e-05, + "loss": 0.5853, + "step": 4870 + }, + { + "epoch": 3.05, + "grad_norm": 0.023425359278917313, + "learning_rate": 3.621810112537645e-05, + "loss": 0.0074, + "step": 4880 + }, + { + "epoch": 3.05, + "grad_norm": 0.0029667699709534645, + "learning_rate": 3.617847519416707e-05, + "loss": 0.0003, + "step": 4890 + }, + { + "epoch": 3.05, + "grad_norm": 0.0053655593656003475, + "learning_rate": 3.613884926295768e-05, + "loss": 0.0022, + "step": 4900 + }, + { + "epoch": 3.05, + "grad_norm": 0.002650972455739975, + "learning_rate": 3.60992233317483e-05, + "loss": 0.0286, + "step": 4910 + }, + { + "epoch": 3.05, + "grad_norm": 0.029404861852526665, + "learning_rate": 3.605959740053891e-05, + "loss": 0.355, + "step": 4920 + }, + { + "epoch": 3.05, + "grad_norm": 0.0018920317525044084, + "learning_rate": 3.6019971469329536e-05, + "loss": 0.0105, + "step": 4930 + }, + { + "epoch": 3.05, + "grad_norm": 0.2511395812034607, + "learning_rate": 3.598034553812015e-05, + "loss": 0.0721, + "step": 4940 + }, + { + "epoch": 3.05, + "grad_norm": 0.0022750215139240026, + "learning_rate": 3.594071960691077e-05, + "loss": 0.0011, + "step": 4950 + }, + { + "epoch": 3.05, + "grad_norm": 0.0004970223526470363, + "learning_rate": 3.590109367570138e-05, + "loss": 0.0493, + "step": 4960 + }, + { + "epoch": 3.05, + "grad_norm": 0.0018257640767842531, + "learning_rate": 3.5861467744492e-05, + "loss": 0.0041, + "step": 4970 + }, + { + "epoch": 3.05, + "grad_norm": 0.002615696983411908, + "learning_rate": 3.5821841813282617e-05, + "loss": 0.0002, + "step": 4980 + }, + { + "epoch": 3.06, + "grad_norm": 0.0015312007162719965, + "learning_rate": 3.578221588207323e-05, + "loss": 0.0048, + "step": 4990 + }, + { + "epoch": 3.06, + "grad_norm": 0.003842801321297884, + "learning_rate": 3.574258995086385e-05, + "loss": 0.0001, + "step": 5000 + }, + { + "epoch": 3.06, + "grad_norm": 0.003976788371801376, + "learning_rate": 3.570296401965446e-05, + "loss": 0.0002, + "step": 5010 + }, + { + "epoch": 3.06, + "grad_norm": 0.0027057684492319822, + "learning_rate": 3.566333808844508e-05, + "loss": 0.0001, + "step": 5020 + }, + { + "epoch": 3.06, + "grad_norm": 0.0013581090606749058, + "learning_rate": 3.56237121572357e-05, + "loss": 0.0643, + "step": 5030 + }, + { + "epoch": 3.06, + "grad_norm": 73.48147583007812, + "learning_rate": 3.5584086226026316e-05, + "loss": 0.1412, + "step": 5040 + }, + { + "epoch": 3.06, + "grad_norm": 0.05521896854043007, + "learning_rate": 3.554446029481693e-05, + "loss": 0.8526, + "step": 5050 + }, + { + "epoch": 3.06, + "grad_norm": 0.01980687491595745, + "learning_rate": 3.550483436360755e-05, + "loss": 0.3712, + "step": 5060 + }, + { + "epoch": 3.06, + "grad_norm": 0.0016443756176158786, + "learning_rate": 3.546520843239816e-05, + "loss": 0.0004, + "step": 5070 + }, + { + "epoch": 3.06, + "grad_norm": 2.7786030769348145, + "learning_rate": 3.5425582501188784e-05, + "loss": 0.0024, + "step": 5080 + }, + { + "epoch": 3.06, + "grad_norm": 0.002752843778580427, + "learning_rate": 3.5385956569979396e-05, + "loss": 0.0989, + "step": 5090 + }, + { + "epoch": 3.06, + "grad_norm": 0.07084832340478897, + "learning_rate": 3.5346330638770015e-05, + "loss": 0.006, + "step": 5100 + }, + { + "epoch": 3.06, + "grad_norm": 0.0023438192438334227, + "learning_rate": 3.530670470756063e-05, + "loss": 0.0631, + "step": 5110 + }, + { + "epoch": 3.06, + "grad_norm": 0.023146087303757668, + "learning_rate": 3.5267078776351246e-05, + "loss": 0.3281, + "step": 5120 + }, + { + "epoch": 3.07, + "grad_norm": 0.0026536276564002037, + "learning_rate": 3.5227452845141865e-05, + "loss": 0.2622, + "step": 5130 + }, + { + "epoch": 3.07, + "grad_norm": 25.2746639251709, + "learning_rate": 3.5187826913932483e-05, + "loss": 0.9971, + "step": 5140 + }, + { + "epoch": 3.07, + "grad_norm": 2.3518447875976562, + "learning_rate": 3.5148200982723095e-05, + "loss": 0.0101, + "step": 5150 + }, + { + "epoch": 3.07, + "grad_norm": 0.004120847675949335, + "learning_rate": 3.5108575051513714e-05, + "loss": 0.2435, + "step": 5160 + }, + { + "epoch": 3.07, + "grad_norm": 0.0018114675767719746, + "learning_rate": 3.5068949120304326e-05, + "loss": 0.0002, + "step": 5170 + }, + { + "epoch": 3.07, + "grad_norm": 0.003768153488636017, + "learning_rate": 3.5029323189094945e-05, + "loss": 0.0041, + "step": 5180 + }, + { + "epoch": 3.07, + "grad_norm": 0.07695072889328003, + "learning_rate": 3.4989697257885564e-05, + "loss": 0.0025, + "step": 5190 + }, + { + "epoch": 3.07, + "grad_norm": 57.52178955078125, + "learning_rate": 3.4950071326676176e-05, + "loss": 0.0468, + "step": 5200 + }, + { + "epoch": 3.07, + "grad_norm": 0.001811747089959681, + "learning_rate": 3.4910445395466795e-05, + "loss": 0.0007, + "step": 5210 + }, + { + "epoch": 3.07, + "grad_norm": 0.0032631447538733482, + "learning_rate": 3.487081946425741e-05, + "loss": 0.0962, + "step": 5220 + }, + { + "epoch": 3.07, + "grad_norm": 0.0040063695050776005, + "learning_rate": 3.4831193533048026e-05, + "loss": 0.0718, + "step": 5230 + }, + { + "epoch": 3.07, + "grad_norm": 0.042804840952157974, + "learning_rate": 3.4791567601838644e-05, + "loss": 0.0038, + "step": 5240 + }, + { + "epoch": 3.07, + "grad_norm": 0.0023616242688149214, + "learning_rate": 3.475194167062926e-05, + "loss": 0.0002, + "step": 5250 + }, + { + "epoch": 3.07, + "grad_norm": 0.002275130245834589, + "learning_rate": 3.4712315739419875e-05, + "loss": 0.6309, + "step": 5260 + }, + { + "epoch": 3.08, + "grad_norm": 0.011256784200668335, + "learning_rate": 3.4672689808210494e-05, + "loss": 0.0002, + "step": 5270 + }, + { + "epoch": 3.08, + "grad_norm": 0.0045999023132026196, + "learning_rate": 3.463306387700111e-05, + "loss": 0.1079, + "step": 5280 + }, + { + "epoch": 3.08, + "grad_norm": 0.001873884117230773, + "learning_rate": 3.459343794579173e-05, + "loss": 0.0004, + "step": 5290 + }, + { + "epoch": 3.08, + "grad_norm": 0.003349520266056061, + "learning_rate": 3.4553812014582344e-05, + "loss": 0.0392, + "step": 5300 + }, + { + "epoch": 3.08, + "grad_norm": 0.07745254039764404, + "learning_rate": 3.451418608337296e-05, + "loss": 0.1259, + "step": 5310 + }, + { + "epoch": 3.08, + "grad_norm": 0.001707065268419683, + "learning_rate": 3.4474560152163574e-05, + "loss": 0.2405, + "step": 5320 + }, + { + "epoch": 3.08, + "grad_norm": 0.00230118609033525, + "learning_rate": 3.443493422095419e-05, + "loss": 0.0026, + "step": 5330 + }, + { + "epoch": 3.08, + "grad_norm": 0.03412836417555809, + "learning_rate": 3.439530828974481e-05, + "loss": 0.1592, + "step": 5340 + }, + { + "epoch": 3.08, + "grad_norm": 0.006646712776273489, + "learning_rate": 3.435568235853543e-05, + "loss": 0.0253, + "step": 5350 + }, + { + "epoch": 3.08, + "grad_norm": 0.10694713890552521, + "learning_rate": 3.431605642732604e-05, + "loss": 0.3725, + "step": 5360 + }, + { + "epoch": 3.08, + "grad_norm": 0.14875371754169464, + "learning_rate": 3.427643049611666e-05, + "loss": 0.7354, + "step": 5370 + }, + { + "epoch": 3.08, + "grad_norm": 0.06602335721254349, + "learning_rate": 3.4236804564907274e-05, + "loss": 0.3752, + "step": 5380 + }, + { + "epoch": 3.08, + "grad_norm": 0.002122233621776104, + "learning_rate": 3.41971786336979e-05, + "loss": 0.0014, + "step": 5390 + }, + { + "epoch": 3.08, + "grad_norm": 0.020870821550488472, + "learning_rate": 3.415755270248851e-05, + "loss": 0.001, + "step": 5400 + }, + { + "epoch": 3.09, + "grad_norm": 51.36176681518555, + "learning_rate": 3.411792677127912e-05, + "loss": 0.0521, + "step": 5410 + }, + { + "epoch": 3.09, + "grad_norm": 0.002612057374790311, + "learning_rate": 3.407830084006974e-05, + "loss": 0.3004, + "step": 5420 + }, + { + "epoch": 3.09, + "grad_norm": 0.006323930341750383, + "learning_rate": 3.4038674908860354e-05, + "loss": 0.0007, + "step": 5430 + }, + { + "epoch": 3.09, + "grad_norm": 0.010717890225350857, + "learning_rate": 3.399904897765098e-05, + "loss": 0.0003, + "step": 5440 + }, + { + "epoch": 3.09, + "grad_norm": 0.20171226561069489, + "learning_rate": 3.395942304644159e-05, + "loss": 0.0007, + "step": 5450 + }, + { + "epoch": 3.09, + "grad_norm": 0.0015600892947986722, + "learning_rate": 3.391979711523221e-05, + "loss": 0.0011, + "step": 5460 + }, + { + "epoch": 3.09, + "grad_norm": 1.7858773469924927, + "learning_rate": 3.388017118402282e-05, + "loss": 0.0032, + "step": 5470 + }, + { + "epoch": 3.09, + "grad_norm": 0.0012454432435333729, + "learning_rate": 3.384054525281344e-05, + "loss": 0.0002, + "step": 5480 + }, + { + "epoch": 3.09, + "grad_norm": 0.0015952313551679254, + "learning_rate": 3.380091932160406e-05, + "loss": 0.001, + "step": 5490 + }, + { + "epoch": 3.09, + "grad_norm": 0.002770837862044573, + "learning_rate": 3.376129339039468e-05, + "loss": 0.0008, + "step": 5500 + }, + { + "epoch": 3.09, + "grad_norm": 1.3940935134887695, + "learning_rate": 3.372166745918529e-05, + "loss": 0.0037, + "step": 5510 + }, + { + "epoch": 3.09, + "grad_norm": 0.1070881336927414, + "learning_rate": 3.368204152797591e-05, + "loss": 0.0003, + "step": 5520 + }, + { + "epoch": 3.09, + "grad_norm": 60.142276763916016, + "learning_rate": 3.364241559676652e-05, + "loss": 0.4676, + "step": 5530 + }, + { + "epoch": 3.09, + "grad_norm": 0.0014883485855534673, + "learning_rate": 3.360278966555714e-05, + "loss": 0.0003, + "step": 5540 + }, + { + "epoch": 3.1, + "grad_norm": 0.002981774276122451, + "learning_rate": 3.356316373434776e-05, + "loss": 0.0002, + "step": 5550 + }, + { + "epoch": 3.1, + "grad_norm": 0.000889226037543267, + "learning_rate": 3.352353780313838e-05, + "loss": 0.0001, + "step": 5560 + }, + { + "epoch": 3.1, + "grad_norm": 0.006324201822280884, + "learning_rate": 3.348391187192899e-05, + "loss": 0.0002, + "step": 5570 + }, + { + "epoch": 3.1, + "grad_norm": 0.013741032220423222, + "learning_rate": 3.344428594071961e-05, + "loss": 0.0038, + "step": 5580 + }, + { + "epoch": 3.1, + "grad_norm": 0.0982193648815155, + "learning_rate": 3.340466000951023e-05, + "loss": 0.0002, + "step": 5590 + }, + { + "epoch": 3.1, + "grad_norm": 0.0023921611718833447, + "learning_rate": 3.336503407830085e-05, + "loss": 0.0001, + "step": 5600 + }, + { + "epoch": 3.1, + "grad_norm": 0.004157126881182194, + "learning_rate": 3.332540814709146e-05, + "loss": 0.0001, + "step": 5610 + }, + { + "epoch": 3.1, + "eval_accuracy": 0.9398496240601504, + "eval_loss": 0.3705739974975586, + "eval_runtime": 2358.2538, + "eval_samples_per_second": 0.282, + "eval_steps_per_second": 0.141, + "step": 5612 + }, + { + "epoch": 4.0, + "grad_norm": 0.0009104391792789102, + "learning_rate": 3.328578221588208e-05, + "loss": 0.3095, + "step": 5620 + }, + { + "epoch": 4.0, + "grad_norm": 0.002629748312756419, + "learning_rate": 3.324615628467269e-05, + "loss": 0.1543, + "step": 5630 + }, + { + "epoch": 4.0, + "grad_norm": 0.005159564781934023, + "learning_rate": 3.320653035346331e-05, + "loss": 0.0003, + "step": 5640 + }, + { + "epoch": 4.0, + "grad_norm": 0.000841008557472378, + "learning_rate": 3.316690442225393e-05, + "loss": 0.0004, + "step": 5650 + }, + { + "epoch": 4.0, + "grad_norm": 0.004792694002389908, + "learning_rate": 3.312727849104454e-05, + "loss": 0.0017, + "step": 5660 + }, + { + "epoch": 4.0, + "grad_norm": 0.0014269945677369833, + "learning_rate": 3.308765255983516e-05, + "loss": 0.0002, + "step": 5670 + }, + { + "epoch": 4.0, + "grad_norm": 0.0021025442983955145, + "learning_rate": 3.304802662862577e-05, + "loss": 0.115, + "step": 5680 + }, + { + "epoch": 4.01, + "grad_norm": 0.0010108908172696829, + "learning_rate": 3.300840069741639e-05, + "loss": 0.0034, + "step": 5690 + }, + { + "epoch": 4.01, + "grad_norm": 0.012116851285099983, + "learning_rate": 3.296877476620701e-05, + "loss": 0.001, + "step": 5700 + }, + { + "epoch": 4.01, + "grad_norm": 28.641616821289062, + "learning_rate": 3.2929148834997626e-05, + "loss": 0.5096, + "step": 5710 + }, + { + "epoch": 4.01, + "grad_norm": 21.132633209228516, + "learning_rate": 3.288952290378824e-05, + "loss": 0.0347, + "step": 5720 + }, + { + "epoch": 4.01, + "grad_norm": 0.0044413842260837555, + "learning_rate": 3.284989697257886e-05, + "loss": 0.0145, + "step": 5730 + }, + { + "epoch": 4.01, + "grad_norm": 48.15180206298828, + "learning_rate": 3.281027104136947e-05, + "loss": 0.0224, + "step": 5740 + }, + { + "epoch": 4.01, + "grad_norm": 0.003202601335942745, + "learning_rate": 3.2770645110160095e-05, + "loss": 0.1978, + "step": 5750 + }, + { + "epoch": 4.01, + "grad_norm": 20.207809448242188, + "learning_rate": 3.273101917895071e-05, + "loss": 0.0856, + "step": 5760 + }, + { + "epoch": 4.01, + "grad_norm": 0.0013485507806763053, + "learning_rate": 3.2691393247741326e-05, + "loss": 0.0013, + "step": 5770 + }, + { + "epoch": 4.01, + "grad_norm": 0.0005685106734745204, + "learning_rate": 3.265176731653194e-05, + "loss": 0.4924, + "step": 5780 + }, + { + "epoch": 4.01, + "grad_norm": 0.0010126458946615458, + "learning_rate": 3.2612141385322556e-05, + "loss": 0.0096, + "step": 5790 + }, + { + "epoch": 4.01, + "grad_norm": 95.05339813232422, + "learning_rate": 3.2572515454113175e-05, + "loss": 0.1921, + "step": 5800 + }, + { + "epoch": 4.01, + "grad_norm": 0.002471966203302145, + "learning_rate": 3.2532889522903794e-05, + "loss": 0.0008, + "step": 5810 + }, + { + "epoch": 4.01, + "grad_norm": 0.001297266804613173, + "learning_rate": 3.2493263591694406e-05, + "loss": 0.0002, + "step": 5820 + }, + { + "epoch": 4.02, + "grad_norm": 19.474943161010742, + "learning_rate": 3.2453637660485025e-05, + "loss": 0.3726, + "step": 5830 + }, + { + "epoch": 4.02, + "grad_norm": 0.015567510388791561, + "learning_rate": 3.241401172927564e-05, + "loss": 0.0003, + "step": 5840 + }, + { + "epoch": 4.02, + "grad_norm": 0.002833213657140732, + "learning_rate": 3.2374385798066256e-05, + "loss": 0.6715, + "step": 5850 + }, + { + "epoch": 4.02, + "grad_norm": 1.4238035678863525, + "learning_rate": 3.2334759866856875e-05, + "loss": 0.0008, + "step": 5860 + }, + { + "epoch": 4.02, + "grad_norm": 0.0025125148240476847, + "learning_rate": 3.2295133935647487e-05, + "loss": 0.055, + "step": 5870 + }, + { + "epoch": 4.02, + "grad_norm": 0.001612589810974896, + "learning_rate": 3.2255508004438105e-05, + "loss": 0.1114, + "step": 5880 + }, + { + "epoch": 4.02, + "grad_norm": 0.0115219596773386, + "learning_rate": 3.221588207322872e-05, + "loss": 0.0002, + "step": 5890 + }, + { + "epoch": 4.02, + "grad_norm": 0.002122466452419758, + "learning_rate": 3.217625614201934e-05, + "loss": 0.0463, + "step": 5900 + }, + { + "epoch": 4.02, + "grad_norm": 0.012210741639137268, + "learning_rate": 3.2136630210809955e-05, + "loss": 0.0038, + "step": 5910 + }, + { + "epoch": 4.02, + "grad_norm": 0.006696117110550404, + "learning_rate": 3.2097004279600574e-05, + "loss": 0.0006, + "step": 5920 + }, + { + "epoch": 4.02, + "grad_norm": 0.003299353178590536, + "learning_rate": 3.2057378348391186e-05, + "loss": 0.4329, + "step": 5930 + }, + { + "epoch": 4.02, + "grad_norm": 0.017031671479344368, + "learning_rate": 3.2017752417181805e-05, + "loss": 0.0014, + "step": 5940 + }, + { + "epoch": 4.02, + "grad_norm": 0.008915259502828121, + "learning_rate": 3.197812648597242e-05, + "loss": 0.0003, + "step": 5950 + }, + { + "epoch": 4.02, + "grad_norm": 0.0033541598822921515, + "learning_rate": 3.193850055476304e-05, + "loss": 0.0261, + "step": 5960 + }, + { + "epoch": 4.03, + "grad_norm": 0.0039758519269526005, + "learning_rate": 3.1898874623553654e-05, + "loss": 0.0701, + "step": 5970 + }, + { + "epoch": 4.03, + "grad_norm": 0.0859561562538147, + "learning_rate": 3.185924869234427e-05, + "loss": 0.0012, + "step": 5980 + }, + { + "epoch": 4.03, + "grad_norm": 0.0011740931076928973, + "learning_rate": 3.1819622761134885e-05, + "loss": 0.0009, + "step": 5990 + }, + { + "epoch": 4.03, + "grad_norm": 0.0011881589889526367, + "learning_rate": 3.1779996829925504e-05, + "loss": 0.1377, + "step": 6000 + }, + { + "epoch": 4.03, + "grad_norm": 0.3393727242946625, + "learning_rate": 3.174037089871612e-05, + "loss": 0.0085, + "step": 6010 + }, + { + "epoch": 4.03, + "grad_norm": 0.26653632521629333, + "learning_rate": 3.170074496750674e-05, + "loss": 0.5985, + "step": 6020 + }, + { + "epoch": 4.03, + "grad_norm": 71.80652618408203, + "learning_rate": 3.1661119036297353e-05, + "loss": 0.1081, + "step": 6030 + }, + { + "epoch": 4.03, + "grad_norm": 0.3816182017326355, + "learning_rate": 3.162149310508797e-05, + "loss": 0.0015, + "step": 6040 + }, + { + "epoch": 4.03, + "grad_norm": 3.339017629623413, + "learning_rate": 3.1581867173878584e-05, + "loss": 0.0737, + "step": 6050 + }, + { + "epoch": 4.03, + "grad_norm": 0.0013679158873856068, + "learning_rate": 3.15422412426692e-05, + "loss": 0.2465, + "step": 6060 + }, + { + "epoch": 4.03, + "grad_norm": 0.003948609344661236, + "learning_rate": 3.150261531145982e-05, + "loss": 0.2111, + "step": 6070 + }, + { + "epoch": 4.03, + "grad_norm": 0.005721264984458685, + "learning_rate": 3.1462989380250434e-05, + "loss": 0.0024, + "step": 6080 + }, + { + "epoch": 4.03, + "grad_norm": 0.0018883657176047564, + "learning_rate": 3.142336344904105e-05, + "loss": 0.2687, + "step": 6090 + }, + { + "epoch": 4.03, + "grad_norm": 0.008159175515174866, + "learning_rate": 3.138373751783167e-05, + "loss": 0.1406, + "step": 6100 + }, + { + "epoch": 4.04, + "grad_norm": 0.009790794923901558, + "learning_rate": 3.134411158662229e-05, + "loss": 0.2122, + "step": 6110 + }, + { + "epoch": 4.04, + "grad_norm": 0.645839273929596, + "learning_rate": 3.13044856554129e-05, + "loss": 0.0223, + "step": 6120 + }, + { + "epoch": 4.04, + "grad_norm": 0.0012109485687687993, + "learning_rate": 3.126485972420352e-05, + "loss": 0.2131, + "step": 6130 + }, + { + "epoch": 4.04, + "grad_norm": 0.005074062384665012, + "learning_rate": 3.122523379299413e-05, + "loss": 0.4669, + "step": 6140 + }, + { + "epoch": 4.04, + "grad_norm": 0.04010836407542229, + "learning_rate": 3.118560786178475e-05, + "loss": 0.012, + "step": 6150 + }, + { + "epoch": 4.04, + "grad_norm": 0.018426967784762383, + "learning_rate": 3.114598193057537e-05, + "loss": 0.0008, + "step": 6160 + }, + { + "epoch": 4.04, + "grad_norm": 0.0035447929985821247, + "learning_rate": 3.110635599936599e-05, + "loss": 0.1271, + "step": 6170 + }, + { + "epoch": 4.04, + "grad_norm": 0.012344791553914547, + "learning_rate": 3.10667300681566e-05, + "loss": 0.0002, + "step": 6180 + }, + { + "epoch": 4.04, + "grad_norm": 0.0015085155609995127, + "learning_rate": 3.102710413694722e-05, + "loss": 0.0064, + "step": 6190 + }, + { + "epoch": 4.04, + "grad_norm": 0.0013396035647019744, + "learning_rate": 3.098747820573783e-05, + "loss": 0.0003, + "step": 6200 + }, + { + "epoch": 4.04, + "grad_norm": 0.007324972189962864, + "learning_rate": 3.094785227452846e-05, + "loss": 0.0001, + "step": 6210 + }, + { + "epoch": 4.04, + "grad_norm": 0.029165761545300484, + "learning_rate": 3.090822634331907e-05, + "loss": 0.0002, + "step": 6220 + }, + { + "epoch": 4.04, + "grad_norm": 0.006251147948205471, + "learning_rate": 3.086860041210969e-05, + "loss": 0.0001, + "step": 6230 + }, + { + "epoch": 4.04, + "grad_norm": 0.0033136485144495964, + "learning_rate": 3.08289744809003e-05, + "loss": 0.1959, + "step": 6240 + }, + { + "epoch": 4.05, + "grad_norm": 15.712539672851562, + "learning_rate": 3.078934854969092e-05, + "loss": 0.0053, + "step": 6250 + }, + { + "epoch": 4.05, + "grad_norm": 0.004770079627633095, + "learning_rate": 3.074972261848154e-05, + "loss": 0.2429, + "step": 6260 + }, + { + "epoch": 4.05, + "grad_norm": 0.001170918345451355, + "learning_rate": 3.071009668727215e-05, + "loss": 0.4537, + "step": 6270 + }, + { + "epoch": 4.05, + "grad_norm": 0.003140375716611743, + "learning_rate": 3.067047075606277e-05, + "loss": 0.0003, + "step": 6280 + }, + { + "epoch": 4.05, + "grad_norm": 0.005154268350452185, + "learning_rate": 3.063084482485338e-05, + "loss": 0.0002, + "step": 6290 + }, + { + "epoch": 4.05, + "grad_norm": 0.718346357345581, + "learning_rate": 3.0591218893644e-05, + "loss": 0.0039, + "step": 6300 + }, + { + "epoch": 4.05, + "grad_norm": 0.29760679602622986, + "learning_rate": 3.055159296243462e-05, + "loss": 0.0325, + "step": 6310 + }, + { + "epoch": 4.05, + "grad_norm": 0.0015770556638017297, + "learning_rate": 3.0511967031225234e-05, + "loss": 0.1031, + "step": 6320 + }, + { + "epoch": 4.05, + "grad_norm": 14.039325714111328, + "learning_rate": 3.047234110001585e-05, + "loss": 0.0254, + "step": 6330 + }, + { + "epoch": 4.05, + "grad_norm": 12.89113998413086, + "learning_rate": 3.043271516880647e-05, + "loss": 0.0182, + "step": 6340 + }, + { + "epoch": 4.05, + "grad_norm": 0.0020349326077848673, + "learning_rate": 3.0393089237597084e-05, + "loss": 0.0047, + "step": 6350 + }, + { + "epoch": 4.05, + "grad_norm": 0.0006648111157119274, + "learning_rate": 3.0353463306387703e-05, + "loss": 0.0111, + "step": 6360 + }, + { + "epoch": 4.05, + "grad_norm": 0.00324794533662498, + "learning_rate": 3.0313837375178318e-05, + "loss": 0.001, + "step": 6370 + }, + { + "epoch": 4.05, + "grad_norm": 0.002352567156776786, + "learning_rate": 3.0274211443968937e-05, + "loss": 0.5155, + "step": 6380 + }, + { + "epoch": 4.06, + "grad_norm": 0.0007183744455687702, + "learning_rate": 3.023458551275955e-05, + "loss": 0.1828, + "step": 6390 + }, + { + "epoch": 4.06, + "grad_norm": 0.0010205942671746016, + "learning_rate": 3.019495958155017e-05, + "loss": 0.0004, + "step": 6400 + }, + { + "epoch": 4.06, + "grad_norm": 0.0007507322006858885, + "learning_rate": 3.0155333650340783e-05, + "loss": 0.0078, + "step": 6410 + }, + { + "epoch": 4.06, + "grad_norm": 0.0010719618294388056, + "learning_rate": 3.0115707719131402e-05, + "loss": 0.0024, + "step": 6420 + }, + { + "epoch": 4.06, + "grad_norm": 0.004630456678569317, + "learning_rate": 3.0076081787922017e-05, + "loss": 0.0001, + "step": 6430 + }, + { + "epoch": 4.06, + "grad_norm": 110.9379653930664, + "learning_rate": 3.0036455856712636e-05, + "loss": 0.2711, + "step": 6440 + }, + { + "epoch": 4.06, + "grad_norm": 0.0028752069920301437, + "learning_rate": 2.999682992550325e-05, + "loss": 0.1684, + "step": 6450 + }, + { + "epoch": 4.06, + "grad_norm": 0.00176974234636873, + "learning_rate": 2.995720399429387e-05, + "loss": 0.1832, + "step": 6460 + }, + { + "epoch": 4.06, + "grad_norm": 0.0004082988016307354, + "learning_rate": 2.9917578063084482e-05, + "loss": 0.1391, + "step": 6470 + }, + { + "epoch": 4.06, + "grad_norm": 91.996337890625, + "learning_rate": 2.9877952131875105e-05, + "loss": 0.0717, + "step": 6480 + }, + { + "epoch": 4.06, + "grad_norm": 0.3914591372013092, + "learning_rate": 2.9838326200665717e-05, + "loss": 0.0006, + "step": 6490 + }, + { + "epoch": 4.06, + "grad_norm": 0.0014606121694669127, + "learning_rate": 2.9798700269456332e-05, + "loss": 0.0002, + "step": 6500 + }, + { + "epoch": 4.06, + "grad_norm": 0.002047004410997033, + "learning_rate": 2.975907433824695e-05, + "loss": 0.243, + "step": 6510 + }, + { + "epoch": 4.06, + "grad_norm": 0.0009985043434426188, + "learning_rate": 2.9719448407037563e-05, + "loss": 0.0339, + "step": 6520 + }, + { + "epoch": 4.07, + "grad_norm": 0.0007074224413372576, + "learning_rate": 2.9679822475828185e-05, + "loss": 0.0015, + "step": 6530 + }, + { + "epoch": 4.07, + "grad_norm": 0.004130239132791758, + "learning_rate": 2.9640196544618797e-05, + "loss": 0.0001, + "step": 6540 + }, + { + "epoch": 4.07, + "grad_norm": 0.004487643018364906, + "learning_rate": 2.9600570613409416e-05, + "loss": 0.0002, + "step": 6550 + }, + { + "epoch": 4.07, + "grad_norm": 0.001936771790497005, + "learning_rate": 2.956094468220003e-05, + "loss": 0.0001, + "step": 6560 + }, + { + "epoch": 4.07, + "grad_norm": 0.004075042437762022, + "learning_rate": 2.952131875099065e-05, + "loss": 0.3415, + "step": 6570 + }, + { + "epoch": 4.07, + "grad_norm": 0.05164702981710434, + "learning_rate": 2.9481692819781266e-05, + "loss": 0.0001, + "step": 6580 + }, + { + "epoch": 4.07, + "grad_norm": 0.0014617941342294216, + "learning_rate": 2.9442066888571884e-05, + "loss": 0.0001, + "step": 6590 + }, + { + "epoch": 4.07, + "grad_norm": 0.0017148368060588837, + "learning_rate": 2.94024409573625e-05, + "loss": 0.0753, + "step": 6600 + }, + { + "epoch": 4.07, + "grad_norm": 0.003370764898136258, + "learning_rate": 2.936281502615312e-05, + "loss": 0.0072, + "step": 6610 + }, + { + "epoch": 4.07, + "grad_norm": 0.003846656298264861, + "learning_rate": 2.932318909494373e-05, + "loss": 0.0001, + "step": 6620 + }, + { + "epoch": 4.07, + "grad_norm": 0.002365513239055872, + "learning_rate": 2.928356316373435e-05, + "loss": 0.0008, + "step": 6630 + }, + { + "epoch": 4.07, + "grad_norm": 0.0008402117528021336, + "learning_rate": 2.9243937232524965e-05, + "loss": 0.0001, + "step": 6640 + }, + { + "epoch": 4.07, + "grad_norm": 0.004054752178490162, + "learning_rate": 2.9204311301315584e-05, + "loss": 0.0548, + "step": 6650 + }, + { + "epoch": 4.07, + "grad_norm": 0.0017859063809737563, + "learning_rate": 2.91646853701062e-05, + "loss": 0.0001, + "step": 6660 + }, + { + "epoch": 4.08, + "grad_norm": 3.045167922973633, + "learning_rate": 2.9125059438896818e-05, + "loss": 0.0106, + "step": 6670 + }, + { + "epoch": 4.08, + "grad_norm": 0.034478865563869476, + "learning_rate": 2.9085433507687433e-05, + "loss": 0.001, + "step": 6680 + }, + { + "epoch": 4.08, + "grad_norm": 0.0020598298870027065, + "learning_rate": 2.9045807576478052e-05, + "loss": 0.9616, + "step": 6690 + }, + { + "epoch": 4.08, + "grad_norm": 0.009513617493212223, + "learning_rate": 2.9006181645268664e-05, + "loss": 0.0601, + "step": 6700 + }, + { + "epoch": 4.08, + "grad_norm": 0.10755365341901779, + "learning_rate": 2.896655571405928e-05, + "loss": 0.4313, + "step": 6710 + }, + { + "epoch": 4.08, + "grad_norm": 89.41072845458984, + "learning_rate": 2.8926929782849898e-05, + "loss": 0.7753, + "step": 6720 + }, + { + "epoch": 4.08, + "grad_norm": 0.005557952914386988, + "learning_rate": 2.8887303851640514e-05, + "loss": 0.022, + "step": 6730 + }, + { + "epoch": 4.08, + "grad_norm": 0.07544991374015808, + "learning_rate": 2.8847677920431132e-05, + "loss": 0.1043, + "step": 6740 + }, + { + "epoch": 4.08, + "grad_norm": 0.004230760037899017, + "learning_rate": 2.8808051989221744e-05, + "loss": 0.2305, + "step": 6750 + }, + { + "epoch": 4.08, + "grad_norm": 0.0005384175456129014, + "learning_rate": 2.8768426058012367e-05, + "loss": 0.0003, + "step": 6760 + }, + { + "epoch": 4.08, + "grad_norm": 0.0020217718556523323, + "learning_rate": 2.872880012680298e-05, + "loss": 0.0008, + "step": 6770 + }, + { + "epoch": 4.08, + "grad_norm": 0.001009553438052535, + "learning_rate": 2.8689174195593598e-05, + "loss": 0.3165, + "step": 6780 + }, + { + "epoch": 4.08, + "grad_norm": 0.002491355175152421, + "learning_rate": 2.8649548264384213e-05, + "loss": 0.5646, + "step": 6790 + }, + { + "epoch": 4.08, + "grad_norm": 0.002977263880893588, + "learning_rate": 2.8609922333174832e-05, + "loss": 0.0001, + "step": 6800 + }, + { + "epoch": 4.09, + "grad_norm": 0.0012742202961817384, + "learning_rate": 2.8570296401965447e-05, + "loss": 0.2827, + "step": 6810 + }, + { + "epoch": 4.09, + "grad_norm": 0.0030132278334349394, + "learning_rate": 2.8530670470756066e-05, + "loss": 0.0006, + "step": 6820 + }, + { + "epoch": 4.09, + "grad_norm": 0.06876442581415176, + "learning_rate": 2.8491044539546678e-05, + "loss": 0.0062, + "step": 6830 + }, + { + "epoch": 4.09, + "grad_norm": 0.008195163682103157, + "learning_rate": 2.84514186083373e-05, + "loss": 0.0145, + "step": 6840 + }, + { + "epoch": 4.09, + "grad_norm": 0.0023913795594125986, + "learning_rate": 2.8411792677127912e-05, + "loss": 0.022, + "step": 6850 + }, + { + "epoch": 4.09, + "grad_norm": 0.0004799796442966908, + "learning_rate": 2.837216674591853e-05, + "loss": 0.2252, + "step": 6860 + }, + { + "epoch": 4.09, + "grad_norm": 0.11730991303920746, + "learning_rate": 2.8332540814709146e-05, + "loss": 0.0033, + "step": 6870 + }, + { + "epoch": 4.09, + "grad_norm": 0.01119227148592472, + "learning_rate": 2.8292914883499765e-05, + "loss": 0.0764, + "step": 6880 + }, + { + "epoch": 4.09, + "grad_norm": 1.4117075204849243, + "learning_rate": 2.825328895229038e-05, + "loss": 0.0313, + "step": 6890 + }, + { + "epoch": 4.09, + "grad_norm": 0.9569471478462219, + "learning_rate": 2.8213663021081e-05, + "loss": 0.0037, + "step": 6900 + }, + { + "epoch": 4.09, + "grad_norm": 0.001442433800548315, + "learning_rate": 2.8174037089871615e-05, + "loss": 0.0001, + "step": 6910 + }, + { + "epoch": 4.09, + "grad_norm": 0.0015686535043641925, + "learning_rate": 2.8134411158662227e-05, + "loss": 0.2835, + "step": 6920 + }, + { + "epoch": 4.09, + "grad_norm": 0.00151319510769099, + "learning_rate": 2.8094785227452846e-05, + "loss": 0.0004, + "step": 6930 + }, + { + "epoch": 4.09, + "grad_norm": 0.00654405914247036, + "learning_rate": 2.805515929624346e-05, + "loss": 0.0011, + "step": 6940 + }, + { + "epoch": 4.1, + "grad_norm": 0.0008709866087883711, + "learning_rate": 2.801553336503408e-05, + "loss": 0.0128, + "step": 6950 + }, + { + "epoch": 4.1, + "grad_norm": 0.00043904109043069184, + "learning_rate": 2.7975907433824695e-05, + "loss": 0.8107, + "step": 6960 + }, + { + "epoch": 4.1, + "grad_norm": 0.00742549542337656, + "learning_rate": 2.7936281502615314e-05, + "loss": 0.3098, + "step": 6970 + }, + { + "epoch": 4.1, + "grad_norm": 0.0007969782454892993, + "learning_rate": 2.7896655571405926e-05, + "loss": 0.0001, + "step": 6980 + }, + { + "epoch": 4.1, + "grad_norm": 52.47648620605469, + "learning_rate": 2.7857029640196548e-05, + "loss": 0.4694, + "step": 6990 + }, + { + "epoch": 4.1, + "grad_norm": 0.0088576078414917, + "learning_rate": 2.781740370898716e-05, + "loss": 0.0016, + "step": 7000 + }, + { + "epoch": 4.1, + "grad_norm": 3.6093878746032715, + "learning_rate": 2.777777777777778e-05, + "loss": 0.054, + "step": 7010 + }, + { + "epoch": 4.1, + "eval_accuracy": 0.9368421052631579, + "eval_loss": 0.4006503224372864, + "eval_runtime": 2328.3346, + "eval_samples_per_second": 0.286, + "eval_steps_per_second": 0.143, + "step": 7015 + }, + { + "epoch": 5.0, + "grad_norm": 0.0007481848588213325, + "learning_rate": 2.7738151846568395e-05, + "loss": 0.0007, + "step": 7020 + }, + { + "epoch": 5.0, + "grad_norm": 162.9553680419922, + "learning_rate": 2.7698525915359013e-05, + "loss": 0.4298, + "step": 7030 + }, + { + "epoch": 5.0, + "grad_norm": 0.0006780726835131645, + "learning_rate": 2.765889998414963e-05, + "loss": 0.4696, + "step": 7040 + }, + { + "epoch": 5.0, + "grad_norm": 0.0014015401247888803, + "learning_rate": 2.7619274052940248e-05, + "loss": 0.028, + "step": 7050 + }, + { + "epoch": 5.0, + "grad_norm": 0.00443660095334053, + "learning_rate": 2.757964812173086e-05, + "loss": 0.0078, + "step": 7060 + }, + { + "epoch": 5.0, + "grad_norm": 4.740740776062012, + "learning_rate": 2.7540022190521482e-05, + "loss": 0.1712, + "step": 7070 + }, + { + "epoch": 5.0, + "grad_norm": 0.0052452534437179565, + "learning_rate": 2.7500396259312094e-05, + "loss": 0.0001, + "step": 7080 + }, + { + "epoch": 5.01, + "grad_norm": 0.0006377240642905235, + "learning_rate": 2.7460770328102713e-05, + "loss": 0.045, + "step": 7090 + }, + { + "epoch": 5.01, + "grad_norm": 0.0011151348007842898, + "learning_rate": 2.7421144396893328e-05, + "loss": 0.0001, + "step": 7100 + }, + { + "epoch": 5.01, + "grad_norm": 0.0006300232489593327, + "learning_rate": 2.7381518465683947e-05, + "loss": 0.0004, + "step": 7110 + }, + { + "epoch": 5.01, + "grad_norm": 0.008152210153639317, + "learning_rate": 2.7341892534474562e-05, + "loss": 0.0002, + "step": 7120 + }, + { + "epoch": 5.01, + "grad_norm": 0.0016102171503007412, + "learning_rate": 2.7302266603265174e-05, + "loss": 0.0302, + "step": 7130 + }, + { + "epoch": 5.01, + "grad_norm": 0.0014644188340753317, + "learning_rate": 2.7262640672055796e-05, + "loss": 0.0, + "step": 7140 + }, + { + "epoch": 5.01, + "grad_norm": 0.0012343927519395947, + "learning_rate": 2.722301474084641e-05, + "loss": 0.001, + "step": 7150 + }, + { + "epoch": 5.01, + "grad_norm": 0.002109797904267907, + "learning_rate": 2.7183388809637027e-05, + "loss": 0.0003, + "step": 7160 + }, + { + "epoch": 5.01, + "grad_norm": 0.0012583807110786438, + "learning_rate": 2.7143762878427643e-05, + "loss": 0.0001, + "step": 7170 + }, + { + "epoch": 5.01, + "grad_norm": 0.0009702452807687223, + "learning_rate": 2.710413694721826e-05, + "loss": 0.1802, + "step": 7180 + }, + { + "epoch": 5.01, + "grad_norm": 0.004518999718129635, + "learning_rate": 2.7064511016008877e-05, + "loss": 0.0001, + "step": 7190 + }, + { + "epoch": 5.01, + "grad_norm": 0.0008531950297765434, + "learning_rate": 2.7024885084799496e-05, + "loss": 0.0001, + "step": 7200 + }, + { + "epoch": 5.01, + "grad_norm": 0.003954921383410692, + "learning_rate": 2.6985259153590108e-05, + "loss": 0.0001, + "step": 7210 + }, + { + "epoch": 5.01, + "grad_norm": 0.0006554504507221282, + "learning_rate": 2.694563322238073e-05, + "loss": 0.0002, + "step": 7220 + }, + { + "epoch": 5.02, + "grad_norm": 0.0011577644618228078, + "learning_rate": 2.6906007291171342e-05, + "loss": 0.006, + "step": 7230 + }, + { + "epoch": 5.02, + "grad_norm": 0.0004994067130610347, + "learning_rate": 2.686638135996196e-05, + "loss": 0.0001, + "step": 7240 + }, + { + "epoch": 5.02, + "grad_norm": 0.006224981974810362, + "learning_rate": 2.6826755428752576e-05, + "loss": 0.4425, + "step": 7250 + }, + { + "epoch": 5.02, + "grad_norm": 0.00843863096088171, + "learning_rate": 2.6787129497543195e-05, + "loss": 0.1975, + "step": 7260 + }, + { + "epoch": 5.02, + "grad_norm": 0.0011182260932400823, + "learning_rate": 2.674750356633381e-05, + "loss": 0.0002, + "step": 7270 + }, + { + "epoch": 5.02, + "grad_norm": 0.0012028939090669155, + "learning_rate": 2.670787763512443e-05, + "loss": 0.0001, + "step": 7280 + }, + { + "epoch": 5.02, + "grad_norm": 0.0064741140231490135, + "learning_rate": 2.666825170391504e-05, + "loss": 0.0066, + "step": 7290 + }, + { + "epoch": 5.02, + "grad_norm": 0.0013653126079589128, + "learning_rate": 2.6628625772705663e-05, + "loss": 0.0802, + "step": 7300 + }, + { + "epoch": 5.02, + "grad_norm": 0.0032840375788509846, + "learning_rate": 2.6588999841496275e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 5.02, + "grad_norm": 0.006207801401615143, + "learning_rate": 2.6549373910286894e-05, + "loss": 0.0001, + "step": 7320 + }, + { + "epoch": 5.02, + "grad_norm": 0.0015818944666534662, + "learning_rate": 2.650974797907751e-05, + "loss": 0.0935, + "step": 7330 + }, + { + "epoch": 5.02, + "grad_norm": 0.0013846838846802711, + "learning_rate": 2.647012204786813e-05, + "loss": 0.0101, + "step": 7340 + }, + { + "epoch": 5.02, + "grad_norm": 0.0015213302103802562, + "learning_rate": 2.6430496116658744e-05, + "loss": 0.0001, + "step": 7350 + }, + { + "epoch": 5.02, + "grad_norm": 0.0016765915788710117, + "learning_rate": 2.6390870185449356e-05, + "loss": 0.0008, + "step": 7360 + }, + { + "epoch": 5.03, + "grad_norm": 0.0029850786086171865, + "learning_rate": 2.6351244254239975e-05, + "loss": 0.5417, + "step": 7370 + }, + { + "epoch": 5.03, + "grad_norm": 0.0028296930249780416, + "learning_rate": 2.631161832303059e-05, + "loss": 0.0029, + "step": 7380 + }, + { + "epoch": 5.03, + "grad_norm": 0.19774562120437622, + "learning_rate": 2.627199239182121e-05, + "loss": 0.0424, + "step": 7390 + }, + { + "epoch": 5.03, + "grad_norm": 0.20521485805511475, + "learning_rate": 2.6232366460611824e-05, + "loss": 0.0003, + "step": 7400 + }, + { + "epoch": 5.03, + "grad_norm": 3.243302822113037, + "learning_rate": 2.6192740529402443e-05, + "loss": 0.0033, + "step": 7410 + }, + { + "epoch": 5.03, + "grad_norm": 0.002176284557208419, + "learning_rate": 2.615311459819306e-05, + "loss": 0.0905, + "step": 7420 + }, + { + "epoch": 5.03, + "grad_norm": 0.00346784177236259, + "learning_rate": 2.6113488666983677e-05, + "loss": 0.0058, + "step": 7430 + }, + { + "epoch": 5.03, + "grad_norm": 0.0022136277984827757, + "learning_rate": 2.607386273577429e-05, + "loss": 0.0145, + "step": 7440 + }, + { + "epoch": 5.03, + "grad_norm": 0.0054547772742807865, + "learning_rate": 2.603423680456491e-05, + "loss": 0.0001, + "step": 7450 + }, + { + "epoch": 5.03, + "grad_norm": 0.0017041038954630494, + "learning_rate": 2.5994610873355524e-05, + "loss": 0.0043, + "step": 7460 + }, + { + "epoch": 5.03, + "grad_norm": 0.00526059465482831, + "learning_rate": 2.5954984942146142e-05, + "loss": 0.0001, + "step": 7470 + }, + { + "epoch": 5.03, + "grad_norm": 0.0015646722167730331, + "learning_rate": 2.5915359010936758e-05, + "loss": 0.0001, + "step": 7480 + }, + { + "epoch": 5.03, + "grad_norm": 0.0014299266040325165, + "learning_rate": 2.5875733079727377e-05, + "loss": 0.0001, + "step": 7490 + }, + { + "epoch": 5.03, + "grad_norm": 0.857555627822876, + "learning_rate": 2.5836107148517992e-05, + "loss": 0.0494, + "step": 7500 + }, + { + "epoch": 5.04, + "grad_norm": 0.0019163636025041342, + "learning_rate": 2.579648121730861e-05, + "loss": 0.0001, + "step": 7510 + }, + { + "epoch": 5.04, + "grad_norm": 0.001081604859791696, + "learning_rate": 2.5756855286099223e-05, + "loss": 0.0001, + "step": 7520 + }, + { + "epoch": 5.04, + "grad_norm": 0.002402815269306302, + "learning_rate": 2.5717229354889845e-05, + "loss": 0.0001, + "step": 7530 + }, + { + "epoch": 5.04, + "grad_norm": 0.0032065189443528652, + "learning_rate": 2.5677603423680457e-05, + "loss": 0.5271, + "step": 7540 + }, + { + "epoch": 5.04, + "grad_norm": 0.0037377572152763605, + "learning_rate": 2.5637977492471076e-05, + "loss": 0.0001, + "step": 7550 + }, + { + "epoch": 5.04, + "grad_norm": 0.0010730663780122995, + "learning_rate": 2.559835156126169e-05, + "loss": 0.0001, + "step": 7560 + }, + { + "epoch": 5.04, + "grad_norm": 0.018039198592305183, + "learning_rate": 2.5558725630052303e-05, + "loss": 0.1574, + "step": 7570 + }, + { + "epoch": 5.04, + "grad_norm": 0.0008627079077996314, + "learning_rate": 2.5519099698842925e-05, + "loss": 0.0004, + "step": 7580 + }, + { + "epoch": 5.04, + "grad_norm": 0.00304847932420671, + "learning_rate": 2.5479473767633537e-05, + "loss": 0.0002, + "step": 7590 + }, + { + "epoch": 5.04, + "grad_norm": 56.73731231689453, + "learning_rate": 2.5439847836424156e-05, + "loss": 0.2908, + "step": 7600 + }, + { + "epoch": 5.04, + "grad_norm": 0.0014052072074264288, + "learning_rate": 2.540022190521477e-05, + "loss": 0.0001, + "step": 7610 + }, + { + "epoch": 5.04, + "grad_norm": 0.0024271756410598755, + "learning_rate": 2.536059597400539e-05, + "loss": 0.0363, + "step": 7620 + }, + { + "epoch": 5.04, + "grad_norm": 0.0011607712367549539, + "learning_rate": 2.5320970042796006e-05, + "loss": 0.0703, + "step": 7630 + }, + { + "epoch": 5.04, + "grad_norm": 0.0010089229326695204, + "learning_rate": 2.5281344111586625e-05, + "loss": 0.0001, + "step": 7640 + }, + { + "epoch": 5.05, + "grad_norm": 0.0012477770214900374, + "learning_rate": 2.524171818037724e-05, + "loss": 0.471, + "step": 7650 + }, + { + "epoch": 5.05, + "grad_norm": 0.0015396666713058949, + "learning_rate": 2.520209224916786e-05, + "loss": 0.2129, + "step": 7660 + }, + { + "epoch": 5.05, + "grad_norm": 0.000801810878328979, + "learning_rate": 2.516246631795847e-05, + "loss": 0.0314, + "step": 7670 + }, + { + "epoch": 5.05, + "grad_norm": 0.0009846306638792157, + "learning_rate": 2.512284038674909e-05, + "loss": 0.0003, + "step": 7680 + }, + { + "epoch": 5.05, + "grad_norm": 0.03625110909342766, + "learning_rate": 2.5083214455539705e-05, + "loss": 0.0016, + "step": 7690 + }, + { + "epoch": 5.05, + "grad_norm": 0.14931851625442505, + "learning_rate": 2.5043588524330324e-05, + "loss": 0.4488, + "step": 7700 + }, + { + "epoch": 5.05, + "grad_norm": 0.007826775312423706, + "learning_rate": 2.500396259312094e-05, + "loss": 0.0002, + "step": 7710 + }, + { + "epoch": 5.05, + "grad_norm": 0.00988730974495411, + "learning_rate": 2.4964336661911555e-05, + "loss": 0.0001, + "step": 7720 + }, + { + "epoch": 5.05, + "grad_norm": 0.0005387517157942057, + "learning_rate": 2.4924710730702174e-05, + "loss": 0.7611, + "step": 7730 + }, + { + "epoch": 5.05, + "grad_norm": 0.0011877217330038548, + "learning_rate": 2.488508479949279e-05, + "loss": 0.0001, + "step": 7740 + }, + { + "epoch": 5.05, + "grad_norm": 0.019128194078803062, + "learning_rate": 2.4845458868283404e-05, + "loss": 0.517, + "step": 7750 + }, + { + "epoch": 5.05, + "grad_norm": 1.5278313159942627, + "learning_rate": 2.4805832937074023e-05, + "loss": 0.0012, + "step": 7760 + }, + { + "epoch": 5.05, + "grad_norm": 0.0027036985848098993, + "learning_rate": 2.476620700586464e-05, + "loss": 0.5882, + "step": 7770 + }, + { + "epoch": 5.05, + "grad_norm": 0.002757065463811159, + "learning_rate": 2.4726581074655254e-05, + "loss": 0.0155, + "step": 7780 + }, + { + "epoch": 5.06, + "grad_norm": 0.004905847366899252, + "learning_rate": 2.4686955143445873e-05, + "loss": 0.0102, + "step": 7790 + }, + { + "epoch": 5.06, + "grad_norm": 0.0014356361934915185, + "learning_rate": 2.4647329212236488e-05, + "loss": 0.0001, + "step": 7800 + }, + { + "epoch": 5.06, + "grad_norm": 3.6968801021575928, + "learning_rate": 2.4607703281027107e-05, + "loss": 0.2234, + "step": 7810 + }, + { + "epoch": 5.06, + "grad_norm": 66.777099609375, + "learning_rate": 2.4568077349817722e-05, + "loss": 0.3216, + "step": 7820 + }, + { + "epoch": 5.06, + "grad_norm": 0.001242569531314075, + "learning_rate": 2.4528451418608338e-05, + "loss": 0.0003, + "step": 7830 + }, + { + "epoch": 5.06, + "grad_norm": 0.0016161628300324082, + "learning_rate": 2.4488825487398957e-05, + "loss": 0.024, + "step": 7840 + }, + { + "epoch": 5.06, + "grad_norm": 0.06756754219532013, + "learning_rate": 2.4449199556189572e-05, + "loss": 0.0614, + "step": 7850 + }, + { + "epoch": 5.06, + "grad_norm": 0.0006389593472704291, + "learning_rate": 2.440957362498019e-05, + "loss": 0.1259, + "step": 7860 + }, + { + "epoch": 5.06, + "grad_norm": 0.004206878133118153, + "learning_rate": 2.4369947693770806e-05, + "loss": 0.2293, + "step": 7870 + }, + { + "epoch": 5.06, + "grad_norm": 0.0025491828564554453, + "learning_rate": 2.433032176256142e-05, + "loss": 0.0004, + "step": 7880 + }, + { + "epoch": 5.06, + "grad_norm": 0.0015132069820538163, + "learning_rate": 2.4290695831352037e-05, + "loss": 0.0453, + "step": 7890 + }, + { + "epoch": 5.06, + "grad_norm": 0.0013023455394431949, + "learning_rate": 2.4251069900142652e-05, + "loss": 0.515, + "step": 7900 + }, + { + "epoch": 5.06, + "grad_norm": 0.0006147758103907108, + "learning_rate": 2.421144396893327e-05, + "loss": 0.0004, + "step": 7910 + }, + { + "epoch": 5.06, + "grad_norm": 0.0013257160317152739, + "learning_rate": 2.4171818037723887e-05, + "loss": 0.0004, + "step": 7920 + }, + { + "epoch": 5.07, + "grad_norm": 0.0010351515375077724, + "learning_rate": 2.4132192106514502e-05, + "loss": 0.2861, + "step": 7930 + }, + { + "epoch": 5.07, + "grad_norm": 0.004010920412838459, + "learning_rate": 2.409256617530512e-05, + "loss": 0.144, + "step": 7940 + }, + { + "epoch": 5.07, + "grad_norm": 0.002655152464285493, + "learning_rate": 2.4052940244095736e-05, + "loss": 0.5804, + "step": 7950 + }, + { + "epoch": 5.07, + "grad_norm": 0.009208135306835175, + "learning_rate": 2.4013314312886355e-05, + "loss": 0.0008, + "step": 7960 + }, + { + "epoch": 5.07, + "grad_norm": 124.37940979003906, + "learning_rate": 2.397368838167697e-05, + "loss": 0.1359, + "step": 7970 + }, + { + "epoch": 5.07, + "grad_norm": 0.0007841124897822738, + "learning_rate": 2.3934062450467586e-05, + "loss": 0.0073, + "step": 7980 + }, + { + "epoch": 5.07, + "grad_norm": 14.345431327819824, + "learning_rate": 2.3894436519258205e-05, + "loss": 0.009, + "step": 7990 + }, + { + "epoch": 5.07, + "grad_norm": 0.0012639712076634169, + "learning_rate": 2.385481058804882e-05, + "loss": 0.0001, + "step": 8000 + }, + { + "epoch": 5.07, + "grad_norm": 0.004882665816694498, + "learning_rate": 2.3815184656839436e-05, + "loss": 0.765, + "step": 8010 + }, + { + "epoch": 5.07, + "grad_norm": 1.992924690246582, + "learning_rate": 2.3775558725630054e-05, + "loss": 0.0199, + "step": 8020 + }, + { + "epoch": 5.07, + "grad_norm": 0.008574814535677433, + "learning_rate": 2.373593279442067e-05, + "loss": 0.0121, + "step": 8030 + }, + { + "epoch": 5.07, + "grad_norm": 0.0031569607090204954, + "learning_rate": 2.369630686321129e-05, + "loss": 0.0105, + "step": 8040 + }, + { + "epoch": 5.07, + "grad_norm": 0.005381352733820677, + "learning_rate": 2.3656680932001904e-05, + "loss": 0.0002, + "step": 8050 + }, + { + "epoch": 5.07, + "grad_norm": 0.0014025687705725431, + "learning_rate": 2.361705500079252e-05, + "loss": 0.1309, + "step": 8060 + }, + { + "epoch": 5.08, + "grad_norm": 0.00232652947306633, + "learning_rate": 2.3577429069583138e-05, + "loss": 0.0253, + "step": 8070 + }, + { + "epoch": 5.08, + "grad_norm": 0.004494811408221722, + "learning_rate": 2.3537803138373754e-05, + "loss": 0.0004, + "step": 8080 + }, + { + "epoch": 5.08, + "grad_norm": 0.007132168859243393, + "learning_rate": 2.3498177207164372e-05, + "loss": 0.0002, + "step": 8090 + }, + { + "epoch": 5.08, + "grad_norm": 0.002315562916919589, + "learning_rate": 2.3458551275954984e-05, + "loss": 0.0048, + "step": 8100 + }, + { + "epoch": 5.08, + "grad_norm": 0.0011102244025096297, + "learning_rate": 2.34189253447456e-05, + "loss": 0.1166, + "step": 8110 + }, + { + "epoch": 5.08, + "grad_norm": 0.0011376317124813795, + "learning_rate": 2.337929941353622e-05, + "loss": 0.0001, + "step": 8120 + }, + { + "epoch": 5.08, + "grad_norm": 0.009772238321602345, + "learning_rate": 2.3339673482326834e-05, + "loss": 0.1212, + "step": 8130 + }, + { + "epoch": 5.08, + "grad_norm": 0.0009250590810552239, + "learning_rate": 2.3300047551117453e-05, + "loss": 0.0077, + "step": 8140 + }, + { + "epoch": 5.08, + "grad_norm": 0.0008343447698280215, + "learning_rate": 2.3260421619908068e-05, + "loss": 0.0001, + "step": 8150 + }, + { + "epoch": 5.08, + "grad_norm": 0.005889697000384331, + "learning_rate": 2.3220795688698684e-05, + "loss": 0.2522, + "step": 8160 + }, + { + "epoch": 5.08, + "grad_norm": 0.004577580373734236, + "learning_rate": 2.3181169757489303e-05, + "loss": 0.0055, + "step": 8170 + }, + { + "epoch": 5.08, + "grad_norm": 0.0006038689170964062, + "learning_rate": 2.3141543826279918e-05, + "loss": 0.22, + "step": 8180 + }, + { + "epoch": 5.08, + "grad_norm": 119.69172668457031, + "learning_rate": 2.3101917895070537e-05, + "loss": 0.2874, + "step": 8190 + }, + { + "epoch": 5.08, + "grad_norm": 0.01207007933408022, + "learning_rate": 2.3062291963861152e-05, + "loss": 0.0003, + "step": 8200 + }, + { + "epoch": 5.09, + "grad_norm": 0.005133229307830334, + "learning_rate": 2.3022666032651768e-05, + "loss": 0.0002, + "step": 8210 + }, + { + "epoch": 5.09, + "grad_norm": 0.0014045186107978225, + "learning_rate": 2.2983040101442386e-05, + "loss": 0.0003, + "step": 8220 + }, + { + "epoch": 5.09, + "grad_norm": 0.005631518550217152, + "learning_rate": 2.2943414170233002e-05, + "loss": 0.0002, + "step": 8230 + }, + { + "epoch": 5.09, + "grad_norm": 0.0011396125191822648, + "learning_rate": 2.2903788239023617e-05, + "loss": 0.0004, + "step": 8240 + }, + { + "epoch": 5.09, + "grad_norm": 0.16508010029792786, + "learning_rate": 2.2864162307814236e-05, + "loss": 0.0002, + "step": 8250 + }, + { + "epoch": 5.09, + "grad_norm": 0.005040541756898165, + "learning_rate": 2.282453637660485e-05, + "loss": 0.016, + "step": 8260 + }, + { + "epoch": 5.09, + "grad_norm": 0.0026673241518437862, + "learning_rate": 2.278491044539547e-05, + "loss": 0.0024, + "step": 8270 + }, + { + "epoch": 5.09, + "grad_norm": 0.0025323168374598026, + "learning_rate": 2.2745284514186086e-05, + "loss": 0.0001, + "step": 8280 + }, + { + "epoch": 5.09, + "grad_norm": 0.002470273757353425, + "learning_rate": 2.27056585829767e-05, + "loss": 0.0001, + "step": 8290 + }, + { + "epoch": 5.09, + "grad_norm": 0.0011150416685268283, + "learning_rate": 2.266603265176732e-05, + "loss": 0.0027, + "step": 8300 + }, + { + "epoch": 5.09, + "grad_norm": 0.0062728519551455975, + "learning_rate": 2.2626406720557935e-05, + "loss": 0.0006, + "step": 8310 + }, + { + "epoch": 5.09, + "grad_norm": 0.001863997895270586, + "learning_rate": 2.258678078934855e-05, + "loss": 0.0001, + "step": 8320 + }, + { + "epoch": 5.09, + "grad_norm": 0.0009478493593633175, + "learning_rate": 2.2547154858139166e-05, + "loss": 0.0179, + "step": 8330 + }, + { + "epoch": 5.09, + "grad_norm": 0.0012072144309058785, + "learning_rate": 2.250752892692978e-05, + "loss": 0.2482, + "step": 8340 + }, + { + "epoch": 5.1, + "grad_norm": 0.0013612033799290657, + "learning_rate": 2.24679029957204e-05, + "loss": 0.0001, + "step": 8350 + }, + { + "epoch": 5.1, + "grad_norm": 0.001653852523304522, + "learning_rate": 2.2428277064511016e-05, + "loss": 0.0001, + "step": 8360 + }, + { + "epoch": 5.1, + "grad_norm": 0.004468216095119715, + "learning_rate": 2.2388651133301634e-05, + "loss": 0.0003, + "step": 8370 + }, + { + "epoch": 5.1, + "grad_norm": 0.21759329736232758, + "learning_rate": 2.234902520209225e-05, + "loss": 0.0004, + "step": 8380 + }, + { + "epoch": 5.1, + "grad_norm": 0.002769963815808296, + "learning_rate": 2.2309399270882865e-05, + "loss": 0.0002, + "step": 8390 + }, + { + "epoch": 5.1, + "grad_norm": 0.0010608519660308957, + "learning_rate": 2.2269773339673484e-05, + "loss": 0.1718, + "step": 8400 + }, + { + "epoch": 5.1, + "grad_norm": 0.0008747797110117972, + "learning_rate": 2.22301474084641e-05, + "loss": 0.0003, + "step": 8410 + }, + { + "epoch": 5.1, + "eval_accuracy": 0.9669172932330827, + "eval_loss": 0.23544873297214508, + "eval_runtime": 2342.8874, + "eval_samples_per_second": 0.284, + "eval_steps_per_second": 0.142, + "step": 8418 + }, + { + "epoch": 6.0, + "grad_norm": 0.0007297981064766645, + "learning_rate": 2.2190521477254715e-05, + "loss": 0.0001, + "step": 8420 + }, + { + "epoch": 6.0, + "grad_norm": 0.007375821936875582, + "learning_rate": 2.2150895546045334e-05, + "loss": 0.0001, + "step": 8430 + }, + { + "epoch": 6.0, + "grad_norm": 0.0019510581623762846, + "learning_rate": 2.211126961483595e-05, + "loss": 0.0001, + "step": 8440 + }, + { + "epoch": 6.0, + "grad_norm": 0.009307813830673695, + "learning_rate": 2.2071643683626568e-05, + "loss": 0.0002, + "step": 8450 + }, + { + "epoch": 6.0, + "grad_norm": 0.07272663712501526, + "learning_rate": 2.2032017752417183e-05, + "loss": 0.0002, + "step": 8460 + }, + { + "epoch": 6.0, + "grad_norm": 0.004176029469817877, + "learning_rate": 2.19923918212078e-05, + "loss": 0.058, + "step": 8470 + }, + { + "epoch": 6.0, + "grad_norm": 0.0019298582337796688, + "learning_rate": 2.1952765889998418e-05, + "loss": 0.0001, + "step": 8480 + }, + { + "epoch": 6.01, + "grad_norm": 137.64112854003906, + "learning_rate": 2.1913139958789033e-05, + "loss": 0.144, + "step": 8490 + }, + { + "epoch": 6.01, + "grad_norm": 0.0035788225941359997, + "learning_rate": 2.1873514027579652e-05, + "loss": 0.1903, + "step": 8500 + }, + { + "epoch": 6.01, + "grad_norm": 0.00900218915194273, + "learning_rate": 2.1833888096370267e-05, + "loss": 0.3162, + "step": 8510 + }, + { + "epoch": 6.01, + "grad_norm": 0.006812531501054764, + "learning_rate": 2.1794262165160883e-05, + "loss": 0.0001, + "step": 8520 + }, + { + "epoch": 6.01, + "grad_norm": 0.011043643578886986, + "learning_rate": 2.1754636233951498e-05, + "loss": 0.0001, + "step": 8530 + }, + { + "epoch": 6.01, + "grad_norm": 0.0009386019664816558, + "learning_rate": 2.1715010302742113e-05, + "loss": 0.0132, + "step": 8540 + }, + { + "epoch": 6.01, + "grad_norm": 0.0009653670713305473, + "learning_rate": 2.1675384371532732e-05, + "loss": 0.0001, + "step": 8550 + }, + { + "epoch": 6.01, + "grad_norm": 0.000631912553217262, + "learning_rate": 2.1635758440323348e-05, + "loss": 0.0046, + "step": 8560 + }, + { + "epoch": 6.01, + "grad_norm": 0.005377355497330427, + "learning_rate": 2.1596132509113963e-05, + "loss": 0.0001, + "step": 8570 + }, + { + "epoch": 6.01, + "grad_norm": 0.0015233962330967188, + "learning_rate": 2.1556506577904582e-05, + "loss": 0.26, + "step": 8580 + }, + { + "epoch": 6.01, + "grad_norm": 0.003712683217599988, + "learning_rate": 2.1516880646695197e-05, + "loss": 0.0103, + "step": 8590 + }, + { + "epoch": 6.01, + "grad_norm": 0.002746036509051919, + "learning_rate": 2.1477254715485816e-05, + "loss": 0.0001, + "step": 8600 + }, + { + "epoch": 6.01, + "grad_norm": 0.001353266416117549, + "learning_rate": 2.143762878427643e-05, + "loss": 0.0001, + "step": 8610 + }, + { + "epoch": 6.01, + "grad_norm": 0.05317896232008934, + "learning_rate": 2.1398002853067047e-05, + "loss": 0.0002, + "step": 8620 + }, + { + "epoch": 6.02, + "grad_norm": 0.002108694287016988, + "learning_rate": 2.1358376921857666e-05, + "loss": 0.0001, + "step": 8630 + }, + { + "epoch": 6.02, + "grad_norm": 0.0015535557176917791, + "learning_rate": 2.131875099064828e-05, + "loss": 0.2856, + "step": 8640 + }, + { + "epoch": 6.02, + "grad_norm": 0.0007479583146050572, + "learning_rate": 2.1279125059438897e-05, + "loss": 0.0001, + "step": 8650 + }, + { + "epoch": 6.02, + "grad_norm": 0.0013678164687007666, + "learning_rate": 2.1239499128229515e-05, + "loss": 0.0001, + "step": 8660 + }, + { + "epoch": 6.02, + "grad_norm": 0.0011460609966889024, + "learning_rate": 2.119987319702013e-05, + "loss": 0.1748, + "step": 8670 + }, + { + "epoch": 6.02, + "grad_norm": 0.005598797462880611, + "learning_rate": 2.116024726581075e-05, + "loss": 0.0005, + "step": 8680 + }, + { + "epoch": 6.02, + "grad_norm": 0.0058416505344212055, + "learning_rate": 2.1120621334601365e-05, + "loss": 0.003, + "step": 8690 + }, + { + "epoch": 6.02, + "grad_norm": 0.0018327133730053902, + "learning_rate": 2.108099540339198e-05, + "loss": 0.0001, + "step": 8700 + }, + { + "epoch": 6.02, + "grad_norm": 0.0008349318522959948, + "learning_rate": 2.10413694721826e-05, + "loss": 0.0014, + "step": 8710 + }, + { + "epoch": 6.02, + "grad_norm": 0.0007587561849504709, + "learning_rate": 2.1001743540973215e-05, + "loss": 0.0012, + "step": 8720 + }, + { + "epoch": 6.02, + "grad_norm": 0.003939191345125437, + "learning_rate": 2.096211760976383e-05, + "loss": 0.0001, + "step": 8730 + }, + { + "epoch": 6.02, + "grad_norm": 0.007147368974983692, + "learning_rate": 2.092249167855445e-05, + "loss": 0.0003, + "step": 8740 + }, + { + "epoch": 6.02, + "grad_norm": 0.0007460744236595929, + "learning_rate": 2.088286574734506e-05, + "loss": 0.0, + "step": 8750 + }, + { + "epoch": 6.02, + "grad_norm": 0.005187608767300844, + "learning_rate": 2.084323981613568e-05, + "loss": 0.0001, + "step": 8760 + }, + { + "epoch": 6.03, + "grad_norm": 0.0012044048635289073, + "learning_rate": 2.0803613884926295e-05, + "loss": 0.0003, + "step": 8770 + }, + { + "epoch": 6.03, + "grad_norm": 0.005269182845950127, + "learning_rate": 2.0763987953716914e-05, + "loss": 0.1424, + "step": 8780 + }, + { + "epoch": 6.03, + "grad_norm": 0.0014458984369412065, + "learning_rate": 2.072436202250753e-05, + "loss": 0.1836, + "step": 8790 + }, + { + "epoch": 6.03, + "grad_norm": 0.003018228802829981, + "learning_rate": 2.0684736091298145e-05, + "loss": 0.0002, + "step": 8800 + }, + { + "epoch": 6.03, + "grad_norm": 0.0005208718357607722, + "learning_rate": 2.0645110160088763e-05, + "loss": 0.276, + "step": 8810 + }, + { + "epoch": 6.03, + "grad_norm": 0.0005419257213361561, + "learning_rate": 2.060548422887938e-05, + "loss": 0.0, + "step": 8820 + }, + { + "epoch": 6.03, + "grad_norm": 0.0056818630546331406, + "learning_rate": 2.0565858297669994e-05, + "loss": 0.0003, + "step": 8830 + }, + { + "epoch": 6.03, + "grad_norm": 0.0021387594752013683, + "learning_rate": 2.0526232366460613e-05, + "loss": 0.0001, + "step": 8840 + }, + { + "epoch": 6.03, + "grad_norm": 0.0017361573409289122, + "learning_rate": 2.048660643525123e-05, + "loss": 0.0235, + "step": 8850 + }, + { + "epoch": 6.03, + "grad_norm": 0.0031765319872647524, + "learning_rate": 2.0446980504041847e-05, + "loss": 0.0002, + "step": 8860 + }, + { + "epoch": 6.03, + "grad_norm": 0.0006492682150565088, + "learning_rate": 2.0407354572832463e-05, + "loss": 0.0, + "step": 8870 + }, + { + "epoch": 6.03, + "grad_norm": 0.009603900834918022, + "learning_rate": 2.0367728641623078e-05, + "loss": 0.0116, + "step": 8880 + }, + { + "epoch": 6.03, + "grad_norm": 0.0014260296011343598, + "learning_rate": 2.0328102710413697e-05, + "loss": 0.0272, + "step": 8890 + }, + { + "epoch": 6.03, + "grad_norm": 0.001238304190337658, + "learning_rate": 2.0288476779204312e-05, + "loss": 0.0001, + "step": 8900 + }, + { + "epoch": 6.04, + "grad_norm": 0.004389143083244562, + "learning_rate": 2.024885084799493e-05, + "loss": 0.0001, + "step": 8910 + }, + { + "epoch": 6.04, + "grad_norm": 0.0006919855368323624, + "learning_rate": 2.0209224916785547e-05, + "loss": 0.0015, + "step": 8920 + }, + { + "epoch": 6.04, + "grad_norm": 0.0013250050833448768, + "learning_rate": 2.0169598985576162e-05, + "loss": 0.0, + "step": 8930 + }, + { + "epoch": 6.04, + "grad_norm": 0.0006862548179924488, + "learning_rate": 2.012997305436678e-05, + "loss": 0.005, + "step": 8940 + }, + { + "epoch": 6.04, + "grad_norm": 0.0006481676246039569, + "learning_rate": 2.0090347123157396e-05, + "loss": 0.0002, + "step": 8950 + }, + { + "epoch": 6.04, + "grad_norm": 0.0009765150607563555, + "learning_rate": 2.005072119194801e-05, + "loss": 0.3095, + "step": 8960 + }, + { + "epoch": 6.04, + "grad_norm": 0.0008786149555817246, + "learning_rate": 2.0011095260738627e-05, + "loss": 0.1903, + "step": 8970 + }, + { + "epoch": 6.04, + "grad_norm": 0.00043602605001069605, + "learning_rate": 1.9971469329529242e-05, + "loss": 0.0002, + "step": 8980 + }, + { + "epoch": 6.04, + "grad_norm": 0.0006052827229723334, + "learning_rate": 1.993184339831986e-05, + "loss": 0.0028, + "step": 8990 + }, + { + "epoch": 6.04, + "grad_norm": 0.0027263278607279062, + "learning_rate": 1.9892217467110477e-05, + "loss": 0.0988, + "step": 9000 + }, + { + "epoch": 6.04, + "grad_norm": 0.0004901738138869405, + "learning_rate": 1.9852591535901095e-05, + "loss": 0.0008, + "step": 9010 + }, + { + "epoch": 6.04, + "grad_norm": 0.004134719260036945, + "learning_rate": 1.981296560469171e-05, + "loss": 0.0001, + "step": 9020 + }, + { + "epoch": 6.04, + "grad_norm": 6.425068378448486, + "learning_rate": 1.9773339673482326e-05, + "loss": 0.0009, + "step": 9030 + }, + { + "epoch": 6.04, + "grad_norm": 0.0021010099444538355, + "learning_rate": 1.9733713742272945e-05, + "loss": 0.0139, + "step": 9040 + }, + { + "epoch": 6.05, + "grad_norm": 0.0003429889620747417, + "learning_rate": 1.969408781106356e-05, + "loss": 0.0001, + "step": 9050 + }, + { + "epoch": 6.05, + "grad_norm": 0.00465469341725111, + "learning_rate": 1.9654461879854176e-05, + "loss": 0.0048, + "step": 9060 + }, + { + "epoch": 6.05, + "grad_norm": 0.0007626991719007492, + "learning_rate": 1.9614835948644795e-05, + "loss": 0.0694, + "step": 9070 + }, + { + "epoch": 6.05, + "grad_norm": 0.0005379422218538821, + "learning_rate": 1.957521001743541e-05, + "loss": 0.0001, + "step": 9080 + }, + { + "epoch": 6.05, + "grad_norm": 0.0018008677288889885, + "learning_rate": 1.953558408622603e-05, + "loss": 0.1537, + "step": 9090 + }, + { + "epoch": 6.05, + "grad_norm": 0.005486232694238424, + "learning_rate": 1.9495958155016644e-05, + "loss": 0.0001, + "step": 9100 + }, + { + "epoch": 6.05, + "grad_norm": 0.0016153625911101699, + "learning_rate": 1.945633222380726e-05, + "loss": 0.1517, + "step": 9110 + }, + { + "epoch": 6.05, + "grad_norm": 0.00048393840552307665, + "learning_rate": 1.941670629259788e-05, + "loss": 0.0515, + "step": 9120 + }, + { + "epoch": 6.05, + "grad_norm": 0.00044351426186040044, + "learning_rate": 1.9377080361388494e-05, + "loss": 0.0, + "step": 9130 + }, + { + "epoch": 6.05, + "grad_norm": 0.003928069956600666, + "learning_rate": 1.9337454430179113e-05, + "loss": 0.0001, + "step": 9140 + }, + { + "epoch": 6.05, + "grad_norm": 0.0009555955766700208, + "learning_rate": 1.9297828498969728e-05, + "loss": 0.0001, + "step": 9150 + }, + { + "epoch": 6.05, + "grad_norm": 0.003042226191610098, + "learning_rate": 1.9258202567760344e-05, + "loss": 0.0, + "step": 9160 + }, + { + "epoch": 6.05, + "grad_norm": 0.0003893129760399461, + "learning_rate": 1.9218576636550962e-05, + "loss": 0.0, + "step": 9170 + }, + { + "epoch": 6.05, + "grad_norm": 0.0008289095130749047, + "learning_rate": 1.9178950705341574e-05, + "loss": 0.0, + "step": 9180 + }, + { + "epoch": 6.06, + "grad_norm": 0.0010318297427147627, + "learning_rate": 1.9139324774132193e-05, + "loss": 0.0, + "step": 9190 + }, + { + "epoch": 6.06, + "grad_norm": 0.0007037441828288138, + "learning_rate": 1.909969884292281e-05, + "loss": 0.0837, + "step": 9200 + }, + { + "epoch": 6.06, + "grad_norm": 44.11083221435547, + "learning_rate": 1.9060072911713424e-05, + "loss": 0.0226, + "step": 9210 + }, + { + "epoch": 6.06, + "grad_norm": 0.0010193975176662207, + "learning_rate": 1.9020446980504043e-05, + "loss": 0.0018, + "step": 9220 + }, + { + "epoch": 6.06, + "grad_norm": 0.0026808753609657288, + "learning_rate": 1.8980821049294658e-05, + "loss": 0.0001, + "step": 9230 + }, + { + "epoch": 6.06, + "grad_norm": 0.0013365477789193392, + "learning_rate": 1.8941195118085277e-05, + "loss": 0.0, + "step": 9240 + }, + { + "epoch": 6.06, + "grad_norm": 33.180870056152344, + "learning_rate": 1.8901569186875892e-05, + "loss": 0.3046, + "step": 9250 + }, + { + "epoch": 6.06, + "grad_norm": 0.001624317723326385, + "learning_rate": 1.8861943255666508e-05, + "loss": 0.2476, + "step": 9260 + }, + { + "epoch": 6.06, + "grad_norm": 0.002660261234268546, + "learning_rate": 1.8822317324457127e-05, + "loss": 0.329, + "step": 9270 + }, + { + "epoch": 6.06, + "grad_norm": 0.001928847748786211, + "learning_rate": 1.8782691393247742e-05, + "loss": 0.0116, + "step": 9280 + }, + { + "epoch": 6.06, + "grad_norm": 0.0004771009262185544, + "learning_rate": 1.8743065462038357e-05, + "loss": 0.0, + "step": 9290 + }, + { + "epoch": 6.06, + "grad_norm": 0.0006694819312542677, + "learning_rate": 1.8703439530828976e-05, + "loss": 0.0743, + "step": 9300 + }, + { + "epoch": 6.06, + "grad_norm": 0.010220357216894627, + "learning_rate": 1.8663813599619592e-05, + "loss": 0.0001, + "step": 9310 + }, + { + "epoch": 6.06, + "grad_norm": 0.0014199281577020884, + "learning_rate": 1.862418766841021e-05, + "loss": 0.0103, + "step": 9320 + }, + { + "epoch": 6.07, + "grad_norm": 0.001806290470995009, + "learning_rate": 1.8584561737200826e-05, + "loss": 0.0006, + "step": 9330 + }, + { + "epoch": 6.07, + "grad_norm": 0.0005750704440288246, + "learning_rate": 1.854493580599144e-05, + "loss": 0.0003, + "step": 9340 + }, + { + "epoch": 6.07, + "grad_norm": 0.0009846296161413193, + "learning_rate": 1.850530987478206e-05, + "loss": 0.0013, + "step": 9350 + }, + { + "epoch": 6.07, + "grad_norm": 0.0016641179099678993, + "learning_rate": 1.8465683943572676e-05, + "loss": 0.06, + "step": 9360 + }, + { + "epoch": 6.07, + "grad_norm": 0.0014823460951447487, + "learning_rate": 1.842605801236329e-05, + "loss": 0.0008, + "step": 9370 + }, + { + "epoch": 6.07, + "grad_norm": 0.0026860409416258335, + "learning_rate": 1.838643208115391e-05, + "loss": 0.0005, + "step": 9380 + }, + { + "epoch": 6.07, + "grad_norm": 0.0014451199676841497, + "learning_rate": 1.8346806149944525e-05, + "loss": 0.0001, + "step": 9390 + }, + { + "epoch": 6.07, + "grad_norm": 0.004795750603079796, + "learning_rate": 1.830718021873514e-05, + "loss": 0.0003, + "step": 9400 + }, + { + "epoch": 6.07, + "grad_norm": 0.0025767534971237183, + "learning_rate": 1.8267554287525756e-05, + "loss": 0.0002, + "step": 9410 + }, + { + "epoch": 6.07, + "grad_norm": 0.0006194358575157821, + "learning_rate": 1.8227928356316375e-05, + "loss": 0.0738, + "step": 9420 + }, + { + "epoch": 6.07, + "grad_norm": 0.007454677484929562, + "learning_rate": 1.818830242510699e-05, + "loss": 0.0048, + "step": 9430 + }, + { + "epoch": 6.07, + "grad_norm": 0.0012314959894865751, + "learning_rate": 1.8148676493897606e-05, + "loss": 0.0045, + "step": 9440 + }, + { + "epoch": 6.07, + "grad_norm": 0.0007009029504843056, + "learning_rate": 1.8109050562688224e-05, + "loss": 0.2806, + "step": 9450 + }, + { + "epoch": 6.07, + "grad_norm": 0.0005554054514504969, + "learning_rate": 1.806942463147884e-05, + "loss": 0.0001, + "step": 9460 + }, + { + "epoch": 6.08, + "grad_norm": 0.00048346296534873545, + "learning_rate": 1.8029798700269455e-05, + "loss": 0.0458, + "step": 9470 + }, + { + "epoch": 6.08, + "grad_norm": 0.0011084218276664615, + "learning_rate": 1.7990172769060074e-05, + "loss": 0.0, + "step": 9480 + }, + { + "epoch": 6.08, + "grad_norm": 0.0003880435542669147, + "learning_rate": 1.795054683785069e-05, + "loss": 0.0142, + "step": 9490 + }, + { + "epoch": 6.08, + "grad_norm": 0.0006134477443993092, + "learning_rate": 1.7910920906641308e-05, + "loss": 0.0, + "step": 9500 + }, + { + "epoch": 6.08, + "grad_norm": 0.0005665639764629304, + "learning_rate": 1.7871294975431924e-05, + "loss": 0.0, + "step": 9510 + }, + { + "epoch": 6.08, + "grad_norm": 0.0003921152965631336, + "learning_rate": 1.783166904422254e-05, + "loss": 0.0, + "step": 9520 + }, + { + "epoch": 6.08, + "grad_norm": 0.001750220195390284, + "learning_rate": 1.7792043113013158e-05, + "loss": 0.0437, + "step": 9530 + }, + { + "epoch": 6.08, + "grad_norm": 0.0012650929857045412, + "learning_rate": 1.7752417181803773e-05, + "loss": 0.3903, + "step": 9540 + }, + { + "epoch": 6.08, + "grad_norm": 57.58509063720703, + "learning_rate": 1.7712791250594392e-05, + "loss": 0.0546, + "step": 9550 + }, + { + "epoch": 6.08, + "grad_norm": 0.00026887169224210083, + "learning_rate": 1.7673165319385008e-05, + "loss": 0.0001, + "step": 9560 + }, + { + "epoch": 6.08, + "grad_norm": 0.0019770157523453236, + "learning_rate": 1.7633539388175623e-05, + "loss": 0.1362, + "step": 9570 + }, + { + "epoch": 6.08, + "grad_norm": 0.0007267651380971074, + "learning_rate": 1.7593913456966242e-05, + "loss": 0.0006, + "step": 9580 + }, + { + "epoch": 6.08, + "grad_norm": 0.001434961101040244, + "learning_rate": 1.7554287525756857e-05, + "loss": 0.0263, + "step": 9590 + }, + { + "epoch": 6.08, + "grad_norm": 0.00044755820999853313, + "learning_rate": 1.7514661594547473e-05, + "loss": 0.0, + "step": 9600 + }, + { + "epoch": 6.09, + "grad_norm": 0.000376471463823691, + "learning_rate": 1.7475035663338088e-05, + "loss": 0.0207, + "step": 9610 + }, + { + "epoch": 6.09, + "grad_norm": 0.014877337031066418, + "learning_rate": 1.7435409732128703e-05, + "loss": 0.0, + "step": 9620 + }, + { + "epoch": 6.09, + "grad_norm": 0.0012328572338446975, + "learning_rate": 1.7395783800919322e-05, + "loss": 0.0259, + "step": 9630 + }, + { + "epoch": 6.09, + "grad_norm": 0.0011149095371365547, + "learning_rate": 1.7356157869709938e-05, + "loss": 0.0, + "step": 9640 + }, + { + "epoch": 6.09, + "grad_norm": 0.000868526753038168, + "learning_rate": 1.7316531938500556e-05, + "loss": 0.0107, + "step": 9650 + }, + { + "epoch": 6.09, + "grad_norm": 0.0003520081809256226, + "learning_rate": 1.7276906007291172e-05, + "loss": 0.0, + "step": 9660 + }, + { + "epoch": 6.09, + "grad_norm": 0.00045317449257709086, + "learning_rate": 1.7237280076081787e-05, + "loss": 0.1845, + "step": 9670 + }, + { + "epoch": 6.09, + "grad_norm": 0.00035488023422658443, + "learning_rate": 1.7197654144872406e-05, + "loss": 0.0, + "step": 9680 + }, + { + "epoch": 6.09, + "grad_norm": 0.0007327714120037854, + "learning_rate": 1.715802821366302e-05, + "loss": 0.0001, + "step": 9690 + }, + { + "epoch": 6.09, + "grad_norm": 0.0025048046372830868, + "learning_rate": 1.7118402282453637e-05, + "loss": 0.0001, + "step": 9700 + }, + { + "epoch": 6.09, + "grad_norm": 0.00043331715278327465, + "learning_rate": 1.7078776351244256e-05, + "loss": 0.0167, + "step": 9710 + }, + { + "epoch": 6.09, + "grad_norm": 0.0004680192796513438, + "learning_rate": 1.703915042003487e-05, + "loss": 0.0, + "step": 9720 + }, + { + "epoch": 6.09, + "grad_norm": 0.0005406651180237532, + "learning_rate": 1.699952448882549e-05, + "loss": 0.0, + "step": 9730 + }, + { + "epoch": 6.09, + "grad_norm": 1.6138055324554443, + "learning_rate": 1.6959898557616105e-05, + "loss": 0.0005, + "step": 9740 + }, + { + "epoch": 6.1, + "grad_norm": 0.0005159827414900064, + "learning_rate": 1.692027262640672e-05, + "loss": 0.0001, + "step": 9750 + }, + { + "epoch": 6.1, + "grad_norm": 0.001156438491307199, + "learning_rate": 1.688064669519734e-05, + "loss": 0.0, + "step": 9760 + }, + { + "epoch": 6.1, + "grad_norm": 0.00034518956090323627, + "learning_rate": 1.6841020763987955e-05, + "loss": 0.0316, + "step": 9770 + }, + { + "epoch": 6.1, + "grad_norm": 0.0007839056779630482, + "learning_rate": 1.680139483277857e-05, + "loss": 0.0016, + "step": 9780 + }, + { + "epoch": 6.1, + "grad_norm": 0.000456125068012625, + "learning_rate": 1.676176890156919e-05, + "loss": 0.0, + "step": 9790 + }, + { + "epoch": 6.1, + "grad_norm": 0.0007673576474189758, + "learning_rate": 1.6722142970359805e-05, + "loss": 0.0, + "step": 9800 + }, + { + "epoch": 6.1, + "grad_norm": 0.000683379708789289, + "learning_rate": 1.6682517039150423e-05, + "loss": 0.0, + "step": 9810 + }, + { + "epoch": 6.1, + "grad_norm": 0.0009253775351680815, + "learning_rate": 1.664289110794104e-05, + "loss": 0.0001, + "step": 9820 + }, + { + "epoch": 6.1, + "eval_accuracy": 0.9473684210526315, + "eval_loss": 0.3900492191314697, + "eval_runtime": 2421.9145, + "eval_samples_per_second": 0.275, + "eval_steps_per_second": 0.137, + "step": 9821 + }, + { + "epoch": 7.0, + "grad_norm": 0.0004204540455248207, + "learning_rate": 1.6603265176731654e-05, + "loss": 0.5604, + "step": 9830 + }, + { + "epoch": 7.0, + "grad_norm": 0.00036831918987445533, + "learning_rate": 1.656363924552227e-05, + "loss": 0.0001, + "step": 9840 + }, + { + "epoch": 7.0, + "grad_norm": 0.00044371382682584226, + "learning_rate": 1.6524013314312885e-05, + "loss": 0.0, + "step": 9850 + }, + { + "epoch": 7.0, + "grad_norm": 0.0005366410478018224, + "learning_rate": 1.6484387383103504e-05, + "loss": 0.0, + "step": 9860 + }, + { + "epoch": 7.0, + "grad_norm": 0.0006946607609279454, + "learning_rate": 1.644476145189412e-05, + "loss": 0.0, + "step": 9870 + }, + { + "epoch": 7.0, + "grad_norm": 0.00034042325569316745, + "learning_rate": 1.6405135520684735e-05, + "loss": 0.0001, + "step": 9880 + }, + { + "epoch": 7.0, + "grad_norm": 0.00025543957599438727, + "learning_rate": 1.6365509589475353e-05, + "loss": 0.0001, + "step": 9890 + }, + { + "epoch": 7.01, + "grad_norm": 0.0005577169358730316, + "learning_rate": 1.632588365826597e-05, + "loss": 0.0001, + "step": 9900 + }, + { + "epoch": 7.01, + "grad_norm": 0.0007238492253236473, + "learning_rate": 1.6286257727056588e-05, + "loss": 0.0001, + "step": 9910 + }, + { + "epoch": 7.01, + "grad_norm": 0.00047818326856940985, + "learning_rate": 1.6246631795847203e-05, + "loss": 0.2815, + "step": 9920 + }, + { + "epoch": 7.01, + "grad_norm": 0.004355975892394781, + "learning_rate": 1.620700586463782e-05, + "loss": 0.0, + "step": 9930 + }, + { + "epoch": 7.01, + "grad_norm": 0.0002552367513999343, + "learning_rate": 1.6167379933428437e-05, + "loss": 0.0, + "step": 9940 + }, + { + "epoch": 7.01, + "grad_norm": 0.0011531308991834521, + "learning_rate": 1.6127754002219053e-05, + "loss": 0.0, + "step": 9950 + }, + { + "epoch": 7.01, + "grad_norm": 0.0009820818668231368, + "learning_rate": 1.608812807100967e-05, + "loss": 0.0, + "step": 9960 + }, + { + "epoch": 7.01, + "grad_norm": 0.0006331288604997098, + "learning_rate": 1.6048502139800287e-05, + "loss": 0.0001, + "step": 9970 + }, + { + "epoch": 7.01, + "grad_norm": 23.247167587280273, + "learning_rate": 1.6008876208590902e-05, + "loss": 0.4045, + "step": 9980 + }, + { + "epoch": 7.01, + "grad_norm": 0.0005796991754323244, + "learning_rate": 1.596925027738152e-05, + "loss": 0.2264, + "step": 9990 + }, + { + "epoch": 7.01, + "grad_norm": 0.0002432822366245091, + "learning_rate": 1.5929624346172137e-05, + "loss": 0.0, + "step": 10000 + }, + { + "epoch": 7.01, + "grad_norm": 0.00044970333692617714, + "learning_rate": 1.5889998414962752e-05, + "loss": 0.5798, + "step": 10010 + }, + { + "epoch": 7.01, + "grad_norm": 0.00043129053665325046, + "learning_rate": 1.585037248375337e-05, + "loss": 0.0, + "step": 10020 + }, + { + "epoch": 7.01, + "grad_norm": 0.0009400318958796561, + "learning_rate": 1.5810746552543986e-05, + "loss": 0.0001, + "step": 10030 + }, + { + "epoch": 7.02, + "grad_norm": 0.001612617983482778, + "learning_rate": 1.57711206213346e-05, + "loss": 0.184, + "step": 10040 + }, + { + "epoch": 7.02, + "grad_norm": 0.003397996537387371, + "learning_rate": 1.5731494690125217e-05, + "loss": 0.0001, + "step": 10050 + }, + { + "epoch": 7.02, + "grad_norm": 0.003118938999250531, + "learning_rate": 1.5691868758915836e-05, + "loss": 0.0118, + "step": 10060 + }, + { + "epoch": 7.02, + "grad_norm": 0.0016245280858129263, + "learning_rate": 1.565224282770645e-05, + "loss": 0.0002, + "step": 10070 + }, + { + "epoch": 7.02, + "grad_norm": 0.003330792533233762, + "learning_rate": 1.5612616896497067e-05, + "loss": 0.0001, + "step": 10080 + }, + { + "epoch": 7.02, + "grad_norm": 0.01675890013575554, + "learning_rate": 1.5572990965287685e-05, + "loss": 0.1361, + "step": 10090 + }, + { + "epoch": 7.02, + "grad_norm": 0.0016782371094450355, + "learning_rate": 1.55333650340783e-05, + "loss": 0.0001, + "step": 10100 + }, + { + "epoch": 7.02, + "grad_norm": 0.0006982790655456483, + "learning_rate": 1.5493739102868916e-05, + "loss": 0.0, + "step": 10110 + }, + { + "epoch": 7.02, + "grad_norm": 0.004016962368041277, + "learning_rate": 1.5454113171659535e-05, + "loss": 0.0002, + "step": 10120 + }, + { + "epoch": 7.02, + "grad_norm": 0.0016343995230272412, + "learning_rate": 1.541448724045015e-05, + "loss": 0.0001, + "step": 10130 + }, + { + "epoch": 7.02, + "grad_norm": 0.0003891861706506461, + "learning_rate": 1.537486130924077e-05, + "loss": 0.0002, + "step": 10140 + }, + { + "epoch": 7.02, + "grad_norm": 0.0005568304331973195, + "learning_rate": 1.5335235378031385e-05, + "loss": 0.0, + "step": 10150 + }, + { + "epoch": 7.02, + "grad_norm": 0.0009192074066959321, + "learning_rate": 1.5295609446822e-05, + "loss": 0.0001, + "step": 10160 + }, + { + "epoch": 7.02, + "grad_norm": 0.00041831223643384874, + "learning_rate": 1.5255983515612617e-05, + "loss": 0.0, + "step": 10170 + }, + { + "epoch": 7.03, + "grad_norm": 0.002276873914524913, + "learning_rate": 1.5216357584403234e-05, + "loss": 0.0001, + "step": 10180 + }, + { + "epoch": 7.03, + "grad_norm": 0.0021974798291921616, + "learning_rate": 1.5176731653193851e-05, + "loss": 0.0, + "step": 10190 + }, + { + "epoch": 7.03, + "grad_norm": 0.03672347217798233, + "learning_rate": 1.5137105721984468e-05, + "loss": 0.0001, + "step": 10200 + }, + { + "epoch": 7.03, + "grad_norm": 0.0004960880614817142, + "learning_rate": 1.5097479790775086e-05, + "loss": 0.0001, + "step": 10210 + }, + { + "epoch": 7.03, + "grad_norm": 0.0003698187356349081, + "learning_rate": 1.5057853859565701e-05, + "loss": 0.0, + "step": 10220 + }, + { + "epoch": 7.03, + "grad_norm": 30.79059600830078, + "learning_rate": 1.5018227928356318e-05, + "loss": 0.013, + "step": 10230 + }, + { + "epoch": 7.03, + "grad_norm": 0.00040281921974383295, + "learning_rate": 1.4978601997146935e-05, + "loss": 0.0001, + "step": 10240 + }, + { + "epoch": 7.03, + "grad_norm": 0.001930213999003172, + "learning_rate": 1.4938976065937552e-05, + "loss": 0.0005, + "step": 10250 + }, + { + "epoch": 7.03, + "grad_norm": 0.0016294418601319194, + "learning_rate": 1.4899350134728166e-05, + "loss": 0.0001, + "step": 10260 + }, + { + "epoch": 7.03, + "grad_norm": 0.0029418901540338993, + "learning_rate": 1.4859724203518781e-05, + "loss": 0.0001, + "step": 10270 + }, + { + "epoch": 7.03, + "grad_norm": 0.0005179463187232614, + "learning_rate": 1.4820098272309399e-05, + "loss": 0.0, + "step": 10280 + }, + { + "epoch": 7.03, + "grad_norm": 117.77789306640625, + "learning_rate": 1.4780472341100016e-05, + "loss": 0.0485, + "step": 10290 + }, + { + "epoch": 7.03, + "grad_norm": 0.0006345610017888248, + "learning_rate": 1.4740846409890633e-05, + "loss": 0.0072, + "step": 10300 + }, + { + "epoch": 7.03, + "grad_norm": 0.004750640131533146, + "learning_rate": 1.470122047868125e-05, + "loss": 0.0001, + "step": 10310 + }, + { + "epoch": 7.04, + "grad_norm": 0.0016635819338262081, + "learning_rate": 1.4661594547471865e-05, + "loss": 0.5727, + "step": 10320 + }, + { + "epoch": 7.04, + "grad_norm": 0.0009257107740268111, + "learning_rate": 1.4621968616262482e-05, + "loss": 0.019, + "step": 10330 + }, + { + "epoch": 7.04, + "grad_norm": 0.0004995065974071622, + "learning_rate": 1.45823426850531e-05, + "loss": 0.0, + "step": 10340 + }, + { + "epoch": 7.04, + "grad_norm": 0.003641214920207858, + "learning_rate": 1.4542716753843717e-05, + "loss": 0.3737, + "step": 10350 + }, + { + "epoch": 7.04, + "grad_norm": 0.0005538859404623508, + "learning_rate": 1.4503090822634332e-05, + "loss": 0.1959, + "step": 10360 + }, + { + "epoch": 7.04, + "grad_norm": 0.0024865760933607817, + "learning_rate": 1.4463464891424949e-05, + "loss": 0.0, + "step": 10370 + }, + { + "epoch": 7.04, + "grad_norm": 0.0012498443247750401, + "learning_rate": 1.4423838960215566e-05, + "loss": 0.0003, + "step": 10380 + }, + { + "epoch": 7.04, + "grad_norm": 0.003093864070251584, + "learning_rate": 1.4384213029006183e-05, + "loss": 0.0001, + "step": 10390 + }, + { + "epoch": 7.04, + "grad_norm": 0.0016697756946086884, + "learning_rate": 1.4344587097796799e-05, + "loss": 0.0001, + "step": 10400 + }, + { + "epoch": 7.04, + "grad_norm": 0.0024545500054955482, + "learning_rate": 1.4304961166587416e-05, + "loss": 0.0, + "step": 10410 + }, + { + "epoch": 7.04, + "grad_norm": 0.0010031814454123378, + "learning_rate": 1.4265335235378033e-05, + "loss": 0.0, + "step": 10420 + }, + { + "epoch": 7.04, + "grad_norm": 0.000834242207929492, + "learning_rate": 1.422570930416865e-05, + "loss": 0.214, + "step": 10430 + }, + { + "epoch": 7.04, + "grad_norm": 0.0008862247341312468, + "learning_rate": 1.4186083372959265e-05, + "loss": 0.0001, + "step": 10440 + }, + { + "epoch": 7.04, + "grad_norm": 0.0010633807396516204, + "learning_rate": 1.4146457441749883e-05, + "loss": 0.0104, + "step": 10450 + }, + { + "epoch": 7.05, + "grad_norm": 24.041336059570312, + "learning_rate": 1.41068315105405e-05, + "loss": 0.0049, + "step": 10460 + }, + { + "epoch": 7.05, + "grad_norm": 0.0011859643273055553, + "learning_rate": 1.4067205579331113e-05, + "loss": 0.0001, + "step": 10470 + }, + { + "epoch": 7.05, + "grad_norm": 0.0006510709063149989, + "learning_rate": 1.402757964812173e-05, + "loss": 0.001, + "step": 10480 + }, + { + "epoch": 7.05, + "grad_norm": 0.000353335402905941, + "learning_rate": 1.3987953716912348e-05, + "loss": 0.0002, + "step": 10490 + }, + { + "epoch": 7.05, + "grad_norm": 0.0005472557386383414, + "learning_rate": 1.3948327785702963e-05, + "loss": 0.0001, + "step": 10500 + }, + { + "epoch": 7.05, + "grad_norm": 0.0006235065520741045, + "learning_rate": 1.390870185449358e-05, + "loss": 0.0004, + "step": 10510 + }, + { + "epoch": 7.05, + "grad_norm": 0.00039498330443166196, + "learning_rate": 1.3869075923284197e-05, + "loss": 0.0, + "step": 10520 + }, + { + "epoch": 7.05, + "grad_norm": 0.0009459428838454187, + "learning_rate": 1.3829449992074814e-05, + "loss": 0.125, + "step": 10530 + }, + { + "epoch": 7.05, + "grad_norm": 0.000288288458250463, + "learning_rate": 1.378982406086543e-05, + "loss": 0.0005, + "step": 10540 + }, + { + "epoch": 7.05, + "grad_norm": 0.0010236125672236085, + "learning_rate": 1.3750198129656047e-05, + "loss": 0.0001, + "step": 10550 + }, + { + "epoch": 7.05, + "grad_norm": 0.0005923935095779598, + "learning_rate": 1.3710572198446664e-05, + "loss": 0.0, + "step": 10560 + }, + { + "epoch": 7.05, + "grad_norm": 0.001925037824548781, + "learning_rate": 1.3670946267237281e-05, + "loss": 0.0, + "step": 10570 + }, + { + "epoch": 7.05, + "grad_norm": 0.0010172594338655472, + "learning_rate": 1.3631320336027898e-05, + "loss": 0.0, + "step": 10580 + }, + { + "epoch": 7.05, + "grad_norm": 0.00022477912716567516, + "learning_rate": 1.3591694404818514e-05, + "loss": 0.4117, + "step": 10590 + }, + { + "epoch": 7.06, + "grad_norm": 0.001964542781934142, + "learning_rate": 1.355206847360913e-05, + "loss": 0.0, + "step": 10600 + }, + { + "epoch": 7.06, + "grad_norm": 0.0008729117107577622, + "learning_rate": 1.3512442542399748e-05, + "loss": 0.005, + "step": 10610 + }, + { + "epoch": 7.06, + "grad_norm": 0.0013933833688497543, + "learning_rate": 1.3472816611190365e-05, + "loss": 0.0, + "step": 10620 + }, + { + "epoch": 7.06, + "grad_norm": 0.720376193523407, + "learning_rate": 1.343319067998098e-05, + "loss": 0.0005, + "step": 10630 + }, + { + "epoch": 7.06, + "grad_norm": 0.0024294324684888124, + "learning_rate": 1.3393564748771597e-05, + "loss": 0.5075, + "step": 10640 + }, + { + "epoch": 7.06, + "grad_norm": 0.00034565231180749834, + "learning_rate": 1.3353938817562215e-05, + "loss": 0.0017, + "step": 10650 + }, + { + "epoch": 7.06, + "grad_norm": 0.0005883481935597956, + "learning_rate": 1.3314312886352832e-05, + "loss": 0.0, + "step": 10660 + }, + { + "epoch": 7.06, + "grad_norm": 0.001018638489767909, + "learning_rate": 1.3274686955143447e-05, + "loss": 0.0005, + "step": 10670 + }, + { + "epoch": 7.06, + "grad_norm": 0.000567563867662102, + "learning_rate": 1.3235061023934064e-05, + "loss": 0.0071, + "step": 10680 + }, + { + "epoch": 7.06, + "grad_norm": 0.0006969044334255159, + "learning_rate": 1.3195435092724678e-05, + "loss": 0.2151, + "step": 10690 + }, + { + "epoch": 7.06, + "grad_norm": 0.000248556025326252, + "learning_rate": 1.3155809161515295e-05, + "loss": 0.0, + "step": 10700 + }, + { + "epoch": 7.06, + "grad_norm": 0.0008631858509033918, + "learning_rate": 1.3116183230305912e-05, + "loss": 0.0001, + "step": 10710 + }, + { + "epoch": 7.06, + "grad_norm": 0.001508180401287973, + "learning_rate": 1.307655729909653e-05, + "loss": 0.0001, + "step": 10720 + }, + { + "epoch": 7.06, + "grad_norm": 0.0005554750678129494, + "learning_rate": 1.3036931367887145e-05, + "loss": 0.0, + "step": 10730 + }, + { + "epoch": 7.07, + "grad_norm": 0.0003934628330171108, + "learning_rate": 1.2997305436677762e-05, + "loss": 0.0, + "step": 10740 + }, + { + "epoch": 7.07, + "grad_norm": 0.001727793482132256, + "learning_rate": 1.2957679505468379e-05, + "loss": 0.0, + "step": 10750 + }, + { + "epoch": 7.07, + "grad_norm": 0.002404275583103299, + "learning_rate": 1.2918053574258996e-05, + "loss": 0.0, + "step": 10760 + }, + { + "epoch": 7.07, + "grad_norm": 0.0008175792172551155, + "learning_rate": 1.2878427643049611e-05, + "loss": 0.0, + "step": 10770 + }, + { + "epoch": 7.07, + "grad_norm": 0.0022247559390962124, + "learning_rate": 1.2838801711840228e-05, + "loss": 0.0001, + "step": 10780 + }, + { + "epoch": 7.07, + "grad_norm": 0.0014646403724327683, + "learning_rate": 1.2799175780630846e-05, + "loss": 0.001, + "step": 10790 + }, + { + "epoch": 7.07, + "grad_norm": 0.0020718346349895, + "learning_rate": 1.2759549849421463e-05, + "loss": 0.0001, + "step": 10800 + }, + { + "epoch": 7.07, + "grad_norm": 0.0008824109099805355, + "learning_rate": 1.2719923918212078e-05, + "loss": 0.5802, + "step": 10810 + }, + { + "epoch": 7.07, + "grad_norm": 0.0007525623659603298, + "learning_rate": 1.2680297987002695e-05, + "loss": 0.0, + "step": 10820 + }, + { + "epoch": 7.07, + "grad_norm": 64.19054412841797, + "learning_rate": 1.2640672055793312e-05, + "loss": 0.4105, + "step": 10830 + }, + { + "epoch": 7.07, + "grad_norm": 0.0019008672097697854, + "learning_rate": 1.260104612458393e-05, + "loss": 0.0005, + "step": 10840 + }, + { + "epoch": 7.07, + "grad_norm": 0.0010036254534497857, + "learning_rate": 1.2561420193374545e-05, + "loss": 0.0006, + "step": 10850 + }, + { + "epoch": 7.07, + "grad_norm": 0.0006118649616837502, + "learning_rate": 1.2521794262165162e-05, + "loss": 0.0, + "step": 10860 + }, + { + "epoch": 7.07, + "grad_norm": 0.003166553797200322, + "learning_rate": 1.2482168330955777e-05, + "loss": 0.0001, + "step": 10870 + }, + { + "epoch": 7.08, + "grad_norm": 0.001118882093578577, + "learning_rate": 1.2442542399746394e-05, + "loss": 0.0, + "step": 10880 + }, + { + "epoch": 7.08, + "grad_norm": 0.0010632872581481934, + "learning_rate": 1.2402916468537012e-05, + "loss": 0.0001, + "step": 10890 + }, + { + "epoch": 7.08, + "grad_norm": 0.001252860063686967, + "learning_rate": 1.2363290537327627e-05, + "loss": 0.0001, + "step": 10900 + }, + { + "epoch": 7.08, + "grad_norm": 0.003005104372277856, + "learning_rate": 1.2323664606118244e-05, + "loss": 0.0001, + "step": 10910 + }, + { + "epoch": 7.08, + "grad_norm": 0.004219905007630587, + "learning_rate": 1.2284038674908861e-05, + "loss": 0.0006, + "step": 10920 + }, + { + "epoch": 7.08, + "grad_norm": 0.0003512962721288204, + "learning_rate": 1.2244412743699478e-05, + "loss": 0.0, + "step": 10930 + }, + { + "epoch": 7.08, + "grad_norm": 0.0026769828982651234, + "learning_rate": 1.2204786812490095e-05, + "loss": 0.0001, + "step": 10940 + }, + { + "epoch": 7.08, + "grad_norm": 0.0003416830440983176, + "learning_rate": 1.216516088128071e-05, + "loss": 0.0, + "step": 10950 + }, + { + "epoch": 7.08, + "grad_norm": 0.0010573529871180654, + "learning_rate": 1.2125534950071326e-05, + "loss": 0.0, + "step": 10960 + }, + { + "epoch": 7.08, + "grad_norm": 0.0013822006294503808, + "learning_rate": 1.2085909018861943e-05, + "loss": 0.2014, + "step": 10970 + }, + { + "epoch": 7.08, + "grad_norm": 0.0003184191882610321, + "learning_rate": 1.204628308765256e-05, + "loss": 0.0, + "step": 10980 + }, + { + "epoch": 7.08, + "grad_norm": 0.004402919672429562, + "learning_rate": 1.2006657156443178e-05, + "loss": 0.006, + "step": 10990 + }, + { + "epoch": 7.08, + "grad_norm": 4.32048225402832, + "learning_rate": 1.1967031225233793e-05, + "loss": 0.0019, + "step": 11000 + }, + { + "epoch": 7.08, + "grad_norm": 0.0006507772486656904, + "learning_rate": 1.192740529402441e-05, + "loss": 0.0, + "step": 11010 + }, + { + "epoch": 7.09, + "grad_norm": 0.0004223829018883407, + "learning_rate": 1.1887779362815027e-05, + "loss": 0.0001, + "step": 11020 + }, + { + "epoch": 7.09, + "grad_norm": 0.0006153634749352932, + "learning_rate": 1.1848153431605644e-05, + "loss": 0.0001, + "step": 11030 + }, + { + "epoch": 7.09, + "grad_norm": 0.0007072246517054737, + "learning_rate": 1.180852750039626e-05, + "loss": 0.0, + "step": 11040 + }, + { + "epoch": 7.09, + "grad_norm": 0.0007087733829393983, + "learning_rate": 1.1768901569186877e-05, + "loss": 0.0591, + "step": 11050 + }, + { + "epoch": 7.09, + "grad_norm": 0.00040967803215608, + "learning_rate": 1.1729275637977492e-05, + "loss": 0.0, + "step": 11060 + }, + { + "epoch": 7.09, + "grad_norm": 0.0018612256972119212, + "learning_rate": 1.168964970676811e-05, + "loss": 0.0001, + "step": 11070 + }, + { + "epoch": 7.09, + "grad_norm": 0.0016640513204038143, + "learning_rate": 1.1650023775558726e-05, + "loss": 0.0001, + "step": 11080 + }, + { + "epoch": 7.09, + "grad_norm": 0.004190579988062382, + "learning_rate": 1.1610397844349342e-05, + "loss": 0.0, + "step": 11090 + }, + { + "epoch": 7.09, + "grad_norm": 0.001836647279560566, + "learning_rate": 1.1570771913139959e-05, + "loss": 0.0, + "step": 11100 + }, + { + "epoch": 7.09, + "grad_norm": 0.0005556776304729283, + "learning_rate": 1.1531145981930576e-05, + "loss": 0.0001, + "step": 11110 + }, + { + "epoch": 7.09, + "grad_norm": 0.0008808193379081786, + "learning_rate": 1.1491520050721193e-05, + "loss": 0.0001, + "step": 11120 + }, + { + "epoch": 7.09, + "grad_norm": 0.0001681848953012377, + "learning_rate": 1.1451894119511809e-05, + "loss": 0.2524, + "step": 11130 + }, + { + "epoch": 7.09, + "grad_norm": 0.0008354588062502444, + "learning_rate": 1.1412268188302426e-05, + "loss": 0.0, + "step": 11140 + }, + { + "epoch": 7.09, + "grad_norm": 0.00051628437358886, + "learning_rate": 1.1372642257093043e-05, + "loss": 0.0, + "step": 11150 + }, + { + "epoch": 7.1, + "grad_norm": 0.0008145067258737981, + "learning_rate": 1.133301632588366e-05, + "loss": 0.0001, + "step": 11160 + }, + { + "epoch": 7.1, + "grad_norm": 0.0004701870202552527, + "learning_rate": 1.1293390394674275e-05, + "loss": 0.1907, + "step": 11170 + }, + { + "epoch": 7.1, + "grad_norm": 0.0011952131753787398, + "learning_rate": 1.125376446346489e-05, + "loss": 0.0, + "step": 11180 + }, + { + "epoch": 7.1, + "grad_norm": 0.00032050846493802965, + "learning_rate": 1.1214138532255508e-05, + "loss": 0.0, + "step": 11190 + }, + { + "epoch": 7.1, + "grad_norm": 0.0006612459546886384, + "learning_rate": 1.1174512601046125e-05, + "loss": 0.0001, + "step": 11200 + }, + { + "epoch": 7.1, + "grad_norm": 0.0030058922711759806, + "learning_rate": 1.1134886669836742e-05, + "loss": 0.0, + "step": 11210 + }, + { + "epoch": 7.1, + "grad_norm": 0.0034754828084260225, + "learning_rate": 1.1095260738627357e-05, + "loss": 0.0003, + "step": 11220 + }, + { + "epoch": 7.1, + "eval_accuracy": 0.9578947368421052, + "eval_loss": 0.2666740119457245, + "eval_runtime": 2322.4119, + "eval_samples_per_second": 0.286, + "eval_steps_per_second": 0.143, + "step": 11224 + }, + { + "epoch": 8.0, + "grad_norm": 0.004194905515760183, + "learning_rate": 1.1055634807417975e-05, + "loss": 0.0001, + "step": 11230 + }, + { + "epoch": 8.0, + "grad_norm": 0.0024937952402979136, + "learning_rate": 1.1016008876208592e-05, + "loss": 0.0, + "step": 11240 + }, + { + "epoch": 8.0, + "grad_norm": 0.00039031429332681, + "learning_rate": 1.0976382944999209e-05, + "loss": 0.0, + "step": 11250 + }, + { + "epoch": 8.0, + "grad_norm": 0.005691041238605976, + "learning_rate": 1.0936757013789826e-05, + "loss": 0.0001, + "step": 11260 + }, + { + "epoch": 8.0, + "grad_norm": 0.00017179737915284932, + "learning_rate": 1.0897131082580441e-05, + "loss": 0.0001, + "step": 11270 + }, + { + "epoch": 8.0, + "grad_norm": 0.000949267705436796, + "learning_rate": 1.0857505151371057e-05, + "loss": 0.0001, + "step": 11280 + }, + { + "epoch": 8.0, + "grad_norm": 0.0003036385169252753, + "learning_rate": 1.0817879220161674e-05, + "loss": 0.0001, + "step": 11290 + }, + { + "epoch": 8.01, + "grad_norm": 0.004243906121701002, + "learning_rate": 1.0778253288952291e-05, + "loss": 0.0005, + "step": 11300 + }, + { + "epoch": 8.01, + "grad_norm": 0.0010142240207642317, + "learning_rate": 1.0738627357742908e-05, + "loss": 0.0, + "step": 11310 + }, + { + "epoch": 8.01, + "grad_norm": 0.0010380720486864448, + "learning_rate": 1.0699001426533523e-05, + "loss": 0.0001, + "step": 11320 + }, + { + "epoch": 8.01, + "grad_norm": 0.0005737761966884136, + "learning_rate": 1.065937549532414e-05, + "loss": 0.0001, + "step": 11330 + }, + { + "epoch": 8.01, + "grad_norm": 0.001465731067582965, + "learning_rate": 1.0619749564114758e-05, + "loss": 0.0, + "step": 11340 + }, + { + "epoch": 8.01, + "grad_norm": 0.002500841859728098, + "learning_rate": 1.0580123632905375e-05, + "loss": 0.0, + "step": 11350 + }, + { + "epoch": 8.01, + "grad_norm": 0.00024287942505907267, + "learning_rate": 1.054049770169599e-05, + "loss": 0.0, + "step": 11360 + }, + { + "epoch": 8.01, + "grad_norm": 0.0006320082466118038, + "learning_rate": 1.0500871770486607e-05, + "loss": 0.0001, + "step": 11370 + }, + { + "epoch": 8.01, + "grad_norm": 0.00030024844454601407, + "learning_rate": 1.0461245839277224e-05, + "loss": 0.0097, + "step": 11380 + }, + { + "epoch": 8.01, + "grad_norm": 0.00043432554230093956, + "learning_rate": 1.042161990806784e-05, + "loss": 0.0421, + "step": 11390 + }, + { + "epoch": 8.01, + "grad_norm": 0.002737953094765544, + "learning_rate": 1.0381993976858457e-05, + "loss": 0.0, + "step": 11400 + }, + { + "epoch": 8.01, + "grad_norm": 0.000816858431790024, + "learning_rate": 1.0342368045649072e-05, + "loss": 0.0, + "step": 11410 + }, + { + "epoch": 8.01, + "grad_norm": 0.00036986047052778304, + "learning_rate": 1.030274211443969e-05, + "loss": 0.0, + "step": 11420 + }, + { + "epoch": 8.01, + "grad_norm": 0.0004323932225815952, + "learning_rate": 1.0263116183230307e-05, + "loss": 0.0002, + "step": 11430 + }, + { + "epoch": 8.02, + "grad_norm": 0.0004024511144962162, + "learning_rate": 1.0223490252020924e-05, + "loss": 0.421, + "step": 11440 + }, + { + "epoch": 8.02, + "grad_norm": 0.0024430027697235346, + "learning_rate": 1.0183864320811539e-05, + "loss": 0.0, + "step": 11450 + }, + { + "epoch": 8.02, + "grad_norm": 0.001345345051959157, + "learning_rate": 1.0144238389602156e-05, + "loss": 0.0, + "step": 11460 + }, + { + "epoch": 8.02, + "grad_norm": 0.0006153620779514313, + "learning_rate": 1.0104612458392773e-05, + "loss": 0.0001, + "step": 11470 + }, + { + "epoch": 8.02, + "grad_norm": 0.0015972702531144023, + "learning_rate": 1.006498652718339e-05, + "loss": 0.0, + "step": 11480 + }, + { + "epoch": 8.02, + "grad_norm": 0.0008706132066436112, + "learning_rate": 1.0025360595974006e-05, + "loss": 0.0001, + "step": 11490 + }, + { + "epoch": 8.02, + "grad_norm": 0.001384895178489387, + "learning_rate": 9.985734664764621e-06, + "loss": 0.0001, + "step": 11500 + }, + { + "epoch": 8.02, + "grad_norm": 0.0010631100740283728, + "learning_rate": 9.946108733555238e-06, + "loss": 0.0002, + "step": 11510 + }, + { + "epoch": 8.02, + "grad_norm": 0.0007243629661388695, + "learning_rate": 9.906482802345855e-06, + "loss": 0.0001, + "step": 11520 + }, + { + "epoch": 8.02, + "grad_norm": 74.74536895751953, + "learning_rate": 9.866856871136473e-06, + "loss": 0.0797, + "step": 11530 + }, + { + "epoch": 8.02, + "grad_norm": 0.005114846862852573, + "learning_rate": 9.827230939927088e-06, + "loss": 0.0001, + "step": 11540 + }, + { + "epoch": 8.02, + "grad_norm": 0.0024818070232868195, + "learning_rate": 9.787605008717705e-06, + "loss": 0.0001, + "step": 11550 + }, + { + "epoch": 8.02, + "grad_norm": 0.00041646783938631415, + "learning_rate": 9.747979077508322e-06, + "loss": 0.0, + "step": 11560 + }, + { + "epoch": 8.02, + "grad_norm": 0.0007332797977142036, + "learning_rate": 9.70835314629894e-06, + "loss": 0.0116, + "step": 11570 + }, + { + "epoch": 8.03, + "grad_norm": 0.0007879806798882782, + "learning_rate": 9.668727215089556e-06, + "loss": 0.0, + "step": 11580 + }, + { + "epoch": 8.03, + "grad_norm": 0.0009714935440570116, + "learning_rate": 9.629101283880172e-06, + "loss": 0.0001, + "step": 11590 + }, + { + "epoch": 8.03, + "grad_norm": 0.0009343309211544693, + "learning_rate": 9.589475352670787e-06, + "loss": 0.5311, + "step": 11600 + }, + { + "epoch": 8.03, + "grad_norm": 0.00037891563260927796, + "learning_rate": 9.549849421461404e-06, + "loss": 0.002, + "step": 11610 + }, + { + "epoch": 8.03, + "grad_norm": 0.001986218150705099, + "learning_rate": 9.510223490252021e-06, + "loss": 0.0111, + "step": 11620 + }, + { + "epoch": 8.03, + "grad_norm": 0.0015318701043725014, + "learning_rate": 9.470597559042639e-06, + "loss": 0.0001, + "step": 11630 + }, + { + "epoch": 8.03, + "grad_norm": 0.0006765589932911098, + "learning_rate": 9.430971627833254e-06, + "loss": 0.0, + "step": 11640 + }, + { + "epoch": 8.03, + "grad_norm": 0.0005000099190510809, + "learning_rate": 9.391345696623871e-06, + "loss": 0.0, + "step": 11650 + }, + { + "epoch": 8.03, + "grad_norm": 0.0017080691177397966, + "learning_rate": 9.351719765414488e-06, + "loss": 0.0002, + "step": 11660 + }, + { + "epoch": 8.03, + "grad_norm": 0.0017356324242427945, + "learning_rate": 9.312093834205105e-06, + "loss": 0.0001, + "step": 11670 + }, + { + "epoch": 8.03, + "grad_norm": 0.0010568661382421851, + "learning_rate": 9.27246790299572e-06, + "loss": 0.0, + "step": 11680 + }, + { + "epoch": 8.03, + "grad_norm": 0.0014095234218984842, + "learning_rate": 9.232841971786338e-06, + "loss": 0.0021, + "step": 11690 + }, + { + "epoch": 8.03, + "grad_norm": 0.00167833489831537, + "learning_rate": 9.193216040576955e-06, + "loss": 0.0001, + "step": 11700 + }, + { + "epoch": 8.03, + "grad_norm": 0.0016795884585008025, + "learning_rate": 9.15359010936757e-06, + "loss": 0.0002, + "step": 11710 + }, + { + "epoch": 8.04, + "grad_norm": 0.0003502909676171839, + "learning_rate": 9.113964178158187e-06, + "loss": 0.0, + "step": 11720 + }, + { + "epoch": 8.04, + "grad_norm": 0.10941363871097565, + "learning_rate": 9.074338246948803e-06, + "loss": 0.0001, + "step": 11730 + }, + { + "epoch": 8.04, + "grad_norm": 0.0014083647402003407, + "learning_rate": 9.03471231573942e-06, + "loss": 0.3081, + "step": 11740 + }, + { + "epoch": 8.04, + "grad_norm": 0.0014537267852574587, + "learning_rate": 8.995086384530037e-06, + "loss": 0.0, + "step": 11750 + }, + { + "epoch": 8.04, + "grad_norm": 0.0005781695363111794, + "learning_rate": 8.955460453320654e-06, + "loss": 0.0, + "step": 11760 + }, + { + "epoch": 8.04, + "grad_norm": 0.0007176825893111527, + "learning_rate": 8.91583452211127e-06, + "loss": 0.0, + "step": 11770 + }, + { + "epoch": 8.04, + "grad_norm": 0.000545515853445977, + "learning_rate": 8.876208590901887e-06, + "loss": 0.0, + "step": 11780 + }, + { + "epoch": 8.04, + "grad_norm": 0.0025596795603632927, + "learning_rate": 8.836582659692504e-06, + "loss": 0.0002, + "step": 11790 + }, + { + "epoch": 8.04, + "grad_norm": 0.030005350708961487, + "learning_rate": 8.796956728483121e-06, + "loss": 0.0001, + "step": 11800 + }, + { + "epoch": 8.04, + "grad_norm": 0.00035480278893373907, + "learning_rate": 8.757330797273736e-06, + "loss": 0.0018, + "step": 11810 + }, + { + "epoch": 8.04, + "grad_norm": 0.004515402484685183, + "learning_rate": 8.717704866064352e-06, + "loss": 0.0, + "step": 11820 + }, + { + "epoch": 8.04, + "grad_norm": 0.0032044288236647844, + "learning_rate": 8.678078934854969e-06, + "loss": 0.0036, + "step": 11830 + }, + { + "epoch": 8.04, + "grad_norm": 0.0009629257838241756, + "learning_rate": 8.638453003645586e-06, + "loss": 0.149, + "step": 11840 + }, + { + "epoch": 8.04, + "grad_norm": 0.0024080132134258747, + "learning_rate": 8.598827072436203e-06, + "loss": 0.0003, + "step": 11850 + }, + { + "epoch": 8.05, + "grad_norm": 0.0015089749358594418, + "learning_rate": 8.559201141226818e-06, + "loss": 0.0, + "step": 11860 + }, + { + "epoch": 8.05, + "grad_norm": 0.0019321951549500227, + "learning_rate": 8.519575210017436e-06, + "loss": 0.0, + "step": 11870 + }, + { + "epoch": 8.05, + "grad_norm": 0.005924368277192116, + "learning_rate": 8.479949278808053e-06, + "loss": 0.0, + "step": 11880 + }, + { + "epoch": 8.05, + "grad_norm": 0.0007942487136460841, + "learning_rate": 8.44032334759867e-06, + "loss": 0.0, + "step": 11890 + }, + { + "epoch": 8.05, + "grad_norm": 0.0022497123572975397, + "learning_rate": 8.400697416389285e-06, + "loss": 0.1055, + "step": 11900 + }, + { + "epoch": 8.05, + "grad_norm": 0.0006818937254138291, + "learning_rate": 8.361071485179902e-06, + "loss": 0.0001, + "step": 11910 + }, + { + "epoch": 8.05, + "grad_norm": 0.0004379069432616234, + "learning_rate": 8.32144555397052e-06, + "loss": 0.0, + "step": 11920 + }, + { + "epoch": 8.05, + "grad_norm": 0.00047276023542508483, + "learning_rate": 8.281819622761135e-06, + "loss": 0.0, + "step": 11930 + }, + { + "epoch": 8.05, + "grad_norm": 0.0004771367821376771, + "learning_rate": 8.242193691551752e-06, + "loss": 0.0, + "step": 11940 + }, + { + "epoch": 8.05, + "grad_norm": 0.0005501986015588045, + "learning_rate": 8.202567760342367e-06, + "loss": 0.0001, + "step": 11950 + }, + { + "epoch": 8.05, + "grad_norm": 0.0011177220148965716, + "learning_rate": 8.162941829132984e-06, + "loss": 0.0703, + "step": 11960 + }, + { + "epoch": 8.05, + "grad_norm": 0.0004951581358909607, + "learning_rate": 8.123315897923602e-06, + "loss": 0.0, + "step": 11970 + }, + { + "epoch": 8.05, + "grad_norm": 0.0008309069671668112, + "learning_rate": 8.083689966714219e-06, + "loss": 0.0, + "step": 11980 + }, + { + "epoch": 8.05, + "grad_norm": 0.000472767511382699, + "learning_rate": 8.044064035504836e-06, + "loss": 0.5261, + "step": 11990 + }, + { + "epoch": 8.06, + "grad_norm": 0.00044904148671776056, + "learning_rate": 8.004438104295451e-06, + "loss": 0.0, + "step": 12000 + }, + { + "epoch": 8.06, + "grad_norm": 0.0004107660206500441, + "learning_rate": 7.964812173086068e-06, + "loss": 0.0, + "step": 12010 + }, + { + "epoch": 8.06, + "grad_norm": 0.00042746157851070166, + "learning_rate": 7.925186241876685e-06, + "loss": 0.0, + "step": 12020 + }, + { + "epoch": 8.06, + "grad_norm": 0.0007110532023943961, + "learning_rate": 7.8855603106673e-06, + "loss": 0.0, + "step": 12030 + }, + { + "epoch": 8.06, + "grad_norm": 0.0007705994066782296, + "learning_rate": 7.845934379457918e-06, + "loss": 0.0, + "step": 12040 + }, + { + "epoch": 8.06, + "grad_norm": 0.0006966418586671352, + "learning_rate": 7.806308448248533e-06, + "loss": 0.0, + "step": 12050 + }, + { + "epoch": 8.06, + "grad_norm": 0.020446307957172394, + "learning_rate": 7.76668251703915e-06, + "loss": 0.0001, + "step": 12060 + }, + { + "epoch": 8.06, + "grad_norm": 0.0004377638688310981, + "learning_rate": 7.727056585829768e-06, + "loss": 0.0, + "step": 12070 + }, + { + "epoch": 8.06, + "grad_norm": 0.00036184967029839754, + "learning_rate": 7.687430654620385e-06, + "loss": 0.0002, + "step": 12080 + }, + { + "epoch": 8.06, + "grad_norm": 0.00029569625621661544, + "learning_rate": 7.647804723411e-06, + "loss": 0.0001, + "step": 12090 + }, + { + "epoch": 8.06, + "grad_norm": 0.0003205812827218324, + "learning_rate": 7.608178792201617e-06, + "loss": 0.0236, + "step": 12100 + }, + { + "epoch": 8.06, + "grad_norm": 0.00043995011947117746, + "learning_rate": 7.568552860992234e-06, + "loss": 0.0001, + "step": 12110 + }, + { + "epoch": 8.06, + "grad_norm": 0.0021792801562696695, + "learning_rate": 7.5289269297828505e-06, + "loss": 0.0, + "step": 12120 + }, + { + "epoch": 8.06, + "grad_norm": 0.003733986523002386, + "learning_rate": 7.489300998573468e-06, + "loss": 0.0, + "step": 12130 + }, + { + "epoch": 8.07, + "grad_norm": 0.001138021470978856, + "learning_rate": 7.449675067364083e-06, + "loss": 0.0001, + "step": 12140 + }, + { + "epoch": 8.07, + "grad_norm": 0.0003544053470250219, + "learning_rate": 7.410049136154699e-06, + "loss": 0.0, + "step": 12150 + }, + { + "epoch": 8.07, + "grad_norm": 0.0007718518027104437, + "learning_rate": 7.370423204945316e-06, + "loss": 0.0, + "step": 12160 + }, + { + "epoch": 8.07, + "grad_norm": 0.000794577703345567, + "learning_rate": 7.330797273735933e-06, + "loss": 0.0, + "step": 12170 + }, + { + "epoch": 8.07, + "grad_norm": 0.0007835258147679269, + "learning_rate": 7.29117134252655e-06, + "loss": 0.0, + "step": 12180 + }, + { + "epoch": 8.07, + "grad_norm": 0.0008351559517905116, + "learning_rate": 7.251545411317166e-06, + "loss": 0.0, + "step": 12190 + }, + { + "epoch": 8.07, + "grad_norm": 0.001067393459379673, + "learning_rate": 7.211919480107783e-06, + "loss": 0.0, + "step": 12200 + }, + { + "epoch": 8.07, + "grad_norm": 0.0005535806412808597, + "learning_rate": 7.172293548898399e-06, + "loss": 0.0, + "step": 12210 + }, + { + "epoch": 8.07, + "grad_norm": 0.0013392162509262562, + "learning_rate": 7.1326676176890165e-06, + "loss": 0.0, + "step": 12220 + }, + { + "epoch": 8.07, + "grad_norm": 0.011801800690591335, + "learning_rate": 7.093041686479633e-06, + "loss": 0.0001, + "step": 12230 + }, + { + "epoch": 8.07, + "grad_norm": 0.0003349117760080844, + "learning_rate": 7.05341575527025e-06, + "loss": 0.0, + "step": 12240 + }, + { + "epoch": 8.07, + "grad_norm": 0.0009791525080800056, + "learning_rate": 7.013789824060865e-06, + "loss": 0.0, + "step": 12250 + }, + { + "epoch": 8.07, + "grad_norm": 0.0003134564030915499, + "learning_rate": 6.9741638928514815e-06, + "loss": 0.0, + "step": 12260 + }, + { + "epoch": 8.07, + "grad_norm": 0.0011281302431598306, + "learning_rate": 6.934537961642099e-06, + "loss": 0.0003, + "step": 12270 + }, + { + "epoch": 8.08, + "grad_norm": 0.0004596656945068389, + "learning_rate": 6.894912030432715e-06, + "loss": 0.0, + "step": 12280 + }, + { + "epoch": 8.08, + "grad_norm": 0.017007848247885704, + "learning_rate": 6.855286099223332e-06, + "loss": 0.1894, + "step": 12290 + }, + { + "epoch": 8.08, + "grad_norm": 0.0009561624028719962, + "learning_rate": 6.815660168013949e-06, + "loss": 0.0001, + "step": 12300 + }, + { + "epoch": 8.08, + "grad_norm": 0.0006208363920450211, + "learning_rate": 6.776034236804565e-06, + "loss": 0.0, + "step": 12310 + }, + { + "epoch": 8.08, + "grad_norm": 0.00040551909478381276, + "learning_rate": 6.7364083055951825e-06, + "loss": 0.0, + "step": 12320 + }, + { + "epoch": 8.08, + "grad_norm": 0.0010045063681900501, + "learning_rate": 6.696782374385799e-06, + "loss": 0.0001, + "step": 12330 + }, + { + "epoch": 8.08, + "grad_norm": 0.001559635391458869, + "learning_rate": 6.657156443176416e-06, + "loss": 0.1317, + "step": 12340 + }, + { + "epoch": 8.08, + "grad_norm": 0.00036661443300545216, + "learning_rate": 6.617530511967032e-06, + "loss": 0.0, + "step": 12350 + }, + { + "epoch": 8.08, + "grad_norm": 0.0022761470172554255, + "learning_rate": 6.5779045807576475e-06, + "loss": 0.0, + "step": 12360 + }, + { + "epoch": 8.08, + "grad_norm": 0.0002771701547317207, + "learning_rate": 6.538278649548265e-06, + "loss": 0.0, + "step": 12370 + }, + { + "epoch": 8.08, + "grad_norm": 0.0009405760793015361, + "learning_rate": 6.498652718338881e-06, + "loss": 0.0, + "step": 12380 + }, + { + "epoch": 8.08, + "grad_norm": 0.0011777085019275546, + "learning_rate": 6.459026787129498e-06, + "loss": 0.0, + "step": 12390 + }, + { + "epoch": 8.08, + "grad_norm": 0.0007950080907903612, + "learning_rate": 6.419400855920114e-06, + "loss": 0.0023, + "step": 12400 + }, + { + "epoch": 8.08, + "grad_norm": 0.000328573863953352, + "learning_rate": 6.379774924710731e-06, + "loss": 0.0001, + "step": 12410 + }, + { + "epoch": 8.09, + "grad_norm": 0.000489677709992975, + "learning_rate": 6.340148993501348e-06, + "loss": 0.0, + "step": 12420 + }, + { + "epoch": 8.09, + "grad_norm": 19.678516387939453, + "learning_rate": 6.300523062291965e-06, + "loss": 0.2121, + "step": 12430 + }, + { + "epoch": 8.09, + "grad_norm": 0.001576061244122684, + "learning_rate": 6.260897131082581e-06, + "loss": 0.2006, + "step": 12440 + }, + { + "epoch": 8.09, + "grad_norm": 0.0010969837894663215, + "learning_rate": 6.221271199873197e-06, + "loss": 0.0089, + "step": 12450 + }, + { + "epoch": 8.09, + "grad_norm": 0.0006820796988904476, + "learning_rate": 6.1816452686638135e-06, + "loss": 0.0001, + "step": 12460 + }, + { + "epoch": 8.09, + "grad_norm": 0.0039375657215714455, + "learning_rate": 6.142019337454431e-06, + "loss": 0.0, + "step": 12470 + }, + { + "epoch": 8.09, + "grad_norm": 0.00018676265608519316, + "learning_rate": 6.102393406245048e-06, + "loss": 0.0002, + "step": 12480 + }, + { + "epoch": 8.09, + "grad_norm": 0.0015864548040553927, + "learning_rate": 6.062767475035663e-06, + "loss": 0.0, + "step": 12490 + }, + { + "epoch": 8.09, + "grad_norm": 0.0005812132731080055, + "learning_rate": 6.02314154382628e-06, + "loss": 0.0001, + "step": 12500 + }, + { + "epoch": 8.09, + "grad_norm": 0.0015394919319078326, + "learning_rate": 5.9835156126168965e-06, + "loss": 0.0, + "step": 12510 + }, + { + "epoch": 8.09, + "grad_norm": 0.5876509547233582, + "learning_rate": 5.943889681407514e-06, + "loss": 0.0002, + "step": 12520 + }, + { + "epoch": 8.09, + "grad_norm": 0.001257477910257876, + "learning_rate": 5.90426375019813e-06, + "loss": 0.0, + "step": 12530 + }, + { + "epoch": 8.09, + "grad_norm": 0.007748996838927269, + "learning_rate": 5.864637818988746e-06, + "loss": 0.0002, + "step": 12540 + }, + { + "epoch": 8.09, + "grad_norm": 0.0004220679693389684, + "learning_rate": 5.825011887779363e-06, + "loss": 0.0001, + "step": 12550 + }, + { + "epoch": 8.1, + "grad_norm": 0.0003514468262437731, + "learning_rate": 5.7853859565699795e-06, + "loss": 0.0062, + "step": 12560 + }, + { + "epoch": 8.1, + "grad_norm": 0.0004685299936681986, + "learning_rate": 5.745760025360597e-06, + "loss": 0.0016, + "step": 12570 + }, + { + "epoch": 8.1, + "grad_norm": 0.0002851441968232393, + "learning_rate": 5.706134094151213e-06, + "loss": 0.0004, + "step": 12580 + }, + { + "epoch": 8.1, + "grad_norm": 0.0006324647110886872, + "learning_rate": 5.66650816294183e-06, + "loss": 0.0, + "step": 12590 + }, + { + "epoch": 8.1, + "grad_norm": 0.000717841787263751, + "learning_rate": 5.626882231732445e-06, + "loss": 0.0, + "step": 12600 + }, + { + "epoch": 8.1, + "grad_norm": 0.001114896615035832, + "learning_rate": 5.5872563005230625e-06, + "loss": 0.0, + "step": 12610 + }, + { + "epoch": 8.1, + "grad_norm": 0.0011514411307871342, + "learning_rate": 5.547630369313679e-06, + "loss": 0.0001, + "step": 12620 + }, + { + "epoch": 8.1, + "eval_accuracy": 0.9654135338345865, + "eval_loss": 0.2435862421989441, + "eval_runtime": 2357.1776, + "eval_samples_per_second": 0.282, + "eval_steps_per_second": 0.141, + "step": 12627 + }, + { + "epoch": 9.0, + "grad_norm": 0.00044704281026497483, + "learning_rate": 5.508004438104296e-06, + "loss": 0.0, + "step": 12630 + }, + { + "epoch": 9.0, + "grad_norm": 0.00041269470239058137, + "learning_rate": 5.468378506894913e-06, + "loss": 0.0, + "step": 12640 + }, + { + "epoch": 9.0, + "grad_norm": 0.0003670216246973723, + "learning_rate": 5.428752575685528e-06, + "loss": 0.0, + "step": 12650 + }, + { + "epoch": 9.0, + "grad_norm": 0.003106119344010949, + "learning_rate": 5.3891266444761455e-06, + "loss": 0.0, + "step": 12660 + }, + { + "epoch": 9.0, + "grad_norm": 0.00040537622408010066, + "learning_rate": 5.349500713266762e-06, + "loss": 0.0, + "step": 12670 + }, + { + "epoch": 9.0, + "grad_norm": 0.00037262984551489353, + "learning_rate": 5.309874782057379e-06, + "loss": 0.0, + "step": 12680 + }, + { + "epoch": 9.0, + "grad_norm": 0.000418797048041597, + "learning_rate": 5.270248850847995e-06, + "loss": 0.0, + "step": 12690 + }, + { + "epoch": 9.01, + "grad_norm": 0.0015914670657366514, + "learning_rate": 5.230622919638612e-06, + "loss": 0.0, + "step": 12700 + }, + { + "epoch": 9.01, + "grad_norm": 0.005690779071301222, + "learning_rate": 5.1909969884292285e-06, + "loss": 0.0976, + "step": 12710 + }, + { + "epoch": 9.01, + "grad_norm": 0.001181070227175951, + "learning_rate": 5.151371057219845e-06, + "loss": 0.0, + "step": 12720 + }, + { + "epoch": 9.01, + "grad_norm": 0.0007823907653801143, + "learning_rate": 5.111745126010462e-06, + "loss": 0.0, + "step": 12730 + }, + { + "epoch": 9.01, + "grad_norm": 0.0010620895773172379, + "learning_rate": 5.072119194801078e-06, + "loss": 0.0, + "step": 12740 + }, + { + "epoch": 9.01, + "grad_norm": 0.00028126072720624506, + "learning_rate": 5.032493263591695e-06, + "loss": 0.0052, + "step": 12750 + }, + { + "epoch": 9.01, + "grad_norm": 0.0005754511221311986, + "learning_rate": 4.992867332382311e-06, + "loss": 0.0, + "step": 12760 + }, + { + "epoch": 9.01, + "grad_norm": 0.000247256743023172, + "learning_rate": 4.953241401172928e-06, + "loss": 0.0, + "step": 12770 + }, + { + "epoch": 9.01, + "grad_norm": 0.0017203809693455696, + "learning_rate": 4.913615469963544e-06, + "loss": 0.0001, + "step": 12780 + }, + { + "epoch": 9.01, + "grad_norm": 0.0005222621257416904, + "learning_rate": 4.873989538754161e-06, + "loss": 0.0, + "step": 12790 + }, + { + "epoch": 9.01, + "grad_norm": 0.00047639888362027705, + "learning_rate": 4.834363607544778e-06, + "loss": 0.0001, + "step": 12800 + }, + { + "epoch": 9.01, + "grad_norm": 0.0015658453339710832, + "learning_rate": 4.794737676335394e-06, + "loss": 0.0, + "step": 12810 + }, + { + "epoch": 9.01, + "grad_norm": 0.0002700120967347175, + "learning_rate": 4.755111745126011e-06, + "loss": 0.0, + "step": 12820 + }, + { + "epoch": 9.01, + "grad_norm": 0.00036174681736156344, + "learning_rate": 4.715485813916627e-06, + "loss": 0.0, + "step": 12830 + }, + { + "epoch": 9.02, + "grad_norm": 0.00048193742986768484, + "learning_rate": 4.675859882707244e-06, + "loss": 0.0001, + "step": 12840 + }, + { + "epoch": 9.02, + "grad_norm": 0.00021181856573093683, + "learning_rate": 4.63623395149786e-06, + "loss": 0.0, + "step": 12850 + }, + { + "epoch": 9.02, + "grad_norm": 0.0007221151608973742, + "learning_rate": 4.5966080202884774e-06, + "loss": 0.0001, + "step": 12860 + }, + { + "epoch": 9.02, + "grad_norm": 0.0008499003597535193, + "learning_rate": 4.556982089079094e-06, + "loss": 0.0, + "step": 12870 + }, + { + "epoch": 9.02, + "grad_norm": 0.00024478594423271716, + "learning_rate": 4.51735615786971e-06, + "loss": 0.0, + "step": 12880 + }, + { + "epoch": 9.02, + "grad_norm": 0.000799850036855787, + "learning_rate": 4.477730226660327e-06, + "loss": 0.0, + "step": 12890 + }, + { + "epoch": 9.02, + "grad_norm": 0.0012479170691221952, + "learning_rate": 4.438104295450943e-06, + "loss": 0.0007, + "step": 12900 + }, + { + "epoch": 9.02, + "grad_norm": 0.0008572249207645655, + "learning_rate": 4.3984783642415604e-06, + "loss": 0.0, + "step": 12910 + }, + { + "epoch": 9.02, + "grad_norm": 0.00028230881434865296, + "learning_rate": 4.358852433032176e-06, + "loss": 0.0773, + "step": 12920 + }, + { + "epoch": 9.02, + "grad_norm": 0.0003641119983512908, + "learning_rate": 4.319226501822793e-06, + "loss": 0.0, + "step": 12930 + }, + { + "epoch": 9.02, + "grad_norm": 0.0009531981777399778, + "learning_rate": 4.279600570613409e-06, + "loss": 0.0, + "step": 12940 + }, + { + "epoch": 9.02, + "grad_norm": 0.00067020149435848, + "learning_rate": 4.239974639404026e-06, + "loss": 0.0, + "step": 12950 + }, + { + "epoch": 9.02, + "grad_norm": 0.0001659138360992074, + "learning_rate": 4.200348708194643e-06, + "loss": 0.0, + "step": 12960 + }, + { + "epoch": 9.02, + "grad_norm": 0.0005148449563421309, + "learning_rate": 4.16072277698526e-06, + "loss": 0.0107, + "step": 12970 + }, + { + "epoch": 9.03, + "grad_norm": 0.000638917728792876, + "learning_rate": 4.121096845775876e-06, + "loss": 0.0, + "step": 12980 + }, + { + "epoch": 9.03, + "grad_norm": 0.00047383896890096366, + "learning_rate": 4.081470914566492e-06, + "loss": 0.0, + "step": 12990 + }, + { + "epoch": 9.03, + "grad_norm": 0.0007675238302908838, + "learning_rate": 4.041844983357109e-06, + "loss": 0.0, + "step": 13000 + }, + { + "epoch": 9.03, + "grad_norm": 0.001697351224720478, + "learning_rate": 4.0022190521477256e-06, + "loss": 0.0, + "step": 13010 + }, + { + "epoch": 9.03, + "grad_norm": 0.00020665867486968637, + "learning_rate": 3.962593120938343e-06, + "loss": 0.0, + "step": 13020 + }, + { + "epoch": 9.03, + "grad_norm": 0.001027750549837947, + "learning_rate": 3.922967189728959e-06, + "loss": 0.2632, + "step": 13030 + }, + { + "epoch": 9.03, + "grad_norm": 0.003146632807329297, + "learning_rate": 3.883341258519575e-06, + "loss": 0.0, + "step": 13040 + }, + { + "epoch": 9.03, + "grad_norm": 0.0007864089566282928, + "learning_rate": 3.843715327310192e-06, + "loss": 0.0044, + "step": 13050 + }, + { + "epoch": 9.03, + "grad_norm": 0.00022077991161495447, + "learning_rate": 3.8040893961008086e-06, + "loss": 0.0, + "step": 13060 + }, + { + "epoch": 9.03, + "grad_norm": 0.0005595972179435194, + "learning_rate": 3.7644634648914252e-06, + "loss": 0.0, + "step": 13070 + }, + { + "epoch": 9.03, + "grad_norm": 0.0005725977243855596, + "learning_rate": 3.7248375336820415e-06, + "loss": 0.0, + "step": 13080 + }, + { + "epoch": 9.03, + "grad_norm": 0.0011127095203846693, + "learning_rate": 3.685211602472658e-06, + "loss": 0.0, + "step": 13090 + }, + { + "epoch": 9.03, + "grad_norm": 0.001887647551484406, + "learning_rate": 3.645585671263275e-06, + "loss": 0.0, + "step": 13100 + }, + { + "epoch": 9.03, + "grad_norm": 0.0005976618267595768, + "learning_rate": 3.6059597400538916e-06, + "loss": 0.0003, + "step": 13110 + }, + { + "epoch": 9.04, + "grad_norm": 0.0006656855694018304, + "learning_rate": 3.5663338088445082e-06, + "loss": 0.0, + "step": 13120 + }, + { + "epoch": 9.04, + "grad_norm": 0.003439901163801551, + "learning_rate": 3.526707877635125e-06, + "loss": 0.0, + "step": 13130 + }, + { + "epoch": 9.04, + "grad_norm": 0.00043997442116960883, + "learning_rate": 3.4870819464257408e-06, + "loss": 0.0, + "step": 13140 + }, + { + "epoch": 9.04, + "grad_norm": 0.0005484743160195649, + "learning_rate": 3.4474560152163574e-06, + "loss": 0.0, + "step": 13150 + }, + { + "epoch": 9.04, + "grad_norm": 0.00040827819611877203, + "learning_rate": 3.4078300840069746e-06, + "loss": 0.0, + "step": 13160 + }, + { + "epoch": 9.04, + "grad_norm": 0.005499335937201977, + "learning_rate": 3.3682041527975912e-06, + "loss": 0.0001, + "step": 13170 + }, + { + "epoch": 9.04, + "grad_norm": 0.001736334292218089, + "learning_rate": 3.328578221588208e-06, + "loss": 0.0, + "step": 13180 + }, + { + "epoch": 9.04, + "grad_norm": 0.0006113905692473054, + "learning_rate": 3.2889522903788238e-06, + "loss": 0.0, + "step": 13190 + }, + { + "epoch": 9.04, + "grad_norm": 0.001001613331027329, + "learning_rate": 3.2493263591694404e-06, + "loss": 0.1631, + "step": 13200 + }, + { + "epoch": 9.04, + "grad_norm": 0.0003023295139428228, + "learning_rate": 3.209700427960057e-06, + "loss": 0.0, + "step": 13210 + }, + { + "epoch": 9.04, + "grad_norm": 0.0009469907963648438, + "learning_rate": 3.170074496750674e-06, + "loss": 0.0, + "step": 13220 + }, + { + "epoch": 9.04, + "grad_norm": 0.0007909215637482703, + "learning_rate": 3.1304485655412905e-06, + "loss": 0.0, + "step": 13230 + }, + { + "epoch": 9.04, + "grad_norm": 0.001787104643881321, + "learning_rate": 3.0908226343319067e-06, + "loss": 0.0, + "step": 13240 + }, + { + "epoch": 9.04, + "grad_norm": 0.0008837388013489544, + "learning_rate": 3.051196703122524e-06, + "loss": 0.0, + "step": 13250 + }, + { + "epoch": 9.05, + "grad_norm": 0.0007934242021292448, + "learning_rate": 3.01157077191314e-06, + "loss": 0.0004, + "step": 13260 + }, + { + "epoch": 9.05, + "grad_norm": 0.0011570610804483294, + "learning_rate": 2.971944840703757e-06, + "loss": 0.0001, + "step": 13270 + }, + { + "epoch": 9.05, + "grad_norm": 0.00029090800671838224, + "learning_rate": 2.932318909494373e-06, + "loss": 0.0, + "step": 13280 + }, + { + "epoch": 9.05, + "grad_norm": 0.0010709144407883286, + "learning_rate": 2.8926929782849897e-06, + "loss": 0.0, + "step": 13290 + }, + { + "epoch": 9.05, + "grad_norm": 0.001289168605580926, + "learning_rate": 2.8530670470756064e-06, + "loss": 0.0, + "step": 13300 + }, + { + "epoch": 9.05, + "grad_norm": 0.002187453443184495, + "learning_rate": 2.8134411158662227e-06, + "loss": 0.0, + "step": 13310 + }, + { + "epoch": 9.05, + "grad_norm": 0.0007116499473340809, + "learning_rate": 2.7738151846568394e-06, + "loss": 0.0, + "step": 13320 + }, + { + "epoch": 9.05, + "grad_norm": 0.000514859682880342, + "learning_rate": 2.7341892534474565e-06, + "loss": 0.0001, + "step": 13330 + }, + { + "epoch": 9.05, + "grad_norm": 0.0007328620995394886, + "learning_rate": 2.6945633222380727e-06, + "loss": 0.0691, + "step": 13340 + }, + { + "epoch": 9.05, + "grad_norm": 0.0007036814349703491, + "learning_rate": 2.6549373910286894e-06, + "loss": 0.0, + "step": 13350 + }, + { + "epoch": 9.05, + "grad_norm": 0.001070524798706174, + "learning_rate": 2.615311459819306e-06, + "loss": 0.0, + "step": 13360 + }, + { + "epoch": 9.05, + "grad_norm": 0.0008939993567764759, + "learning_rate": 2.5756855286099224e-06, + "loss": 0.0001, + "step": 13370 + }, + { + "epoch": 9.05, + "grad_norm": 0.0004034818266518414, + "learning_rate": 2.536059597400539e-06, + "loss": 0.0206, + "step": 13380 + }, + { + "epoch": 9.05, + "grad_norm": 4.411261081695557, + "learning_rate": 2.4964336661911553e-06, + "loss": 0.0124, + "step": 13390 + }, + { + "epoch": 9.06, + "grad_norm": 0.0006528793601319194, + "learning_rate": 2.456807734981772e-06, + "loss": 0.0, + "step": 13400 + }, + { + "epoch": 9.06, + "grad_norm": 0.0003673941537272185, + "learning_rate": 2.417181803772389e-06, + "loss": 0.0, + "step": 13410 + }, + { + "epoch": 9.06, + "grad_norm": 0.001056182780303061, + "learning_rate": 2.3775558725630054e-06, + "loss": 0.0, + "step": 13420 + }, + { + "epoch": 9.06, + "grad_norm": 0.0012370526092126966, + "learning_rate": 2.337929941353622e-06, + "loss": 0.0001, + "step": 13430 + }, + { + "epoch": 9.06, + "grad_norm": 0.0015783560229465365, + "learning_rate": 2.2983040101442387e-06, + "loss": 0.0, + "step": 13440 + }, + { + "epoch": 9.06, + "grad_norm": 0.0001985041017178446, + "learning_rate": 2.258678078934855e-06, + "loss": 0.0016, + "step": 13450 + }, + { + "epoch": 9.06, + "grad_norm": 0.0010269788326695561, + "learning_rate": 2.2190521477254717e-06, + "loss": 0.1057, + "step": 13460 + }, + { + "epoch": 9.06, + "grad_norm": 0.04036625847220421, + "learning_rate": 2.179426216516088e-06, + "loss": 0.2287, + "step": 13470 + }, + { + "epoch": 9.06, + "grad_norm": 0.000473200052510947, + "learning_rate": 2.1398002853067046e-06, + "loss": 0.0001, + "step": 13480 + }, + { + "epoch": 9.06, + "grad_norm": 0.0003723807749338448, + "learning_rate": 2.1001743540973213e-06, + "loss": 0.0, + "step": 13490 + }, + { + "epoch": 9.06, + "grad_norm": 0.0007169354357756674, + "learning_rate": 2.060548422887938e-06, + "loss": 0.0008, + "step": 13500 + }, + { + "epoch": 9.06, + "grad_norm": 0.00031334979576058686, + "learning_rate": 2.0209224916785547e-06, + "loss": 0.0, + "step": 13510 + }, + { + "epoch": 9.06, + "grad_norm": 0.000616435194388032, + "learning_rate": 1.9812965604691713e-06, + "loss": 0.0, + "step": 13520 + }, + { + "epoch": 9.06, + "grad_norm": 0.0008787320111878216, + "learning_rate": 1.9416706292597876e-06, + "loss": 0.0, + "step": 13530 + }, + { + "epoch": 9.07, + "grad_norm": 0.0002825538394972682, + "learning_rate": 1.9020446980504043e-06, + "loss": 0.0021, + "step": 13540 + }, + { + "epoch": 9.07, + "grad_norm": 0.002063804306089878, + "learning_rate": 1.8624187668410208e-06, + "loss": 0.0004, + "step": 13550 + }, + { + "epoch": 9.07, + "grad_norm": 0.000512151513248682, + "learning_rate": 1.8227928356316374e-06, + "loss": 0.0, + "step": 13560 + }, + { + "epoch": 9.07, + "grad_norm": 0.0006224968819878995, + "learning_rate": 1.7831669044222541e-06, + "loss": 0.0, + "step": 13570 + }, + { + "epoch": 9.07, + "grad_norm": 0.00019008757953997701, + "learning_rate": 1.7435409732128704e-06, + "loss": 0.0, + "step": 13580 + }, + { + "epoch": 9.07, + "grad_norm": 0.0002794242464005947, + "learning_rate": 1.7039150420034873e-06, + "loss": 0.0, + "step": 13590 + }, + { + "epoch": 9.07, + "grad_norm": 0.0009566029766574502, + "learning_rate": 1.664289110794104e-06, + "loss": 0.0001, + "step": 13600 + }, + { + "epoch": 9.07, + "grad_norm": 0.0003199617494828999, + "learning_rate": 1.6246631795847202e-06, + "loss": 0.0001, + "step": 13610 + }, + { + "epoch": 9.07, + "grad_norm": 0.00032697312417440116, + "learning_rate": 1.585037248375337e-06, + "loss": 0.0, + "step": 13620 + }, + { + "epoch": 9.07, + "grad_norm": 0.002565112430602312, + "learning_rate": 1.5454113171659534e-06, + "loss": 0.0, + "step": 13630 + }, + { + "epoch": 9.07, + "grad_norm": 237.59519958496094, + "learning_rate": 1.50578538595657e-06, + "loss": 0.1313, + "step": 13640 + }, + { + "epoch": 9.07, + "grad_norm": 0.0006662964588031173, + "learning_rate": 1.4661594547471865e-06, + "loss": 0.0, + "step": 13650 + }, + { + "epoch": 9.07, + "grad_norm": 0.0011941486736759543, + "learning_rate": 1.4265335235378032e-06, + "loss": 0.0, + "step": 13660 + }, + { + "epoch": 9.07, + "grad_norm": 0.0028123382944613695, + "learning_rate": 1.3869075923284197e-06, + "loss": 0.0, + "step": 13670 + }, + { + "epoch": 9.08, + "grad_norm": 0.0008815588662400842, + "learning_rate": 1.3472816611190364e-06, + "loss": 0.0277, + "step": 13680 + }, + { + "epoch": 9.08, + "grad_norm": 0.00045147593482397497, + "learning_rate": 1.307655729909653e-06, + "loss": 0.0, + "step": 13690 + }, + { + "epoch": 9.08, + "grad_norm": 0.00011046286817872897, + "learning_rate": 1.2680297987002695e-06, + "loss": 0.1484, + "step": 13700 + }, + { + "epoch": 9.08, + "grad_norm": 0.0018034332897514105, + "learning_rate": 1.228403867490886e-06, + "loss": 0.0027, + "step": 13710 + }, + { + "epoch": 9.08, + "grad_norm": 0.000713842804543674, + "learning_rate": 1.1887779362815027e-06, + "loss": 0.0, + "step": 13720 + }, + { + "epoch": 9.08, + "grad_norm": 0.0010389587841928005, + "learning_rate": 1.1491520050721194e-06, + "loss": 0.0, + "step": 13730 + }, + { + "epoch": 9.08, + "grad_norm": 0.0003368295438122004, + "learning_rate": 1.1095260738627358e-06, + "loss": 0.0, + "step": 13740 + }, + { + "epoch": 9.08, + "grad_norm": 0.000346412300132215, + "learning_rate": 1.0699001426533523e-06, + "loss": 0.0, + "step": 13750 + }, + { + "epoch": 9.08, + "grad_norm": 0.0004677934921346605, + "learning_rate": 1.030274211443969e-06, + "loss": 0.0, + "step": 13760 + }, + { + "epoch": 9.08, + "grad_norm": 0.0008401199011132121, + "learning_rate": 9.906482802345857e-07, + "loss": 0.0, + "step": 13770 + }, + { + "epoch": 9.08, + "grad_norm": 0.0003339408722240478, + "learning_rate": 9.510223490252021e-07, + "loss": 0.0002, + "step": 13780 + }, + { + "epoch": 9.08, + "grad_norm": 0.0004967558197677135, + "learning_rate": 9.113964178158187e-07, + "loss": 0.0, + "step": 13790 + }, + { + "epoch": 9.08, + "grad_norm": 0.002963978098705411, + "learning_rate": 8.717704866064352e-07, + "loss": 0.0, + "step": 13800 + }, + { + "epoch": 9.08, + "grad_norm": 0.001155543839558959, + "learning_rate": 8.32144555397052e-07, + "loss": 0.0, + "step": 13810 + }, + { + "epoch": 9.09, + "grad_norm": 0.000786484801210463, + "learning_rate": 7.925186241876685e-07, + "loss": 0.1625, + "step": 13820 + }, + { + "epoch": 9.09, + "grad_norm": 0.0002841146197170019, + "learning_rate": 7.52892692978285e-07, + "loss": 0.0, + "step": 13830 + }, + { + "epoch": 9.09, + "grad_norm": 0.00030605948995798826, + "learning_rate": 7.132667617689016e-07, + "loss": 0.0, + "step": 13840 + }, + { + "epoch": 9.09, + "grad_norm": 0.001265210215933621, + "learning_rate": 6.736408305595182e-07, + "loss": 0.0, + "step": 13850 + }, + { + "epoch": 9.09, + "grad_norm": 0.00038683577440679073, + "learning_rate": 6.340148993501348e-07, + "loss": 0.0, + "step": 13860 + }, + { + "epoch": 9.09, + "grad_norm": 0.0005034942296333611, + "learning_rate": 5.943889681407513e-07, + "loss": 0.0, + "step": 13870 + }, + { + "epoch": 9.09, + "grad_norm": 0.0011582579463720322, + "learning_rate": 5.547630369313679e-07, + "loss": 0.0, + "step": 13880 + }, + { + "epoch": 9.09, + "grad_norm": 0.0016904632793739438, + "learning_rate": 5.151371057219845e-07, + "loss": 0.0, + "step": 13890 + }, + { + "epoch": 9.09, + "grad_norm": 0.00032329061650671065, + "learning_rate": 4.7551117451260107e-07, + "loss": 0.0, + "step": 13900 + }, + { + "epoch": 9.09, + "grad_norm": 0.0003388900659047067, + "learning_rate": 4.358852433032176e-07, + "loss": 0.0, + "step": 13910 + }, + { + "epoch": 9.09, + "grad_norm": 0.0003800652630161494, + "learning_rate": 3.962593120938342e-07, + "loss": 0.0, + "step": 13920 + }, + { + "epoch": 9.09, + "grad_norm": 0.0009641946526244283, + "learning_rate": 3.566333808844508e-07, + "loss": 0.0, + "step": 13930 + }, + { + "epoch": 9.09, + "grad_norm": 0.0005723941139876842, + "learning_rate": 3.170074496750674e-07, + "loss": 0.0001, + "step": 13940 + }, + { + "epoch": 9.09, + "grad_norm": 0.0005183956818655133, + "learning_rate": 2.7738151846568396e-07, + "loss": 0.0, + "step": 13950 + }, + { + "epoch": 9.1, + "grad_norm": 0.009076601825654507, + "learning_rate": 2.3775558725630054e-07, + "loss": 0.0, + "step": 13960 + }, + { + "epoch": 9.1, + "grad_norm": 0.0007901808712631464, + "learning_rate": 1.981296560469171e-07, + "loss": 0.0, + "step": 13970 + }, + { + "epoch": 9.1, + "grad_norm": 0.0005284142098389566, + "learning_rate": 1.585037248375337e-07, + "loss": 0.0, + "step": 13980 + }, + { + "epoch": 9.1, + "grad_norm": 0.0006428571650758386, + "learning_rate": 1.1887779362815027e-07, + "loss": 0.0, + "step": 13990 + }, + { + "epoch": 9.1, + "grad_norm": 0.0012319569941610098, + "learning_rate": 7.925186241876685e-08, + "loss": 0.0001, + "step": 14000 + }, + { + "epoch": 9.1, + "grad_norm": 0.000267757655819878, + "learning_rate": 3.962593120938342e-08, + "loss": 0.0, + "step": 14010 + }, + { + "epoch": 9.1, + "grad_norm": 0.0010718012927100062, + "learning_rate": 0.0, + "loss": 0.0, + "step": 14020 + }, + { + "epoch": 9.1, + "eval_accuracy": 0.9654135338345865, + "eval_loss": 0.24323464930057526, + "eval_runtime": 2339.7693, + "eval_samples_per_second": 0.284, + "eval_steps_per_second": 0.142, + "step": 14020 + }, + { + "epoch": 9.1, + "step": 14020, + "total_flos": 7.1819242300007645e+19, + "train_loss": 0.21922695452951727, + "train_runtime": 145352.8053, + "train_samples_per_second": 0.193, + "train_steps_per_second": 0.096 + }, + { + "epoch": 9.1, + "eval_accuracy": 0.960960960960961, + "eval_loss": 0.25779759883880615, + "eval_runtime": 1196.8626, + "eval_samples_per_second": 0.278, + "eval_steps_per_second": 0.14, + "step": 14020 + }, + { + "epoch": 9.1, + "eval_accuracy": 0.960960960960961, + "eval_loss": 0.25779759883880615, + "eval_runtime": 1193.7233, + "eval_samples_per_second": 0.279, + "eval_steps_per_second": 0.14, + "step": 14020 + } + ], + "logging_steps": 10, + "max_steps": 14020, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "total_flos": 7.1819242300007645e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}