diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10657 @@ +{ + "best_metric": 0.5269582909460834, + "best_model_checkpoint": "videomae-base-finetuned-Cheh2_ucf_light_demo-10epochs/checkpoint-13518", + "epoch": 9.099400399733511, + "eval_steps": 500, + "global_step": 15010, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006662225183211193, + "grad_norm": 7.63810396194458, + "learning_rate": 3.3311125916055966e-07, + "loss": 2.8842, + "step": 10 + }, + { + "epoch": 0.0013324450366422385, + "grad_norm": 7.92678689956665, + "learning_rate": 6.662225183211193e-07, + "loss": 2.7855, + "step": 20 + }, + { + "epoch": 0.001998667554963358, + "grad_norm": 9.115898132324219, + "learning_rate": 9.99333777481679e-07, + "loss": 2.876, + "step": 30 + }, + { + "epoch": 0.002664890073284477, + "grad_norm": 7.452608108520508, + "learning_rate": 1.3324450366422386e-06, + "loss": 2.8942, + "step": 40 + }, + { + "epoch": 0.0033311125916055963, + "grad_norm": 8.414969444274902, + "learning_rate": 1.6655562958027983e-06, + "loss": 2.8683, + "step": 50 + }, + { + "epoch": 0.003997335109926716, + "grad_norm": 9.669262886047363, + "learning_rate": 1.998667554963358e-06, + "loss": 2.9063, + "step": 60 + }, + { + "epoch": 0.004663557628247834, + "grad_norm": 8.873148918151855, + "learning_rate": 2.3317788141239174e-06, + "loss": 2.9122, + "step": 70 + }, + { + "epoch": 0.005329780146568954, + "grad_norm": 7.010265827178955, + "learning_rate": 2.6648900732844773e-06, + "loss": 2.7938, + "step": 80 + }, + { + "epoch": 0.005996002664890073, + "grad_norm": 7.868851661682129, + "learning_rate": 2.9980013324450367e-06, + "loss": 2.8432, + "step": 90 + }, + { + "epoch": 0.006662225183211193, + "grad_norm": 13.779504776000977, + "learning_rate": 3.3311125916055966e-06, + "loss": 2.8557, + "step": 100 + }, + { + "epoch": 0.0073284477015323115, + "grad_norm": 18.025705337524414, + "learning_rate": 3.664223850766156e-06, + "loss": 2.833, + "step": 110 + }, + { + "epoch": 0.007994670219853431, + "grad_norm": 10.714227676391602, + "learning_rate": 3.997335109926716e-06, + "loss": 2.819, + "step": 120 + }, + { + "epoch": 0.008660892738174551, + "grad_norm": 12.980474472045898, + "learning_rate": 4.330446369087275e-06, + "loss": 2.7673, + "step": 130 + }, + { + "epoch": 0.009327115256495669, + "grad_norm": 9.94898796081543, + "learning_rate": 4.663557628247835e-06, + "loss": 2.8013, + "step": 140 + }, + { + "epoch": 0.009993337774816789, + "grad_norm": 11.145480155944824, + "learning_rate": 4.996668887408395e-06, + "loss": 2.7966, + "step": 150 + }, + { + "epoch": 0.010659560293137908, + "grad_norm": 8.752339363098145, + "learning_rate": 5.3297801465689545e-06, + "loss": 2.8563, + "step": 160 + }, + { + "epoch": 0.011325782811459028, + "grad_norm": 10.844465255737305, + "learning_rate": 5.6628914057295136e-06, + "loss": 2.818, + "step": 170 + }, + { + "epoch": 0.011992005329780146, + "grad_norm": 8.572322845458984, + "learning_rate": 5.9960026648900734e-06, + "loss": 2.7323, + "step": 180 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 9.468083381652832, + "learning_rate": 6.329113924050633e-06, + "loss": 2.8321, + "step": 190 + }, + { + "epoch": 0.013324450366422385, + "grad_norm": 9.758465766906738, + "learning_rate": 6.662225183211193e-06, + "loss": 2.7838, + "step": 200 + }, + { + "epoch": 0.013990672884743505, + "grad_norm": 7.190845489501953, + "learning_rate": 6.995336442371753e-06, + "loss": 2.7511, + "step": 210 + }, + { + "epoch": 0.014656895403064623, + "grad_norm": 9.061192512512207, + "learning_rate": 7.328447701532312e-06, + "loss": 2.7792, + "step": 220 + }, + { + "epoch": 0.015323117921385743, + "grad_norm": 7.029029369354248, + "learning_rate": 7.661558960692871e-06, + "loss": 2.8318, + "step": 230 + }, + { + "epoch": 0.015989340439706862, + "grad_norm": 8.73095417022705, + "learning_rate": 7.994670219853432e-06, + "loss": 2.6969, + "step": 240 + }, + { + "epoch": 0.016655562958027982, + "grad_norm": 8.359384536743164, + "learning_rate": 8.32778147901399e-06, + "loss": 2.8081, + "step": 250 + }, + { + "epoch": 0.017321785476349102, + "grad_norm": 7.900940895080566, + "learning_rate": 8.66089273817455e-06, + "loss": 2.813, + "step": 260 + }, + { + "epoch": 0.01798800799467022, + "grad_norm": 7.423415184020996, + "learning_rate": 8.99400399733511e-06, + "loss": 2.7452, + "step": 270 + }, + { + "epoch": 0.018654230512991338, + "grad_norm": 7.782347679138184, + "learning_rate": 9.32711525649567e-06, + "loss": 2.7722, + "step": 280 + }, + { + "epoch": 0.019320453031312457, + "grad_norm": 8.399079322814941, + "learning_rate": 9.66022651565623e-06, + "loss": 2.7515, + "step": 290 + }, + { + "epoch": 0.019986675549633577, + "grad_norm": 8.545032501220703, + "learning_rate": 9.99333777481679e-06, + "loss": 2.7359, + "step": 300 + }, + { + "epoch": 0.020652898067954697, + "grad_norm": 8.589181900024414, + "learning_rate": 1.032644903397735e-05, + "loss": 2.725, + "step": 310 + }, + { + "epoch": 0.021319120586275817, + "grad_norm": 8.402008056640625, + "learning_rate": 1.0659560293137909e-05, + "loss": 2.771, + "step": 320 + }, + { + "epoch": 0.021985343104596936, + "grad_norm": 7.227911949157715, + "learning_rate": 1.0992671552298468e-05, + "loss": 2.7502, + "step": 330 + }, + { + "epoch": 0.022651565622918056, + "grad_norm": 7.783188343048096, + "learning_rate": 1.1325782811459027e-05, + "loss": 2.7395, + "step": 340 + }, + { + "epoch": 0.023317788141239172, + "grad_norm": 8.657325744628906, + "learning_rate": 1.1658894070619588e-05, + "loss": 2.7669, + "step": 350 + }, + { + "epoch": 0.023984010659560292, + "grad_norm": 8.297825813293457, + "learning_rate": 1.1992005329780147e-05, + "loss": 2.7388, + "step": 360 + }, + { + "epoch": 0.02465023317788141, + "grad_norm": 9.336974143981934, + "learning_rate": 1.2325116588940706e-05, + "loss": 2.745, + "step": 370 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 8.059700965881348, + "learning_rate": 1.2658227848101267e-05, + "loss": 2.7364, + "step": 380 + }, + { + "epoch": 0.02598267821452365, + "grad_norm": 7.693248748779297, + "learning_rate": 1.2991339107261827e-05, + "loss": 2.6444, + "step": 390 + }, + { + "epoch": 0.02664890073284477, + "grad_norm": 9.74440860748291, + "learning_rate": 1.3324450366422386e-05, + "loss": 2.7298, + "step": 400 + }, + { + "epoch": 0.02731512325116589, + "grad_norm": 7.748172283172607, + "learning_rate": 1.3657561625582945e-05, + "loss": 2.7692, + "step": 410 + }, + { + "epoch": 0.02798134576948701, + "grad_norm": 7.931158542633057, + "learning_rate": 1.3990672884743506e-05, + "loss": 2.7054, + "step": 420 + }, + { + "epoch": 0.028647568287808126, + "grad_norm": 8.028225898742676, + "learning_rate": 1.4323784143904065e-05, + "loss": 2.7396, + "step": 430 + }, + { + "epoch": 0.029313790806129246, + "grad_norm": 8.826229095458984, + "learning_rate": 1.4656895403064624e-05, + "loss": 2.6486, + "step": 440 + }, + { + "epoch": 0.029980013324450366, + "grad_norm": 7.829557418823242, + "learning_rate": 1.4990006662225183e-05, + "loss": 2.7363, + "step": 450 + }, + { + "epoch": 0.030646235842771485, + "grad_norm": 8.903072357177734, + "learning_rate": 1.5323117921385742e-05, + "loss": 2.625, + "step": 460 + }, + { + "epoch": 0.0313124583610926, + "grad_norm": 7.610354900360107, + "learning_rate": 1.5656229180546303e-05, + "loss": 2.7232, + "step": 470 + }, + { + "epoch": 0.031978680879413725, + "grad_norm": 7.955471515655518, + "learning_rate": 1.5989340439706864e-05, + "loss": 2.727, + "step": 480 + }, + { + "epoch": 0.03264490339773484, + "grad_norm": 8.275619506835938, + "learning_rate": 1.632245169886742e-05, + "loss": 2.6826, + "step": 490 + }, + { + "epoch": 0.033311125916055964, + "grad_norm": 8.786977767944336, + "learning_rate": 1.665556295802798e-05, + "loss": 2.7416, + "step": 500 + }, + { + "epoch": 0.03397734843437708, + "grad_norm": 9.151405334472656, + "learning_rate": 1.6988674217188542e-05, + "loss": 2.6067, + "step": 510 + }, + { + "epoch": 0.034643570952698204, + "grad_norm": 8.654767036437988, + "learning_rate": 1.73217854763491e-05, + "loss": 2.6428, + "step": 520 + }, + { + "epoch": 0.03530979347101932, + "grad_norm": 8.000008583068848, + "learning_rate": 1.765489673550966e-05, + "loss": 2.6457, + "step": 530 + }, + { + "epoch": 0.03597601598934044, + "grad_norm": 8.926780700683594, + "learning_rate": 1.798800799467022e-05, + "loss": 2.6917, + "step": 540 + }, + { + "epoch": 0.03664223850766156, + "grad_norm": 9.491806983947754, + "learning_rate": 1.832111925383078e-05, + "loss": 2.5757, + "step": 550 + }, + { + "epoch": 0.037308461025982675, + "grad_norm": 11.303333282470703, + "learning_rate": 1.865423051299134e-05, + "loss": 2.6069, + "step": 560 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 9.33780574798584, + "learning_rate": 1.89873417721519e-05, + "loss": 2.5802, + "step": 570 + }, + { + "epoch": 0.038640906062624915, + "grad_norm": 10.358466148376465, + "learning_rate": 1.932045303131246e-05, + "loss": 2.6637, + "step": 580 + }, + { + "epoch": 0.03930712858094604, + "grad_norm": 9.639898300170898, + "learning_rate": 1.965356429047302e-05, + "loss": 2.6555, + "step": 590 + }, + { + "epoch": 0.039973351099267154, + "grad_norm": 9.707630157470703, + "learning_rate": 1.998667554963358e-05, + "loss": 2.6017, + "step": 600 + }, + { + "epoch": 0.04063957361758828, + "grad_norm": 9.97517204284668, + "learning_rate": 2.031978680879414e-05, + "loss": 2.5346, + "step": 610 + }, + { + "epoch": 0.041305796135909394, + "grad_norm": 9.079474449157715, + "learning_rate": 2.06528980679547e-05, + "loss": 2.6604, + "step": 620 + }, + { + "epoch": 0.04197201865423051, + "grad_norm": 8.746962547302246, + "learning_rate": 2.0986009327115257e-05, + "loss": 2.615, + "step": 630 + }, + { + "epoch": 0.04263824117255163, + "grad_norm": 9.94512939453125, + "learning_rate": 2.1319120586275818e-05, + "loss": 2.6732, + "step": 640 + }, + { + "epoch": 0.04330446369087275, + "grad_norm": 9.626450538635254, + "learning_rate": 2.1652231845436375e-05, + "loss": 2.4743, + "step": 650 + }, + { + "epoch": 0.04397068620919387, + "grad_norm": 8.614361763000488, + "learning_rate": 2.1985343104596936e-05, + "loss": 2.6106, + "step": 660 + }, + { + "epoch": 0.04463690872751499, + "grad_norm": 9.120862007141113, + "learning_rate": 2.2318454363757497e-05, + "loss": 2.5029, + "step": 670 + }, + { + "epoch": 0.04530313124583611, + "grad_norm": 11.924278259277344, + "learning_rate": 2.2651565622918054e-05, + "loss": 2.5521, + "step": 680 + }, + { + "epoch": 0.04596935376415723, + "grad_norm": 10.966530799865723, + "learning_rate": 2.2984676882078615e-05, + "loss": 2.5866, + "step": 690 + }, + { + "epoch": 0.046635576282478344, + "grad_norm": 9.447751998901367, + "learning_rate": 2.3317788141239176e-05, + "loss": 2.4353, + "step": 700 + }, + { + "epoch": 0.04730179880079947, + "grad_norm": 11.329916954040527, + "learning_rate": 2.3650899400399733e-05, + "loss": 2.508, + "step": 710 + }, + { + "epoch": 0.047968021319120584, + "grad_norm": 13.041949272155762, + "learning_rate": 2.3984010659560294e-05, + "loss": 2.4598, + "step": 720 + }, + { + "epoch": 0.04863424383744171, + "grad_norm": 10.894869804382324, + "learning_rate": 2.4317121918720854e-05, + "loss": 2.3893, + "step": 730 + }, + { + "epoch": 0.04930046635576282, + "grad_norm": 9.811487197875977, + "learning_rate": 2.4650233177881412e-05, + "loss": 2.4279, + "step": 740 + }, + { + "epoch": 0.049966688874083946, + "grad_norm": 12.670271873474121, + "learning_rate": 2.4983344437041972e-05, + "loss": 2.5177, + "step": 750 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 12.06639575958252, + "learning_rate": 2.5316455696202533e-05, + "loss": 2.378, + "step": 760 + }, + { + "epoch": 0.051299133910726186, + "grad_norm": 11.615519523620605, + "learning_rate": 2.564956695536309e-05, + "loss": 2.4431, + "step": 770 + }, + { + "epoch": 0.0519653564290473, + "grad_norm": 11.830503463745117, + "learning_rate": 2.5982678214523655e-05, + "loss": 2.5298, + "step": 780 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 13.209222793579102, + "learning_rate": 2.6315789473684212e-05, + "loss": 2.5268, + "step": 790 + }, + { + "epoch": 0.05329780146568954, + "grad_norm": 11.294581413269043, + "learning_rate": 2.6648900732844773e-05, + "loss": 2.4935, + "step": 800 + }, + { + "epoch": 0.05396402398401066, + "grad_norm": 10.616040229797363, + "learning_rate": 2.698201199200533e-05, + "loss": 2.2216, + "step": 810 + }, + { + "epoch": 0.05463024650233178, + "grad_norm": 10.795992851257324, + "learning_rate": 2.731512325116589e-05, + "loss": 2.49, + "step": 820 + }, + { + "epoch": 0.0552964690206529, + "grad_norm": 9.622115135192871, + "learning_rate": 2.7648234510326448e-05, + "loss": 2.459, + "step": 830 + }, + { + "epoch": 0.05596269153897402, + "grad_norm": 8.9458646774292, + "learning_rate": 2.7981345769487012e-05, + "loss": 2.2603, + "step": 840 + }, + { + "epoch": 0.056628914057295136, + "grad_norm": 9.720725059509277, + "learning_rate": 2.8314457028647566e-05, + "loss": 2.4335, + "step": 850 + }, + { + "epoch": 0.05729513657561625, + "grad_norm": 8.741358757019043, + "learning_rate": 2.864756828780813e-05, + "loss": 2.5508, + "step": 860 + }, + { + "epoch": 0.057961359093937376, + "grad_norm": 10.076268196105957, + "learning_rate": 2.8980679546968688e-05, + "loss": 2.4548, + "step": 870 + }, + { + "epoch": 0.05862758161225849, + "grad_norm": 11.318229675292969, + "learning_rate": 2.9313790806129248e-05, + "loss": 2.5112, + "step": 880 + }, + { + "epoch": 0.059293804130579615, + "grad_norm": 11.918413162231445, + "learning_rate": 2.9646902065289806e-05, + "loss": 2.3348, + "step": 890 + }, + { + "epoch": 0.05996002664890073, + "grad_norm": 10.639049530029297, + "learning_rate": 2.9980013324450366e-05, + "loss": 2.4893, + "step": 900 + }, + { + "epoch": 0.060626249167221855, + "grad_norm": 9.080975532531738, + "learning_rate": 3.031312458361093e-05, + "loss": 2.275, + "step": 910 + }, + { + "epoch": 0.06129247168554297, + "grad_norm": 10.406045913696289, + "learning_rate": 3.0646235842771484e-05, + "loss": 2.3774, + "step": 920 + }, + { + "epoch": 0.061958694203864094, + "grad_norm": 21.67115020751953, + "learning_rate": 3.097934710193205e-05, + "loss": 2.5484, + "step": 930 + }, + { + "epoch": 0.0626249167221852, + "grad_norm": 12.380255699157715, + "learning_rate": 3.1312458361092606e-05, + "loss": 2.4908, + "step": 940 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 8.405176162719727, + "learning_rate": 3.1645569620253167e-05, + "loss": 2.3972, + "step": 950 + }, + { + "epoch": 0.06395736175882745, + "grad_norm": 9.598671913146973, + "learning_rate": 3.197868087941373e-05, + "loss": 2.4328, + "step": 960 + }, + { + "epoch": 0.06462358427714857, + "grad_norm": 9.85934066772461, + "learning_rate": 3.231179213857429e-05, + "loss": 2.3984, + "step": 970 + }, + { + "epoch": 0.06528980679546968, + "grad_norm": 9.67108154296875, + "learning_rate": 3.264490339773484e-05, + "loss": 2.3469, + "step": 980 + }, + { + "epoch": 0.06595602931379081, + "grad_norm": 14.794262886047363, + "learning_rate": 3.297801465689541e-05, + "loss": 2.3402, + "step": 990 + }, + { + "epoch": 0.06662225183211193, + "grad_norm": 11.501021385192871, + "learning_rate": 3.331112591605596e-05, + "loss": 2.3241, + "step": 1000 + }, + { + "epoch": 0.06728847435043304, + "grad_norm": 11.72728443145752, + "learning_rate": 3.3644237175216524e-05, + "loss": 2.236, + "step": 1010 + }, + { + "epoch": 0.06795469686875416, + "grad_norm": 10.66474723815918, + "learning_rate": 3.3977348434377085e-05, + "loss": 2.4494, + "step": 1020 + }, + { + "epoch": 0.06862091938707528, + "grad_norm": 11.998071670532227, + "learning_rate": 3.4310459693537645e-05, + "loss": 2.4495, + "step": 1030 + }, + { + "epoch": 0.06928714190539641, + "grad_norm": 10.421819686889648, + "learning_rate": 3.46435709526982e-05, + "loss": 2.3546, + "step": 1040 + }, + { + "epoch": 0.06995336442371752, + "grad_norm": 9.24685001373291, + "learning_rate": 3.497668221185876e-05, + "loss": 2.2899, + "step": 1050 + }, + { + "epoch": 0.07061958694203864, + "grad_norm": 10.867138862609863, + "learning_rate": 3.530979347101932e-05, + "loss": 2.3759, + "step": 1060 + }, + { + "epoch": 0.07128580946035976, + "grad_norm": 18.8386173248291, + "learning_rate": 3.564290473017988e-05, + "loss": 2.353, + "step": 1070 + }, + { + "epoch": 0.07195203197868089, + "grad_norm": 13.193479537963867, + "learning_rate": 3.597601598934044e-05, + "loss": 2.3598, + "step": 1080 + }, + { + "epoch": 0.072618254497002, + "grad_norm": 11.135821342468262, + "learning_rate": 3.6309127248501e-05, + "loss": 2.0379, + "step": 1090 + }, + { + "epoch": 0.07328447701532312, + "grad_norm": 11.465619087219238, + "learning_rate": 3.664223850766156e-05, + "loss": 2.0553, + "step": 1100 + }, + { + "epoch": 0.07395069953364423, + "grad_norm": 9.789112091064453, + "learning_rate": 3.697534976682212e-05, + "loss": 2.4473, + "step": 1110 + }, + { + "epoch": 0.07461692205196535, + "grad_norm": 10.19888687133789, + "learning_rate": 3.730846102598268e-05, + "loss": 2.179, + "step": 1120 + }, + { + "epoch": 0.07528314457028648, + "grad_norm": 9.195366859436035, + "learning_rate": 3.764157228514324e-05, + "loss": 2.1104, + "step": 1130 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 11.891404151916504, + "learning_rate": 3.79746835443038e-05, + "loss": 2.1634, + "step": 1140 + }, + { + "epoch": 0.07661558960692871, + "grad_norm": 12.181055068969727, + "learning_rate": 3.830779480346436e-05, + "loss": 2.3321, + "step": 1150 + }, + { + "epoch": 0.07728181212524983, + "grad_norm": 10.981636047363281, + "learning_rate": 3.864090606262492e-05, + "loss": 2.237, + "step": 1160 + }, + { + "epoch": 0.07794803464357095, + "grad_norm": 10.422065734863281, + "learning_rate": 3.8974017321785475e-05, + "loss": 2.4123, + "step": 1170 + }, + { + "epoch": 0.07861425716189208, + "grad_norm": 14.429110527038574, + "learning_rate": 3.930712858094604e-05, + "loss": 2.389, + "step": 1180 + }, + { + "epoch": 0.07928047968021319, + "grad_norm": 8.421404838562012, + "learning_rate": 3.96402398401066e-05, + "loss": 2.3829, + "step": 1190 + }, + { + "epoch": 0.07994670219853431, + "grad_norm": 10.366717338562012, + "learning_rate": 3.997335109926716e-05, + "loss": 2.2898, + "step": 1200 + }, + { + "epoch": 0.08061292471685542, + "grad_norm": 8.243749618530273, + "learning_rate": 4.030646235842772e-05, + "loss": 2.4065, + "step": 1210 + }, + { + "epoch": 0.08127914723517655, + "grad_norm": 10.87509822845459, + "learning_rate": 4.063957361758828e-05, + "loss": 2.2885, + "step": 1220 + }, + { + "epoch": 0.08194536975349767, + "grad_norm": 11.025871276855469, + "learning_rate": 4.097268487674883e-05, + "loss": 2.3276, + "step": 1230 + }, + { + "epoch": 0.08261159227181879, + "grad_norm": 12.098799705505371, + "learning_rate": 4.13057961359094e-05, + "loss": 2.2235, + "step": 1240 + }, + { + "epoch": 0.0832778147901399, + "grad_norm": 12.713061332702637, + "learning_rate": 4.1638907395069954e-05, + "loss": 2.3264, + "step": 1250 + }, + { + "epoch": 0.08394403730846102, + "grad_norm": 11.777823448181152, + "learning_rate": 4.1972018654230515e-05, + "loss": 2.4952, + "step": 1260 + }, + { + "epoch": 0.08461025982678215, + "grad_norm": 8.089425086975098, + "learning_rate": 4.2305129913391076e-05, + "loss": 2.3291, + "step": 1270 + }, + { + "epoch": 0.08527648234510327, + "grad_norm": 14.143279075622559, + "learning_rate": 4.2638241172551636e-05, + "loss": 2.2375, + "step": 1280 + }, + { + "epoch": 0.08594270486342438, + "grad_norm": 11.689330101013184, + "learning_rate": 4.297135243171219e-05, + "loss": 2.3163, + "step": 1290 + }, + { + "epoch": 0.0866089273817455, + "grad_norm": 8.694790840148926, + "learning_rate": 4.330446369087275e-05, + "loss": 2.3953, + "step": 1300 + }, + { + "epoch": 0.08727514990006663, + "grad_norm": 8.265359878540039, + "learning_rate": 4.363757495003331e-05, + "loss": 2.1699, + "step": 1310 + }, + { + "epoch": 0.08794137241838774, + "grad_norm": 9.13792610168457, + "learning_rate": 4.397068620919387e-05, + "loss": 2.2424, + "step": 1320 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 10.141348838806152, + "learning_rate": 4.430379746835443e-05, + "loss": 2.0845, + "step": 1330 + }, + { + "epoch": 0.08927381745502998, + "grad_norm": 8.910552978515625, + "learning_rate": 4.4636908727514994e-05, + "loss": 2.111, + "step": 1340 + }, + { + "epoch": 0.0899400399733511, + "grad_norm": 11.548059463500977, + "learning_rate": 4.4970019986675555e-05, + "loss": 2.2492, + "step": 1350 + }, + { + "epoch": 0.09060626249167222, + "grad_norm": 11.858217239379883, + "learning_rate": 4.530313124583611e-05, + "loss": 2.2024, + "step": 1360 + }, + { + "epoch": 0.09127248500999334, + "grad_norm": 9.182195663452148, + "learning_rate": 4.5636242504996676e-05, + "loss": 2.258, + "step": 1370 + }, + { + "epoch": 0.09193870752831446, + "grad_norm": 9.36535930633545, + "learning_rate": 4.596935376415723e-05, + "loss": 2.3666, + "step": 1380 + }, + { + "epoch": 0.09260493004663557, + "grad_norm": 13.052865028381348, + "learning_rate": 4.630246502331779e-05, + "loss": 2.0462, + "step": 1390 + }, + { + "epoch": 0.09327115256495669, + "grad_norm": 15.416093826293945, + "learning_rate": 4.663557628247835e-05, + "loss": 2.1078, + "step": 1400 + }, + { + "epoch": 0.09393737508327782, + "grad_norm": 9.860511779785156, + "learning_rate": 4.696868754163891e-05, + "loss": 2.2692, + "step": 1410 + }, + { + "epoch": 0.09460359760159893, + "grad_norm": 8.188815116882324, + "learning_rate": 4.7301798800799466e-05, + "loss": 2.2741, + "step": 1420 + }, + { + "epoch": 0.09526982011992005, + "grad_norm": 9.153392791748047, + "learning_rate": 4.7634910059960034e-05, + "loss": 2.1341, + "step": 1430 + }, + { + "epoch": 0.09593604263824117, + "grad_norm": 15.457286834716797, + "learning_rate": 4.796802131912059e-05, + "loss": 2.2775, + "step": 1440 + }, + { + "epoch": 0.0966022651565623, + "grad_norm": 14.855579376220703, + "learning_rate": 4.830113257828115e-05, + "loss": 2.1782, + "step": 1450 + }, + { + "epoch": 0.09726848767488341, + "grad_norm": 10.549091339111328, + "learning_rate": 4.863424383744171e-05, + "loss": 2.2352, + "step": 1460 + }, + { + "epoch": 0.09793471019320453, + "grad_norm": 12.650402069091797, + "learning_rate": 4.896735509660227e-05, + "loss": 2.1561, + "step": 1470 + }, + { + "epoch": 0.09860093271152565, + "grad_norm": 10.362966537475586, + "learning_rate": 4.9300466355762824e-05, + "loss": 2.2463, + "step": 1480 + }, + { + "epoch": 0.09926715522984676, + "grad_norm": 12.717658042907715, + "learning_rate": 4.963357761492339e-05, + "loss": 2.2419, + "step": 1490 + }, + { + "epoch": 0.09993337774816789, + "grad_norm": 8.998979568481445, + "learning_rate": 4.9966688874083945e-05, + "loss": 2.3441, + "step": 1500 + }, + { + "epoch": 0.10006662225183212, + "eval_accuracy": 0.24686334350627331, + "eval_loss": 2.3922977447509766, + "eval_runtime": 940.5833, + "eval_samples_per_second": 3.135, + "eval_steps_per_second": 0.392, + "step": 1502 + }, + { + "epoch": 1.0005329780146568, + "grad_norm": 13.393614768981934, + "learning_rate": 4.9966688874083945e-05, + "loss": 2.3347, + "step": 1510 + }, + { + "epoch": 1.001199200532978, + "grad_norm": 12.568906784057617, + "learning_rate": 4.9929676511955e-05, + "loss": 1.9773, + "step": 1520 + }, + { + "epoch": 1.001865423051299, + "grad_norm": 8.994260787963867, + "learning_rate": 4.9892664149826046e-05, + "loss": 2.1767, + "step": 1530 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 9.295744895935059, + "learning_rate": 4.9855651787697094e-05, + "loss": 2.2339, + "step": 1540 + }, + { + "epoch": 1.0031978680879414, + "grad_norm": 10.39157772064209, + "learning_rate": 4.981863942556814e-05, + "loss": 1.9992, + "step": 1550 + }, + { + "epoch": 1.0038640906062626, + "grad_norm": 11.607149124145508, + "learning_rate": 4.978162706343919e-05, + "loss": 2.086, + "step": 1560 + }, + { + "epoch": 1.0045303131245835, + "grad_norm": 12.5213623046875, + "learning_rate": 4.9744614701310236e-05, + "loss": 2.0738, + "step": 1570 + }, + { + "epoch": 1.0051965356429047, + "grad_norm": 11.493557929992676, + "learning_rate": 4.970760233918128e-05, + "loss": 1.9712, + "step": 1580 + }, + { + "epoch": 1.0058627581612258, + "grad_norm": 8.833171844482422, + "learning_rate": 4.967058997705234e-05, + "loss": 2.0976, + "step": 1590 + }, + { + "epoch": 1.006528980679547, + "grad_norm": 15.026812553405762, + "learning_rate": 4.963357761492339e-05, + "loss": 2.0485, + "step": 1600 + }, + { + "epoch": 1.0071952031978681, + "grad_norm": 8.762751579284668, + "learning_rate": 4.959656525279444e-05, + "loss": 2.0246, + "step": 1610 + }, + { + "epoch": 1.0078614257161893, + "grad_norm": 14.288644790649414, + "learning_rate": 4.9559552890665486e-05, + "loss": 2.086, + "step": 1620 + }, + { + "epoch": 1.0085276482345102, + "grad_norm": 6.759267807006836, + "learning_rate": 4.952254052853653e-05, + "loss": 2.2369, + "step": 1630 + }, + { + "epoch": 1.0091938707528314, + "grad_norm": 14.177397727966309, + "learning_rate": 4.948552816640758e-05, + "loss": 2.1651, + "step": 1640 + }, + { + "epoch": 1.0098600932711526, + "grad_norm": 10.87800121307373, + "learning_rate": 4.9448515804278634e-05, + "loss": 2.1072, + "step": 1650 + }, + { + "epoch": 1.0105263157894737, + "grad_norm": 10.101408004760742, + "learning_rate": 4.941150344214968e-05, + "loss": 2.3265, + "step": 1660 + }, + { + "epoch": 1.0111925383077949, + "grad_norm": 10.579983711242676, + "learning_rate": 4.937449108002073e-05, + "loss": 2.0223, + "step": 1670 + }, + { + "epoch": 1.011858760826116, + "grad_norm": 9.264245986938477, + "learning_rate": 4.9337478717891776e-05, + "loss": 1.9954, + "step": 1680 + }, + { + "epoch": 1.012524983344437, + "grad_norm": 13.724347114562988, + "learning_rate": 4.9300466355762824e-05, + "loss": 2.1786, + "step": 1690 + }, + { + "epoch": 1.0131912058627581, + "grad_norm": 11.379825592041016, + "learning_rate": 4.926345399363387e-05, + "loss": 2.27, + "step": 1700 + }, + { + "epoch": 1.0138574283810793, + "grad_norm": 11.224685668945312, + "learning_rate": 4.9226441631504925e-05, + "loss": 2.3964, + "step": 1710 + }, + { + "epoch": 1.0145236508994004, + "grad_norm": 9.328446388244629, + "learning_rate": 4.918942926937597e-05, + "loss": 2.2096, + "step": 1720 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 9.708972930908203, + "learning_rate": 4.9152416907247026e-05, + "loss": 2.3261, + "step": 1730 + }, + { + "epoch": 1.0158560959360425, + "grad_norm": 13.25235652923584, + "learning_rate": 4.9115404545118074e-05, + "loss": 2.2181, + "step": 1740 + }, + { + "epoch": 1.0165223184543637, + "grad_norm": 9.051215171813965, + "learning_rate": 4.907839218298912e-05, + "loss": 1.9097, + "step": 1750 + }, + { + "epoch": 1.0171885409726849, + "grad_norm": 9.303827285766602, + "learning_rate": 4.904137982086017e-05, + "loss": 2.2561, + "step": 1760 + }, + { + "epoch": 1.017854763491006, + "grad_norm": 8.126070022583008, + "learning_rate": 4.900436745873122e-05, + "loss": 2.0199, + "step": 1770 + }, + { + "epoch": 1.0185209860093272, + "grad_norm": 14.028191566467285, + "learning_rate": 4.896735509660227e-05, + "loss": 2.1409, + "step": 1780 + }, + { + "epoch": 1.0191872085276483, + "grad_norm": 11.869588851928711, + "learning_rate": 4.893034273447332e-05, + "loss": 1.9559, + "step": 1790 + }, + { + "epoch": 1.0198534310459693, + "grad_norm": 11.795266151428223, + "learning_rate": 4.8893330372344364e-05, + "loss": 2.0419, + "step": 1800 + }, + { + "epoch": 1.0205196535642904, + "grad_norm": 11.018795013427734, + "learning_rate": 4.885631801021541e-05, + "loss": 2.043, + "step": 1810 + }, + { + "epoch": 1.0211858760826116, + "grad_norm": 7.332172870635986, + "learning_rate": 4.881930564808646e-05, + "loss": 1.9727, + "step": 1820 + }, + { + "epoch": 1.0218520986009327, + "grad_norm": 13.432587623596191, + "learning_rate": 4.878229328595751e-05, + "loss": 2.0114, + "step": 1830 + }, + { + "epoch": 1.022518321119254, + "grad_norm": 8.070444107055664, + "learning_rate": 4.874528092382856e-05, + "loss": 2.062, + "step": 1840 + }, + { + "epoch": 1.023184543637575, + "grad_norm": 15.205826759338379, + "learning_rate": 4.870826856169961e-05, + "loss": 1.933, + "step": 1850 + }, + { + "epoch": 1.023850766155896, + "grad_norm": 12.374849319458008, + "learning_rate": 4.867125619957066e-05, + "loss": 2.1134, + "step": 1860 + }, + { + "epoch": 1.0245169886742171, + "grad_norm": 10.898399353027344, + "learning_rate": 4.863424383744171e-05, + "loss": 2.1247, + "step": 1870 + }, + { + "epoch": 1.0251832111925383, + "grad_norm": 9.324128150939941, + "learning_rate": 4.8597231475312756e-05, + "loss": 1.8143, + "step": 1880 + }, + { + "epoch": 1.0258494337108595, + "grad_norm": 8.686003684997559, + "learning_rate": 4.856021911318381e-05, + "loss": 2.1711, + "step": 1890 + }, + { + "epoch": 1.0265156562291806, + "grad_norm": 8.357766151428223, + "learning_rate": 4.852320675105486e-05, + "loss": 2.1686, + "step": 1900 + }, + { + "epoch": 1.0271818787475016, + "grad_norm": 14.981986045837402, + "learning_rate": 4.8486194388925905e-05, + "loss": 2.1455, + "step": 1910 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 11.987198829650879, + "learning_rate": 4.844918202679695e-05, + "loss": 2.115, + "step": 1920 + }, + { + "epoch": 1.0285143237841439, + "grad_norm": 10.56569766998291, + "learning_rate": 4.8412169664668e-05, + "loss": 2.1391, + "step": 1930 + }, + { + "epoch": 1.029180546302465, + "grad_norm": 14.438934326171875, + "learning_rate": 4.837515730253905e-05, + "loss": 2.2382, + "step": 1940 + }, + { + "epoch": 1.0298467688207862, + "grad_norm": 7.349361896514893, + "learning_rate": 4.83381449404101e-05, + "loss": 2.0428, + "step": 1950 + }, + { + "epoch": 1.0305129913391073, + "grad_norm": 10.253539085388184, + "learning_rate": 4.830113257828115e-05, + "loss": 2.3541, + "step": 1960 + }, + { + "epoch": 1.0311792138574283, + "grad_norm": 10.772522926330566, + "learning_rate": 4.8264120216152195e-05, + "loss": 1.9612, + "step": 1970 + }, + { + "epoch": 1.0318454363757494, + "grad_norm": 9.996562957763672, + "learning_rate": 4.822710785402324e-05, + "loss": 1.9021, + "step": 1980 + }, + { + "epoch": 1.0325116588940706, + "grad_norm": 10.950971603393555, + "learning_rate": 4.81900954918943e-05, + "loss": 2.1543, + "step": 1990 + }, + { + "epoch": 1.0331778814123918, + "grad_norm": 10.033283233642578, + "learning_rate": 4.8153083129765344e-05, + "loss": 2.1107, + "step": 2000 + }, + { + "epoch": 1.033844103930713, + "grad_norm": 14.631372451782227, + "learning_rate": 4.81160707676364e-05, + "loss": 2.1159, + "step": 2010 + }, + { + "epoch": 1.034510326449034, + "grad_norm": 12.627449989318848, + "learning_rate": 4.8079058405507446e-05, + "loss": 2.0263, + "step": 2020 + }, + { + "epoch": 1.035176548967355, + "grad_norm": 7.914418697357178, + "learning_rate": 4.804204604337849e-05, + "loss": 1.9982, + "step": 2030 + }, + { + "epoch": 1.0358427714856762, + "grad_norm": 10.436437606811523, + "learning_rate": 4.800503368124954e-05, + "loss": 2.0046, + "step": 2040 + }, + { + "epoch": 1.0365089940039973, + "grad_norm": 21.002273559570312, + "learning_rate": 4.796802131912059e-05, + "loss": 1.9071, + "step": 2050 + }, + { + "epoch": 1.0371752165223185, + "grad_norm": 10.877198219299316, + "learning_rate": 4.7931008956991635e-05, + "loss": 1.9091, + "step": 2060 + }, + { + "epoch": 1.0378414390406396, + "grad_norm": 8.712760925292969, + "learning_rate": 4.789399659486269e-05, + "loss": 2.0149, + "step": 2070 + }, + { + "epoch": 1.0385076615589608, + "grad_norm": 10.912163734436035, + "learning_rate": 4.7856984232733736e-05, + "loss": 1.964, + "step": 2080 + }, + { + "epoch": 1.0391738840772817, + "grad_norm": 12.805818557739258, + "learning_rate": 4.7819971870604783e-05, + "loss": 2.1557, + "step": 2090 + }, + { + "epoch": 1.0398401065956029, + "grad_norm": 8.292597770690918, + "learning_rate": 4.778295950847583e-05, + "loss": 1.9859, + "step": 2100 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 12.738317489624023, + "learning_rate": 4.774594714634688e-05, + "loss": 2.0058, + "step": 2110 + }, + { + "epoch": 1.0411725516322452, + "grad_norm": 10.040546417236328, + "learning_rate": 4.770893478421793e-05, + "loss": 2.0221, + "step": 2120 + }, + { + "epoch": 1.0418387741505664, + "grad_norm": 9.450180053710938, + "learning_rate": 4.767192242208898e-05, + "loss": 2.2751, + "step": 2130 + }, + { + "epoch": 1.0425049966688875, + "grad_norm": 9.594880104064941, + "learning_rate": 4.7634910059960034e-05, + "loss": 2.0262, + "step": 2140 + }, + { + "epoch": 1.0431712191872085, + "grad_norm": 15.776046752929688, + "learning_rate": 4.759789769783108e-05, + "loss": 1.8402, + "step": 2150 + }, + { + "epoch": 1.0438374417055296, + "grad_norm": 9.773028373718262, + "learning_rate": 4.756088533570213e-05, + "loss": 2.1311, + "step": 2160 + }, + { + "epoch": 1.0445036642238508, + "grad_norm": 9.812409400939941, + "learning_rate": 4.7523872973573175e-05, + "loss": 2.033, + "step": 2170 + }, + { + "epoch": 1.045169886742172, + "grad_norm": 15.476219177246094, + "learning_rate": 4.748686061144422e-05, + "loss": 2.0528, + "step": 2180 + }, + { + "epoch": 1.045836109260493, + "grad_norm": 10.705337524414062, + "learning_rate": 4.744984824931528e-05, + "loss": 2.1299, + "step": 2190 + }, + { + "epoch": 1.046502331778814, + "grad_norm": 10.608670234680176, + "learning_rate": 4.7412835887186324e-05, + "loss": 2.0263, + "step": 2200 + }, + { + "epoch": 1.0471685542971352, + "grad_norm": 10.196287155151367, + "learning_rate": 4.737582352505737e-05, + "loss": 2.1621, + "step": 2210 + }, + { + "epoch": 1.0478347768154563, + "grad_norm": 9.682682037353516, + "learning_rate": 4.733881116292842e-05, + "loss": 2.1107, + "step": 2220 + }, + { + "epoch": 1.0485009993337775, + "grad_norm": 11.595756530761719, + "learning_rate": 4.7301798800799466e-05, + "loss": 2.0216, + "step": 2230 + }, + { + "epoch": 1.0491672218520987, + "grad_norm": 11.568273544311523, + "learning_rate": 4.726478643867051e-05, + "loss": 2.0292, + "step": 2240 + }, + { + "epoch": 1.0498334443704198, + "grad_norm": 14.324235916137695, + "learning_rate": 4.722777407654157e-05, + "loss": 1.7203, + "step": 2250 + }, + { + "epoch": 1.0504996668887407, + "grad_norm": 14.842239379882812, + "learning_rate": 4.7190761714412615e-05, + "loss": 2.0435, + "step": 2260 + }, + { + "epoch": 1.051165889407062, + "grad_norm": 16.512113571166992, + "learning_rate": 4.715374935228367e-05, + "loss": 1.9707, + "step": 2270 + }, + { + "epoch": 1.051832111925383, + "grad_norm": 13.837310791015625, + "learning_rate": 4.7116736990154716e-05, + "loss": 1.9517, + "step": 2280 + }, + { + "epoch": 1.0524983344437042, + "grad_norm": 11.001923561096191, + "learning_rate": 4.707972462802576e-05, + "loss": 1.882, + "step": 2290 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 8.219276428222656, + "learning_rate": 4.704271226589681e-05, + "loss": 1.723, + "step": 2300 + }, + { + "epoch": 1.0538307794803465, + "grad_norm": 11.656583786010742, + "learning_rate": 4.700569990376786e-05, + "loss": 2.1287, + "step": 2310 + }, + { + "epoch": 1.0544970019986675, + "grad_norm": 11.53858470916748, + "learning_rate": 4.696868754163891e-05, + "loss": 1.9843, + "step": 2320 + }, + { + "epoch": 1.0551632245169886, + "grad_norm": 14.099578857421875, + "learning_rate": 4.693167517950996e-05, + "loss": 1.8246, + "step": 2330 + }, + { + "epoch": 1.0558294470353098, + "grad_norm": 11.870223045349121, + "learning_rate": 4.689466281738101e-05, + "loss": 1.8476, + "step": 2340 + }, + { + "epoch": 1.056495669553631, + "grad_norm": 10.513201713562012, + "learning_rate": 4.6857650455252054e-05, + "loss": 2.0768, + "step": 2350 + }, + { + "epoch": 1.057161892071952, + "grad_norm": 10.243853569030762, + "learning_rate": 4.68206380931231e-05, + "loss": 1.7207, + "step": 2360 + }, + { + "epoch": 1.057828114590273, + "grad_norm": 9.696374893188477, + "learning_rate": 4.678362573099415e-05, + "loss": 1.9356, + "step": 2370 + }, + { + "epoch": 1.0584943371085942, + "grad_norm": 6.808454990386963, + "learning_rate": 4.67466133688652e-05, + "loss": 1.686, + "step": 2380 + }, + { + "epoch": 1.0591605596269154, + "grad_norm": 11.773763656616211, + "learning_rate": 4.670960100673625e-05, + "loss": 1.6973, + "step": 2390 + }, + { + "epoch": 1.0598267821452365, + "grad_norm": 9.969675064086914, + "learning_rate": 4.6672588644607304e-05, + "loss": 2.0548, + "step": 2400 + }, + { + "epoch": 1.0604930046635577, + "grad_norm": 8.728965759277344, + "learning_rate": 4.663557628247835e-05, + "loss": 2.0585, + "step": 2410 + }, + { + "epoch": 1.0611592271818788, + "grad_norm": 9.217435836791992, + "learning_rate": 4.65985639203494e-05, + "loss": 1.9508, + "step": 2420 + }, + { + "epoch": 1.0618254497001998, + "grad_norm": 11.786916732788086, + "learning_rate": 4.6561551558220446e-05, + "loss": 1.896, + "step": 2430 + }, + { + "epoch": 1.062491672218521, + "grad_norm": 10.680403709411621, + "learning_rate": 4.65245391960915e-05, + "loss": 1.9341, + "step": 2440 + }, + { + "epoch": 1.063157894736842, + "grad_norm": 10.756195068359375, + "learning_rate": 4.648752683396255e-05, + "loss": 1.8754, + "step": 2450 + }, + { + "epoch": 1.0638241172551632, + "grad_norm": 11.283404350280762, + "learning_rate": 4.6450514471833595e-05, + "loss": 1.7211, + "step": 2460 + }, + { + "epoch": 1.0644903397734844, + "grad_norm": 14.51810073852539, + "learning_rate": 4.641350210970464e-05, + "loss": 1.8242, + "step": 2470 + }, + { + "epoch": 1.0651565622918056, + "grad_norm": 9.010109901428223, + "learning_rate": 4.637648974757569e-05, + "loss": 1.7251, + "step": 2480 + }, + { + "epoch": 1.0658227848101265, + "grad_norm": 9.78903865814209, + "learning_rate": 4.6339477385446737e-05, + "loss": 2.0208, + "step": 2490 + }, + { + "epoch": 1.0664890073284476, + "grad_norm": 15.573025703430176, + "learning_rate": 4.630246502331779e-05, + "loss": 1.9475, + "step": 2500 + }, + { + "epoch": 1.0671552298467688, + "grad_norm": 14.990290641784668, + "learning_rate": 4.626545266118884e-05, + "loss": 2.0785, + "step": 2510 + }, + { + "epoch": 1.06782145236509, + "grad_norm": 15.643211364746094, + "learning_rate": 4.6228440299059885e-05, + "loss": 1.9327, + "step": 2520 + }, + { + "epoch": 1.0684876748834111, + "grad_norm": 16.963171005249023, + "learning_rate": 4.619142793693094e-05, + "loss": 2.0569, + "step": 2530 + }, + { + "epoch": 1.0691538974017323, + "grad_norm": 14.99976634979248, + "learning_rate": 4.615441557480199e-05, + "loss": 1.9347, + "step": 2540 + }, + { + "epoch": 1.0698201199200532, + "grad_norm": 9.576680183410645, + "learning_rate": 4.6117403212673034e-05, + "loss": 2.1985, + "step": 2550 + }, + { + "epoch": 1.0704863424383744, + "grad_norm": 13.33770751953125, + "learning_rate": 4.608039085054409e-05, + "loss": 1.8209, + "step": 2560 + }, + { + "epoch": 1.0711525649566955, + "grad_norm": 12.083077430725098, + "learning_rate": 4.6043378488415135e-05, + "loss": 1.964, + "step": 2570 + }, + { + "epoch": 1.0718187874750167, + "grad_norm": 12.412477493286133, + "learning_rate": 4.600636612628618e-05, + "loss": 2.0988, + "step": 2580 + }, + { + "epoch": 1.0724850099933378, + "grad_norm": 10.35688304901123, + "learning_rate": 4.596935376415723e-05, + "loss": 1.8418, + "step": 2590 + }, + { + "epoch": 1.073151232511659, + "grad_norm": 9.166183471679688, + "learning_rate": 4.593234140202828e-05, + "loss": 2.0682, + "step": 2600 + }, + { + "epoch": 1.07381745502998, + "grad_norm": 14.883468627929688, + "learning_rate": 4.5895329039899325e-05, + "loss": 1.8077, + "step": 2610 + }, + { + "epoch": 1.074483677548301, + "grad_norm": 9.008862495422363, + "learning_rate": 4.585831667777038e-05, + "loss": 1.8592, + "step": 2620 + }, + { + "epoch": 1.0751499000666223, + "grad_norm": 10.294509887695312, + "learning_rate": 4.5821304315641426e-05, + "loss": 1.8459, + "step": 2630 + }, + { + "epoch": 1.0758161225849434, + "grad_norm": 13.311019897460938, + "learning_rate": 4.578429195351247e-05, + "loss": 2.1663, + "step": 2640 + }, + { + "epoch": 1.0764823451032646, + "grad_norm": 10.994257926940918, + "learning_rate": 4.574727959138353e-05, + "loss": 1.8598, + "step": 2650 + }, + { + "epoch": 1.0771485676215855, + "grad_norm": 17.064979553222656, + "learning_rate": 4.5710267229254575e-05, + "loss": 1.8871, + "step": 2660 + }, + { + "epoch": 1.0778147901399067, + "grad_norm": 14.703117370605469, + "learning_rate": 4.567325486712562e-05, + "loss": 1.8012, + "step": 2670 + }, + { + "epoch": 1.0784810126582278, + "grad_norm": 9.350732803344727, + "learning_rate": 4.5636242504996676e-05, + "loss": 1.898, + "step": 2680 + }, + { + "epoch": 1.079147235176549, + "grad_norm": 16.0025577545166, + "learning_rate": 4.559923014286772e-05, + "loss": 1.8303, + "step": 2690 + }, + { + "epoch": 1.0798134576948701, + "grad_norm": 10.28544807434082, + "learning_rate": 4.556221778073877e-05, + "loss": 1.836, + "step": 2700 + }, + { + "epoch": 1.0804796802131913, + "grad_norm": 16.86279296875, + "learning_rate": 4.552520541860982e-05, + "loss": 2.1423, + "step": 2710 + }, + { + "epoch": 1.0811459027315122, + "grad_norm": 11.462324142456055, + "learning_rate": 4.5488193056480865e-05, + "loss": 1.7931, + "step": 2720 + }, + { + "epoch": 1.0818121252498334, + "grad_norm": 8.896488189697266, + "learning_rate": 4.545118069435191e-05, + "loss": 1.8083, + "step": 2730 + }, + { + "epoch": 1.0824783477681545, + "grad_norm": 12.387425422668457, + "learning_rate": 4.5414168332222967e-05, + "loss": 1.8485, + "step": 2740 + }, + { + "epoch": 1.0831445702864757, + "grad_norm": 12.504982948303223, + "learning_rate": 4.5377155970094014e-05, + "loss": 1.9298, + "step": 2750 + }, + { + "epoch": 1.0838107928047969, + "grad_norm": 8.38635540008545, + "learning_rate": 4.534014360796506e-05, + "loss": 1.811, + "step": 2760 + }, + { + "epoch": 1.084477015323118, + "grad_norm": 12.249164581298828, + "learning_rate": 4.530313124583611e-05, + "loss": 1.7072, + "step": 2770 + }, + { + "epoch": 1.085143237841439, + "grad_norm": 13.315929412841797, + "learning_rate": 4.526611888370716e-05, + "loss": 1.9771, + "step": 2780 + }, + { + "epoch": 1.0858094603597601, + "grad_norm": 10.914670944213867, + "learning_rate": 4.522910652157821e-05, + "loss": 1.8456, + "step": 2790 + }, + { + "epoch": 1.0864756828780813, + "grad_norm": 10.898269653320312, + "learning_rate": 4.5192094159449264e-05, + "loss": 1.9248, + "step": 2800 + }, + { + "epoch": 1.0871419053964024, + "grad_norm": 14.258195877075195, + "learning_rate": 4.515508179732031e-05, + "loss": 1.8424, + "step": 2810 + }, + { + "epoch": 1.0878081279147236, + "grad_norm": 12.83182430267334, + "learning_rate": 4.511806943519136e-05, + "loss": 2.1138, + "step": 2820 + }, + { + "epoch": 1.0884743504330445, + "grad_norm": 11.202397346496582, + "learning_rate": 4.5081057073062406e-05, + "loss": 1.9372, + "step": 2830 + }, + { + "epoch": 1.0891405729513657, + "grad_norm": 15.81797981262207, + "learning_rate": 4.504404471093345e-05, + "loss": 1.6765, + "step": 2840 + }, + { + "epoch": 1.0898067954696868, + "grad_norm": 8.608427047729492, + "learning_rate": 4.50070323488045e-05, + "loss": 1.6121, + "step": 2850 + }, + { + "epoch": 1.090473017988008, + "grad_norm": 16.029726028442383, + "learning_rate": 4.4970019986675555e-05, + "loss": 1.7724, + "step": 2860 + }, + { + "epoch": 1.0911392405063292, + "grad_norm": 11.284930229187012, + "learning_rate": 4.49330076245466e-05, + "loss": 1.8123, + "step": 2870 + }, + { + "epoch": 1.0918054630246503, + "grad_norm": 12.826201438903809, + "learning_rate": 4.489599526241765e-05, + "loss": 1.8504, + "step": 2880 + }, + { + "epoch": 1.0924716855429712, + "grad_norm": 14.259387016296387, + "learning_rate": 4.4858982900288696e-05, + "loss": 1.998, + "step": 2890 + }, + { + "epoch": 1.0931379080612924, + "grad_norm": 6.000631809234619, + "learning_rate": 4.4821970538159744e-05, + "loss": 1.7464, + "step": 2900 + }, + { + "epoch": 1.0938041305796136, + "grad_norm": 12.826776504516602, + "learning_rate": 4.47849581760308e-05, + "loss": 1.7891, + "step": 2910 + }, + { + "epoch": 1.0944703530979347, + "grad_norm": 20.789466857910156, + "learning_rate": 4.4747945813901845e-05, + "loss": 1.654, + "step": 2920 + }, + { + "epoch": 1.0951365756162559, + "grad_norm": 17.070629119873047, + "learning_rate": 4.47109334517729e-05, + "loss": 1.9719, + "step": 2930 + }, + { + "epoch": 1.095802798134577, + "grad_norm": 13.203139305114746, + "learning_rate": 4.4673921089643947e-05, + "loss": 1.9438, + "step": 2940 + }, + { + "epoch": 1.096469020652898, + "grad_norm": 12.207772254943848, + "learning_rate": 4.4636908727514994e-05, + "loss": 1.9524, + "step": 2950 + }, + { + "epoch": 1.0971352431712191, + "grad_norm": 10.32587718963623, + "learning_rate": 4.459989636538604e-05, + "loss": 1.7491, + "step": 2960 + }, + { + "epoch": 1.0978014656895403, + "grad_norm": 16.482036590576172, + "learning_rate": 4.456288400325709e-05, + "loss": 1.7864, + "step": 2970 + }, + { + "epoch": 1.0984676882078614, + "grad_norm": 9.45310115814209, + "learning_rate": 4.4525871641128136e-05, + "loss": 2.1425, + "step": 2980 + }, + { + "epoch": 1.0991339107261826, + "grad_norm": 11.884702682495117, + "learning_rate": 4.448885927899919e-05, + "loss": 1.9594, + "step": 2990 + }, + { + "epoch": 1.0998001332445038, + "grad_norm": 11.340782165527344, + "learning_rate": 4.445184691687024e-05, + "loss": 1.6506, + "step": 3000 + }, + { + "epoch": 1.100066622251832, + "eval_accuracy": 0.35130552729738895, + "eval_loss": 2.059617280960083, + "eval_runtime": 936.2938, + "eval_samples_per_second": 3.15, + "eval_steps_per_second": 0.394, + "step": 3004 + }, + { + "epoch": 2.0003997335109927, + "grad_norm": 10.887309074401855, + "learning_rate": 4.4414834554741284e-05, + "loss": 1.7786, + "step": 3010 + }, + { + "epoch": 2.0010659560293136, + "grad_norm": 9.394381523132324, + "learning_rate": 4.437782219261233e-05, + "loss": 1.8451, + "step": 3020 + }, + { + "epoch": 2.001732178547635, + "grad_norm": 13.808993339538574, + "learning_rate": 4.434080983048338e-05, + "loss": 1.6152, + "step": 3030 + }, + { + "epoch": 2.002398401065956, + "grad_norm": 14.144484519958496, + "learning_rate": 4.430379746835443e-05, + "loss": 1.2937, + "step": 3040 + }, + { + "epoch": 2.0030646235842773, + "grad_norm": 14.640848159790039, + "learning_rate": 4.426678510622548e-05, + "loss": 1.8098, + "step": 3050 + }, + { + "epoch": 2.003730846102598, + "grad_norm": 21.34695053100586, + "learning_rate": 4.4229772744096534e-05, + "loss": 1.7346, + "step": 3060 + }, + { + "epoch": 2.0043970686209196, + "grad_norm": 12.26648235321045, + "learning_rate": 4.419276038196758e-05, + "loss": 1.6038, + "step": 3070 + }, + { + "epoch": 2.0050632911392405, + "grad_norm": 10.25528621673584, + "learning_rate": 4.415574801983863e-05, + "loss": 1.8983, + "step": 3080 + }, + { + "epoch": 2.0057295136575615, + "grad_norm": 9.814477920532227, + "learning_rate": 4.4118735657709676e-05, + "loss": 1.8758, + "step": 3090 + }, + { + "epoch": 2.006395736175883, + "grad_norm": 15.37265396118164, + "learning_rate": 4.4081723295580724e-05, + "loss": 1.9502, + "step": 3100 + }, + { + "epoch": 2.007061958694204, + "grad_norm": 9.403654098510742, + "learning_rate": 4.404471093345178e-05, + "loss": 1.7639, + "step": 3110 + }, + { + "epoch": 2.007728181212525, + "grad_norm": 12.723804473876953, + "learning_rate": 4.4007698571322825e-05, + "loss": 1.7549, + "step": 3120 + }, + { + "epoch": 2.008394403730846, + "grad_norm": 11.941670417785645, + "learning_rate": 4.397068620919387e-05, + "loss": 1.7834, + "step": 3130 + }, + { + "epoch": 2.009060626249167, + "grad_norm": 15.871787071228027, + "learning_rate": 4.393367384706492e-05, + "loss": 2.0504, + "step": 3140 + }, + { + "epoch": 2.0097268487674884, + "grad_norm": 11.952261924743652, + "learning_rate": 4.389666148493597e-05, + "loss": 1.5951, + "step": 3150 + }, + { + "epoch": 2.0103930712858094, + "grad_norm": 10.877362251281738, + "learning_rate": 4.3859649122807014e-05, + "loss": 1.9144, + "step": 3160 + }, + { + "epoch": 2.0110592938041307, + "grad_norm": 10.613754272460938, + "learning_rate": 4.382263676067807e-05, + "loss": 1.746, + "step": 3170 + }, + { + "epoch": 2.0117255163224517, + "grad_norm": 8.767799377441406, + "learning_rate": 4.3785624398549116e-05, + "loss": 1.6818, + "step": 3180 + }, + { + "epoch": 2.0123917388407726, + "grad_norm": 11.285937309265137, + "learning_rate": 4.374861203642017e-05, + "loss": 1.6261, + "step": 3190 + }, + { + "epoch": 2.013057961359094, + "grad_norm": 12.250618934631348, + "learning_rate": 4.371159967429122e-05, + "loss": 1.6106, + "step": 3200 + }, + { + "epoch": 2.013724183877415, + "grad_norm": 15.090853691101074, + "learning_rate": 4.3674587312162264e-05, + "loss": 1.7971, + "step": 3210 + }, + { + "epoch": 2.0143904063957363, + "grad_norm": 12.513516426086426, + "learning_rate": 4.363757495003331e-05, + "loss": 1.8177, + "step": 3220 + }, + { + "epoch": 2.0150566289140572, + "grad_norm": 13.689520835876465, + "learning_rate": 4.3600562587904366e-05, + "loss": 1.6382, + "step": 3230 + }, + { + "epoch": 2.0157228514323786, + "grad_norm": 10.838946342468262, + "learning_rate": 4.356355022577541e-05, + "loss": 1.4649, + "step": 3240 + }, + { + "epoch": 2.0163890739506996, + "grad_norm": 9.496235847473145, + "learning_rate": 4.352653786364646e-05, + "loss": 1.6198, + "step": 3250 + }, + { + "epoch": 2.0170552964690205, + "grad_norm": 5.283502101898193, + "learning_rate": 4.348952550151751e-05, + "loss": 1.9689, + "step": 3260 + }, + { + "epoch": 2.017721518987342, + "grad_norm": 12.338292121887207, + "learning_rate": 4.3452513139388555e-05, + "loss": 2.0582, + "step": 3270 + }, + { + "epoch": 2.018387741505663, + "grad_norm": 11.439964294433594, + "learning_rate": 4.34155007772596e-05, + "loss": 1.6131, + "step": 3280 + }, + { + "epoch": 2.019053964023984, + "grad_norm": 9.407355308532715, + "learning_rate": 4.3378488415130656e-05, + "loss": 1.5869, + "step": 3290 + }, + { + "epoch": 2.019720186542305, + "grad_norm": 10.538002967834473, + "learning_rate": 4.3341476053001704e-05, + "loss": 1.8029, + "step": 3300 + }, + { + "epoch": 2.020386409060626, + "grad_norm": 6.820994853973389, + "learning_rate": 4.330446369087275e-05, + "loss": 1.4661, + "step": 3310 + }, + { + "epoch": 2.0210526315789474, + "grad_norm": 14.21889591217041, + "learning_rate": 4.3267451328743805e-05, + "loss": 1.7658, + "step": 3320 + }, + { + "epoch": 2.0217188540972684, + "grad_norm": 16.270326614379883, + "learning_rate": 4.323043896661485e-05, + "loss": 1.761, + "step": 3330 + }, + { + "epoch": 2.0223850766155897, + "grad_norm": 9.896149635314941, + "learning_rate": 4.31934266044859e-05, + "loss": 1.8644, + "step": 3340 + }, + { + "epoch": 2.0230512991339107, + "grad_norm": 18.067331314086914, + "learning_rate": 4.3156414242356954e-05, + "loss": 1.6767, + "step": 3350 + }, + { + "epoch": 2.023717521652232, + "grad_norm": 12.747305870056152, + "learning_rate": 4.3119401880228e-05, + "loss": 1.5904, + "step": 3360 + }, + { + "epoch": 2.024383744170553, + "grad_norm": 13.039362907409668, + "learning_rate": 4.308238951809905e-05, + "loss": 1.7391, + "step": 3370 + }, + { + "epoch": 2.025049966688874, + "grad_norm": 10.833274841308594, + "learning_rate": 4.3045377155970096e-05, + "loss": 1.7857, + "step": 3380 + }, + { + "epoch": 2.0257161892071953, + "grad_norm": 24.027122497558594, + "learning_rate": 4.300836479384114e-05, + "loss": 1.5298, + "step": 3390 + }, + { + "epoch": 2.0263824117255163, + "grad_norm": 7.5531415939331055, + "learning_rate": 4.297135243171219e-05, + "loss": 1.5457, + "step": 3400 + }, + { + "epoch": 2.0270486342438376, + "grad_norm": 12.391172409057617, + "learning_rate": 4.2934340069583244e-05, + "loss": 1.6695, + "step": 3410 + }, + { + "epoch": 2.0277148567621586, + "grad_norm": 11.258078575134277, + "learning_rate": 4.289732770745429e-05, + "loss": 1.7929, + "step": 3420 + }, + { + "epoch": 2.0283810792804795, + "grad_norm": 11.439624786376953, + "learning_rate": 4.286031534532534e-05, + "loss": 1.7668, + "step": 3430 + }, + { + "epoch": 2.029047301798801, + "grad_norm": 9.160229682922363, + "learning_rate": 4.2823302983196386e-05, + "loss": 1.8471, + "step": 3440 + }, + { + "epoch": 2.029713524317122, + "grad_norm": 13.424290657043457, + "learning_rate": 4.278629062106744e-05, + "loss": 1.7342, + "step": 3450 + }, + { + "epoch": 2.030379746835443, + "grad_norm": 16.457414627075195, + "learning_rate": 4.274927825893849e-05, + "loss": 1.6163, + "step": 3460 + }, + { + "epoch": 2.031045969353764, + "grad_norm": 15.836133003234863, + "learning_rate": 4.271226589680954e-05, + "loss": 1.8883, + "step": 3470 + }, + { + "epoch": 2.031712191872085, + "grad_norm": 11.612725257873535, + "learning_rate": 4.267525353468059e-05, + "loss": 1.6857, + "step": 3480 + }, + { + "epoch": 2.0323784143904065, + "grad_norm": 13.235568046569824, + "learning_rate": 4.2638241172551636e-05, + "loss": 1.9679, + "step": 3490 + }, + { + "epoch": 2.0330446369087274, + "grad_norm": 11.35844898223877, + "learning_rate": 4.2601228810422684e-05, + "loss": 1.7182, + "step": 3500 + }, + { + "epoch": 2.0337108594270488, + "grad_norm": 15.999759674072266, + "learning_rate": 4.256421644829373e-05, + "loss": 1.8757, + "step": 3510 + }, + { + "epoch": 2.0343770819453697, + "grad_norm": 11.543108940124512, + "learning_rate": 4.252720408616478e-05, + "loss": 1.6129, + "step": 3520 + }, + { + "epoch": 2.035043304463691, + "grad_norm": 15.207947731018066, + "learning_rate": 4.249019172403583e-05, + "loss": 1.4091, + "step": 3530 + }, + { + "epoch": 2.035709526982012, + "grad_norm": 14.73616886138916, + "learning_rate": 4.245317936190688e-05, + "loss": 1.9264, + "step": 3540 + }, + { + "epoch": 2.036375749500333, + "grad_norm": 14.512042999267578, + "learning_rate": 4.241616699977793e-05, + "loss": 1.6243, + "step": 3550 + }, + { + "epoch": 2.0370419720186543, + "grad_norm": 8.635575294494629, + "learning_rate": 4.2379154637648974e-05, + "loss": 1.7434, + "step": 3560 + }, + { + "epoch": 2.0377081945369753, + "grad_norm": 10.63775634765625, + "learning_rate": 4.234214227552002e-05, + "loss": 1.7813, + "step": 3570 + }, + { + "epoch": 2.0383744170552967, + "grad_norm": 17.920299530029297, + "learning_rate": 4.2305129913391076e-05, + "loss": 1.6561, + "step": 3580 + }, + { + "epoch": 2.0390406395736176, + "grad_norm": 12.075530052185059, + "learning_rate": 4.226811755126212e-05, + "loss": 1.5315, + "step": 3590 + }, + { + "epoch": 2.0397068620919385, + "grad_norm": 14.980879783630371, + "learning_rate": 4.223110518913318e-05, + "loss": 1.5218, + "step": 3600 + }, + { + "epoch": 2.04037308461026, + "grad_norm": 13.485072135925293, + "learning_rate": 4.2194092827004224e-05, + "loss": 1.3468, + "step": 3610 + }, + { + "epoch": 2.041039307128581, + "grad_norm": 16.32023811340332, + "learning_rate": 4.215708046487527e-05, + "loss": 1.6911, + "step": 3620 + }, + { + "epoch": 2.041705529646902, + "grad_norm": 15.975449562072754, + "learning_rate": 4.212006810274632e-05, + "loss": 1.7328, + "step": 3630 + }, + { + "epoch": 2.042371752165223, + "grad_norm": 8.385498046875, + "learning_rate": 4.2083055740617366e-05, + "loss": 1.4539, + "step": 3640 + }, + { + "epoch": 2.043037974683544, + "grad_norm": 19.339635848999023, + "learning_rate": 4.2046043378488413e-05, + "loss": 1.5604, + "step": 3650 + }, + { + "epoch": 2.0437041972018655, + "grad_norm": 8.827022552490234, + "learning_rate": 4.200903101635947e-05, + "loss": 1.7373, + "step": 3660 + }, + { + "epoch": 2.0443704197201864, + "grad_norm": 15.012313842773438, + "learning_rate": 4.1972018654230515e-05, + "loss": 1.6688, + "step": 3670 + }, + { + "epoch": 2.045036642238508, + "grad_norm": 10.896596908569336, + "learning_rate": 4.193500629210156e-05, + "loss": 1.6011, + "step": 3680 + }, + { + "epoch": 2.0457028647568287, + "grad_norm": 14.602241516113281, + "learning_rate": 4.189799392997261e-05, + "loss": 2.0345, + "step": 3690 + }, + { + "epoch": 2.04636908727515, + "grad_norm": 15.97205638885498, + "learning_rate": 4.186098156784366e-05, + "loss": 1.5237, + "step": 3700 + }, + { + "epoch": 2.047035309793471, + "grad_norm": 9.577689170837402, + "learning_rate": 4.182396920571471e-05, + "loss": 1.5308, + "step": 3710 + }, + { + "epoch": 2.047701532311792, + "grad_norm": 13.969609260559082, + "learning_rate": 4.1786956843585765e-05, + "loss": 1.67, + "step": 3720 + }, + { + "epoch": 2.0483677548301134, + "grad_norm": 22.1742000579834, + "learning_rate": 4.174994448145681e-05, + "loss": 1.3937, + "step": 3730 + }, + { + "epoch": 2.0490339773484343, + "grad_norm": 19.15953254699707, + "learning_rate": 4.171293211932786e-05, + "loss": 1.629, + "step": 3740 + }, + { + "epoch": 2.0497001998667557, + "grad_norm": 20.59035301208496, + "learning_rate": 4.167591975719891e-05, + "loss": 1.4002, + "step": 3750 + }, + { + "epoch": 2.0503664223850766, + "grad_norm": 18.7655086517334, + "learning_rate": 4.1638907395069954e-05, + "loss": 1.6007, + "step": 3760 + }, + { + "epoch": 2.0510326449033975, + "grad_norm": 8.854756355285645, + "learning_rate": 4.1601895032941e-05, + "loss": 1.5676, + "step": 3770 + }, + { + "epoch": 2.051698867421719, + "grad_norm": 11.866414070129395, + "learning_rate": 4.1564882670812056e-05, + "loss": 1.878, + "step": 3780 + }, + { + "epoch": 2.05236508994004, + "grad_norm": 18.629728317260742, + "learning_rate": 4.15278703086831e-05, + "loss": 1.7127, + "step": 3790 + }, + { + "epoch": 2.0530313124583612, + "grad_norm": 10.55181884765625, + "learning_rate": 4.149085794655415e-05, + "loss": 1.6997, + "step": 3800 + }, + { + "epoch": 2.053697534976682, + "grad_norm": 11.000639915466309, + "learning_rate": 4.14538455844252e-05, + "loss": 1.3128, + "step": 3810 + }, + { + "epoch": 2.054363757495003, + "grad_norm": 15.632569313049316, + "learning_rate": 4.1416833222296245e-05, + "loss": 1.9045, + "step": 3820 + }, + { + "epoch": 2.0550299800133245, + "grad_norm": 17.97886085510254, + "learning_rate": 4.137982086016729e-05, + "loss": 1.7551, + "step": 3830 + }, + { + "epoch": 2.0556962025316454, + "grad_norm": 12.146591186523438, + "learning_rate": 4.1342808498038346e-05, + "loss": 1.6294, + "step": 3840 + }, + { + "epoch": 2.056362425049967, + "grad_norm": 14.966313362121582, + "learning_rate": 4.13057961359094e-05, + "loss": 1.7238, + "step": 3850 + }, + { + "epoch": 2.0570286475682877, + "grad_norm": 16.597681045532227, + "learning_rate": 4.126878377378045e-05, + "loss": 1.9472, + "step": 3860 + }, + { + "epoch": 2.057694870086609, + "grad_norm": 15.325579643249512, + "learning_rate": 4.1231771411651495e-05, + "loss": 1.5235, + "step": 3870 + }, + { + "epoch": 2.05836109260493, + "grad_norm": 11.925553321838379, + "learning_rate": 4.119475904952254e-05, + "loss": 1.7154, + "step": 3880 + }, + { + "epoch": 2.059027315123251, + "grad_norm": 9.478686332702637, + "learning_rate": 4.115774668739359e-05, + "loss": 1.9233, + "step": 3890 + }, + { + "epoch": 2.0596935376415724, + "grad_norm": 13.057063102722168, + "learning_rate": 4.1120734325264643e-05, + "loss": 1.7613, + "step": 3900 + }, + { + "epoch": 2.0603597601598933, + "grad_norm": 8.994803428649902, + "learning_rate": 4.108372196313569e-05, + "loss": 1.495, + "step": 3910 + }, + { + "epoch": 2.0610259826782147, + "grad_norm": 10.930026054382324, + "learning_rate": 4.104670960100674e-05, + "loss": 1.7005, + "step": 3920 + }, + { + "epoch": 2.0616922051965356, + "grad_norm": 13.663028717041016, + "learning_rate": 4.1009697238877785e-05, + "loss": 1.5341, + "step": 3930 + }, + { + "epoch": 2.0623584277148566, + "grad_norm": 11.480274200439453, + "learning_rate": 4.097268487674883e-05, + "loss": 1.9438, + "step": 3940 + }, + { + "epoch": 2.063024650233178, + "grad_norm": 15.040125846862793, + "learning_rate": 4.093567251461988e-05, + "loss": 1.973, + "step": 3950 + }, + { + "epoch": 2.063690872751499, + "grad_norm": 12.20592212677002, + "learning_rate": 4.0898660152490934e-05, + "loss": 1.7557, + "step": 3960 + }, + { + "epoch": 2.0643570952698203, + "grad_norm": 17.329959869384766, + "learning_rate": 4.086164779036198e-05, + "loss": 1.6554, + "step": 3970 + }, + { + "epoch": 2.065023317788141, + "grad_norm": 11.793479919433594, + "learning_rate": 4.0824635428233035e-05, + "loss": 1.7979, + "step": 3980 + }, + { + "epoch": 2.0656895403064626, + "grad_norm": 7.690437316894531, + "learning_rate": 4.078762306610408e-05, + "loss": 1.9708, + "step": 3990 + }, + { + "epoch": 2.0663557628247835, + "grad_norm": 13.500343322753906, + "learning_rate": 4.075061070397513e-05, + "loss": 1.589, + "step": 4000 + }, + { + "epoch": 2.0670219853431044, + "grad_norm": 11.21178913116455, + "learning_rate": 4.071359834184618e-05, + "loss": 1.4662, + "step": 4010 + }, + { + "epoch": 2.067688207861426, + "grad_norm": 9.275733947753906, + "learning_rate": 4.067658597971723e-05, + "loss": 1.8705, + "step": 4020 + }, + { + "epoch": 2.0683544303797468, + "grad_norm": 10.865612030029297, + "learning_rate": 4.063957361758828e-05, + "loss": 1.5537, + "step": 4030 + }, + { + "epoch": 2.069020652898068, + "grad_norm": 15.506858825683594, + "learning_rate": 4.0602561255459326e-05, + "loss": 1.4576, + "step": 4040 + }, + { + "epoch": 2.069686875416389, + "grad_norm": 11.702934265136719, + "learning_rate": 4.056554889333037e-05, + "loss": 1.8315, + "step": 4050 + }, + { + "epoch": 2.07035309793471, + "grad_norm": 16.20012664794922, + "learning_rate": 4.052853653120142e-05, + "loss": 2.0521, + "step": 4060 + }, + { + "epoch": 2.0710193204530314, + "grad_norm": 16.11864471435547, + "learning_rate": 4.049152416907247e-05, + "loss": 1.6164, + "step": 4070 + }, + { + "epoch": 2.0716855429713523, + "grad_norm": 24.665590286254883, + "learning_rate": 4.045451180694352e-05, + "loss": 1.6976, + "step": 4080 + }, + { + "epoch": 2.0723517654896737, + "grad_norm": 14.924704551696777, + "learning_rate": 4.041749944481457e-05, + "loss": 1.8273, + "step": 4090 + }, + { + "epoch": 2.0730179880079946, + "grad_norm": 9.342179298400879, + "learning_rate": 4.038048708268562e-05, + "loss": 1.6265, + "step": 4100 + }, + { + "epoch": 2.0736842105263156, + "grad_norm": 11.691144943237305, + "learning_rate": 4.034347472055667e-05, + "loss": 1.6187, + "step": 4110 + }, + { + "epoch": 2.074350433044637, + "grad_norm": 8.620841979980469, + "learning_rate": 4.030646235842772e-05, + "loss": 1.1902, + "step": 4120 + }, + { + "epoch": 2.075016655562958, + "grad_norm": 19.810626983642578, + "learning_rate": 4.0269449996298765e-05, + "loss": 1.9392, + "step": 4130 + }, + { + "epoch": 2.0756828780812793, + "grad_norm": 8.55063533782959, + "learning_rate": 4.023243763416982e-05, + "loss": 1.5349, + "step": 4140 + }, + { + "epoch": 2.0763491005996, + "grad_norm": 13.146772384643555, + "learning_rate": 4.019542527204087e-05, + "loss": 1.5791, + "step": 4150 + }, + { + "epoch": 2.0770153231179216, + "grad_norm": 11.435802459716797, + "learning_rate": 4.0158412909911914e-05, + "loss": 1.7232, + "step": 4160 + }, + { + "epoch": 2.0776815456362425, + "grad_norm": 12.72943115234375, + "learning_rate": 4.012140054778296e-05, + "loss": 1.7808, + "step": 4170 + }, + { + "epoch": 2.0783477681545635, + "grad_norm": 16.26506805419922, + "learning_rate": 4.008438818565401e-05, + "loss": 1.5783, + "step": 4180 + }, + { + "epoch": 2.079013990672885, + "grad_norm": 10.844205856323242, + "learning_rate": 4.0047375823525056e-05, + "loss": 1.5336, + "step": 4190 + }, + { + "epoch": 2.0796802131912058, + "grad_norm": 12.086185455322266, + "learning_rate": 4.001036346139611e-05, + "loss": 1.6591, + "step": 4200 + }, + { + "epoch": 2.080346435709527, + "grad_norm": 13.10291576385498, + "learning_rate": 3.997335109926716e-05, + "loss": 1.5033, + "step": 4210 + }, + { + "epoch": 2.081012658227848, + "grad_norm": 9.898117065429688, + "learning_rate": 3.9936338737138205e-05, + "loss": 1.3535, + "step": 4220 + }, + { + "epoch": 2.081678880746169, + "grad_norm": 15.68769645690918, + "learning_rate": 3.989932637500925e-05, + "loss": 1.6216, + "step": 4230 + }, + { + "epoch": 2.0823451032644904, + "grad_norm": 19.074623107910156, + "learning_rate": 3.9862314012880306e-05, + "loss": 1.882, + "step": 4240 + }, + { + "epoch": 2.0830113257828113, + "grad_norm": 15.581271171569824, + "learning_rate": 3.982530165075135e-05, + "loss": 1.5994, + "step": 4250 + }, + { + "epoch": 2.0836775483011327, + "grad_norm": 10.840824127197266, + "learning_rate": 3.97882892886224e-05, + "loss": 1.5197, + "step": 4260 + }, + { + "epoch": 2.0843437708194537, + "grad_norm": 13.594049453735352, + "learning_rate": 3.9751276926493455e-05, + "loss": 1.6335, + "step": 4270 + }, + { + "epoch": 2.085009993337775, + "grad_norm": 9.110240936279297, + "learning_rate": 3.97142645643645e-05, + "loss": 1.672, + "step": 4280 + }, + { + "epoch": 2.085676215856096, + "grad_norm": 11.509053230285645, + "learning_rate": 3.967725220223555e-05, + "loss": 1.5686, + "step": 4290 + }, + { + "epoch": 2.086342438374417, + "grad_norm": 10.843743324279785, + "learning_rate": 3.96402398401066e-05, + "loss": 1.5938, + "step": 4300 + }, + { + "epoch": 2.0870086608927383, + "grad_norm": 9.984175682067871, + "learning_rate": 3.9603227477977644e-05, + "loss": 1.5438, + "step": 4310 + }, + { + "epoch": 2.0876748834110592, + "grad_norm": 11.339105606079102, + "learning_rate": 3.956621511584869e-05, + "loss": 1.4527, + "step": 4320 + }, + { + "epoch": 2.0883411059293806, + "grad_norm": 14.779313087463379, + "learning_rate": 3.9529202753719745e-05, + "loss": 1.8612, + "step": 4330 + }, + { + "epoch": 2.0890073284477015, + "grad_norm": 15.02027416229248, + "learning_rate": 3.949219039159079e-05, + "loss": 1.776, + "step": 4340 + }, + { + "epoch": 2.0896735509660225, + "grad_norm": 13.097738265991211, + "learning_rate": 3.945517802946184e-05, + "loss": 1.6092, + "step": 4350 + }, + { + "epoch": 2.090339773484344, + "grad_norm": 7.93465518951416, + "learning_rate": 3.941816566733289e-05, + "loss": 1.6091, + "step": 4360 + }, + { + "epoch": 2.091005996002665, + "grad_norm": 15.468151092529297, + "learning_rate": 3.938115330520394e-05, + "loss": 1.8504, + "step": 4370 + }, + { + "epoch": 2.091672218520986, + "grad_norm": 11.207691192626953, + "learning_rate": 3.934414094307499e-05, + "loss": 1.4335, + "step": 4380 + }, + { + "epoch": 2.092338441039307, + "grad_norm": 10.797505378723145, + "learning_rate": 3.930712858094604e-05, + "loss": 1.7997, + "step": 4390 + }, + { + "epoch": 2.093004663557628, + "grad_norm": 13.08703899383545, + "learning_rate": 3.927011621881709e-05, + "loss": 1.5593, + "step": 4400 + }, + { + "epoch": 2.0936708860759494, + "grad_norm": 10.595236778259277, + "learning_rate": 3.923310385668814e-05, + "loss": 1.5135, + "step": 4410 + }, + { + "epoch": 2.0943371085942704, + "grad_norm": 16.976335525512695, + "learning_rate": 3.9196091494559185e-05, + "loss": 1.5569, + "step": 4420 + }, + { + "epoch": 2.0950033311125917, + "grad_norm": 12.817399024963379, + "learning_rate": 3.915907913243023e-05, + "loss": 1.8435, + "step": 4430 + }, + { + "epoch": 2.0956695536309127, + "grad_norm": 10.048445701599121, + "learning_rate": 3.912206677030128e-05, + "loss": 1.7525, + "step": 4440 + }, + { + "epoch": 2.096335776149234, + "grad_norm": 16.87804412841797, + "learning_rate": 3.908505440817233e-05, + "loss": 1.5535, + "step": 4450 + }, + { + "epoch": 2.097001998667555, + "grad_norm": 13.084789276123047, + "learning_rate": 3.904804204604338e-05, + "loss": 1.7752, + "step": 4460 + }, + { + "epoch": 2.097668221185876, + "grad_norm": 16.734256744384766, + "learning_rate": 3.901102968391443e-05, + "loss": 1.7388, + "step": 4470 + }, + { + "epoch": 2.0983344437041973, + "grad_norm": 15.996969223022461, + "learning_rate": 3.8974017321785475e-05, + "loss": 1.707, + "step": 4480 + }, + { + "epoch": 2.0990006662225182, + "grad_norm": 13.369511604309082, + "learning_rate": 3.893700495965652e-05, + "loss": 1.5391, + "step": 4490 + }, + { + "epoch": 2.0996668887408396, + "grad_norm": 12.97499942779541, + "learning_rate": 3.8899992597527577e-05, + "loss": 1.1963, + "step": 4500 + }, + { + "epoch": 2.1000666222518323, + "eval_accuracy": 0.40352661919294674, + "eval_loss": 2.0147666931152344, + "eval_runtime": 933.7877, + "eval_samples_per_second": 3.158, + "eval_steps_per_second": 0.395, + "step": 4506 + }, + { + "epoch": 3.0002664890073283, + "grad_norm": 15.215970039367676, + "learning_rate": 3.8862980235398624e-05, + "loss": 1.4756, + "step": 4510 + }, + { + "epoch": 3.0009327115256497, + "grad_norm": 16.958261489868164, + "learning_rate": 3.882596787326968e-05, + "loss": 1.563, + "step": 4520 + }, + { + "epoch": 3.0015989340439706, + "grad_norm": 8.306703567504883, + "learning_rate": 3.8788955511140725e-05, + "loss": 1.1522, + "step": 4530 + }, + { + "epoch": 3.002265156562292, + "grad_norm": 13.249198913574219, + "learning_rate": 3.875194314901177e-05, + "loss": 1.3661, + "step": 4540 + }, + { + "epoch": 3.002931379080613, + "grad_norm": 18.432411193847656, + "learning_rate": 3.871493078688282e-05, + "loss": 1.5291, + "step": 4550 + }, + { + "epoch": 3.003597601598934, + "grad_norm": 18.05307960510254, + "learning_rate": 3.867791842475387e-05, + "loss": 1.4069, + "step": 4560 + }, + { + "epoch": 3.0042638241172552, + "grad_norm": 10.151884078979492, + "learning_rate": 3.864090606262492e-05, + "loss": 1.697, + "step": 4570 + }, + { + "epoch": 3.004930046635576, + "grad_norm": 16.832725524902344, + "learning_rate": 3.860389370049597e-05, + "loss": 1.7264, + "step": 4580 + }, + { + "epoch": 3.0055962691538975, + "grad_norm": 11.218570709228516, + "learning_rate": 3.8566881338367016e-05, + "loss": 1.5317, + "step": 4590 + }, + { + "epoch": 3.0062624916722185, + "grad_norm": 11.21003246307373, + "learning_rate": 3.852986897623806e-05, + "loss": 1.3907, + "step": 4600 + }, + { + "epoch": 3.0069287141905394, + "grad_norm": 18.26798439025879, + "learning_rate": 3.849285661410911e-05, + "loss": 1.407, + "step": 4610 + }, + { + "epoch": 3.007594936708861, + "grad_norm": 10.299131393432617, + "learning_rate": 3.845584425198016e-05, + "loss": 1.4544, + "step": 4620 + }, + { + "epoch": 3.0082611592271817, + "grad_norm": 14.523100852966309, + "learning_rate": 3.841883188985121e-05, + "loss": 1.6025, + "step": 4630 + }, + { + "epoch": 3.008927381745503, + "grad_norm": 21.3692626953125, + "learning_rate": 3.8381819527722266e-05, + "loss": 1.4126, + "step": 4640 + }, + { + "epoch": 3.009593604263824, + "grad_norm": 14.70117473602295, + "learning_rate": 3.834480716559331e-05, + "loss": 1.5408, + "step": 4650 + }, + { + "epoch": 3.0102598267821454, + "grad_norm": 12.915194511413574, + "learning_rate": 3.830779480346436e-05, + "loss": 1.4654, + "step": 4660 + }, + { + "epoch": 3.0109260493004664, + "grad_norm": 20.629234313964844, + "learning_rate": 3.827078244133541e-05, + "loss": 1.5289, + "step": 4670 + }, + { + "epoch": 3.0115922718187873, + "grad_norm": 12.350275993347168, + "learning_rate": 3.8233770079206455e-05, + "loss": 1.4062, + "step": 4680 + }, + { + "epoch": 3.0122584943371087, + "grad_norm": 15.387788772583008, + "learning_rate": 3.819675771707751e-05, + "loss": 1.6281, + "step": 4690 + }, + { + "epoch": 3.0129247168554296, + "grad_norm": 12.144196510314941, + "learning_rate": 3.8159745354948557e-05, + "loss": 1.5496, + "step": 4700 + }, + { + "epoch": 3.013590939373751, + "grad_norm": 12.420608520507812, + "learning_rate": 3.8122732992819604e-05, + "loss": 1.7403, + "step": 4710 + }, + { + "epoch": 3.014257161892072, + "grad_norm": 14.960898399353027, + "learning_rate": 3.808572063069065e-05, + "loss": 1.4505, + "step": 4720 + }, + { + "epoch": 3.014923384410393, + "grad_norm": 13.325089454650879, + "learning_rate": 3.80487082685617e-05, + "loss": 1.4012, + "step": 4730 + }, + { + "epoch": 3.0155896069287143, + "grad_norm": 11.524730682373047, + "learning_rate": 3.8011695906432746e-05, + "loss": 1.6834, + "step": 4740 + }, + { + "epoch": 3.016255829447035, + "grad_norm": 14.186535835266113, + "learning_rate": 3.79746835443038e-05, + "loss": 1.7019, + "step": 4750 + }, + { + "epoch": 3.0169220519653566, + "grad_norm": 14.750042915344238, + "learning_rate": 3.793767118217485e-05, + "loss": 1.4515, + "step": 4760 + }, + { + "epoch": 3.0175882744836775, + "grad_norm": 17.655672073364258, + "learning_rate": 3.79006588200459e-05, + "loss": 1.581, + "step": 4770 + }, + { + "epoch": 3.018254497001999, + "grad_norm": 17.052404403686523, + "learning_rate": 3.786364645791695e-05, + "loss": 1.5761, + "step": 4780 + }, + { + "epoch": 3.01892071952032, + "grad_norm": 11.285076141357422, + "learning_rate": 3.7826634095787996e-05, + "loss": 1.283, + "step": 4790 + }, + { + "epoch": 3.0195869420386408, + "grad_norm": 13.205350875854492, + "learning_rate": 3.778962173365904e-05, + "loss": 1.7407, + "step": 4800 + }, + { + "epoch": 3.020253164556962, + "grad_norm": 14.600531578063965, + "learning_rate": 3.77526093715301e-05, + "loss": 1.4104, + "step": 4810 + }, + { + "epoch": 3.020919387075283, + "grad_norm": 11.197827339172363, + "learning_rate": 3.7715597009401144e-05, + "loss": 1.5037, + "step": 4820 + }, + { + "epoch": 3.0215856095936044, + "grad_norm": 15.034667015075684, + "learning_rate": 3.767858464727219e-05, + "loss": 1.5055, + "step": 4830 + }, + { + "epoch": 3.0222518321119254, + "grad_norm": 11.730103492736816, + "learning_rate": 3.764157228514324e-05, + "loss": 1.4912, + "step": 4840 + }, + { + "epoch": 3.0229180546302463, + "grad_norm": 11.502195358276367, + "learning_rate": 3.7604559923014286e-05, + "loss": 1.5078, + "step": 4850 + }, + { + "epoch": 3.0235842771485677, + "grad_norm": 10.607624053955078, + "learning_rate": 3.7567547560885334e-05, + "loss": 1.6138, + "step": 4860 + }, + { + "epoch": 3.0242504996668886, + "grad_norm": 13.86859130859375, + "learning_rate": 3.753053519875639e-05, + "loss": 1.6391, + "step": 4870 + }, + { + "epoch": 3.02491672218521, + "grad_norm": 9.169739723205566, + "learning_rate": 3.7493522836627435e-05, + "loss": 1.7743, + "step": 4880 + }, + { + "epoch": 3.025582944703531, + "grad_norm": 15.122998237609863, + "learning_rate": 3.745651047449848e-05, + "loss": 1.2746, + "step": 4890 + }, + { + "epoch": 3.026249167221852, + "grad_norm": 15.145633697509766, + "learning_rate": 3.7419498112369536e-05, + "loss": 1.6317, + "step": 4900 + }, + { + "epoch": 3.0269153897401733, + "grad_norm": 17.01380157470703, + "learning_rate": 3.7382485750240584e-05, + "loss": 1.4506, + "step": 4910 + }, + { + "epoch": 3.027581612258494, + "grad_norm": 15.637210845947266, + "learning_rate": 3.734547338811163e-05, + "loss": 1.5191, + "step": 4920 + }, + { + "epoch": 3.0282478347768156, + "grad_norm": 11.869989395141602, + "learning_rate": 3.730846102598268e-05, + "loss": 1.5614, + "step": 4930 + }, + { + "epoch": 3.0289140572951365, + "grad_norm": 10.91602611541748, + "learning_rate": 3.727144866385373e-05, + "loss": 1.5021, + "step": 4940 + }, + { + "epoch": 3.029580279813458, + "grad_norm": 15.031268119812012, + "learning_rate": 3.723443630172478e-05, + "loss": 1.5928, + "step": 4950 + }, + { + "epoch": 3.030246502331779, + "grad_norm": 17.435476303100586, + "learning_rate": 3.719742393959583e-05, + "loss": 1.4846, + "step": 4960 + }, + { + "epoch": 3.0309127248500998, + "grad_norm": 15.683967590332031, + "learning_rate": 3.7160411577466874e-05, + "loss": 1.6661, + "step": 4970 + }, + { + "epoch": 3.031578947368421, + "grad_norm": 14.337061882019043, + "learning_rate": 3.712339921533792e-05, + "loss": 1.4514, + "step": 4980 + }, + { + "epoch": 3.032245169886742, + "grad_norm": 12.065239906311035, + "learning_rate": 3.708638685320897e-05, + "loss": 1.394, + "step": 4990 + }, + { + "epoch": 3.0329113924050635, + "grad_norm": 8.80199909210205, + "learning_rate": 3.704937449108002e-05, + "loss": 1.5229, + "step": 5000 + }, + { + "epoch": 3.0335776149233844, + "grad_norm": 12.470699310302734, + "learning_rate": 3.701236212895107e-05, + "loss": 1.6067, + "step": 5010 + }, + { + "epoch": 3.0342438374417053, + "grad_norm": 16.140684127807617, + "learning_rate": 3.697534976682212e-05, + "loss": 1.6443, + "step": 5020 + }, + { + "epoch": 3.0349100599600267, + "grad_norm": 8.777125358581543, + "learning_rate": 3.693833740469317e-05, + "loss": 1.556, + "step": 5030 + }, + { + "epoch": 3.0355762824783477, + "grad_norm": 18.180103302001953, + "learning_rate": 3.690132504256422e-05, + "loss": 1.5939, + "step": 5040 + }, + { + "epoch": 3.036242504996669, + "grad_norm": 14.178755760192871, + "learning_rate": 3.6864312680435266e-05, + "loss": 1.6334, + "step": 5050 + }, + { + "epoch": 3.03690872751499, + "grad_norm": 12.896017074584961, + "learning_rate": 3.682730031830632e-05, + "loss": 1.7531, + "step": 5060 + }, + { + "epoch": 3.037574950033311, + "grad_norm": 9.471027374267578, + "learning_rate": 3.679028795617737e-05, + "loss": 1.323, + "step": 5070 + }, + { + "epoch": 3.0382411725516323, + "grad_norm": 12.745383262634277, + "learning_rate": 3.6753275594048415e-05, + "loss": 1.8436, + "step": 5080 + }, + { + "epoch": 3.038907395069953, + "grad_norm": 13.957222938537598, + "learning_rate": 3.671626323191946e-05, + "loss": 1.5402, + "step": 5090 + }, + { + "epoch": 3.0395736175882746, + "grad_norm": 9.582544326782227, + "learning_rate": 3.667925086979051e-05, + "loss": 1.4357, + "step": 5100 + }, + { + "epoch": 3.0402398401065955, + "grad_norm": 10.9628324508667, + "learning_rate": 3.664223850766156e-05, + "loss": 1.7103, + "step": 5110 + }, + { + "epoch": 3.040906062624917, + "grad_norm": 9.880194664001465, + "learning_rate": 3.660522614553261e-05, + "loss": 1.5198, + "step": 5120 + }, + { + "epoch": 3.041572285143238, + "grad_norm": 10.77079963684082, + "learning_rate": 3.656821378340366e-05, + "loss": 1.6108, + "step": 5130 + }, + { + "epoch": 3.042238507661559, + "grad_norm": 18.81410026550293, + "learning_rate": 3.6531201421274706e-05, + "loss": 1.4817, + "step": 5140 + }, + { + "epoch": 3.04290473017988, + "grad_norm": 12.222564697265625, + "learning_rate": 3.649418905914575e-05, + "loss": 1.182, + "step": 5150 + }, + { + "epoch": 3.043570952698201, + "grad_norm": 14.021005630493164, + "learning_rate": 3.645717669701681e-05, + "loss": 1.418, + "step": 5160 + }, + { + "epoch": 3.0442371752165225, + "grad_norm": 15.33260726928711, + "learning_rate": 3.6420164334887854e-05, + "loss": 1.7835, + "step": 5170 + }, + { + "epoch": 3.0449033977348434, + "grad_norm": 25.292016983032227, + "learning_rate": 3.638315197275891e-05, + "loss": 1.7755, + "step": 5180 + }, + { + "epoch": 3.0455696202531644, + "grad_norm": 13.377341270446777, + "learning_rate": 3.6346139610629956e-05, + "loss": 1.5004, + "step": 5190 + }, + { + "epoch": 3.0462358427714857, + "grad_norm": 16.162139892578125, + "learning_rate": 3.6309127248501e-05, + "loss": 1.6479, + "step": 5200 + }, + { + "epoch": 3.0469020652898067, + "grad_norm": 17.52812957763672, + "learning_rate": 3.627211488637205e-05, + "loss": 1.4474, + "step": 5210 + }, + { + "epoch": 3.047568287808128, + "grad_norm": 13.561515808105469, + "learning_rate": 3.62351025242431e-05, + "loss": 1.4563, + "step": 5220 + }, + { + "epoch": 3.048234510326449, + "grad_norm": 10.305264472961426, + "learning_rate": 3.6198090162114145e-05, + "loss": 1.4348, + "step": 5230 + }, + { + "epoch": 3.0489007328447704, + "grad_norm": 17.934368133544922, + "learning_rate": 3.61610777999852e-05, + "loss": 1.3775, + "step": 5240 + }, + { + "epoch": 3.0495669553630913, + "grad_norm": 14.757049560546875, + "learning_rate": 3.6124065437856246e-05, + "loss": 1.5544, + "step": 5250 + }, + { + "epoch": 3.0502331778814122, + "grad_norm": 12.461883544921875, + "learning_rate": 3.6087053075727294e-05, + "loss": 1.4527, + "step": 5260 + }, + { + "epoch": 3.0508994003997336, + "grad_norm": 10.8318452835083, + "learning_rate": 3.605004071359834e-05, + "loss": 1.8238, + "step": 5270 + }, + { + "epoch": 3.0515656229180546, + "grad_norm": 11.785452842712402, + "learning_rate": 3.601302835146939e-05, + "loss": 1.4037, + "step": 5280 + }, + { + "epoch": 3.052231845436376, + "grad_norm": 15.148716926574707, + "learning_rate": 3.597601598934044e-05, + "loss": 1.5518, + "step": 5290 + }, + { + "epoch": 3.052898067954697, + "grad_norm": 7.458914756774902, + "learning_rate": 3.593900362721149e-05, + "loss": 1.5001, + "step": 5300 + }, + { + "epoch": 3.053564290473018, + "grad_norm": 13.059767723083496, + "learning_rate": 3.5901991265082544e-05, + "loss": 1.4981, + "step": 5310 + }, + { + "epoch": 3.054230512991339, + "grad_norm": 12.219048500061035, + "learning_rate": 3.586497890295359e-05, + "loss": 1.355, + "step": 5320 + }, + { + "epoch": 3.05489673550966, + "grad_norm": 16.16537094116211, + "learning_rate": 3.582796654082464e-05, + "loss": 1.5513, + "step": 5330 + }, + { + "epoch": 3.0555629580279815, + "grad_norm": 16.25689125061035, + "learning_rate": 3.5790954178695686e-05, + "loss": 1.4732, + "step": 5340 + }, + { + "epoch": 3.0562291805463024, + "grad_norm": 8.901713371276855, + "learning_rate": 3.575394181656673e-05, + "loss": 1.3732, + "step": 5350 + }, + { + "epoch": 3.0568954030646234, + "grad_norm": 13.870367050170898, + "learning_rate": 3.571692945443779e-05, + "loss": 1.6119, + "step": 5360 + }, + { + "epoch": 3.0575616255829448, + "grad_norm": 13.626014709472656, + "learning_rate": 3.5679917092308834e-05, + "loss": 1.5496, + "step": 5370 + }, + { + "epoch": 3.0582278481012657, + "grad_norm": 11.13468074798584, + "learning_rate": 3.564290473017988e-05, + "loss": 1.4368, + "step": 5380 + }, + { + "epoch": 3.058894070619587, + "grad_norm": 16.570789337158203, + "learning_rate": 3.560589236805093e-05, + "loss": 1.4659, + "step": 5390 + }, + { + "epoch": 3.059560293137908, + "grad_norm": 21.184246063232422, + "learning_rate": 3.5568880005921976e-05, + "loss": 1.311, + "step": 5400 + }, + { + "epoch": 3.0602265156562294, + "grad_norm": 13.80511474609375, + "learning_rate": 3.5531867643793023e-05, + "loss": 1.3566, + "step": 5410 + }, + { + "epoch": 3.0608927381745503, + "grad_norm": 15.659733772277832, + "learning_rate": 3.549485528166408e-05, + "loss": 1.3253, + "step": 5420 + }, + { + "epoch": 3.0615589606928713, + "grad_norm": 15.352439880371094, + "learning_rate": 3.5457842919535125e-05, + "loss": 1.7833, + "step": 5430 + }, + { + "epoch": 3.0622251832111926, + "grad_norm": 12.850274085998535, + "learning_rate": 3.542083055740618e-05, + "loss": 1.4901, + "step": 5440 + }, + { + "epoch": 3.0628914057295136, + "grad_norm": 25.66019630432129, + "learning_rate": 3.5383818195277226e-05, + "loss": 1.464, + "step": 5450 + }, + { + "epoch": 3.063557628247835, + "grad_norm": 11.96265983581543, + "learning_rate": 3.5346805833148274e-05, + "loss": 1.3499, + "step": 5460 + }, + { + "epoch": 3.064223850766156, + "grad_norm": 8.351175308227539, + "learning_rate": 3.530979347101932e-05, + "loss": 1.5202, + "step": 5470 + }, + { + "epoch": 3.064890073284477, + "grad_norm": 14.557748794555664, + "learning_rate": 3.5272781108890375e-05, + "loss": 1.4381, + "step": 5480 + }, + { + "epoch": 3.065556295802798, + "grad_norm": 11.688433647155762, + "learning_rate": 3.523576874676142e-05, + "loss": 1.5388, + "step": 5490 + }, + { + "epoch": 3.066222518321119, + "grad_norm": 11.493616104125977, + "learning_rate": 3.519875638463247e-05, + "loss": 1.5093, + "step": 5500 + }, + { + "epoch": 3.0668887408394405, + "grad_norm": 13.362151145935059, + "learning_rate": 3.516174402250352e-05, + "loss": 1.5642, + "step": 5510 + }, + { + "epoch": 3.0675549633577615, + "grad_norm": 11.954153060913086, + "learning_rate": 3.5124731660374564e-05, + "loss": 1.4481, + "step": 5520 + }, + { + "epoch": 3.068221185876083, + "grad_norm": 20.144615173339844, + "learning_rate": 3.508771929824561e-05, + "loss": 1.2268, + "step": 5530 + }, + { + "epoch": 3.0688874083944038, + "grad_norm": 14.94782543182373, + "learning_rate": 3.5050706936116666e-05, + "loss": 1.5983, + "step": 5540 + }, + { + "epoch": 3.0695536309127247, + "grad_norm": 20.14879608154297, + "learning_rate": 3.501369457398771e-05, + "loss": 1.5956, + "step": 5550 + }, + { + "epoch": 3.070219853431046, + "grad_norm": 17.22555923461914, + "learning_rate": 3.497668221185876e-05, + "loss": 1.6973, + "step": 5560 + }, + { + "epoch": 3.070886075949367, + "grad_norm": 11.193543434143066, + "learning_rate": 3.4939669849729814e-05, + "loss": 1.4243, + "step": 5570 + }, + { + "epoch": 3.0715522984676884, + "grad_norm": 15.382784843444824, + "learning_rate": 3.490265748760086e-05, + "loss": 1.7221, + "step": 5580 + }, + { + "epoch": 3.0722185209860093, + "grad_norm": 13.465352058410645, + "learning_rate": 3.486564512547191e-05, + "loss": 1.4708, + "step": 5590 + }, + { + "epoch": 3.0728847435043303, + "grad_norm": 18.847583770751953, + "learning_rate": 3.482863276334296e-05, + "loss": 1.652, + "step": 5600 + }, + { + "epoch": 3.0735509660226517, + "grad_norm": 13.221688270568848, + "learning_rate": 3.479162040121401e-05, + "loss": 1.4341, + "step": 5610 + }, + { + "epoch": 3.0742171885409726, + "grad_norm": 13.010218620300293, + "learning_rate": 3.475460803908506e-05, + "loss": 1.3999, + "step": 5620 + }, + { + "epoch": 3.074883411059294, + "grad_norm": 12.14547061920166, + "learning_rate": 3.4717595676956105e-05, + "loss": 1.5665, + "step": 5630 + }, + { + "epoch": 3.075549633577615, + "grad_norm": 21.72825813293457, + "learning_rate": 3.468058331482715e-05, + "loss": 1.6283, + "step": 5640 + }, + { + "epoch": 3.076215856095936, + "grad_norm": 18.014970779418945, + "learning_rate": 3.46435709526982e-05, + "loss": 1.6392, + "step": 5650 + }, + { + "epoch": 3.076882078614257, + "grad_norm": 22.04119110107422, + "learning_rate": 3.460655859056925e-05, + "loss": 1.4623, + "step": 5660 + }, + { + "epoch": 3.077548301132578, + "grad_norm": 9.21822738647461, + "learning_rate": 3.45695462284403e-05, + "loss": 1.3862, + "step": 5670 + }, + { + "epoch": 3.0782145236508995, + "grad_norm": 10.164238929748535, + "learning_rate": 3.453253386631135e-05, + "loss": 1.6107, + "step": 5680 + }, + { + "epoch": 3.0788807461692205, + "grad_norm": 11.371834754943848, + "learning_rate": 3.4495521504182395e-05, + "loss": 1.41, + "step": 5690 + }, + { + "epoch": 3.079546968687542, + "grad_norm": 12.127604484558105, + "learning_rate": 3.445850914205345e-05, + "loss": 1.6278, + "step": 5700 + }, + { + "epoch": 3.080213191205863, + "grad_norm": 9.186652183532715, + "learning_rate": 3.44214967799245e-05, + "loss": 1.3286, + "step": 5710 + }, + { + "epoch": 3.0808794137241837, + "grad_norm": 12.061607360839844, + "learning_rate": 3.4384484417795544e-05, + "loss": 1.2818, + "step": 5720 + }, + { + "epoch": 3.081545636242505, + "grad_norm": 17.80367088317871, + "learning_rate": 3.43474720556666e-05, + "loss": 1.3538, + "step": 5730 + }, + { + "epoch": 3.082211858760826, + "grad_norm": 14.947989463806152, + "learning_rate": 3.4310459693537645e-05, + "loss": 1.6617, + "step": 5740 + }, + { + "epoch": 3.0828780812791474, + "grad_norm": 14.401968002319336, + "learning_rate": 3.427344733140869e-05, + "loss": 1.3803, + "step": 5750 + }, + { + "epoch": 3.0835443037974684, + "grad_norm": 9.967034339904785, + "learning_rate": 3.423643496927974e-05, + "loss": 1.3943, + "step": 5760 + }, + { + "epoch": 3.0842105263157893, + "grad_norm": 9.973755836486816, + "learning_rate": 3.419942260715079e-05, + "loss": 1.6706, + "step": 5770 + }, + { + "epoch": 3.0848767488341107, + "grad_norm": 16.958301544189453, + "learning_rate": 3.4162410245021835e-05, + "loss": 1.5059, + "step": 5780 + }, + { + "epoch": 3.0855429713524316, + "grad_norm": 15.443440437316895, + "learning_rate": 3.412539788289289e-05, + "loss": 1.3541, + "step": 5790 + }, + { + "epoch": 3.086209193870753, + "grad_norm": 24.35504913330078, + "learning_rate": 3.4088385520763936e-05, + "loss": 1.2597, + "step": 5800 + }, + { + "epoch": 3.086875416389074, + "grad_norm": 11.669227600097656, + "learning_rate": 3.405137315863498e-05, + "loss": 1.4889, + "step": 5810 + }, + { + "epoch": 3.087541638907395, + "grad_norm": 18.460765838623047, + "learning_rate": 3.401436079650603e-05, + "loss": 1.6406, + "step": 5820 + }, + { + "epoch": 3.0882078614257162, + "grad_norm": 8.799760818481445, + "learning_rate": 3.3977348434377085e-05, + "loss": 1.3655, + "step": 5830 + }, + { + "epoch": 3.088874083944037, + "grad_norm": 14.372369766235352, + "learning_rate": 3.394033607224813e-05, + "loss": 1.4535, + "step": 5840 + }, + { + "epoch": 3.0895403064623586, + "grad_norm": 16.446149826049805, + "learning_rate": 3.3903323710119186e-05, + "loss": 1.5246, + "step": 5850 + }, + { + "epoch": 3.0902065289806795, + "grad_norm": 13.568804740905762, + "learning_rate": 3.3866311347990233e-05, + "loss": 1.5568, + "step": 5860 + }, + { + "epoch": 3.090872751499001, + "grad_norm": 13.1392240524292, + "learning_rate": 3.382929898586128e-05, + "loss": 1.4523, + "step": 5870 + }, + { + "epoch": 3.091538974017322, + "grad_norm": 15.326749801635742, + "learning_rate": 3.379228662373233e-05, + "loss": 1.3054, + "step": 5880 + }, + { + "epoch": 3.0922051965356427, + "grad_norm": 8.836103439331055, + "learning_rate": 3.3755274261603375e-05, + "loss": 1.361, + "step": 5890 + }, + { + "epoch": 3.092871419053964, + "grad_norm": 16.035789489746094, + "learning_rate": 3.371826189947442e-05, + "loss": 1.4981, + "step": 5900 + }, + { + "epoch": 3.093537641572285, + "grad_norm": 15.998915672302246, + "learning_rate": 3.368124953734548e-05, + "loss": 1.5503, + "step": 5910 + }, + { + "epoch": 3.0942038640906064, + "grad_norm": 13.637360572814941, + "learning_rate": 3.3644237175216524e-05, + "loss": 1.4543, + "step": 5920 + }, + { + "epoch": 3.0948700866089274, + "grad_norm": 13.598423957824707, + "learning_rate": 3.360722481308757e-05, + "loss": 1.7996, + "step": 5930 + }, + { + "epoch": 3.0955363091272483, + "grad_norm": 10.546525001525879, + "learning_rate": 3.357021245095862e-05, + "loss": 1.3868, + "step": 5940 + }, + { + "epoch": 3.0962025316455697, + "grad_norm": 14.470475196838379, + "learning_rate": 3.353320008882967e-05, + "loss": 1.517, + "step": 5950 + }, + { + "epoch": 3.0968687541638906, + "grad_norm": 9.543516159057617, + "learning_rate": 3.349618772670072e-05, + "loss": 1.3, + "step": 5960 + }, + { + "epoch": 3.097534976682212, + "grad_norm": 12.861827850341797, + "learning_rate": 3.3459175364571774e-05, + "loss": 1.7645, + "step": 5970 + }, + { + "epoch": 3.098201199200533, + "grad_norm": 12.052502632141113, + "learning_rate": 3.342216300244282e-05, + "loss": 1.6164, + "step": 5980 + }, + { + "epoch": 3.098867421718854, + "grad_norm": 11.314814567565918, + "learning_rate": 3.338515064031387e-05, + "loss": 1.2414, + "step": 5990 + }, + { + "epoch": 3.0995336442371753, + "grad_norm": 21.638168334960938, + "learning_rate": 3.3348138278184916e-05, + "loss": 1.5381, + "step": 6000 + }, + { + "epoch": 3.1000666222518323, + "eval_accuracy": 0.4262461851475076, + "eval_loss": 1.8795864582061768, + "eval_runtime": 930.084, + "eval_samples_per_second": 3.171, + "eval_steps_per_second": 0.397, + "step": 6008 + }, + { + "epoch": 4.000133244503664, + "grad_norm": 22.866823196411133, + "learning_rate": 3.331112591605596e-05, + "loss": 1.2868, + "step": 6010 + }, + { + "epoch": 4.000799467021985, + "grad_norm": 13.173073768615723, + "learning_rate": 3.327411355392701e-05, + "loss": 1.3054, + "step": 6020 + }, + { + "epoch": 4.001465689540306, + "grad_norm": 9.517548561096191, + "learning_rate": 3.3237101191798065e-05, + "loss": 1.3582, + "step": 6030 + }, + { + "epoch": 4.002131912058627, + "grad_norm": 16.31472396850586, + "learning_rate": 3.320008882966911e-05, + "loss": 1.3807, + "step": 6040 + }, + { + "epoch": 4.002798134576949, + "grad_norm": 15.690023422241211, + "learning_rate": 3.316307646754016e-05, + "loss": 1.3149, + "step": 6050 + }, + { + "epoch": 4.00346435709527, + "grad_norm": 10.64606761932373, + "learning_rate": 3.3126064105411207e-05, + "loss": 1.1538, + "step": 6060 + }, + { + "epoch": 4.004130579613591, + "grad_norm": 15.117137908935547, + "learning_rate": 3.3089051743282254e-05, + "loss": 1.4487, + "step": 6070 + }, + { + "epoch": 4.004796802131912, + "grad_norm": 8.122971534729004, + "learning_rate": 3.305203938115331e-05, + "loss": 1.4562, + "step": 6080 + }, + { + "epoch": 4.005463024650233, + "grad_norm": 6.153361797332764, + "learning_rate": 3.3015027019024355e-05, + "loss": 1.2875, + "step": 6090 + }, + { + "epoch": 4.006129247168555, + "grad_norm": 10.3196439743042, + "learning_rate": 3.297801465689541e-05, + "loss": 1.4453, + "step": 6100 + }, + { + "epoch": 4.0067954696868755, + "grad_norm": 10.312451362609863, + "learning_rate": 3.294100229476646e-05, + "loss": 1.5337, + "step": 6110 + }, + { + "epoch": 4.007461692205196, + "grad_norm": 16.3599853515625, + "learning_rate": 3.2903989932637504e-05, + "loss": 1.4675, + "step": 6120 + }, + { + "epoch": 4.008127914723517, + "grad_norm": 29.082304000854492, + "learning_rate": 3.286697757050855e-05, + "loss": 1.5648, + "step": 6130 + }, + { + "epoch": 4.008794137241839, + "grad_norm": 12.260856628417969, + "learning_rate": 3.28299652083796e-05, + "loss": 1.4563, + "step": 6140 + }, + { + "epoch": 4.00946035976016, + "grad_norm": 14.735239028930664, + "learning_rate": 3.279295284625065e-05, + "loss": 1.5013, + "step": 6150 + }, + { + "epoch": 4.010126582278481, + "grad_norm": 12.416278839111328, + "learning_rate": 3.27559404841217e-05, + "loss": 1.4909, + "step": 6160 + }, + { + "epoch": 4.010792804796802, + "grad_norm": 19.984357833862305, + "learning_rate": 3.271892812199275e-05, + "loss": 1.4101, + "step": 6170 + }, + { + "epoch": 4.011459027315123, + "grad_norm": 21.226810455322266, + "learning_rate": 3.2681915759863795e-05, + "loss": 1.7346, + "step": 6180 + }, + { + "epoch": 4.012125249833445, + "grad_norm": 22.12054443359375, + "learning_rate": 3.264490339773484e-05, + "loss": 1.7142, + "step": 6190 + }, + { + "epoch": 4.012791472351766, + "grad_norm": 13.617444038391113, + "learning_rate": 3.260789103560589e-05, + "loss": 1.5081, + "step": 6200 + }, + { + "epoch": 4.013457694870087, + "grad_norm": 12.599283218383789, + "learning_rate": 3.257087867347694e-05, + "loss": 1.5396, + "step": 6210 + }, + { + "epoch": 4.014123917388408, + "grad_norm": 19.034812927246094, + "learning_rate": 3.253386631134799e-05, + "loss": 1.4477, + "step": 6220 + }, + { + "epoch": 4.0147901399067285, + "grad_norm": 11.019577980041504, + "learning_rate": 3.2496853949219045e-05, + "loss": 1.0688, + "step": 6230 + }, + { + "epoch": 4.01545636242505, + "grad_norm": 17.27197265625, + "learning_rate": 3.245984158709009e-05, + "loss": 1.3635, + "step": 6240 + }, + { + "epoch": 4.016122584943371, + "grad_norm": 12.043231964111328, + "learning_rate": 3.242282922496114e-05, + "loss": 1.5222, + "step": 6250 + }, + { + "epoch": 4.016788807461692, + "grad_norm": 35.01735305786133, + "learning_rate": 3.2385816862832187e-05, + "loss": 1.6137, + "step": 6260 + }, + { + "epoch": 4.017455029980013, + "grad_norm": 12.604266166687012, + "learning_rate": 3.234880450070324e-05, + "loss": 1.3264, + "step": 6270 + }, + { + "epoch": 4.018121252498334, + "grad_norm": 12.331528663635254, + "learning_rate": 3.231179213857429e-05, + "loss": 1.3519, + "step": 6280 + }, + { + "epoch": 4.018787475016656, + "grad_norm": 17.354236602783203, + "learning_rate": 3.2274779776445335e-05, + "loss": 1.3254, + "step": 6290 + }, + { + "epoch": 4.019453697534977, + "grad_norm": 14.423813819885254, + "learning_rate": 3.223776741431638e-05, + "loss": 1.2438, + "step": 6300 + }, + { + "epoch": 4.020119920053298, + "grad_norm": 18.979324340820312, + "learning_rate": 3.220075505218743e-05, + "loss": 1.1656, + "step": 6310 + }, + { + "epoch": 4.020786142571619, + "grad_norm": 17.50141143798828, + "learning_rate": 3.216374269005848e-05, + "loss": 1.2051, + "step": 6320 + }, + { + "epoch": 4.02145236508994, + "grad_norm": 13.537330627441406, + "learning_rate": 3.2126730327929524e-05, + "loss": 1.4187, + "step": 6330 + }, + { + "epoch": 4.0221185876082615, + "grad_norm": 10.338102340698242, + "learning_rate": 3.208971796580058e-05, + "loss": 1.3292, + "step": 6340 + }, + { + "epoch": 4.022784810126582, + "grad_norm": 13.946428298950195, + "learning_rate": 3.2052705603671626e-05, + "loss": 1.4169, + "step": 6350 + }, + { + "epoch": 4.023451032644903, + "grad_norm": 14.485420227050781, + "learning_rate": 3.201569324154268e-05, + "loss": 1.579, + "step": 6360 + }, + { + "epoch": 4.024117255163224, + "grad_norm": 17.42156219482422, + "learning_rate": 3.197868087941373e-05, + "loss": 1.5805, + "step": 6370 + }, + { + "epoch": 4.024783477681545, + "grad_norm": 10.080490112304688, + "learning_rate": 3.1941668517284775e-05, + "loss": 1.3274, + "step": 6380 + }, + { + "epoch": 4.025449700199867, + "grad_norm": 15.428861618041992, + "learning_rate": 3.190465615515582e-05, + "loss": 1.3294, + "step": 6390 + }, + { + "epoch": 4.026115922718188, + "grad_norm": 7.290970802307129, + "learning_rate": 3.1867643793026876e-05, + "loss": 1.3107, + "step": 6400 + }, + { + "epoch": 4.026782145236509, + "grad_norm": 16.720703125, + "learning_rate": 3.183063143089792e-05, + "loss": 1.1085, + "step": 6410 + }, + { + "epoch": 4.02744836775483, + "grad_norm": 16.478504180908203, + "learning_rate": 3.179361906876897e-05, + "loss": 1.3723, + "step": 6420 + }, + { + "epoch": 4.028114590273152, + "grad_norm": 12.011799812316895, + "learning_rate": 3.175660670664002e-05, + "loss": 1.0051, + "step": 6430 + }, + { + "epoch": 4.028780812791473, + "grad_norm": 13.277886390686035, + "learning_rate": 3.1719594344511065e-05, + "loss": 1.2938, + "step": 6440 + }, + { + "epoch": 4.0294470353097935, + "grad_norm": 19.252532958984375, + "learning_rate": 3.168258198238211e-05, + "loss": 1.3156, + "step": 6450 + }, + { + "epoch": 4.0301132578281145, + "grad_norm": 16.14283561706543, + "learning_rate": 3.1645569620253167e-05, + "loss": 1.5451, + "step": 6460 + }, + { + "epoch": 4.030779480346435, + "grad_norm": 20.538185119628906, + "learning_rate": 3.1608557258124214e-05, + "loss": 1.4057, + "step": 6470 + }, + { + "epoch": 4.031445702864757, + "grad_norm": 22.111997604370117, + "learning_rate": 3.157154489599526e-05, + "loss": 1.2072, + "step": 6480 + }, + { + "epoch": 4.032111925383078, + "grad_norm": 14.090339660644531, + "learning_rate": 3.1534532533866315e-05, + "loss": 1.3606, + "step": 6490 + }, + { + "epoch": 4.032778147901399, + "grad_norm": 14.041748046875, + "learning_rate": 3.149752017173736e-05, + "loss": 1.3211, + "step": 6500 + }, + { + "epoch": 4.03344437041972, + "grad_norm": 6.82070255279541, + "learning_rate": 3.146050780960841e-05, + "loss": 1.3355, + "step": 6510 + }, + { + "epoch": 4.034110592938041, + "grad_norm": 20.088518142700195, + "learning_rate": 3.1423495447479464e-05, + "loss": 1.6087, + "step": 6520 + }, + { + "epoch": 4.034776815456363, + "grad_norm": 11.49791145324707, + "learning_rate": 3.138648308535051e-05, + "loss": 1.3514, + "step": 6530 + }, + { + "epoch": 4.035443037974684, + "grad_norm": 12.951839447021484, + "learning_rate": 3.134947072322156e-05, + "loss": 1.2688, + "step": 6540 + }, + { + "epoch": 4.036109260493005, + "grad_norm": 21.329265594482422, + "learning_rate": 3.1312458361092606e-05, + "loss": 1.2153, + "step": 6550 + }, + { + "epoch": 4.036775483011326, + "grad_norm": 11.322661399841309, + "learning_rate": 3.127544599896365e-05, + "loss": 1.2848, + "step": 6560 + }, + { + "epoch": 4.0374417055296465, + "grad_norm": 13.17877197265625, + "learning_rate": 3.12384336368347e-05, + "loss": 1.2915, + "step": 6570 + }, + { + "epoch": 4.038107928047968, + "grad_norm": 18.409740447998047, + "learning_rate": 3.1201421274705754e-05, + "loss": 1.2272, + "step": 6580 + }, + { + "epoch": 4.038774150566289, + "grad_norm": 11.204869270324707, + "learning_rate": 3.11644089125768e-05, + "loss": 1.5458, + "step": 6590 + }, + { + "epoch": 4.03944037308461, + "grad_norm": 23.34467315673828, + "learning_rate": 3.112739655044785e-05, + "loss": 1.3412, + "step": 6600 + }, + { + "epoch": 4.040106595602931, + "grad_norm": 19.538471221923828, + "learning_rate": 3.1090384188318896e-05, + "loss": 1.2632, + "step": 6610 + }, + { + "epoch": 4.040772818121252, + "grad_norm": 19.623435974121094, + "learning_rate": 3.105337182618995e-05, + "loss": 1.3473, + "step": 6620 + }, + { + "epoch": 4.041439040639574, + "grad_norm": 13.31259536743164, + "learning_rate": 3.1016359464061e-05, + "loss": 1.4039, + "step": 6630 + }, + { + "epoch": 4.042105263157895, + "grad_norm": 11.650578498840332, + "learning_rate": 3.097934710193205e-05, + "loss": 1.2364, + "step": 6640 + }, + { + "epoch": 4.042771485676216, + "grad_norm": 13.556853294372559, + "learning_rate": 3.09423347398031e-05, + "loss": 1.199, + "step": 6650 + }, + { + "epoch": 4.043437708194537, + "grad_norm": 15.513994216918945, + "learning_rate": 3.0905322377674146e-05, + "loss": 1.25, + "step": 6660 + }, + { + "epoch": 4.044103930712858, + "grad_norm": 16.894397735595703, + "learning_rate": 3.0868310015545194e-05, + "loss": 1.5079, + "step": 6670 + }, + { + "epoch": 4.0447701532311795, + "grad_norm": 20.541175842285156, + "learning_rate": 3.083129765341624e-05, + "loss": 1.3052, + "step": 6680 + }, + { + "epoch": 4.0454363757495, + "grad_norm": 14.237310409545898, + "learning_rate": 3.079428529128729e-05, + "loss": 1.2968, + "step": 6690 + }, + { + "epoch": 4.046102598267821, + "grad_norm": 18.40723991394043, + "learning_rate": 3.075727292915834e-05, + "loss": 1.3942, + "step": 6700 + }, + { + "epoch": 4.046768820786142, + "grad_norm": 15.848162651062012, + "learning_rate": 3.072026056702939e-05, + "loss": 1.3092, + "step": 6710 + }, + { + "epoch": 4.047435043304464, + "grad_norm": 9.55514144897461, + "learning_rate": 3.068324820490044e-05, + "loss": 1.3233, + "step": 6720 + }, + { + "epoch": 4.048101265822785, + "grad_norm": 12.167716979980469, + "learning_rate": 3.0646235842771484e-05, + "loss": 1.296, + "step": 6730 + }, + { + "epoch": 4.048767488341106, + "grad_norm": 16.108800888061523, + "learning_rate": 3.060922348064253e-05, + "loss": 1.6003, + "step": 6740 + }, + { + "epoch": 4.049433710859427, + "grad_norm": 15.742432594299316, + "learning_rate": 3.0572211118513586e-05, + "loss": 1.4423, + "step": 6750 + }, + { + "epoch": 4.050099933377748, + "grad_norm": 16.92340660095215, + "learning_rate": 3.053519875638464e-05, + "loss": 1.5272, + "step": 6760 + }, + { + "epoch": 4.05076615589607, + "grad_norm": 15.265058517456055, + "learning_rate": 3.0498186394255684e-05, + "loss": 1.4342, + "step": 6770 + }, + { + "epoch": 4.051432378414391, + "grad_norm": 12.96069622039795, + "learning_rate": 3.046117403212673e-05, + "loss": 1.24, + "step": 6780 + }, + { + "epoch": 4.052098600932712, + "grad_norm": 17.07809829711914, + "learning_rate": 3.0424161669997782e-05, + "loss": 1.2475, + "step": 6790 + }, + { + "epoch": 4.0527648234510325, + "grad_norm": 14.411175727844238, + "learning_rate": 3.038714930786883e-05, + "loss": 1.2367, + "step": 6800 + }, + { + "epoch": 4.053431045969353, + "grad_norm": 16.544776916503906, + "learning_rate": 3.0350136945739876e-05, + "loss": 1.4367, + "step": 6810 + }, + { + "epoch": 4.054097268487675, + "grad_norm": 9.868607521057129, + "learning_rate": 3.031312458361093e-05, + "loss": 1.3964, + "step": 6820 + }, + { + "epoch": 4.054763491005996, + "grad_norm": 21.66016387939453, + "learning_rate": 3.0276112221481978e-05, + "loss": 1.3543, + "step": 6830 + }, + { + "epoch": 4.055429713524317, + "grad_norm": 20.950666427612305, + "learning_rate": 3.0239099859353025e-05, + "loss": 1.2699, + "step": 6840 + }, + { + "epoch": 4.056095936042638, + "grad_norm": 22.680036544799805, + "learning_rate": 3.0202087497224076e-05, + "loss": 1.356, + "step": 6850 + }, + { + "epoch": 4.056762158560959, + "grad_norm": 12.542525291442871, + "learning_rate": 3.0165075135095123e-05, + "loss": 1.3987, + "step": 6860 + }, + { + "epoch": 4.057428381079281, + "grad_norm": 23.960670471191406, + "learning_rate": 3.012806277296617e-05, + "loss": 1.4848, + "step": 6870 + }, + { + "epoch": 4.058094603597602, + "grad_norm": 27.134601593017578, + "learning_rate": 3.0091050410837224e-05, + "loss": 1.2249, + "step": 6880 + }, + { + "epoch": 4.058760826115923, + "grad_norm": 8.823290824890137, + "learning_rate": 3.0054038048708272e-05, + "loss": 1.5314, + "step": 6890 + }, + { + "epoch": 4.059427048634244, + "grad_norm": 13.979783058166504, + "learning_rate": 3.001702568657932e-05, + "loss": 1.7174, + "step": 6900 + }, + { + "epoch": 4.060093271152565, + "grad_norm": 14.71882438659668, + "learning_rate": 2.9980013324450366e-05, + "loss": 1.4145, + "step": 6910 + }, + { + "epoch": 4.060759493670886, + "grad_norm": 11.231287002563477, + "learning_rate": 2.9943000962321417e-05, + "loss": 1.4126, + "step": 6920 + }, + { + "epoch": 4.061425716189207, + "grad_norm": 11.53627872467041, + "learning_rate": 2.9905988600192464e-05, + "loss": 1.2493, + "step": 6930 + }, + { + "epoch": 4.062091938707528, + "grad_norm": 19.879762649536133, + "learning_rate": 2.986897623806352e-05, + "loss": 1.281, + "step": 6940 + }, + { + "epoch": 4.062758161225849, + "grad_norm": 22.309303283691406, + "learning_rate": 2.9831963875934566e-05, + "loss": 1.5756, + "step": 6950 + }, + { + "epoch": 4.06342438374417, + "grad_norm": 20.112037658691406, + "learning_rate": 2.9794951513805613e-05, + "loss": 1.4147, + "step": 6960 + }, + { + "epoch": 4.064090606262492, + "grad_norm": 27.678239822387695, + "learning_rate": 2.975793915167666e-05, + "loss": 1.3312, + "step": 6970 + }, + { + "epoch": 4.064756828780813, + "grad_norm": 21.366147994995117, + "learning_rate": 2.972092678954771e-05, + "loss": 1.4587, + "step": 6980 + }, + { + "epoch": 4.065423051299134, + "grad_norm": 17.812877655029297, + "learning_rate": 2.9683914427418758e-05, + "loss": 1.5225, + "step": 6990 + }, + { + "epoch": 4.066089273817455, + "grad_norm": 15.924351692199707, + "learning_rate": 2.9646902065289806e-05, + "loss": 1.4569, + "step": 7000 + }, + { + "epoch": 4.066755496335777, + "grad_norm": 12.777033805847168, + "learning_rate": 2.960988970316086e-05, + "loss": 1.385, + "step": 7010 + }, + { + "epoch": 4.0674217188540975, + "grad_norm": 12.325570106506348, + "learning_rate": 2.9572877341031907e-05, + "loss": 1.3896, + "step": 7020 + }, + { + "epoch": 4.0680879413724185, + "grad_norm": 15.772327423095703, + "learning_rate": 2.9535864978902954e-05, + "loss": 1.2253, + "step": 7030 + }, + { + "epoch": 4.068754163890739, + "grad_norm": 16.967395782470703, + "learning_rate": 2.9498852616774e-05, + "loss": 1.6147, + "step": 7040 + }, + { + "epoch": 4.06942038640906, + "grad_norm": 7.944436550140381, + "learning_rate": 2.9461840254645052e-05, + "loss": 1.1364, + "step": 7050 + }, + { + "epoch": 4.070086608927382, + "grad_norm": 15.581433296203613, + "learning_rate": 2.94248278925161e-05, + "loss": 1.1479, + "step": 7060 + }, + { + "epoch": 4.070752831445703, + "grad_norm": 9.832449913024902, + "learning_rate": 2.9387815530387154e-05, + "loss": 1.1747, + "step": 7070 + }, + { + "epoch": 4.071419053964024, + "grad_norm": 14.627313613891602, + "learning_rate": 2.93508031682582e-05, + "loss": 1.1966, + "step": 7080 + }, + { + "epoch": 4.072085276482345, + "grad_norm": 11.71273422241211, + "learning_rate": 2.9313790806129248e-05, + "loss": 1.5156, + "step": 7090 + }, + { + "epoch": 4.072751499000666, + "grad_norm": 18.158708572387695, + "learning_rate": 2.9276778444000296e-05, + "loss": 1.4667, + "step": 7100 + }, + { + "epoch": 4.073417721518988, + "grad_norm": 18.480003356933594, + "learning_rate": 2.9239766081871346e-05, + "loss": 1.3878, + "step": 7110 + }, + { + "epoch": 4.074083944037309, + "grad_norm": 12.444354057312012, + "learning_rate": 2.9202753719742394e-05, + "loss": 1.4722, + "step": 7120 + }, + { + "epoch": 4.07475016655563, + "grad_norm": 18.845603942871094, + "learning_rate": 2.9165741357613448e-05, + "loss": 1.2856, + "step": 7130 + }, + { + "epoch": 4.0754163890739505, + "grad_norm": 16.586881637573242, + "learning_rate": 2.9128728995484495e-05, + "loss": 1.0164, + "step": 7140 + }, + { + "epoch": 4.0760826115922715, + "grad_norm": 18.517322540283203, + "learning_rate": 2.9091716633355542e-05, + "loss": 1.553, + "step": 7150 + }, + { + "epoch": 4.076748834110593, + "grad_norm": 14.29059886932373, + "learning_rate": 2.905470427122659e-05, + "loss": 1.6955, + "step": 7160 + }, + { + "epoch": 4.077415056628914, + "grad_norm": 13.189104080200195, + "learning_rate": 2.9017691909097637e-05, + "loss": 1.424, + "step": 7170 + }, + { + "epoch": 4.078081279147235, + "grad_norm": 12.376962661743164, + "learning_rate": 2.8980679546968688e-05, + "loss": 1.7534, + "step": 7180 + }, + { + "epoch": 4.078747501665556, + "grad_norm": 24.547611236572266, + "learning_rate": 2.894366718483974e-05, + "loss": 1.6627, + "step": 7190 + }, + { + "epoch": 4.079413724183877, + "grad_norm": 24.342578887939453, + "learning_rate": 2.890665482271079e-05, + "loss": 1.5877, + "step": 7200 + }, + { + "epoch": 4.080079946702199, + "grad_norm": 12.825194358825684, + "learning_rate": 2.8869642460581836e-05, + "loss": 1.4401, + "step": 7210 + }, + { + "epoch": 4.08074616922052, + "grad_norm": 12.576107025146484, + "learning_rate": 2.8832630098452884e-05, + "loss": 1.2816, + "step": 7220 + }, + { + "epoch": 4.081412391738841, + "grad_norm": 16.752649307250977, + "learning_rate": 2.879561773632393e-05, + "loss": 1.3288, + "step": 7230 + }, + { + "epoch": 4.082078614257162, + "grad_norm": 30.863229751586914, + "learning_rate": 2.875860537419498e-05, + "loss": 1.3952, + "step": 7240 + }, + { + "epoch": 4.082744836775483, + "grad_norm": 15.5134916305542, + "learning_rate": 2.8721593012066032e-05, + "loss": 1.3036, + "step": 7250 + }, + { + "epoch": 4.083411059293804, + "grad_norm": 15.511784553527832, + "learning_rate": 2.8684580649937083e-05, + "loss": 1.5325, + "step": 7260 + }, + { + "epoch": 4.084077281812125, + "grad_norm": 12.838709831237793, + "learning_rate": 2.864756828780813e-05, + "loss": 1.4286, + "step": 7270 + }, + { + "epoch": 4.084743504330446, + "grad_norm": 12.987570762634277, + "learning_rate": 2.8610555925679178e-05, + "loss": 1.2718, + "step": 7280 + }, + { + "epoch": 4.085409726848767, + "grad_norm": 17.843914031982422, + "learning_rate": 2.8573543563550225e-05, + "loss": 1.1721, + "step": 7290 + }, + { + "epoch": 4.086075949367088, + "grad_norm": 13.999878883361816, + "learning_rate": 2.8536531201421272e-05, + "loss": 1.4786, + "step": 7300 + }, + { + "epoch": 4.08674217188541, + "grad_norm": 18.958114624023438, + "learning_rate": 2.8499518839292326e-05, + "loss": 1.4354, + "step": 7310 + }, + { + "epoch": 4.087408394403731, + "grad_norm": 13.13485336303711, + "learning_rate": 2.8462506477163377e-05, + "loss": 1.1948, + "step": 7320 + }, + { + "epoch": 4.088074616922052, + "grad_norm": 20.817176818847656, + "learning_rate": 2.8425494115034424e-05, + "loss": 1.653, + "step": 7330 + }, + { + "epoch": 4.088740839440373, + "grad_norm": 15.646084785461426, + "learning_rate": 2.838848175290547e-05, + "loss": 1.6082, + "step": 7340 + }, + { + "epoch": 4.089407061958695, + "grad_norm": 10.071416854858398, + "learning_rate": 2.835146939077652e-05, + "loss": 1.0674, + "step": 7350 + }, + { + "epoch": 4.090073284477016, + "grad_norm": 15.38576889038086, + "learning_rate": 2.8314457028647566e-05, + "loss": 1.3246, + "step": 7360 + }, + { + "epoch": 4.0907395069953365, + "grad_norm": 17.8707275390625, + "learning_rate": 2.827744466651862e-05, + "loss": 1.4139, + "step": 7370 + }, + { + "epoch": 4.091405729513657, + "grad_norm": 12.501643180847168, + "learning_rate": 2.8240432304389667e-05, + "loss": 1.2666, + "step": 7380 + }, + { + "epoch": 4.092071952031978, + "grad_norm": 11.446122169494629, + "learning_rate": 2.8203419942260718e-05, + "loss": 1.4666, + "step": 7390 + }, + { + "epoch": 4.0927381745503, + "grad_norm": 13.337681770324707, + "learning_rate": 2.8166407580131765e-05, + "loss": 1.29, + "step": 7400 + }, + { + "epoch": 4.093404397068621, + "grad_norm": 15.705299377441406, + "learning_rate": 2.8129395218002813e-05, + "loss": 1.4705, + "step": 7410 + }, + { + "epoch": 4.094070619586942, + "grad_norm": 10.4541654586792, + "learning_rate": 2.809238285587386e-05, + "loss": 1.253, + "step": 7420 + }, + { + "epoch": 4.094736842105263, + "grad_norm": 12.392687797546387, + "learning_rate": 2.8055370493744914e-05, + "loss": 1.2986, + "step": 7430 + }, + { + "epoch": 4.095403064623584, + "grad_norm": 12.397735595703125, + "learning_rate": 2.801835813161596e-05, + "loss": 1.5369, + "step": 7440 + }, + { + "epoch": 4.096069287141906, + "grad_norm": 11.043461799621582, + "learning_rate": 2.7981345769487012e-05, + "loss": 1.2619, + "step": 7450 + }, + { + "epoch": 4.096735509660227, + "grad_norm": 15.392802238464355, + "learning_rate": 2.794433340735806e-05, + "loss": 1.2489, + "step": 7460 + }, + { + "epoch": 4.097401732178548, + "grad_norm": 15.00299072265625, + "learning_rate": 2.7907321045229107e-05, + "loss": 1.5254, + "step": 7470 + }, + { + "epoch": 4.098067954696869, + "grad_norm": 10.4408597946167, + "learning_rate": 2.7870308683100154e-05, + "loss": 1.1062, + "step": 7480 + }, + { + "epoch": 4.0987341772151895, + "grad_norm": 22.96048355102539, + "learning_rate": 2.7833296320971208e-05, + "loss": 1.5428, + "step": 7490 + }, + { + "epoch": 4.099400399733511, + "grad_norm": 12.79764175415039, + "learning_rate": 2.7796283958842255e-05, + "loss": 1.2185, + "step": 7500 + }, + { + "epoch": 4.100066622251832, + "grad_norm": 67.66664123535156, + "learning_rate": 2.7759271596713303e-05, + "loss": 1.5438, + "step": 7510 + }, + { + "epoch": 4.100066622251832, + "eval_accuracy": 0.43981010512037977, + "eval_loss": 1.8850669860839844, + "eval_runtime": 936.3561, + "eval_samples_per_second": 3.149, + "eval_steps_per_second": 0.394, + "step": 7510 + }, + { + "epoch": 5.000666222518321, + "grad_norm": 11.612895965576172, + "learning_rate": 2.7722259234584353e-05, + "loss": 1.1533, + "step": 7520 + }, + { + "epoch": 5.001332445036642, + "grad_norm": 14.573636054992676, + "learning_rate": 2.76852468724554e-05, + "loss": 1.5313, + "step": 7530 + }, + { + "epoch": 5.001998667554964, + "grad_norm": 21.381816864013672, + "learning_rate": 2.7648234510326448e-05, + "loss": 1.168, + "step": 7540 + }, + { + "epoch": 5.002664890073285, + "grad_norm": 18.441518783569336, + "learning_rate": 2.7611222148197502e-05, + "loss": 1.2082, + "step": 7550 + }, + { + "epoch": 5.003331112591606, + "grad_norm": 16.47058868408203, + "learning_rate": 2.757420978606855e-05, + "loss": 1.4219, + "step": 7560 + }, + { + "epoch": 5.0039973351099265, + "grad_norm": 9.968449592590332, + "learning_rate": 2.7537197423939597e-05, + "loss": 1.4211, + "step": 7570 + }, + { + "epoch": 5.004663557628247, + "grad_norm": 19.501323699951172, + "learning_rate": 2.7500185061810647e-05, + "loss": 1.2441, + "step": 7580 + }, + { + "epoch": 5.005329780146569, + "grad_norm": 16.23154067993164, + "learning_rate": 2.7463172699681695e-05, + "loss": 1.5421, + "step": 7590 + }, + { + "epoch": 5.00599600266489, + "grad_norm": 22.80805778503418, + "learning_rate": 2.7426160337552742e-05, + "loss": 1.3562, + "step": 7600 + }, + { + "epoch": 5.006662225183211, + "grad_norm": 14.404147148132324, + "learning_rate": 2.7389147975423796e-05, + "loss": 1.0889, + "step": 7610 + }, + { + "epoch": 5.007328447701532, + "grad_norm": 21.835342407226562, + "learning_rate": 2.7352135613294843e-05, + "loss": 1.1422, + "step": 7620 + }, + { + "epoch": 5.007994670219853, + "grad_norm": 14.204809188842773, + "learning_rate": 2.731512325116589e-05, + "loss": 1.3133, + "step": 7630 + }, + { + "epoch": 5.008660892738175, + "grad_norm": 11.132874488830566, + "learning_rate": 2.7278110889036938e-05, + "loss": 1.1071, + "step": 7640 + }, + { + "epoch": 5.009327115256496, + "grad_norm": 12.24893856048584, + "learning_rate": 2.724109852690799e-05, + "loss": 1.2034, + "step": 7650 + }, + { + "epoch": 5.009993337774817, + "grad_norm": 12.559704780578613, + "learning_rate": 2.7204086164779036e-05, + "loss": 1.3096, + "step": 7660 + }, + { + "epoch": 5.010659560293138, + "grad_norm": 14.044257164001465, + "learning_rate": 2.7167073802650083e-05, + "loss": 1.2636, + "step": 7670 + }, + { + "epoch": 5.0113257828114595, + "grad_norm": 13.864354133605957, + "learning_rate": 2.7130061440521137e-05, + "loss": 1.5267, + "step": 7680 + }, + { + "epoch": 5.01199200532978, + "grad_norm": 21.201732635498047, + "learning_rate": 2.7093049078392185e-05, + "loss": 1.1526, + "step": 7690 + }, + { + "epoch": 5.012658227848101, + "grad_norm": 16.216150283813477, + "learning_rate": 2.7056036716263232e-05, + "loss": 1.1798, + "step": 7700 + }, + { + "epoch": 5.013324450366422, + "grad_norm": 25.177841186523438, + "learning_rate": 2.7019024354134283e-05, + "loss": 1.6048, + "step": 7710 + }, + { + "epoch": 5.013990672884743, + "grad_norm": 21.327478408813477, + "learning_rate": 2.698201199200533e-05, + "loss": 1.1576, + "step": 7720 + }, + { + "epoch": 5.014656895403065, + "grad_norm": 14.58990478515625, + "learning_rate": 2.6944999629876377e-05, + "loss": 1.4466, + "step": 7730 + }, + { + "epoch": 5.015323117921386, + "grad_norm": 11.63847541809082, + "learning_rate": 2.690798726774743e-05, + "loss": 1.4452, + "step": 7740 + }, + { + "epoch": 5.015989340439707, + "grad_norm": 14.77788257598877, + "learning_rate": 2.687097490561848e-05, + "loss": 1.2652, + "step": 7750 + }, + { + "epoch": 5.016655562958028, + "grad_norm": 19.32585334777832, + "learning_rate": 2.6833962543489526e-05, + "loss": 1.5369, + "step": 7760 + }, + { + "epoch": 5.017321785476349, + "grad_norm": 18.672203063964844, + "learning_rate": 2.6796950181360577e-05, + "loss": 1.3586, + "step": 7770 + }, + { + "epoch": 5.017988007994671, + "grad_norm": 15.706737518310547, + "learning_rate": 2.6759937819231624e-05, + "loss": 1.2251, + "step": 7780 + }, + { + "epoch": 5.0186542305129915, + "grad_norm": 14.212475776672363, + "learning_rate": 2.672292545710267e-05, + "loss": 0.9481, + "step": 7790 + }, + { + "epoch": 5.0193204530313125, + "grad_norm": 18.241979598999023, + "learning_rate": 2.6685913094973725e-05, + "loss": 1.1286, + "step": 7800 + }, + { + "epoch": 5.019986675549633, + "grad_norm": 14.85487174987793, + "learning_rate": 2.6648900732844773e-05, + "loss": 1.2753, + "step": 7810 + }, + { + "epoch": 5.020652898067954, + "grad_norm": 18.47552490234375, + "learning_rate": 2.661188837071582e-05, + "loss": 1.2741, + "step": 7820 + }, + { + "epoch": 5.021319120586276, + "grad_norm": 10.971531867980957, + "learning_rate": 2.6574876008586867e-05, + "loss": 1.2206, + "step": 7830 + }, + { + "epoch": 5.021985343104597, + "grad_norm": 18.05440902709961, + "learning_rate": 2.6537863646457918e-05, + "loss": 1.1457, + "step": 7840 + }, + { + "epoch": 5.022651565622918, + "grad_norm": 36.85404586791992, + "learning_rate": 2.6500851284328965e-05, + "loss": 1.1997, + "step": 7850 + }, + { + "epoch": 5.023317788141239, + "grad_norm": 12.900432586669922, + "learning_rate": 2.646383892220002e-05, + "loss": 1.0813, + "step": 7860 + }, + { + "epoch": 5.02398401065956, + "grad_norm": 26.419130325317383, + "learning_rate": 2.6426826560071067e-05, + "loss": 1.3377, + "step": 7870 + }, + { + "epoch": 5.024650233177882, + "grad_norm": 19.67392921447754, + "learning_rate": 2.6389814197942114e-05, + "loss": 1.406, + "step": 7880 + }, + { + "epoch": 5.025316455696203, + "grad_norm": 24.628110885620117, + "learning_rate": 2.635280183581316e-05, + "loss": 1.5068, + "step": 7890 + }, + { + "epoch": 5.025982678214524, + "grad_norm": 18.93452262878418, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.2426, + "step": 7900 + }, + { + "epoch": 5.0266489007328445, + "grad_norm": 15.747820854187012, + "learning_rate": 2.627877711155526e-05, + "loss": 0.9804, + "step": 7910 + }, + { + "epoch": 5.0273151232511655, + "grad_norm": 13.337479591369629, + "learning_rate": 2.6241764749426313e-05, + "loss": 1.2021, + "step": 7920 + }, + { + "epoch": 5.027981345769487, + "grad_norm": 12.642459869384766, + "learning_rate": 2.620475238729736e-05, + "loss": 1.3055, + "step": 7930 + }, + { + "epoch": 5.028647568287808, + "grad_norm": 13.696338653564453, + "learning_rate": 2.6167740025168408e-05, + "loss": 1.243, + "step": 7940 + }, + { + "epoch": 5.029313790806129, + "grad_norm": 12.312226295471191, + "learning_rate": 2.6130727663039455e-05, + "loss": 1.2425, + "step": 7950 + }, + { + "epoch": 5.02998001332445, + "grad_norm": 10.619965553283691, + "learning_rate": 2.6093715300910503e-05, + "loss": 1.0635, + "step": 7960 + }, + { + "epoch": 5.030646235842772, + "grad_norm": 16.610933303833008, + "learning_rate": 2.6056702938781553e-05, + "loss": 1.2789, + "step": 7970 + }, + { + "epoch": 5.031312458361093, + "grad_norm": 25.85894012451172, + "learning_rate": 2.6019690576652604e-05, + "loss": 1.5233, + "step": 7980 + }, + { + "epoch": 5.031978680879414, + "grad_norm": 12.571439743041992, + "learning_rate": 2.5982678214523655e-05, + "loss": 1.2424, + "step": 7990 + }, + { + "epoch": 5.032644903397735, + "grad_norm": 20.10173225402832, + "learning_rate": 2.5945665852394702e-05, + "loss": 1.5217, + "step": 8000 + }, + { + "epoch": 5.033311125916056, + "grad_norm": 21.525365829467773, + "learning_rate": 2.590865349026575e-05, + "loss": 1.0498, + "step": 8010 + }, + { + "epoch": 5.0339773484343775, + "grad_norm": 12.425312995910645, + "learning_rate": 2.5871641128136797e-05, + "loss": 1.4057, + "step": 8020 + }, + { + "epoch": 5.034643570952698, + "grad_norm": 16.854585647583008, + "learning_rate": 2.5834628766007847e-05, + "loss": 1.3043, + "step": 8030 + }, + { + "epoch": 5.035309793471019, + "grad_norm": 22.611907958984375, + "learning_rate": 2.5797616403878898e-05, + "loss": 1.0438, + "step": 8040 + }, + { + "epoch": 5.03597601598934, + "grad_norm": 12.07515811920166, + "learning_rate": 2.576060404174995e-05, + "loss": 1.2947, + "step": 8050 + }, + { + "epoch": 5.036642238507661, + "grad_norm": 14.033244132995605, + "learning_rate": 2.5723591679620996e-05, + "loss": 1.0858, + "step": 8060 + }, + { + "epoch": 5.037308461025983, + "grad_norm": 15.033126831054688, + "learning_rate": 2.5686579317492043e-05, + "loss": 1.1159, + "step": 8070 + }, + { + "epoch": 5.037974683544304, + "grad_norm": 11.701658248901367, + "learning_rate": 2.564956695536309e-05, + "loss": 1.2648, + "step": 8080 + }, + { + "epoch": 5.038640906062625, + "grad_norm": 13.016339302062988, + "learning_rate": 2.5612554593234138e-05, + "loss": 1.2807, + "step": 8090 + }, + { + "epoch": 5.039307128580946, + "grad_norm": 17.96981430053711, + "learning_rate": 2.5575542231105192e-05, + "loss": 1.0827, + "step": 8100 + }, + { + "epoch": 5.039973351099267, + "grad_norm": 15.402466773986816, + "learning_rate": 2.553852986897624e-05, + "loss": 1.3537, + "step": 8110 + }, + { + "epoch": 5.040639573617589, + "grad_norm": 11.735275268554688, + "learning_rate": 2.550151750684729e-05, + "loss": 1.5196, + "step": 8120 + }, + { + "epoch": 5.04130579613591, + "grad_norm": 11.587544441223145, + "learning_rate": 2.5464505144718337e-05, + "loss": 1.3281, + "step": 8130 + }, + { + "epoch": 5.0419720186542305, + "grad_norm": 17.989864349365234, + "learning_rate": 2.5427492782589385e-05, + "loss": 1.5483, + "step": 8140 + }, + { + "epoch": 5.042638241172551, + "grad_norm": 23.9891414642334, + "learning_rate": 2.5390480420460432e-05, + "loss": 1.1688, + "step": 8150 + }, + { + "epoch": 5.043304463690872, + "grad_norm": 21.458175659179688, + "learning_rate": 2.5353468058331486e-05, + "loss": 1.3983, + "step": 8160 + }, + { + "epoch": 5.043970686209194, + "grad_norm": 13.542852401733398, + "learning_rate": 2.5316455696202533e-05, + "loss": 1.3188, + "step": 8170 + }, + { + "epoch": 5.044636908727515, + "grad_norm": 11.665117263793945, + "learning_rate": 2.5279443334073584e-05, + "loss": 1.0982, + "step": 8180 + }, + { + "epoch": 5.045303131245836, + "grad_norm": 10.42524242401123, + "learning_rate": 2.524243097194463e-05, + "loss": 1.5062, + "step": 8190 + }, + { + "epoch": 5.045969353764157, + "grad_norm": 18.53445816040039, + "learning_rate": 2.520541860981568e-05, + "loss": 1.1695, + "step": 8200 + }, + { + "epoch": 5.046635576282478, + "grad_norm": 17.51700210571289, + "learning_rate": 2.5168406247686726e-05, + "loss": 1.3786, + "step": 8210 + }, + { + "epoch": 5.0473017988008, + "grad_norm": 17.457183837890625, + "learning_rate": 2.513139388555778e-05, + "loss": 1.212, + "step": 8220 + }, + { + "epoch": 5.047968021319121, + "grad_norm": 14.491168022155762, + "learning_rate": 2.5094381523428827e-05, + "loss": 1.1946, + "step": 8230 + }, + { + "epoch": 5.048634243837442, + "grad_norm": 15.916807174682617, + "learning_rate": 2.5057369161299878e-05, + "loss": 1.2123, + "step": 8240 + }, + { + "epoch": 5.049300466355763, + "grad_norm": 13.061064720153809, + "learning_rate": 2.5020356799170925e-05, + "loss": 1.0544, + "step": 8250 + }, + { + "epoch": 5.0499666888740835, + "grad_norm": 14.148636817932129, + "learning_rate": 2.4983344437041972e-05, + "loss": 1.5593, + "step": 8260 + }, + { + "epoch": 5.050632911392405, + "grad_norm": 18.7181396484375, + "learning_rate": 2.4946332074913023e-05, + "loss": 1.1484, + "step": 8270 + }, + { + "epoch": 5.051299133910726, + "grad_norm": 16.810848236083984, + "learning_rate": 2.490931971278407e-05, + "loss": 1.1494, + "step": 8280 + }, + { + "epoch": 5.051965356429047, + "grad_norm": 10.421501159667969, + "learning_rate": 2.4872307350655118e-05, + "loss": 1.3035, + "step": 8290 + }, + { + "epoch": 5.052631578947368, + "grad_norm": 22.520591735839844, + "learning_rate": 2.483529498852617e-05, + "loss": 1.0186, + "step": 8300 + }, + { + "epoch": 5.05329780146569, + "grad_norm": 21.928674697875977, + "learning_rate": 2.479828262639722e-05, + "loss": 1.6327, + "step": 8310 + }, + { + "epoch": 5.053964023984011, + "grad_norm": 12.552172660827637, + "learning_rate": 2.4761270264268266e-05, + "loss": 0.9753, + "step": 8320 + }, + { + "epoch": 5.054630246502332, + "grad_norm": 7.5370988845825195, + "learning_rate": 2.4724257902139317e-05, + "loss": 1.2464, + "step": 8330 + }, + { + "epoch": 5.055296469020653, + "grad_norm": 15.833026885986328, + "learning_rate": 2.4687245540010364e-05, + "loss": 1.3526, + "step": 8340 + }, + { + "epoch": 5.055962691538974, + "grad_norm": 17.667726516723633, + "learning_rate": 2.4650233177881412e-05, + "loss": 1.6755, + "step": 8350 + }, + { + "epoch": 5.0566289140572955, + "grad_norm": 17.385398864746094, + "learning_rate": 2.4613220815752462e-05, + "loss": 1.1396, + "step": 8360 + }, + { + "epoch": 5.0572951365756165, + "grad_norm": 5.932229518890381, + "learning_rate": 2.4576208453623513e-05, + "loss": 1.5436, + "step": 8370 + }, + { + "epoch": 5.057961359093937, + "grad_norm": 19.611263275146484, + "learning_rate": 2.453919609149456e-05, + "loss": 1.1354, + "step": 8380 + }, + { + "epoch": 5.058627581612258, + "grad_norm": 20.095909118652344, + "learning_rate": 2.450218372936561e-05, + "loss": 1.3787, + "step": 8390 + }, + { + "epoch": 5.059293804130579, + "grad_norm": 12.45937728881836, + "learning_rate": 2.446517136723666e-05, + "loss": 1.2642, + "step": 8400 + }, + { + "epoch": 5.059960026648901, + "grad_norm": 8.93193244934082, + "learning_rate": 2.4428159005107706e-05, + "loss": 1.5255, + "step": 8410 + }, + { + "epoch": 5.060626249167222, + "grad_norm": 16.46532440185547, + "learning_rate": 2.4391146642978756e-05, + "loss": 1.6689, + "step": 8420 + }, + { + "epoch": 5.061292471685543, + "grad_norm": 11.329387664794922, + "learning_rate": 2.4354134280849804e-05, + "loss": 1.3907, + "step": 8430 + }, + { + "epoch": 5.061958694203864, + "grad_norm": 14.128633499145508, + "learning_rate": 2.4317121918720854e-05, + "loss": 1.1934, + "step": 8440 + }, + { + "epoch": 5.062624916722185, + "grad_norm": 10.45793628692627, + "learning_rate": 2.4280109556591905e-05, + "loss": 1.5409, + "step": 8450 + }, + { + "epoch": 5.063291139240507, + "grad_norm": 15.659375190734863, + "learning_rate": 2.4243097194462952e-05, + "loss": 1.1119, + "step": 8460 + }, + { + "epoch": 5.063957361758828, + "grad_norm": 18.073314666748047, + "learning_rate": 2.4206084832334e-05, + "loss": 1.305, + "step": 8470 + }, + { + "epoch": 5.0646235842771485, + "grad_norm": 24.583187103271484, + "learning_rate": 2.416907247020505e-05, + "loss": 1.4173, + "step": 8480 + }, + { + "epoch": 5.0652898067954695, + "grad_norm": 8.584030151367188, + "learning_rate": 2.4132060108076098e-05, + "loss": 1.22, + "step": 8490 + }, + { + "epoch": 5.06595602931379, + "grad_norm": 14.935359954833984, + "learning_rate": 2.409504774594715e-05, + "loss": 1.2675, + "step": 8500 + }, + { + "epoch": 5.066622251832112, + "grad_norm": 23.53730583190918, + "learning_rate": 2.40580353838182e-05, + "loss": 1.3386, + "step": 8510 + }, + { + "epoch": 5.067288474350433, + "grad_norm": 10.066162109375, + "learning_rate": 2.4021023021689246e-05, + "loss": 1.3437, + "step": 8520 + }, + { + "epoch": 5.067954696868754, + "grad_norm": 13.48981761932373, + "learning_rate": 2.3984010659560294e-05, + "loss": 1.1653, + "step": 8530 + }, + { + "epoch": 5.068620919387075, + "grad_norm": 13.490951538085938, + "learning_rate": 2.3946998297431344e-05, + "loss": 1.3357, + "step": 8540 + }, + { + "epoch": 5.069287141905396, + "grad_norm": 17.73015785217285, + "learning_rate": 2.3909985935302392e-05, + "loss": 1.2657, + "step": 8550 + }, + { + "epoch": 5.069953364423718, + "grad_norm": 16.10188102722168, + "learning_rate": 2.387297357317344e-05, + "loss": 1.3206, + "step": 8560 + }, + { + "epoch": 5.070619586942039, + "grad_norm": 13.22396183013916, + "learning_rate": 2.383596121104449e-05, + "loss": 1.4605, + "step": 8570 + }, + { + "epoch": 5.07128580946036, + "grad_norm": 13.858728408813477, + "learning_rate": 2.379894884891554e-05, + "loss": 1.1412, + "step": 8580 + }, + { + "epoch": 5.071952031978681, + "grad_norm": 9.455029487609863, + "learning_rate": 2.3761936486786588e-05, + "loss": 1.2453, + "step": 8590 + }, + { + "epoch": 5.072618254497002, + "grad_norm": 13.854305267333984, + "learning_rate": 2.372492412465764e-05, + "loss": 1.3178, + "step": 8600 + }, + { + "epoch": 5.073284477015323, + "grad_norm": 9.86813735961914, + "learning_rate": 2.3687911762528686e-05, + "loss": 1.4471, + "step": 8610 + }, + { + "epoch": 5.073950699533644, + "grad_norm": 13.251646995544434, + "learning_rate": 2.3650899400399733e-05, + "loss": 1.0193, + "step": 8620 + }, + { + "epoch": 5.074616922051965, + "grad_norm": 13.65928840637207, + "learning_rate": 2.3613887038270784e-05, + "loss": 1.3942, + "step": 8630 + }, + { + "epoch": 5.075283144570286, + "grad_norm": 18.734760284423828, + "learning_rate": 2.3576874676141834e-05, + "loss": 1.1647, + "step": 8640 + }, + { + "epoch": 5.075949367088608, + "grad_norm": 15.465428352355957, + "learning_rate": 2.353986231401288e-05, + "loss": 1.0783, + "step": 8650 + }, + { + "epoch": 5.076615589606929, + "grad_norm": 14.853531837463379, + "learning_rate": 2.350284995188393e-05, + "loss": 1.2047, + "step": 8660 + }, + { + "epoch": 5.07728181212525, + "grad_norm": 16.615352630615234, + "learning_rate": 2.346583758975498e-05, + "loss": 1.1713, + "step": 8670 + }, + { + "epoch": 5.077948034643571, + "grad_norm": 4.484228134155273, + "learning_rate": 2.3428825227626027e-05, + "loss": 1.1507, + "step": 8680 + }, + { + "epoch": 5.078614257161892, + "grad_norm": 18.028894424438477, + "learning_rate": 2.3391812865497074e-05, + "loss": 1.239, + "step": 8690 + }, + { + "epoch": 5.079280479680214, + "grad_norm": 20.45878028869629, + "learning_rate": 2.3354800503368125e-05, + "loss": 1.1635, + "step": 8700 + }, + { + "epoch": 5.0799467021985345, + "grad_norm": 15.416409492492676, + "learning_rate": 2.3317788141239176e-05, + "loss": 1.243, + "step": 8710 + }, + { + "epoch": 5.080612924716855, + "grad_norm": 8.840574264526367, + "learning_rate": 2.3280775779110223e-05, + "loss": 1.2485, + "step": 8720 + }, + { + "epoch": 5.081279147235176, + "grad_norm": 13.862686157226562, + "learning_rate": 2.3243763416981274e-05, + "loss": 1.2109, + "step": 8730 + }, + { + "epoch": 5.081945369753497, + "grad_norm": 15.73887825012207, + "learning_rate": 2.320675105485232e-05, + "loss": 1.209, + "step": 8740 + }, + { + "epoch": 5.082611592271819, + "grad_norm": 13.038989067077637, + "learning_rate": 2.3169738692723368e-05, + "loss": 1.2744, + "step": 8750 + }, + { + "epoch": 5.08327781479014, + "grad_norm": 15.106502532958984, + "learning_rate": 2.313272633059442e-05, + "loss": 1.0696, + "step": 8760 + }, + { + "epoch": 5.083944037308461, + "grad_norm": 16.90589714050293, + "learning_rate": 2.309571396846547e-05, + "loss": 1.2162, + "step": 8770 + }, + { + "epoch": 5.084610259826782, + "grad_norm": 9.700875282287598, + "learning_rate": 2.3058701606336517e-05, + "loss": 1.3809, + "step": 8780 + }, + { + "epoch": 5.085276482345103, + "grad_norm": 15.69945240020752, + "learning_rate": 2.3021689244207568e-05, + "loss": 1.3838, + "step": 8790 + }, + { + "epoch": 5.085942704863425, + "grad_norm": 6.215369701385498, + "learning_rate": 2.2984676882078615e-05, + "loss": 1.0835, + "step": 8800 + }, + { + "epoch": 5.086608927381746, + "grad_norm": 13.538189888000488, + "learning_rate": 2.2947664519949662e-05, + "loss": 1.0703, + "step": 8810 + }, + { + "epoch": 5.087275149900067, + "grad_norm": 20.458240509033203, + "learning_rate": 2.2910652157820713e-05, + "loss": 1.2463, + "step": 8820 + }, + { + "epoch": 5.0879413724183875, + "grad_norm": 20.578060150146484, + "learning_rate": 2.2873639795691764e-05, + "loss": 1.1213, + "step": 8830 + }, + { + "epoch": 5.0886075949367084, + "grad_norm": 11.874213218688965, + "learning_rate": 2.283662743356281e-05, + "loss": 1.1985, + "step": 8840 + }, + { + "epoch": 5.08927381745503, + "grad_norm": 11.972701072692871, + "learning_rate": 2.279961507143386e-05, + "loss": 1.2814, + "step": 8850 + }, + { + "epoch": 5.089940039973351, + "grad_norm": 11.481633186340332, + "learning_rate": 2.276260270930491e-05, + "loss": 1.1566, + "step": 8860 + }, + { + "epoch": 5.090606262491672, + "grad_norm": 20.831695556640625, + "learning_rate": 2.2725590347175956e-05, + "loss": 1.2881, + "step": 8870 + }, + { + "epoch": 5.091272485009993, + "grad_norm": 15.78706169128418, + "learning_rate": 2.2688577985047007e-05, + "loss": 1.3575, + "step": 8880 + }, + { + "epoch": 5.091938707528314, + "grad_norm": 10.312272071838379, + "learning_rate": 2.2651565622918054e-05, + "loss": 1.6457, + "step": 8890 + }, + { + "epoch": 5.092604930046636, + "grad_norm": 26.16973876953125, + "learning_rate": 2.2614553260789105e-05, + "loss": 1.4447, + "step": 8900 + }, + { + "epoch": 5.093271152564957, + "grad_norm": 9.408838272094727, + "learning_rate": 2.2577540898660156e-05, + "loss": 1.073, + "step": 8910 + }, + { + "epoch": 5.093937375083278, + "grad_norm": 10.96800422668457, + "learning_rate": 2.2540528536531203e-05, + "loss": 1.2, + "step": 8920 + }, + { + "epoch": 5.094603597601599, + "grad_norm": 17.276994705200195, + "learning_rate": 2.250351617440225e-05, + "loss": 1.3668, + "step": 8930 + }, + { + "epoch": 5.0952698201199205, + "grad_norm": 11.48062801361084, + "learning_rate": 2.24665038122733e-05, + "loss": 1.2026, + "step": 8940 + }, + { + "epoch": 5.095936042638241, + "grad_norm": 26.46087646484375, + "learning_rate": 2.2429491450144348e-05, + "loss": 1.1726, + "step": 8950 + }, + { + "epoch": 5.096602265156562, + "grad_norm": 18.596731185913086, + "learning_rate": 2.23924790880154e-05, + "loss": 1.2826, + "step": 8960 + }, + { + "epoch": 5.097268487674883, + "grad_norm": 25.796871185302734, + "learning_rate": 2.235546672588645e-05, + "loss": 1.3141, + "step": 8970 + }, + { + "epoch": 5.097934710193204, + "grad_norm": 13.524627685546875, + "learning_rate": 2.2318454363757497e-05, + "loss": 1.5064, + "step": 8980 + }, + { + "epoch": 5.098600932711526, + "grad_norm": 24.042057037353516, + "learning_rate": 2.2281442001628544e-05, + "loss": 1.3654, + "step": 8990 + }, + { + "epoch": 5.099267155229847, + "grad_norm": 5.356496334075928, + "learning_rate": 2.2244429639499595e-05, + "loss": 1.0528, + "step": 9000 + }, + { + "epoch": 5.099933377748168, + "grad_norm": 8.590046882629395, + "learning_rate": 2.2207417277370642e-05, + "loss": 1.225, + "step": 9010 + }, + { + "epoch": 5.100066622251832, + "eval_accuracy": 0.4649033570701933, + "eval_loss": 1.7874189615249634, + "eval_runtime": 949.5185, + "eval_samples_per_second": 3.106, + "eval_steps_per_second": 0.389, + "step": 9012 + }, + { + "epoch": 6.000532978014657, + "grad_norm": 15.83902645111084, + "learning_rate": 2.217040491524169e-05, + "loss": 1.1545, + "step": 9020 + }, + { + "epoch": 6.001199200532978, + "grad_norm": 14.010050773620605, + "learning_rate": 2.213339255311274e-05, + "loss": 1.4341, + "step": 9030 + }, + { + "epoch": 6.001865423051299, + "grad_norm": 10.691797256469727, + "learning_rate": 2.209638019098379e-05, + "loss": 1.0935, + "step": 9040 + }, + { + "epoch": 6.00253164556962, + "grad_norm": 14.35091781616211, + "learning_rate": 2.2059367828854838e-05, + "loss": 1.0831, + "step": 9050 + }, + { + "epoch": 6.003197868087941, + "grad_norm": 22.088050842285156, + "learning_rate": 2.202235546672589e-05, + "loss": 1.3766, + "step": 9060 + }, + { + "epoch": 6.003864090606262, + "grad_norm": 20.55189323425293, + "learning_rate": 2.1985343104596936e-05, + "loss": 1.1779, + "step": 9070 + }, + { + "epoch": 6.004530313124584, + "grad_norm": 8.70118236541748, + "learning_rate": 2.1948330742467983e-05, + "loss": 1.1382, + "step": 9080 + }, + { + "epoch": 6.005196535642905, + "grad_norm": 22.003557205200195, + "learning_rate": 2.1911318380339034e-05, + "loss": 1.2397, + "step": 9090 + }, + { + "epoch": 6.005862758161226, + "grad_norm": 20.362037658691406, + "learning_rate": 2.1874306018210085e-05, + "loss": 1.2817, + "step": 9100 + }, + { + "epoch": 6.006528980679547, + "grad_norm": 18.034761428833008, + "learning_rate": 2.1837293656081132e-05, + "loss": 1.3153, + "step": 9110 + }, + { + "epoch": 6.007195203197868, + "grad_norm": 12.911800384521484, + "learning_rate": 2.1800281293952183e-05, + "loss": 1.1229, + "step": 9120 + }, + { + "epoch": 6.0078614257161895, + "grad_norm": 20.2958984375, + "learning_rate": 2.176326893182323e-05, + "loss": 1.1306, + "step": 9130 + }, + { + "epoch": 6.0085276482345105, + "grad_norm": 10.70205020904541, + "learning_rate": 2.1726256569694277e-05, + "loss": 0.9889, + "step": 9140 + }, + { + "epoch": 6.009193870752831, + "grad_norm": 14.338765144348145, + "learning_rate": 2.1689244207565328e-05, + "loss": 1.3786, + "step": 9150 + }, + { + "epoch": 6.009860093271152, + "grad_norm": 9.158313751220703, + "learning_rate": 2.1652231845436375e-05, + "loss": 1.0936, + "step": 9160 + }, + { + "epoch": 6.010526315789473, + "grad_norm": 17.45079231262207, + "learning_rate": 2.1615219483307426e-05, + "loss": 1.0422, + "step": 9170 + }, + { + "epoch": 6.011192538307795, + "grad_norm": 19.044965744018555, + "learning_rate": 2.1578207121178477e-05, + "loss": 1.1737, + "step": 9180 + }, + { + "epoch": 6.011858760826116, + "grad_norm": 18.864051818847656, + "learning_rate": 2.1541194759049524e-05, + "loss": 1.099, + "step": 9190 + }, + { + "epoch": 6.012524983344437, + "grad_norm": 16.256820678710938, + "learning_rate": 2.150418239692057e-05, + "loss": 1.0997, + "step": 9200 + }, + { + "epoch": 6.013191205862758, + "grad_norm": 12.893338203430176, + "learning_rate": 2.1467170034791622e-05, + "loss": 1.1097, + "step": 9210 + }, + { + "epoch": 6.013857428381079, + "grad_norm": 21.89535903930664, + "learning_rate": 2.143015767266267e-05, + "loss": 1.4427, + "step": 9220 + }, + { + "epoch": 6.014523650899401, + "grad_norm": 10.747573852539062, + "learning_rate": 2.139314531053372e-05, + "loss": 1.3024, + "step": 9230 + }, + { + "epoch": 6.015189873417722, + "grad_norm": 12.628827095031738, + "learning_rate": 2.135613294840477e-05, + "loss": 1.2129, + "step": 9240 + }, + { + "epoch": 6.0158560959360425, + "grad_norm": 16.268753051757812, + "learning_rate": 2.1319120586275818e-05, + "loss": 1.1722, + "step": 9250 + }, + { + "epoch": 6.0165223184543635, + "grad_norm": 16.101028442382812, + "learning_rate": 2.1282108224146865e-05, + "loss": 1.2843, + "step": 9260 + }, + { + "epoch": 6.017188540972685, + "grad_norm": 12.535353660583496, + "learning_rate": 2.1245095862017916e-05, + "loss": 1.0297, + "step": 9270 + }, + { + "epoch": 6.017854763491006, + "grad_norm": 12.013237953186035, + "learning_rate": 2.1208083499888963e-05, + "loss": 1.2505, + "step": 9280 + }, + { + "epoch": 6.018520986009327, + "grad_norm": 13.258869171142578, + "learning_rate": 2.117107113776001e-05, + "loss": 1.1373, + "step": 9290 + }, + { + "epoch": 6.019187208527648, + "grad_norm": 9.718552589416504, + "learning_rate": 2.113405877563106e-05, + "loss": 1.1208, + "step": 9300 + }, + { + "epoch": 6.019853431045969, + "grad_norm": 11.174440383911133, + "learning_rate": 2.1097046413502112e-05, + "loss": 1.1504, + "step": 9310 + }, + { + "epoch": 6.020519653564291, + "grad_norm": 15.864629745483398, + "learning_rate": 2.106003405137316e-05, + "loss": 1.0444, + "step": 9320 + }, + { + "epoch": 6.021185876082612, + "grad_norm": 16.85066032409668, + "learning_rate": 2.1023021689244207e-05, + "loss": 1.0025, + "step": 9330 + }, + { + "epoch": 6.021852098600933, + "grad_norm": 11.282647132873535, + "learning_rate": 2.0986009327115257e-05, + "loss": 1.4011, + "step": 9340 + }, + { + "epoch": 6.022518321119254, + "grad_norm": 11.690325736999512, + "learning_rate": 2.0948996964986305e-05, + "loss": 1.2895, + "step": 9350 + }, + { + "epoch": 6.023184543637575, + "grad_norm": 8.497153282165527, + "learning_rate": 2.0911984602857355e-05, + "loss": 1.354, + "step": 9360 + }, + { + "epoch": 6.023850766155896, + "grad_norm": 13.286844253540039, + "learning_rate": 2.0874972240728406e-05, + "loss": 1.1424, + "step": 9370 + }, + { + "epoch": 6.024516988674217, + "grad_norm": 17.99744415283203, + "learning_rate": 2.0837959878599453e-05, + "loss": 1.4165, + "step": 9380 + }, + { + "epoch": 6.025183211192538, + "grad_norm": 21.13187026977539, + "learning_rate": 2.08009475164705e-05, + "loss": 1.1749, + "step": 9390 + }, + { + "epoch": 6.025849433710859, + "grad_norm": 21.85277557373047, + "learning_rate": 2.076393515434155e-05, + "loss": 1.494, + "step": 9400 + }, + { + "epoch": 6.02651565622918, + "grad_norm": 8.210258483886719, + "learning_rate": 2.07269227922126e-05, + "loss": 1.0251, + "step": 9410 + }, + { + "epoch": 6.027181878747502, + "grad_norm": 19.790748596191406, + "learning_rate": 2.0689910430083646e-05, + "loss": 1.1387, + "step": 9420 + }, + { + "epoch": 6.027848101265823, + "grad_norm": 11.26384162902832, + "learning_rate": 2.06528980679547e-05, + "loss": 1.2651, + "step": 9430 + }, + { + "epoch": 6.028514323784144, + "grad_norm": 17.298749923706055, + "learning_rate": 2.0615885705825747e-05, + "loss": 1.0489, + "step": 9440 + }, + { + "epoch": 6.029180546302465, + "grad_norm": 13.616299629211426, + "learning_rate": 2.0578873343696795e-05, + "loss": 1.1934, + "step": 9450 + }, + { + "epoch": 6.029846768820786, + "grad_norm": 30.67489242553711, + "learning_rate": 2.0541860981567845e-05, + "loss": 1.3579, + "step": 9460 + }, + { + "epoch": 6.030512991339108, + "grad_norm": 15.617873191833496, + "learning_rate": 2.0504848619438893e-05, + "loss": 0.96, + "step": 9470 + }, + { + "epoch": 6.0311792138574285, + "grad_norm": 15.543371200561523, + "learning_rate": 2.046783625730994e-05, + "loss": 0.9432, + "step": 9480 + }, + { + "epoch": 6.031845436375749, + "grad_norm": 13.943503379821777, + "learning_rate": 2.043082389518099e-05, + "loss": 1.1023, + "step": 9490 + }, + { + "epoch": 6.03251165889407, + "grad_norm": 13.085801124572754, + "learning_rate": 2.039381153305204e-05, + "loss": 1.1881, + "step": 9500 + }, + { + "epoch": 6.033177881412391, + "grad_norm": 15.746623039245605, + "learning_rate": 2.035679917092309e-05, + "loss": 1.084, + "step": 9510 + }, + { + "epoch": 6.033844103930713, + "grad_norm": 15.39754581451416, + "learning_rate": 2.031978680879414e-05, + "loss": 1.2122, + "step": 9520 + }, + { + "epoch": 6.034510326449034, + "grad_norm": 22.38309669494629, + "learning_rate": 2.0282774446665187e-05, + "loss": 1.1392, + "step": 9530 + }, + { + "epoch": 6.035176548967355, + "grad_norm": 20.638362884521484, + "learning_rate": 2.0245762084536234e-05, + "loss": 1.2329, + "step": 9540 + }, + { + "epoch": 6.035842771485676, + "grad_norm": 16.581417083740234, + "learning_rate": 2.0208749722407285e-05, + "loss": 1.0684, + "step": 9550 + }, + { + "epoch": 6.036508994003998, + "grad_norm": 14.21743392944336, + "learning_rate": 2.0171737360278335e-05, + "loss": 1.1825, + "step": 9560 + }, + { + "epoch": 6.037175216522319, + "grad_norm": 20.944358825683594, + "learning_rate": 2.0134724998149383e-05, + "loss": 1.1024, + "step": 9570 + }, + { + "epoch": 6.03784143904064, + "grad_norm": 19.97979736328125, + "learning_rate": 2.0097712636020433e-05, + "loss": 1.1672, + "step": 9580 + }, + { + "epoch": 6.038507661558961, + "grad_norm": 13.323993682861328, + "learning_rate": 2.006070027389148e-05, + "loss": 1.0126, + "step": 9590 + }, + { + "epoch": 6.0391738840772815, + "grad_norm": 25.246082305908203, + "learning_rate": 2.0023687911762528e-05, + "loss": 1.0119, + "step": 9600 + }, + { + "epoch": 6.039840106595603, + "grad_norm": 27.887540817260742, + "learning_rate": 1.998667554963358e-05, + "loss": 1.1074, + "step": 9610 + }, + { + "epoch": 6.040506329113924, + "grad_norm": 14.763928413391113, + "learning_rate": 1.9949663187504626e-05, + "loss": 0.9407, + "step": 9620 + }, + { + "epoch": 6.041172551632245, + "grad_norm": 22.632875442504883, + "learning_rate": 1.9912650825375677e-05, + "loss": 1.5615, + "step": 9630 + }, + { + "epoch": 6.041838774150566, + "grad_norm": 15.380461692810059, + "learning_rate": 1.9875638463246727e-05, + "loss": 1.1684, + "step": 9640 + }, + { + "epoch": 6.042504996668887, + "grad_norm": 12.475933074951172, + "learning_rate": 1.9838626101117775e-05, + "loss": 0.9673, + "step": 9650 + }, + { + "epoch": 6.043171219187209, + "grad_norm": 16.277036666870117, + "learning_rate": 1.9801613738988822e-05, + "loss": 1.0571, + "step": 9660 + }, + { + "epoch": 6.04383744170553, + "grad_norm": 11.018747329711914, + "learning_rate": 1.9764601376859873e-05, + "loss": 0.9112, + "step": 9670 + }, + { + "epoch": 6.044503664223851, + "grad_norm": 25.415081024169922, + "learning_rate": 1.972758901473092e-05, + "loss": 1.1849, + "step": 9680 + }, + { + "epoch": 6.045169886742172, + "grad_norm": 21.805448532104492, + "learning_rate": 1.969057665260197e-05, + "loss": 1.2798, + "step": 9690 + }, + { + "epoch": 6.045836109260493, + "grad_norm": 30.073305130004883, + "learning_rate": 1.965356429047302e-05, + "loss": 1.3491, + "step": 9700 + }, + { + "epoch": 6.0465023317788145, + "grad_norm": 18.26006507873535, + "learning_rate": 1.961655192834407e-05, + "loss": 1.1401, + "step": 9710 + }, + { + "epoch": 6.047168554297135, + "grad_norm": 11.450695037841797, + "learning_rate": 1.9579539566215116e-05, + "loss": 1.15, + "step": 9720 + }, + { + "epoch": 6.047834776815456, + "grad_norm": 15.99052619934082, + "learning_rate": 1.9542527204086167e-05, + "loss": 1.1841, + "step": 9730 + }, + { + "epoch": 6.048500999333777, + "grad_norm": 17.24779510498047, + "learning_rate": 1.9505514841957214e-05, + "loss": 0.9403, + "step": 9740 + }, + { + "epoch": 6.049167221852098, + "grad_norm": 12.234496116638184, + "learning_rate": 1.946850247982826e-05, + "loss": 1.1752, + "step": 9750 + }, + { + "epoch": 6.04983344437042, + "grad_norm": 7.349771022796631, + "learning_rate": 1.9431490117699312e-05, + "loss": 1.1157, + "step": 9760 + }, + { + "epoch": 6.050499666888741, + "grad_norm": 14.889724731445312, + "learning_rate": 1.9394477755570363e-05, + "loss": 1.519, + "step": 9770 + }, + { + "epoch": 6.051165889407062, + "grad_norm": 14.725175857543945, + "learning_rate": 1.935746539344141e-05, + "loss": 1.3066, + "step": 9780 + }, + { + "epoch": 6.051832111925383, + "grad_norm": 19.340238571166992, + "learning_rate": 1.932045303131246e-05, + "loss": 1.2374, + "step": 9790 + }, + { + "epoch": 6.052498334443704, + "grad_norm": 24.564382553100586, + "learning_rate": 1.9283440669183508e-05, + "loss": 1.3411, + "step": 9800 + }, + { + "epoch": 6.053164556962026, + "grad_norm": 16.787242889404297, + "learning_rate": 1.9246428307054555e-05, + "loss": 1.0835, + "step": 9810 + }, + { + "epoch": 6.0538307794803465, + "grad_norm": 16.78311538696289, + "learning_rate": 1.9209415944925606e-05, + "loss": 1.3167, + "step": 9820 + }, + { + "epoch": 6.0544970019986675, + "grad_norm": 16.26388931274414, + "learning_rate": 1.9172403582796657e-05, + "loss": 1.3232, + "step": 9830 + }, + { + "epoch": 6.055163224516988, + "grad_norm": 16.02640151977539, + "learning_rate": 1.9135391220667704e-05, + "loss": 1.1122, + "step": 9840 + }, + { + "epoch": 6.05582944703531, + "grad_norm": 9.208070755004883, + "learning_rate": 1.9098378858538755e-05, + "loss": 1.1834, + "step": 9850 + }, + { + "epoch": 6.056495669553631, + "grad_norm": 13.764708518981934, + "learning_rate": 1.9061366496409802e-05, + "loss": 1.1903, + "step": 9860 + }, + { + "epoch": 6.057161892071952, + "grad_norm": 13.623838424682617, + "learning_rate": 1.902435413428085e-05, + "loss": 1.3428, + "step": 9870 + }, + { + "epoch": 6.057828114590273, + "grad_norm": 17.887203216552734, + "learning_rate": 1.89873417721519e-05, + "loss": 1.1983, + "step": 9880 + }, + { + "epoch": 6.058494337108594, + "grad_norm": 12.464969635009766, + "learning_rate": 1.895032941002295e-05, + "loss": 1.0835, + "step": 9890 + }, + { + "epoch": 6.059160559626916, + "grad_norm": 29.823570251464844, + "learning_rate": 1.8913317047893998e-05, + "loss": 1.1248, + "step": 9900 + }, + { + "epoch": 6.059826782145237, + "grad_norm": 14.32949161529541, + "learning_rate": 1.887630468576505e-05, + "loss": 1.2496, + "step": 9910 + }, + { + "epoch": 6.060493004663558, + "grad_norm": 13.004071235656738, + "learning_rate": 1.8839292323636096e-05, + "loss": 1.0403, + "step": 9920 + }, + { + "epoch": 6.061159227181879, + "grad_norm": 34.6653938293457, + "learning_rate": 1.8802279961507143e-05, + "loss": 1.293, + "step": 9930 + }, + { + "epoch": 6.0618254497001995, + "grad_norm": 15.617185592651367, + "learning_rate": 1.8765267599378194e-05, + "loss": 1.2092, + "step": 9940 + }, + { + "epoch": 6.062491672218521, + "grad_norm": 23.877058029174805, + "learning_rate": 1.872825523724924e-05, + "loss": 1.1424, + "step": 9950 + }, + { + "epoch": 6.063157894736842, + "grad_norm": 23.280105590820312, + "learning_rate": 1.8691242875120292e-05, + "loss": 1.3753, + "step": 9960 + }, + { + "epoch": 6.063824117255163, + "grad_norm": 11.883151054382324, + "learning_rate": 1.865423051299134e-05, + "loss": 1.0283, + "step": 9970 + }, + { + "epoch": 6.064490339773484, + "grad_norm": 18.9891414642334, + "learning_rate": 1.861721815086239e-05, + "loss": 1.2308, + "step": 9980 + }, + { + "epoch": 6.065156562291805, + "grad_norm": 14.54801082611084, + "learning_rate": 1.8580205788733437e-05, + "loss": 1.1843, + "step": 9990 + }, + { + "epoch": 6.065822784810127, + "grad_norm": 18.7949161529541, + "learning_rate": 1.8543193426604484e-05, + "loss": 0.9488, + "step": 10000 + }, + { + "epoch": 6.066489007328448, + "grad_norm": 16.36859893798828, + "learning_rate": 1.8506181064475535e-05, + "loss": 1.3812, + "step": 10010 + }, + { + "epoch": 6.067155229846769, + "grad_norm": 13.530806541442871, + "learning_rate": 1.8469168702346586e-05, + "loss": 1.1782, + "step": 10020 + }, + { + "epoch": 6.06782145236509, + "grad_norm": 23.630657196044922, + "learning_rate": 1.8432156340217633e-05, + "loss": 1.0219, + "step": 10030 + }, + { + "epoch": 6.068487674883411, + "grad_norm": 20.588144302368164, + "learning_rate": 1.8395143978088684e-05, + "loss": 1.3127, + "step": 10040 + }, + { + "epoch": 6.0691538974017325, + "grad_norm": 16.623828887939453, + "learning_rate": 1.835813161595973e-05, + "loss": 1.2645, + "step": 10050 + }, + { + "epoch": 6.069820119920053, + "grad_norm": 22.614290237426758, + "learning_rate": 1.832111925383078e-05, + "loss": 1.4497, + "step": 10060 + }, + { + "epoch": 6.070486342438374, + "grad_norm": 10.512471199035645, + "learning_rate": 1.828410689170183e-05, + "loss": 1.0029, + "step": 10070 + }, + { + "epoch": 6.071152564956695, + "grad_norm": 17.14558219909668, + "learning_rate": 1.8247094529572876e-05, + "loss": 1.5766, + "step": 10080 + }, + { + "epoch": 6.071818787475016, + "grad_norm": 22.323314666748047, + "learning_rate": 1.8210082167443927e-05, + "loss": 1.1847, + "step": 10090 + }, + { + "epoch": 6.072485009993338, + "grad_norm": 14.525805473327637, + "learning_rate": 1.8173069805314978e-05, + "loss": 1.0161, + "step": 10100 + }, + { + "epoch": 6.073151232511659, + "grad_norm": 6.080660820007324, + "learning_rate": 1.8136057443186025e-05, + "loss": 1.1406, + "step": 10110 + }, + { + "epoch": 6.07381745502998, + "grad_norm": 13.113394737243652, + "learning_rate": 1.8099045081057072e-05, + "loss": 0.9578, + "step": 10120 + }, + { + "epoch": 6.074483677548301, + "grad_norm": 21.47410011291504, + "learning_rate": 1.8062032718928123e-05, + "loss": 1.1755, + "step": 10130 + }, + { + "epoch": 6.075149900066622, + "grad_norm": 13.592970848083496, + "learning_rate": 1.802502035679917e-05, + "loss": 0.9875, + "step": 10140 + }, + { + "epoch": 6.075816122584944, + "grad_norm": 21.81001091003418, + "learning_rate": 1.798800799467022e-05, + "loss": 1.133, + "step": 10150 + }, + { + "epoch": 6.076482345103265, + "grad_norm": 18.186767578125, + "learning_rate": 1.7950995632541272e-05, + "loss": 1.2831, + "step": 10160 + }, + { + "epoch": 6.0771485676215855, + "grad_norm": 9.985696792602539, + "learning_rate": 1.791398327041232e-05, + "loss": 1.6898, + "step": 10170 + }, + { + "epoch": 6.077814790139906, + "grad_norm": 9.399591445922852, + "learning_rate": 1.7876970908283366e-05, + "loss": 1.082, + "step": 10180 + }, + { + "epoch": 6.078481012658228, + "grad_norm": 17.446556091308594, + "learning_rate": 1.7839958546154417e-05, + "loss": 1.0198, + "step": 10190 + }, + { + "epoch": 6.079147235176549, + "grad_norm": 14.212360382080078, + "learning_rate": 1.7802946184025464e-05, + "loss": 1.2316, + "step": 10200 + }, + { + "epoch": 6.07981345769487, + "grad_norm": 14.29467487335205, + "learning_rate": 1.7765933821896512e-05, + "loss": 1.3852, + "step": 10210 + }, + { + "epoch": 6.080479680213191, + "grad_norm": 12.772151947021484, + "learning_rate": 1.7728921459767562e-05, + "loss": 1.1019, + "step": 10220 + }, + { + "epoch": 6.081145902731512, + "grad_norm": 23.23948860168457, + "learning_rate": 1.7691909097638613e-05, + "loss": 1.0991, + "step": 10230 + }, + { + "epoch": 6.081812125249834, + "grad_norm": 16.20070457458496, + "learning_rate": 1.765489673550966e-05, + "loss": 1.0325, + "step": 10240 + }, + { + "epoch": 6.082478347768155, + "grad_norm": 12.131396293640137, + "learning_rate": 1.761788437338071e-05, + "loss": 1.1053, + "step": 10250 + }, + { + "epoch": 6.083144570286476, + "grad_norm": 12.462883949279785, + "learning_rate": 1.758087201125176e-05, + "loss": 1.2074, + "step": 10260 + }, + { + "epoch": 6.083810792804797, + "grad_norm": 14.353355407714844, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.3472, + "step": 10270 + }, + { + "epoch": 6.084477015323118, + "grad_norm": 8.644720077514648, + "learning_rate": 1.7506847286993856e-05, + "loss": 1.2404, + "step": 10280 + }, + { + "epoch": 6.085143237841439, + "grad_norm": 20.34583282470703, + "learning_rate": 1.7469834924864907e-05, + "loss": 1.0389, + "step": 10290 + }, + { + "epoch": 6.08580946035976, + "grad_norm": 15.65890884399414, + "learning_rate": 1.7432822562735954e-05, + "loss": 1.2431, + "step": 10300 + }, + { + "epoch": 6.086475682878081, + "grad_norm": 16.938621520996094, + "learning_rate": 1.7395810200607005e-05, + "loss": 1.1138, + "step": 10310 + }, + { + "epoch": 6.087141905396402, + "grad_norm": 14.84618091583252, + "learning_rate": 1.7358797838478052e-05, + "loss": 1.1909, + "step": 10320 + }, + { + "epoch": 6.087808127914723, + "grad_norm": 17.537261962890625, + "learning_rate": 1.73217854763491e-05, + "loss": 1.3082, + "step": 10330 + }, + { + "epoch": 6.088474350433045, + "grad_norm": 10.635568618774414, + "learning_rate": 1.728477311422015e-05, + "loss": 0.8098, + "step": 10340 + }, + { + "epoch": 6.089140572951366, + "grad_norm": 25.97487449645996, + "learning_rate": 1.7247760752091198e-05, + "loss": 0.9738, + "step": 10350 + }, + { + "epoch": 6.089806795469687, + "grad_norm": 8.214698791503906, + "learning_rate": 1.721074838996225e-05, + "loss": 1.0363, + "step": 10360 + }, + { + "epoch": 6.090473017988008, + "grad_norm": 17.86111068725586, + "learning_rate": 1.71737360278333e-05, + "loss": 1.1247, + "step": 10370 + }, + { + "epoch": 6.091139240506329, + "grad_norm": 13.101350784301758, + "learning_rate": 1.7136723665704346e-05, + "loss": 0.9959, + "step": 10380 + }, + { + "epoch": 6.0918054630246505, + "grad_norm": 24.15110969543457, + "learning_rate": 1.7099711303575394e-05, + "loss": 1.0879, + "step": 10390 + }, + { + "epoch": 6.0924716855429715, + "grad_norm": 15.315494537353516, + "learning_rate": 1.7062698941446444e-05, + "loss": 1.0249, + "step": 10400 + }, + { + "epoch": 6.093137908061292, + "grad_norm": 13.924478530883789, + "learning_rate": 1.702568657931749e-05, + "loss": 1.379, + "step": 10410 + }, + { + "epoch": 6.093804130579613, + "grad_norm": 17.271005630493164, + "learning_rate": 1.6988674217188542e-05, + "loss": 1.1037, + "step": 10420 + }, + { + "epoch": 6.094470353097934, + "grad_norm": 14.004146575927734, + "learning_rate": 1.6951661855059593e-05, + "loss": 0.8298, + "step": 10430 + }, + { + "epoch": 6.095136575616256, + "grad_norm": 13.17989444732666, + "learning_rate": 1.691464949293064e-05, + "loss": 0.8945, + "step": 10440 + }, + { + "epoch": 6.095802798134577, + "grad_norm": 13.544062614440918, + "learning_rate": 1.6877637130801688e-05, + "loss": 1.023, + "step": 10450 + }, + { + "epoch": 6.096469020652898, + "grad_norm": 18.14484405517578, + "learning_rate": 1.684062476867274e-05, + "loss": 1.1833, + "step": 10460 + }, + { + "epoch": 6.097135243171219, + "grad_norm": 14.675586700439453, + "learning_rate": 1.6803612406543786e-05, + "loss": 1.2292, + "step": 10470 + }, + { + "epoch": 6.097801465689541, + "grad_norm": 14.314397811889648, + "learning_rate": 1.6766600044414836e-05, + "loss": 0.931, + "step": 10480 + }, + { + "epoch": 6.098467688207862, + "grad_norm": 11.240303039550781, + "learning_rate": 1.6729587682285887e-05, + "loss": 1.2544, + "step": 10490 + }, + { + "epoch": 6.099133910726183, + "grad_norm": 16.184127807617188, + "learning_rate": 1.6692575320156934e-05, + "loss": 1.1378, + "step": 10500 + }, + { + "epoch": 6.0998001332445035, + "grad_norm": 16.394861221313477, + "learning_rate": 1.665556295802798e-05, + "loss": 1.038, + "step": 10510 + }, + { + "epoch": 6.100066622251832, + "eval_accuracy": 0.4852492370295015, + "eval_loss": 1.7263031005859375, + "eval_runtime": 935.413, + "eval_samples_per_second": 3.153, + "eval_steps_per_second": 0.394, + "step": 10514 + }, + { + "epoch": 7.000399733510993, + "grad_norm": 18.364229202270508, + "learning_rate": 1.6618550595899032e-05, + "loss": 1.0807, + "step": 10520 + }, + { + "epoch": 7.001065956029314, + "grad_norm": 16.4349308013916, + "learning_rate": 1.658153823377008e-05, + "loss": 1.3658, + "step": 10530 + }, + { + "epoch": 7.001732178547635, + "grad_norm": 21.844661712646484, + "learning_rate": 1.6544525871641127e-05, + "loss": 1.011, + "step": 10540 + }, + { + "epoch": 7.002398401065956, + "grad_norm": 13.007594108581543, + "learning_rate": 1.6507513509512178e-05, + "loss": 1.0939, + "step": 10550 + }, + { + "epoch": 7.003064623584277, + "grad_norm": 16.862592697143555, + "learning_rate": 1.647050114738323e-05, + "loss": 1.3063, + "step": 10560 + }, + { + "epoch": 7.003730846102599, + "grad_norm": 11.427144050598145, + "learning_rate": 1.6433488785254276e-05, + "loss": 1.1376, + "step": 10570 + }, + { + "epoch": 7.00439706862092, + "grad_norm": 21.839696884155273, + "learning_rate": 1.6396476423125326e-05, + "loss": 1.425, + "step": 10580 + }, + { + "epoch": 7.0050632911392405, + "grad_norm": 23.1754207611084, + "learning_rate": 1.6359464060996374e-05, + "loss": 1.1796, + "step": 10590 + }, + { + "epoch": 7.0057295136575615, + "grad_norm": 14.855545043945312, + "learning_rate": 1.632245169886742e-05, + "loss": 1.2355, + "step": 10600 + }, + { + "epoch": 7.006395736175882, + "grad_norm": 14.48912525177002, + "learning_rate": 1.628543933673847e-05, + "loss": 1.0072, + "step": 10610 + }, + { + "epoch": 7.007061958694204, + "grad_norm": 9.86231803894043, + "learning_rate": 1.6248426974609522e-05, + "loss": 0.9665, + "step": 10620 + }, + { + "epoch": 7.007728181212525, + "grad_norm": 14.741275787353516, + "learning_rate": 1.621141461248057e-05, + "loss": 1.128, + "step": 10630 + }, + { + "epoch": 7.008394403730846, + "grad_norm": 23.821813583374023, + "learning_rate": 1.617440225035162e-05, + "loss": 0.9926, + "step": 10640 + }, + { + "epoch": 7.009060626249167, + "grad_norm": 15.521468162536621, + "learning_rate": 1.6137389888222668e-05, + "loss": 1.1828, + "step": 10650 + }, + { + "epoch": 7.009726848767488, + "grad_norm": 5.935873031616211, + "learning_rate": 1.6100377526093715e-05, + "loss": 1.0812, + "step": 10660 + }, + { + "epoch": 7.01039307128581, + "grad_norm": 17.442787170410156, + "learning_rate": 1.6063365163964762e-05, + "loss": 0.8407, + "step": 10670 + }, + { + "epoch": 7.011059293804131, + "grad_norm": 13.502436637878418, + "learning_rate": 1.6026352801835813e-05, + "loss": 1.0323, + "step": 10680 + }, + { + "epoch": 7.011725516322452, + "grad_norm": 14.81945514678955, + "learning_rate": 1.5989340439706864e-05, + "loss": 1.1952, + "step": 10690 + }, + { + "epoch": 7.012391738840773, + "grad_norm": 13.918295860290527, + "learning_rate": 1.595232807757791e-05, + "loss": 1.2622, + "step": 10700 + }, + { + "epoch": 7.0130579613590935, + "grad_norm": 19.72701644897461, + "learning_rate": 1.591531571544896e-05, + "loss": 1.1882, + "step": 10710 + }, + { + "epoch": 7.013724183877415, + "grad_norm": 20.94658660888672, + "learning_rate": 1.587830335332001e-05, + "loss": 1.2592, + "step": 10720 + }, + { + "epoch": 7.014390406395736, + "grad_norm": 17.63475799560547, + "learning_rate": 1.5841290991191056e-05, + "loss": 1.221, + "step": 10730 + }, + { + "epoch": 7.015056628914057, + "grad_norm": 18.014263153076172, + "learning_rate": 1.5804278629062107e-05, + "loss": 1.0224, + "step": 10740 + }, + { + "epoch": 7.015722851432378, + "grad_norm": 15.626989364624023, + "learning_rate": 1.5767266266933158e-05, + "loss": 1.1008, + "step": 10750 + }, + { + "epoch": 7.016389073950699, + "grad_norm": 25.595731735229492, + "learning_rate": 1.5730253904804205e-05, + "loss": 0.9637, + "step": 10760 + }, + { + "epoch": 7.017055296469021, + "grad_norm": 24.82137680053711, + "learning_rate": 1.5693241542675256e-05, + "loss": 1.0596, + "step": 10770 + }, + { + "epoch": 7.017721518987342, + "grad_norm": 15.368452072143555, + "learning_rate": 1.5656229180546303e-05, + "loss": 1.1609, + "step": 10780 + }, + { + "epoch": 7.018387741505663, + "grad_norm": 13.832304954528809, + "learning_rate": 1.561921681841735e-05, + "loss": 1.225, + "step": 10790 + }, + { + "epoch": 7.019053964023984, + "grad_norm": 17.498554229736328, + "learning_rate": 1.55822044562884e-05, + "loss": 1.4268, + "step": 10800 + }, + { + "epoch": 7.019720186542306, + "grad_norm": 16.15509605407715, + "learning_rate": 1.5545192094159448e-05, + "loss": 1.2718, + "step": 10810 + }, + { + "epoch": 7.0203864090606265, + "grad_norm": 18.702865600585938, + "learning_rate": 1.55081797320305e-05, + "loss": 1.1827, + "step": 10820 + }, + { + "epoch": 7.021052631578947, + "grad_norm": 11.797273635864258, + "learning_rate": 1.547116736990155e-05, + "loss": 1.1963, + "step": 10830 + }, + { + "epoch": 7.021718854097268, + "grad_norm": 22.64579200744629, + "learning_rate": 1.5434155007772597e-05, + "loss": 1.1895, + "step": 10840 + }, + { + "epoch": 7.022385076615589, + "grad_norm": 14.292520523071289, + "learning_rate": 1.5397142645643644e-05, + "loss": 1.5208, + "step": 10850 + }, + { + "epoch": 7.023051299133911, + "grad_norm": 16.13132095336914, + "learning_rate": 1.5360130283514695e-05, + "loss": 1.251, + "step": 10860 + }, + { + "epoch": 7.023717521652232, + "grad_norm": 12.262897491455078, + "learning_rate": 1.5323117921385742e-05, + "loss": 0.9128, + "step": 10870 + }, + { + "epoch": 7.024383744170553, + "grad_norm": 18.754131317138672, + "learning_rate": 1.5286105559256793e-05, + "loss": 0.9246, + "step": 10880 + }, + { + "epoch": 7.025049966688874, + "grad_norm": 33.72392272949219, + "learning_rate": 1.5249093197127842e-05, + "loss": 1.3048, + "step": 10890 + }, + { + "epoch": 7.025716189207195, + "grad_norm": 14.045398712158203, + "learning_rate": 1.5212080834998891e-05, + "loss": 1.0097, + "step": 10900 + }, + { + "epoch": 7.026382411725517, + "grad_norm": 12.4899263381958, + "learning_rate": 1.5175068472869938e-05, + "loss": 0.9223, + "step": 10910 + }, + { + "epoch": 7.027048634243838, + "grad_norm": 15.914814949035645, + "learning_rate": 1.5138056110740989e-05, + "loss": 1.1063, + "step": 10920 + }, + { + "epoch": 7.027714856762159, + "grad_norm": 9.353304862976074, + "learning_rate": 1.5101043748612038e-05, + "loss": 0.9733, + "step": 10930 + }, + { + "epoch": 7.0283810792804795, + "grad_norm": 14.784786224365234, + "learning_rate": 1.5064031386483085e-05, + "loss": 0.9905, + "step": 10940 + }, + { + "epoch": 7.0290473017988, + "grad_norm": 17.18323516845703, + "learning_rate": 1.5027019024354136e-05, + "loss": 1.1269, + "step": 10950 + }, + { + "epoch": 7.029713524317122, + "grad_norm": 18.267072677612305, + "learning_rate": 1.4990006662225183e-05, + "loss": 1.288, + "step": 10960 + }, + { + "epoch": 7.030379746835443, + "grad_norm": 15.0663480758667, + "learning_rate": 1.4952994300096232e-05, + "loss": 1.1113, + "step": 10970 + }, + { + "epoch": 7.031045969353764, + "grad_norm": 14.281476974487305, + "learning_rate": 1.4915981937967283e-05, + "loss": 0.9255, + "step": 10980 + }, + { + "epoch": 7.031712191872085, + "grad_norm": 20.614591598510742, + "learning_rate": 1.487896957583833e-05, + "loss": 0.9914, + "step": 10990 + }, + { + "epoch": 7.032378414390406, + "grad_norm": 17.782739639282227, + "learning_rate": 1.4841957213709379e-05, + "loss": 1.1604, + "step": 11000 + }, + { + "epoch": 7.033044636908728, + "grad_norm": 14.316519737243652, + "learning_rate": 1.480494485158043e-05, + "loss": 0.9679, + "step": 11010 + }, + { + "epoch": 7.033710859427049, + "grad_norm": 13.61143684387207, + "learning_rate": 1.4767932489451477e-05, + "loss": 1.1681, + "step": 11020 + }, + { + "epoch": 7.03437708194537, + "grad_norm": 16.448678970336914, + "learning_rate": 1.4730920127322526e-05, + "loss": 1.36, + "step": 11030 + }, + { + "epoch": 7.035043304463691, + "grad_norm": 10.194297790527344, + "learning_rate": 1.4693907765193577e-05, + "loss": 0.9916, + "step": 11040 + }, + { + "epoch": 7.035709526982012, + "grad_norm": 13.044581413269043, + "learning_rate": 1.4656895403064624e-05, + "loss": 1.0431, + "step": 11050 + }, + { + "epoch": 7.036375749500333, + "grad_norm": 14.166892051696777, + "learning_rate": 1.4619883040935673e-05, + "loss": 1.2727, + "step": 11060 + }, + { + "epoch": 7.037041972018654, + "grad_norm": 14.891949653625488, + "learning_rate": 1.4582870678806724e-05, + "loss": 0.7753, + "step": 11070 + }, + { + "epoch": 7.037708194536975, + "grad_norm": 14.074814796447754, + "learning_rate": 1.4545858316677771e-05, + "loss": 1.0036, + "step": 11080 + }, + { + "epoch": 7.038374417055296, + "grad_norm": 13.037166595458984, + "learning_rate": 1.4508845954548818e-05, + "loss": 1.1219, + "step": 11090 + }, + { + "epoch": 7.039040639573617, + "grad_norm": 15.729752540588379, + "learning_rate": 1.447183359241987e-05, + "loss": 1.107, + "step": 11100 + }, + { + "epoch": 7.039706862091939, + "grad_norm": 10.30783748626709, + "learning_rate": 1.4434821230290918e-05, + "loss": 1.0747, + "step": 11110 + }, + { + "epoch": 7.04037308461026, + "grad_norm": 20.32052993774414, + "learning_rate": 1.4397808868161965e-05, + "loss": 1.082, + "step": 11120 + }, + { + "epoch": 7.041039307128581, + "grad_norm": 4.13175106048584, + "learning_rate": 1.4360796506033016e-05, + "loss": 0.9936, + "step": 11130 + }, + { + "epoch": 7.041705529646902, + "grad_norm": 16.794660568237305, + "learning_rate": 1.4323784143904065e-05, + "loss": 1.2215, + "step": 11140 + }, + { + "epoch": 7.042371752165224, + "grad_norm": 17.221820831298828, + "learning_rate": 1.4286771781775112e-05, + "loss": 0.9512, + "step": 11150 + }, + { + "epoch": 7.0430379746835445, + "grad_norm": 12.29546070098877, + "learning_rate": 1.4249759419646163e-05, + "loss": 0.7544, + "step": 11160 + }, + { + "epoch": 7.0437041972018655, + "grad_norm": 14.759180068969727, + "learning_rate": 1.4212747057517212e-05, + "loss": 1.0056, + "step": 11170 + }, + { + "epoch": 7.044370419720186, + "grad_norm": 15.466297149658203, + "learning_rate": 1.417573469538826e-05, + "loss": 0.9993, + "step": 11180 + }, + { + "epoch": 7.045036642238507, + "grad_norm": 10.176376342773438, + "learning_rate": 1.413872233325931e-05, + "loss": 0.9486, + "step": 11190 + }, + { + "epoch": 7.045702864756829, + "grad_norm": 12.766093254089355, + "learning_rate": 1.4101709971130359e-05, + "loss": 1.2051, + "step": 11200 + }, + { + "epoch": 7.04636908727515, + "grad_norm": 13.715441703796387, + "learning_rate": 1.4064697609001406e-05, + "loss": 1.1165, + "step": 11210 + }, + { + "epoch": 7.047035309793471, + "grad_norm": 19.838533401489258, + "learning_rate": 1.4027685246872457e-05, + "loss": 1.0698, + "step": 11220 + }, + { + "epoch": 7.047701532311792, + "grad_norm": 24.530807495117188, + "learning_rate": 1.3990672884743506e-05, + "loss": 1.0449, + "step": 11230 + }, + { + "epoch": 7.048367754830113, + "grad_norm": 14.116693496704102, + "learning_rate": 1.3953660522614553e-05, + "loss": 0.9139, + "step": 11240 + }, + { + "epoch": 7.049033977348435, + "grad_norm": 17.854320526123047, + "learning_rate": 1.3916648160485604e-05, + "loss": 1.0455, + "step": 11250 + }, + { + "epoch": 7.049700199866756, + "grad_norm": 11.167586326599121, + "learning_rate": 1.3879635798356651e-05, + "loss": 0.9966, + "step": 11260 + }, + { + "epoch": 7.050366422385077, + "grad_norm": 21.27550506591797, + "learning_rate": 1.38426234362277e-05, + "loss": 1.3097, + "step": 11270 + }, + { + "epoch": 7.0510326449033975, + "grad_norm": 9.350018501281738, + "learning_rate": 1.3805611074098751e-05, + "loss": 0.985, + "step": 11280 + }, + { + "epoch": 7.0516988674217185, + "grad_norm": 26.047470092773438, + "learning_rate": 1.3768598711969798e-05, + "loss": 1.2039, + "step": 11290 + }, + { + "epoch": 7.05236508994004, + "grad_norm": 10.058138847351074, + "learning_rate": 1.3731586349840847e-05, + "loss": 0.9254, + "step": 11300 + }, + { + "epoch": 7.053031312458361, + "grad_norm": 22.37078285217285, + "learning_rate": 1.3694573987711898e-05, + "loss": 1.0746, + "step": 11310 + }, + { + "epoch": 7.053697534976682, + "grad_norm": 15.933229446411133, + "learning_rate": 1.3657561625582945e-05, + "loss": 1.0301, + "step": 11320 + }, + { + "epoch": 7.054363757495003, + "grad_norm": 18.064044952392578, + "learning_rate": 1.3620549263453994e-05, + "loss": 1.1172, + "step": 11330 + }, + { + "epoch": 7.055029980013324, + "grad_norm": 14.76087760925293, + "learning_rate": 1.3583536901325042e-05, + "loss": 1.2446, + "step": 11340 + }, + { + "epoch": 7.055696202531646, + "grad_norm": 15.363812446594238, + "learning_rate": 1.3546524539196092e-05, + "loss": 1.0692, + "step": 11350 + }, + { + "epoch": 7.056362425049967, + "grad_norm": 5.732789516448975, + "learning_rate": 1.3509512177067141e-05, + "loss": 0.9816, + "step": 11360 + }, + { + "epoch": 7.057028647568288, + "grad_norm": 22.603395462036133, + "learning_rate": 1.3472499814938189e-05, + "loss": 1.1305, + "step": 11370 + }, + { + "epoch": 7.057694870086609, + "grad_norm": 15.2554349899292, + "learning_rate": 1.343548745280924e-05, + "loss": 1.0819, + "step": 11380 + }, + { + "epoch": 7.05836109260493, + "grad_norm": 10.416316032409668, + "learning_rate": 1.3398475090680288e-05, + "loss": 0.8564, + "step": 11390 + }, + { + "epoch": 7.059027315123251, + "grad_norm": 16.662059783935547, + "learning_rate": 1.3361462728551336e-05, + "loss": 0.905, + "step": 11400 + }, + { + "epoch": 7.059693537641572, + "grad_norm": 15.249651908874512, + "learning_rate": 1.3324450366422386e-05, + "loss": 1.0009, + "step": 11410 + }, + { + "epoch": 7.060359760159893, + "grad_norm": 17.14960479736328, + "learning_rate": 1.3287438004293434e-05, + "loss": 1.0018, + "step": 11420 + }, + { + "epoch": 7.061025982678214, + "grad_norm": 19.571836471557617, + "learning_rate": 1.3250425642164483e-05, + "loss": 1.0117, + "step": 11430 + }, + { + "epoch": 7.061692205196536, + "grad_norm": 12.646673202514648, + "learning_rate": 1.3213413280035533e-05, + "loss": 1.0774, + "step": 11440 + }, + { + "epoch": 7.062358427714857, + "grad_norm": 18.20713996887207, + "learning_rate": 1.317640091790658e-05, + "loss": 1.1195, + "step": 11450 + }, + { + "epoch": 7.063024650233178, + "grad_norm": 20.833433151245117, + "learning_rate": 1.313938855577763e-05, + "loss": 1.1935, + "step": 11460 + }, + { + "epoch": 7.063690872751499, + "grad_norm": 22.983657836914062, + "learning_rate": 1.310237619364868e-05, + "loss": 0.8122, + "step": 11470 + }, + { + "epoch": 7.06435709526982, + "grad_norm": 15.948904991149902, + "learning_rate": 1.3065363831519728e-05, + "loss": 1.0293, + "step": 11480 + }, + { + "epoch": 7.065023317788142, + "grad_norm": 16.243017196655273, + "learning_rate": 1.3028351469390777e-05, + "loss": 1.0259, + "step": 11490 + }, + { + "epoch": 7.065689540306463, + "grad_norm": 10.228900909423828, + "learning_rate": 1.2991339107261827e-05, + "loss": 1.0818, + "step": 11500 + }, + { + "epoch": 7.0663557628247835, + "grad_norm": 17.355710983276367, + "learning_rate": 1.2954326745132875e-05, + "loss": 0.9217, + "step": 11510 + }, + { + "epoch": 7.067021985343104, + "grad_norm": 18.271076202392578, + "learning_rate": 1.2917314383003924e-05, + "loss": 1.0031, + "step": 11520 + }, + { + "epoch": 7.067688207861425, + "grad_norm": 22.773921966552734, + "learning_rate": 1.2880302020874974e-05, + "loss": 0.9776, + "step": 11530 + }, + { + "epoch": 7.068354430379747, + "grad_norm": 14.880889892578125, + "learning_rate": 1.2843289658746022e-05, + "loss": 1.1638, + "step": 11540 + }, + { + "epoch": 7.069020652898068, + "grad_norm": 14.236210823059082, + "learning_rate": 1.2806277296617069e-05, + "loss": 0.8889, + "step": 11550 + }, + { + "epoch": 7.069686875416389, + "grad_norm": 11.51760196685791, + "learning_rate": 1.276926493448812e-05, + "loss": 1.0509, + "step": 11560 + }, + { + "epoch": 7.07035309793471, + "grad_norm": 12.758858680725098, + "learning_rate": 1.2732252572359169e-05, + "loss": 0.9589, + "step": 11570 + }, + { + "epoch": 7.071019320453031, + "grad_norm": 17.774017333984375, + "learning_rate": 1.2695240210230216e-05, + "loss": 0.7703, + "step": 11580 + }, + { + "epoch": 7.071685542971353, + "grad_norm": 19.448102951049805, + "learning_rate": 1.2658227848101267e-05, + "loss": 1.1762, + "step": 11590 + }, + { + "epoch": 7.072351765489674, + "grad_norm": 8.015789985656738, + "learning_rate": 1.2621215485972316e-05, + "loss": 1.0327, + "step": 11600 + }, + { + "epoch": 7.073017988007995, + "grad_norm": 12.257453918457031, + "learning_rate": 1.2584203123843363e-05, + "loss": 0.8952, + "step": 11610 + }, + { + "epoch": 7.073684210526316, + "grad_norm": 19.525419235229492, + "learning_rate": 1.2547190761714414e-05, + "loss": 0.9944, + "step": 11620 + }, + { + "epoch": 7.0743504330446365, + "grad_norm": 19.29220199584961, + "learning_rate": 1.2510178399585463e-05, + "loss": 1.0505, + "step": 11630 + }, + { + "epoch": 7.075016655562958, + "grad_norm": 18.580411911010742, + "learning_rate": 1.2473166037456512e-05, + "loss": 1.4173, + "step": 11640 + }, + { + "epoch": 7.075682878081279, + "grad_norm": 18.8646240234375, + "learning_rate": 1.2436153675327559e-05, + "loss": 0.9532, + "step": 11650 + }, + { + "epoch": 7.0763491005996, + "grad_norm": 18.226675033569336, + "learning_rate": 1.239914131319861e-05, + "loss": 1.2546, + "step": 11660 + }, + { + "epoch": 7.077015323117921, + "grad_norm": 16.938735961914062, + "learning_rate": 1.2362128951069659e-05, + "loss": 1.1066, + "step": 11670 + }, + { + "epoch": 7.077681545636242, + "grad_norm": 19.677278518676758, + "learning_rate": 1.2325116588940706e-05, + "loss": 0.7528, + "step": 11680 + }, + { + "epoch": 7.078347768154564, + "grad_norm": 27.626800537109375, + "learning_rate": 1.2288104226811757e-05, + "loss": 0.9611, + "step": 11690 + }, + { + "epoch": 7.079013990672885, + "grad_norm": 16.934402465820312, + "learning_rate": 1.2251091864682806e-05, + "loss": 1.0842, + "step": 11700 + }, + { + "epoch": 7.079680213191206, + "grad_norm": 18.02473258972168, + "learning_rate": 1.2214079502553853e-05, + "loss": 1.1856, + "step": 11710 + }, + { + "epoch": 7.080346435709527, + "grad_norm": 13.752556800842285, + "learning_rate": 1.2177067140424902e-05, + "loss": 1.1673, + "step": 11720 + }, + { + "epoch": 7.0810126582278485, + "grad_norm": 9.916633605957031, + "learning_rate": 1.2140054778295953e-05, + "loss": 0.8257, + "step": 11730 + }, + { + "epoch": 7.0816788807461695, + "grad_norm": 18.2783145904541, + "learning_rate": 1.2103042416167e-05, + "loss": 1.2965, + "step": 11740 + }, + { + "epoch": 7.08234510326449, + "grad_norm": 14.082517623901367, + "learning_rate": 1.2066030054038049e-05, + "loss": 0.8431, + "step": 11750 + }, + { + "epoch": 7.083011325782811, + "grad_norm": 11.931652069091797, + "learning_rate": 1.20290176919091e-05, + "loss": 1.0103, + "step": 11760 + }, + { + "epoch": 7.083677548301132, + "grad_norm": 18.79412841796875, + "learning_rate": 1.1992005329780147e-05, + "loss": 1.1975, + "step": 11770 + }, + { + "epoch": 7.084343770819454, + "grad_norm": 16.138456344604492, + "learning_rate": 1.1954992967651196e-05, + "loss": 0.8683, + "step": 11780 + }, + { + "epoch": 7.085009993337775, + "grad_norm": 21.204801559448242, + "learning_rate": 1.1917980605522245e-05, + "loss": 0.9148, + "step": 11790 + }, + { + "epoch": 7.085676215856096, + "grad_norm": 13.393819808959961, + "learning_rate": 1.1880968243393294e-05, + "loss": 1.3899, + "step": 11800 + }, + { + "epoch": 7.086342438374417, + "grad_norm": 25.119752883911133, + "learning_rate": 1.1843955881264343e-05, + "loss": 0.9742, + "step": 11810 + }, + { + "epoch": 7.087008660892738, + "grad_norm": 19.688535690307617, + "learning_rate": 1.1806943519135392e-05, + "loss": 1.0377, + "step": 11820 + }, + { + "epoch": 7.08767488341106, + "grad_norm": 11.750617980957031, + "learning_rate": 1.176993115700644e-05, + "loss": 0.9941, + "step": 11830 + }, + { + "epoch": 7.088341105929381, + "grad_norm": 18.055028915405273, + "learning_rate": 1.173291879487749e-05, + "loss": 1.2615, + "step": 11840 + }, + { + "epoch": 7.0890073284477015, + "grad_norm": 17.120216369628906, + "learning_rate": 1.1695906432748537e-05, + "loss": 1.0148, + "step": 11850 + }, + { + "epoch": 7.0896735509660225, + "grad_norm": 14.143601417541504, + "learning_rate": 1.1658894070619588e-05, + "loss": 1.0558, + "step": 11860 + }, + { + "epoch": 7.090339773484343, + "grad_norm": 23.571935653686523, + "learning_rate": 1.1621881708490637e-05, + "loss": 1.0717, + "step": 11870 + }, + { + "epoch": 7.091005996002665, + "grad_norm": 26.634227752685547, + "learning_rate": 1.1584869346361684e-05, + "loss": 1.2216, + "step": 11880 + }, + { + "epoch": 7.091672218520986, + "grad_norm": 19.495254516601562, + "learning_rate": 1.1547856984232735e-05, + "loss": 1.2251, + "step": 11890 + }, + { + "epoch": 7.092338441039307, + "grad_norm": 11.348212242126465, + "learning_rate": 1.1510844622103784e-05, + "loss": 1.1654, + "step": 11900 + }, + { + "epoch": 7.093004663557628, + "grad_norm": 14.961196899414062, + "learning_rate": 1.1473832259974831e-05, + "loss": 1.1382, + "step": 11910 + }, + { + "epoch": 7.093670886075949, + "grad_norm": 20.4808292388916, + "learning_rate": 1.1436819897845882e-05, + "loss": 1.2279, + "step": 11920 + }, + { + "epoch": 7.094337108594271, + "grad_norm": 19.699079513549805, + "learning_rate": 1.139980753571693e-05, + "loss": 0.9463, + "step": 11930 + }, + { + "epoch": 7.095003331112592, + "grad_norm": 14.806946754455566, + "learning_rate": 1.1362795173587978e-05, + "loss": 1.086, + "step": 11940 + }, + { + "epoch": 7.095669553630913, + "grad_norm": 21.047077178955078, + "learning_rate": 1.1325782811459027e-05, + "loss": 1.3118, + "step": 11950 + }, + { + "epoch": 7.096335776149234, + "grad_norm": 14.972464561462402, + "learning_rate": 1.1288770449330078e-05, + "loss": 0.9168, + "step": 11960 + }, + { + "epoch": 7.0970019986675545, + "grad_norm": 13.109081268310547, + "learning_rate": 1.1251758087201125e-05, + "loss": 0.7897, + "step": 11970 + }, + { + "epoch": 7.097668221185876, + "grad_norm": 11.749272346496582, + "learning_rate": 1.1214745725072174e-05, + "loss": 1.0413, + "step": 11980 + }, + { + "epoch": 7.098334443704197, + "grad_norm": 16.486719131469727, + "learning_rate": 1.1177733362943225e-05, + "loss": 1.1579, + "step": 11990 + }, + { + "epoch": 7.099000666222518, + "grad_norm": 30.194507598876953, + "learning_rate": 1.1140721000814272e-05, + "loss": 0.8827, + "step": 12000 + }, + { + "epoch": 7.099666888740839, + "grad_norm": 14.817559242248535, + "learning_rate": 1.1103708638685321e-05, + "loss": 1.0129, + "step": 12010 + }, + { + "epoch": 7.100066622251832, + "eval_accuracy": 0.5045778229908443, + "eval_loss": 1.7192351818084717, + "eval_runtime": 951.4359, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.388, + "step": 12016 + }, + { + "epoch": 8.000266489007329, + "grad_norm": 17.354957580566406, + "learning_rate": 1.106669627655637e-05, + "loss": 1.4574, + "step": 12020 + }, + { + "epoch": 8.00093271152565, + "grad_norm": 14.763650894165039, + "learning_rate": 1.1029683914427419e-05, + "loss": 0.8932, + "step": 12030 + }, + { + "epoch": 8.00159893404397, + "grad_norm": 25.071596145629883, + "learning_rate": 1.0992671552298468e-05, + "loss": 0.9523, + "step": 12040 + }, + { + "epoch": 8.002265156562292, + "grad_norm": 19.431076049804688, + "learning_rate": 1.0955659190169517e-05, + "loss": 0.8871, + "step": 12050 + }, + { + "epoch": 8.002931379080612, + "grad_norm": 11.56208324432373, + "learning_rate": 1.0918646828040566e-05, + "loss": 0.8143, + "step": 12060 + }, + { + "epoch": 8.003597601598933, + "grad_norm": 25.064655303955078, + "learning_rate": 1.0881634465911615e-05, + "loss": 0.8988, + "step": 12070 + }, + { + "epoch": 8.004263824117254, + "grad_norm": 32.08738708496094, + "learning_rate": 1.0844622103782664e-05, + "loss": 0.783, + "step": 12080 + }, + { + "epoch": 8.004930046635577, + "grad_norm": 17.351627349853516, + "learning_rate": 1.0807609741653713e-05, + "loss": 1.02, + "step": 12090 + }, + { + "epoch": 8.005596269153898, + "grad_norm": 12.986157417297363, + "learning_rate": 1.0770597379524762e-05, + "loss": 0.7795, + "step": 12100 + }, + { + "epoch": 8.006262491672219, + "grad_norm": 10.791961669921875, + "learning_rate": 1.0733585017395811e-05, + "loss": 1.0024, + "step": 12110 + }, + { + "epoch": 8.00692871419054, + "grad_norm": 11.830477714538574, + "learning_rate": 1.069657265526686e-05, + "loss": 0.9519, + "step": 12120 + }, + { + "epoch": 8.00759493670886, + "grad_norm": 16.85247039794922, + "learning_rate": 1.0659560293137909e-05, + "loss": 1.026, + "step": 12130 + }, + { + "epoch": 8.008261159227182, + "grad_norm": 11.4721040725708, + "learning_rate": 1.0622547931008958e-05, + "loss": 1.0379, + "step": 12140 + }, + { + "epoch": 8.008927381745503, + "grad_norm": 19.24578285217285, + "learning_rate": 1.0585535568880005e-05, + "loss": 1.0133, + "step": 12150 + }, + { + "epoch": 8.009593604263824, + "grad_norm": 13.95763874053955, + "learning_rate": 1.0548523206751056e-05, + "loss": 0.7758, + "step": 12160 + }, + { + "epoch": 8.010259826782145, + "grad_norm": 22.450647354125977, + "learning_rate": 1.0511510844622103e-05, + "loss": 0.996, + "step": 12170 + }, + { + "epoch": 8.010926049300465, + "grad_norm": 10.975811958312988, + "learning_rate": 1.0474498482493152e-05, + "loss": 0.9537, + "step": 12180 + }, + { + "epoch": 8.011592271818788, + "grad_norm": 19.196500778198242, + "learning_rate": 1.0437486120364203e-05, + "loss": 0.6828, + "step": 12190 + }, + { + "epoch": 8.01225849433711, + "grad_norm": 19.093387603759766, + "learning_rate": 1.040047375823525e-05, + "loss": 1.1093, + "step": 12200 + }, + { + "epoch": 8.01292471685543, + "grad_norm": 23.93227195739746, + "learning_rate": 1.03634613961063e-05, + "loss": 0.7009, + "step": 12210 + }, + { + "epoch": 8.013590939373751, + "grad_norm": 24.3724365234375, + "learning_rate": 1.032644903397735e-05, + "loss": 1.0617, + "step": 12220 + }, + { + "epoch": 8.014257161892072, + "grad_norm": 25.974380493164062, + "learning_rate": 1.0289436671848397e-05, + "loss": 0.9848, + "step": 12230 + }, + { + "epoch": 8.014923384410393, + "grad_norm": 18.305068969726562, + "learning_rate": 1.0252424309719446e-05, + "loss": 1.204, + "step": 12240 + }, + { + "epoch": 8.015589606928714, + "grad_norm": 23.677614212036133, + "learning_rate": 1.0215411947590495e-05, + "loss": 1.003, + "step": 12250 + }, + { + "epoch": 8.016255829447035, + "grad_norm": 15.935638427734375, + "learning_rate": 1.0178399585461544e-05, + "loss": 1.2671, + "step": 12260 + }, + { + "epoch": 8.016922051965356, + "grad_norm": 14.754202842712402, + "learning_rate": 1.0141387223332593e-05, + "loss": 0.8956, + "step": 12270 + }, + { + "epoch": 8.017588274483678, + "grad_norm": 29.415117263793945, + "learning_rate": 1.0104374861203642e-05, + "loss": 1.1597, + "step": 12280 + }, + { + "epoch": 8.018254497002, + "grad_norm": 23.605554580688477, + "learning_rate": 1.0067362499074691e-05, + "loss": 1.0729, + "step": 12290 + }, + { + "epoch": 8.01892071952032, + "grad_norm": 20.42319107055664, + "learning_rate": 1.003035013694574e-05, + "loss": 1.1159, + "step": 12300 + }, + { + "epoch": 8.019586942038641, + "grad_norm": 15.65279769897461, + "learning_rate": 9.99333777481679e-06, + "loss": 0.9321, + "step": 12310 + }, + { + "epoch": 8.020253164556962, + "grad_norm": 16.063478469848633, + "learning_rate": 9.956325412687838e-06, + "loss": 0.6542, + "step": 12320 + }, + { + "epoch": 8.020919387075283, + "grad_norm": 21.866817474365234, + "learning_rate": 9.919313050558887e-06, + "loss": 0.8203, + "step": 12330 + }, + { + "epoch": 8.021585609593604, + "grad_norm": 15.718562126159668, + "learning_rate": 9.882300688429936e-06, + "loss": 1.0571, + "step": 12340 + }, + { + "epoch": 8.022251832111925, + "grad_norm": 16.71888542175293, + "learning_rate": 9.845288326300985e-06, + "loss": 0.8927, + "step": 12350 + }, + { + "epoch": 8.022918054630246, + "grad_norm": 17.722354888916016, + "learning_rate": 9.808275964172034e-06, + "loss": 1.0851, + "step": 12360 + }, + { + "epoch": 8.023584277148567, + "grad_norm": 10.810696601867676, + "learning_rate": 9.771263602043083e-06, + "loss": 0.9633, + "step": 12370 + }, + { + "epoch": 8.02425049966689, + "grad_norm": 14.196084976196289, + "learning_rate": 9.73425123991413e-06, + "loss": 1.1421, + "step": 12380 + }, + { + "epoch": 8.02491672218521, + "grad_norm": 18.883628845214844, + "learning_rate": 9.697238877785181e-06, + "loss": 0.8193, + "step": 12390 + }, + { + "epoch": 8.025582944703531, + "grad_norm": 24.4031982421875, + "learning_rate": 9.66022651565623e-06, + "loss": 1.2563, + "step": 12400 + }, + { + "epoch": 8.026249167221852, + "grad_norm": 17.76465606689453, + "learning_rate": 9.623214153527278e-06, + "loss": 0.8486, + "step": 12410 + }, + { + "epoch": 8.026915389740173, + "grad_norm": 23.54026222229004, + "learning_rate": 9.586201791398328e-06, + "loss": 1.0868, + "step": 12420 + }, + { + "epoch": 8.027581612258494, + "grad_norm": 19.324783325195312, + "learning_rate": 9.549189429269377e-06, + "loss": 0.9587, + "step": 12430 + }, + { + "epoch": 8.028247834776815, + "grad_norm": 16.5588321685791, + "learning_rate": 9.512177067140425e-06, + "loss": 0.8787, + "step": 12440 + }, + { + "epoch": 8.028914057295136, + "grad_norm": 19.387126922607422, + "learning_rate": 9.475164705011475e-06, + "loss": 0.972, + "step": 12450 + }, + { + "epoch": 8.029580279813457, + "grad_norm": 15.966093063354492, + "learning_rate": 9.438152342882524e-06, + "loss": 1.2985, + "step": 12460 + }, + { + "epoch": 8.030246502331778, + "grad_norm": 14.803018569946289, + "learning_rate": 9.401139980753572e-06, + "loss": 1.1003, + "step": 12470 + }, + { + "epoch": 8.0309127248501, + "grad_norm": 8.160183906555176, + "learning_rate": 9.36412761862462e-06, + "loss": 0.8992, + "step": 12480 + }, + { + "epoch": 8.031578947368422, + "grad_norm": 20.219057083129883, + "learning_rate": 9.32711525649567e-06, + "loss": 1.0183, + "step": 12490 + }, + { + "epoch": 8.032245169886743, + "grad_norm": 16.644651412963867, + "learning_rate": 9.290102894366719e-06, + "loss": 1.1365, + "step": 12500 + }, + { + "epoch": 8.032911392405063, + "grad_norm": 23.346940994262695, + "learning_rate": 9.253090532237768e-06, + "loss": 1.0652, + "step": 12510 + }, + { + "epoch": 8.033577614923384, + "grad_norm": 18.153305053710938, + "learning_rate": 9.216078170108817e-06, + "loss": 0.9135, + "step": 12520 + }, + { + "epoch": 8.034243837441705, + "grad_norm": 20.254547119140625, + "learning_rate": 9.179065807979866e-06, + "loss": 1.1375, + "step": 12530 + }, + { + "epoch": 8.034910059960026, + "grad_norm": 28.64413070678711, + "learning_rate": 9.142053445850915e-06, + "loss": 0.9989, + "step": 12540 + }, + { + "epoch": 8.035576282478347, + "grad_norm": 18.472246170043945, + "learning_rate": 9.105041083721964e-06, + "loss": 0.9601, + "step": 12550 + }, + { + "epoch": 8.036242504996668, + "grad_norm": 10.703330993652344, + "learning_rate": 9.068028721593013e-06, + "loss": 0.9746, + "step": 12560 + }, + { + "epoch": 8.03690872751499, + "grad_norm": 14.770788192749023, + "learning_rate": 9.031016359464062e-06, + "loss": 0.9644, + "step": 12570 + }, + { + "epoch": 8.037574950033312, + "grad_norm": 9.637856483459473, + "learning_rate": 8.99400399733511e-06, + "loss": 1.0055, + "step": 12580 + }, + { + "epoch": 8.038241172551633, + "grad_norm": 14.531105041503906, + "learning_rate": 8.95699163520616e-06, + "loss": 1.0244, + "step": 12590 + }, + { + "epoch": 8.038907395069954, + "grad_norm": 23.367136001586914, + "learning_rate": 8.919979273077209e-06, + "loss": 0.9277, + "step": 12600 + }, + { + "epoch": 8.039573617588275, + "grad_norm": 14.899935722351074, + "learning_rate": 8.882966910948256e-06, + "loss": 1.0579, + "step": 12610 + }, + { + "epoch": 8.040239840106596, + "grad_norm": 15.697758674621582, + "learning_rate": 8.845954548819307e-06, + "loss": 0.7105, + "step": 12620 + }, + { + "epoch": 8.040906062624916, + "grad_norm": 23.88060760498047, + "learning_rate": 8.808942186690356e-06, + "loss": 0.884, + "step": 12630 + }, + { + "epoch": 8.041572285143237, + "grad_norm": 27.046621322631836, + "learning_rate": 8.771929824561403e-06, + "loss": 1.0206, + "step": 12640 + }, + { + "epoch": 8.042238507661558, + "grad_norm": 12.780049324035645, + "learning_rate": 8.734917462432454e-06, + "loss": 1.0478, + "step": 12650 + }, + { + "epoch": 8.04290473017988, + "grad_norm": 24.664676666259766, + "learning_rate": 8.697905100303503e-06, + "loss": 1.0203, + "step": 12660 + }, + { + "epoch": 8.043570952698202, + "grad_norm": 19.14145278930664, + "learning_rate": 8.66089273817455e-06, + "loss": 1.126, + "step": 12670 + }, + { + "epoch": 8.044237175216523, + "grad_norm": 14.153072357177734, + "learning_rate": 8.623880376045599e-06, + "loss": 1.0269, + "step": 12680 + }, + { + "epoch": 8.044903397734844, + "grad_norm": 16.468942642211914, + "learning_rate": 8.58686801391665e-06, + "loss": 0.9886, + "step": 12690 + }, + { + "epoch": 8.045569620253165, + "grad_norm": 12.845463752746582, + "learning_rate": 8.549855651787697e-06, + "loss": 1.0562, + "step": 12700 + }, + { + "epoch": 8.046235842771486, + "grad_norm": 16.37422752380371, + "learning_rate": 8.512843289658746e-06, + "loss": 0.789, + "step": 12710 + }, + { + "epoch": 8.046902065289807, + "grad_norm": 12.973320007324219, + "learning_rate": 8.475830927529797e-06, + "loss": 0.9808, + "step": 12720 + }, + { + "epoch": 8.047568287808128, + "grad_norm": 22.305334091186523, + "learning_rate": 8.438818565400844e-06, + "loss": 0.8152, + "step": 12730 + }, + { + "epoch": 8.048234510326449, + "grad_norm": 28.418701171875, + "learning_rate": 8.401806203271893e-06, + "loss": 0.9093, + "step": 12740 + }, + { + "epoch": 8.04890073284477, + "grad_norm": 17.776851654052734, + "learning_rate": 8.364793841142944e-06, + "loss": 1.0529, + "step": 12750 + }, + { + "epoch": 8.04956695536309, + "grad_norm": 12.215154647827148, + "learning_rate": 8.32778147901399e-06, + "loss": 1.0425, + "step": 12760 + }, + { + "epoch": 8.050233177881413, + "grad_norm": 19.526031494140625, + "learning_rate": 8.29076911688504e-06, + "loss": 1.1786, + "step": 12770 + }, + { + "epoch": 8.050899400399734, + "grad_norm": 15.074190139770508, + "learning_rate": 8.253756754756089e-06, + "loss": 1.059, + "step": 12780 + }, + { + "epoch": 8.051565622918055, + "grad_norm": 12.098401069641113, + "learning_rate": 8.216744392627138e-06, + "loss": 0.8655, + "step": 12790 + }, + { + "epoch": 8.052231845436376, + "grad_norm": 24.401731491088867, + "learning_rate": 8.179732030498187e-06, + "loss": 0.9449, + "step": 12800 + }, + { + "epoch": 8.052898067954697, + "grad_norm": 19.46772575378418, + "learning_rate": 8.142719668369236e-06, + "loss": 1.1686, + "step": 12810 + }, + { + "epoch": 8.053564290473018, + "grad_norm": 15.265368461608887, + "learning_rate": 8.105707306240285e-06, + "loss": 1.1018, + "step": 12820 + }, + { + "epoch": 8.054230512991339, + "grad_norm": 10.117189407348633, + "learning_rate": 8.068694944111334e-06, + "loss": 1.2335, + "step": 12830 + }, + { + "epoch": 8.05489673550966, + "grad_norm": 20.11189842224121, + "learning_rate": 8.031682581982381e-06, + "loss": 1.2719, + "step": 12840 + }, + { + "epoch": 8.05556295802798, + "grad_norm": 10.103278160095215, + "learning_rate": 7.994670219853432e-06, + "loss": 1.0022, + "step": 12850 + }, + { + "epoch": 8.056229180546303, + "grad_norm": 14.04018783569336, + "learning_rate": 7.95765785772448e-06, + "loss": 0.9545, + "step": 12860 + }, + { + "epoch": 8.056895403064624, + "grad_norm": 21.26001739501953, + "learning_rate": 7.920645495595528e-06, + "loss": 1.2282, + "step": 12870 + }, + { + "epoch": 8.057561625582945, + "grad_norm": 22.010358810424805, + "learning_rate": 7.883633133466579e-06, + "loss": 1.0444, + "step": 12880 + }, + { + "epoch": 8.058227848101266, + "grad_norm": 15.85307788848877, + "learning_rate": 7.846620771337628e-06, + "loss": 0.9449, + "step": 12890 + }, + { + "epoch": 8.058894070619587, + "grad_norm": 27.069652557373047, + "learning_rate": 7.809608409208675e-06, + "loss": 1.0265, + "step": 12900 + }, + { + "epoch": 8.059560293137908, + "grad_norm": 16.252498626708984, + "learning_rate": 7.772596047079724e-06, + "loss": 0.9446, + "step": 12910 + }, + { + "epoch": 8.060226515656229, + "grad_norm": 26.529401779174805, + "learning_rate": 7.735583684950775e-06, + "loss": 1.0682, + "step": 12920 + }, + { + "epoch": 8.06089273817455, + "grad_norm": 16.16809844970703, + "learning_rate": 7.698571322821822e-06, + "loss": 0.9913, + "step": 12930 + }, + { + "epoch": 8.06155896069287, + "grad_norm": 21.500802993774414, + "learning_rate": 7.661558960692871e-06, + "loss": 0.8234, + "step": 12940 + }, + { + "epoch": 8.062225183211192, + "grad_norm": 9.534919738769531, + "learning_rate": 7.624546598563921e-06, + "loss": 0.8943, + "step": 12950 + }, + { + "epoch": 8.062891405729514, + "grad_norm": 24.226545333862305, + "learning_rate": 7.587534236434969e-06, + "loss": 1.2594, + "step": 12960 + }, + { + "epoch": 8.063557628247835, + "grad_norm": 17.659679412841797, + "learning_rate": 7.550521874306019e-06, + "loss": 1.1662, + "step": 12970 + }, + { + "epoch": 8.064223850766156, + "grad_norm": 19.02828025817871, + "learning_rate": 7.513509512177068e-06, + "loss": 0.979, + "step": 12980 + }, + { + "epoch": 8.064890073284477, + "grad_norm": 26.834571838378906, + "learning_rate": 7.476497150048116e-06, + "loss": 0.8033, + "step": 12990 + }, + { + "epoch": 8.065556295802798, + "grad_norm": 15.116127967834473, + "learning_rate": 7.439484787919165e-06, + "loss": 1.0022, + "step": 13000 + }, + { + "epoch": 8.06622251832112, + "grad_norm": 23.068113327026367, + "learning_rate": 7.402472425790215e-06, + "loss": 1.0216, + "step": 13010 + }, + { + "epoch": 8.06688874083944, + "grad_norm": 33.717430114746094, + "learning_rate": 7.365460063661263e-06, + "loss": 0.9599, + "step": 13020 + }, + { + "epoch": 8.067554963357761, + "grad_norm": 14.382299423217773, + "learning_rate": 7.328447701532312e-06, + "loss": 0.8928, + "step": 13030 + }, + { + "epoch": 8.068221185876082, + "grad_norm": 11.484391212463379, + "learning_rate": 7.291435339403362e-06, + "loss": 1.1551, + "step": 13040 + }, + { + "epoch": 8.068887408394403, + "grad_norm": 16.246004104614258, + "learning_rate": 7.254422977274409e-06, + "loss": 0.7789, + "step": 13050 + }, + { + "epoch": 8.069553630912726, + "grad_norm": 21.760108947753906, + "learning_rate": 7.217410615145459e-06, + "loss": 0.9097, + "step": 13060 + }, + { + "epoch": 8.070219853431047, + "grad_norm": 21.727842330932617, + "learning_rate": 7.180398253016508e-06, + "loss": 0.8161, + "step": 13070 + }, + { + "epoch": 8.070886075949367, + "grad_norm": 16.951379776000977, + "learning_rate": 7.143385890887556e-06, + "loss": 1.1907, + "step": 13080 + }, + { + "epoch": 8.071552298467688, + "grad_norm": 14.930109024047852, + "learning_rate": 7.106373528758606e-06, + "loss": 0.9768, + "step": 13090 + }, + { + "epoch": 8.07221852098601, + "grad_norm": 13.25757122039795, + "learning_rate": 7.069361166629655e-06, + "loss": 1.0442, + "step": 13100 + }, + { + "epoch": 8.07288474350433, + "grad_norm": 17.497682571411133, + "learning_rate": 7.032348804500703e-06, + "loss": 1.0555, + "step": 13110 + }, + { + "epoch": 8.073550966022651, + "grad_norm": 27.137048721313477, + "learning_rate": 6.995336442371753e-06, + "loss": 0.9927, + "step": 13120 + }, + { + "epoch": 8.074217188540972, + "grad_norm": 22.3636531829834, + "learning_rate": 6.958324080242802e-06, + "loss": 1.0467, + "step": 13130 + }, + { + "epoch": 8.074883411059293, + "grad_norm": 21.966400146484375, + "learning_rate": 6.92131171811385e-06, + "loss": 0.9746, + "step": 13140 + }, + { + "epoch": 8.075549633577616, + "grad_norm": 10.819857597351074, + "learning_rate": 6.884299355984899e-06, + "loss": 1.1301, + "step": 13150 + }, + { + "epoch": 8.076215856095937, + "grad_norm": 22.201786041259766, + "learning_rate": 6.847286993855949e-06, + "loss": 1.1137, + "step": 13160 + }, + { + "epoch": 8.076882078614258, + "grad_norm": 15.351531028747559, + "learning_rate": 6.810274631726997e-06, + "loss": 1.075, + "step": 13170 + }, + { + "epoch": 8.077548301132579, + "grad_norm": 9.249370574951172, + "learning_rate": 6.773262269598046e-06, + "loss": 0.807, + "step": 13180 + }, + { + "epoch": 8.0782145236509, + "grad_norm": 20.31663703918457, + "learning_rate": 6.736249907469094e-06, + "loss": 1.0561, + "step": 13190 + }, + { + "epoch": 8.07888074616922, + "grad_norm": 22.812593460083008, + "learning_rate": 6.699237545340144e-06, + "loss": 1.1236, + "step": 13200 + }, + { + "epoch": 8.079546968687541, + "grad_norm": 25.445377349853516, + "learning_rate": 6.662225183211193e-06, + "loss": 1.0179, + "step": 13210 + }, + { + "epoch": 8.080213191205862, + "grad_norm": 11.689116477966309, + "learning_rate": 6.625212821082241e-06, + "loss": 0.7599, + "step": 13220 + }, + { + "epoch": 8.080879413724183, + "grad_norm": 18.420169830322266, + "learning_rate": 6.58820045895329e-06, + "loss": 0.9931, + "step": 13230 + }, + { + "epoch": 8.081545636242504, + "grad_norm": 12.665791511535645, + "learning_rate": 6.55118809682434e-06, + "loss": 0.8801, + "step": 13240 + }, + { + "epoch": 8.082211858760827, + "grad_norm": 20.70166778564453, + "learning_rate": 6.514175734695388e-06, + "loss": 1.2889, + "step": 13250 + }, + { + "epoch": 8.082878081279148, + "grad_norm": 10.324226379394531, + "learning_rate": 6.477163372566437e-06, + "loss": 0.9308, + "step": 13260 + }, + { + "epoch": 8.083544303797469, + "grad_norm": 14.437554359436035, + "learning_rate": 6.440151010437487e-06, + "loss": 1.192, + "step": 13270 + }, + { + "epoch": 8.08421052631579, + "grad_norm": 17.76905632019043, + "learning_rate": 6.4031386483085345e-06, + "loss": 0.9883, + "step": 13280 + }, + { + "epoch": 8.08487674883411, + "grad_norm": 13.64749526977539, + "learning_rate": 6.366126286179584e-06, + "loss": 0.9376, + "step": 13290 + }, + { + "epoch": 8.085542971352432, + "grad_norm": 6.335607051849365, + "learning_rate": 6.329113924050633e-06, + "loss": 0.7484, + "step": 13300 + }, + { + "epoch": 8.086209193870753, + "grad_norm": 28.03476333618164, + "learning_rate": 6.2921015619216815e-06, + "loss": 1.2082, + "step": 13310 + }, + { + "epoch": 8.086875416389073, + "grad_norm": 17.74762535095215, + "learning_rate": 6.255089199792731e-06, + "loss": 0.8309, + "step": 13320 + }, + { + "epoch": 8.087541638907394, + "grad_norm": 21.163410186767578, + "learning_rate": 6.2180768376637794e-06, + "loss": 0.9534, + "step": 13330 + }, + { + "epoch": 8.088207861425715, + "grad_norm": 7.922489643096924, + "learning_rate": 6.181064475534829e-06, + "loss": 0.9617, + "step": 13340 + }, + { + "epoch": 8.088874083944038, + "grad_norm": 15.332942962646484, + "learning_rate": 6.144052113405878e-06, + "loss": 0.9534, + "step": 13350 + }, + { + "epoch": 8.089540306462359, + "grad_norm": 15.467304229736328, + "learning_rate": 6.1070397512769264e-06, + "loss": 0.7679, + "step": 13360 + }, + { + "epoch": 8.09020652898068, + "grad_norm": 27.217870712280273, + "learning_rate": 6.070027389147976e-06, + "loss": 0.8557, + "step": 13370 + }, + { + "epoch": 8.090872751499, + "grad_norm": 16.020980834960938, + "learning_rate": 6.0330150270190244e-06, + "loss": 0.9202, + "step": 13380 + }, + { + "epoch": 8.091538974017322, + "grad_norm": 14.924962997436523, + "learning_rate": 5.9960026648900734e-06, + "loss": 0.8126, + "step": 13390 + }, + { + "epoch": 8.092205196535643, + "grad_norm": 24.40899658203125, + "learning_rate": 5.9589903027611224e-06, + "loss": 1.0785, + "step": 13400 + }, + { + "epoch": 8.092871419053964, + "grad_norm": 21.064863204956055, + "learning_rate": 5.9219779406321714e-06, + "loss": 0.917, + "step": 13410 + }, + { + "epoch": 8.093537641572285, + "grad_norm": 24.186546325683594, + "learning_rate": 5.88496557850322e-06, + "loss": 0.6881, + "step": 13420 + }, + { + "epoch": 8.094203864090606, + "grad_norm": 21.761343002319336, + "learning_rate": 5.8479532163742686e-06, + "loss": 0.7778, + "step": 13430 + }, + { + "epoch": 8.094870086608928, + "grad_norm": 18.209165573120117, + "learning_rate": 5.810940854245318e-06, + "loss": 1.0125, + "step": 13440 + }, + { + "epoch": 8.09553630912725, + "grad_norm": 15.490808486938477, + "learning_rate": 5.773928492116367e-06, + "loss": 1.0377, + "step": 13450 + }, + { + "epoch": 8.09620253164557, + "grad_norm": 12.829586029052734, + "learning_rate": 5.7369161299874156e-06, + "loss": 1.366, + "step": 13460 + }, + { + "epoch": 8.096868754163891, + "grad_norm": 14.535886764526367, + "learning_rate": 5.699903767858465e-06, + "loss": 1.0422, + "step": 13470 + }, + { + "epoch": 8.097534976682212, + "grad_norm": 11.570591926574707, + "learning_rate": 5.6628914057295136e-06, + "loss": 0.9691, + "step": 13480 + }, + { + "epoch": 8.098201199200533, + "grad_norm": 13.828847885131836, + "learning_rate": 5.6258790436005626e-06, + "loss": 0.7427, + "step": 13490 + }, + { + "epoch": 8.098867421718854, + "grad_norm": 21.245874404907227, + "learning_rate": 5.588866681471612e-06, + "loss": 1.0387, + "step": 13500 + }, + { + "epoch": 8.099533644237175, + "grad_norm": 19.132814407348633, + "learning_rate": 5.5518543193426606e-06, + "loss": 1.0309, + "step": 13510 + }, + { + "epoch": 8.100066622251832, + "eval_accuracy": 0.5269582909460834, + "eval_loss": 1.6447622776031494, + "eval_runtime": 931.4513, + "eval_samples_per_second": 3.166, + "eval_steps_per_second": 0.396, + "step": 13518 + }, + { + "epoch": 9.000133244503663, + "grad_norm": 13.561812400817871, + "learning_rate": 5.5148419572137095e-06, + "loss": 0.8546, + "step": 13520 + }, + { + "epoch": 9.000799467021986, + "grad_norm": 12.23229694366455, + "learning_rate": 5.4778295950847585e-06, + "loss": 0.7912, + "step": 13530 + }, + { + "epoch": 9.001465689540307, + "grad_norm": 39.896392822265625, + "learning_rate": 5.4408172329558075e-06, + "loss": 0.961, + "step": 13540 + }, + { + "epoch": 9.002131912058628, + "grad_norm": 17.12689971923828, + "learning_rate": 5.4038048708268565e-06, + "loss": 1.0882, + "step": 13550 + }, + { + "epoch": 9.002798134576949, + "grad_norm": 31.749732971191406, + "learning_rate": 5.3667925086979055e-06, + "loss": 0.6823, + "step": 13560 + }, + { + "epoch": 9.00346435709527, + "grad_norm": 25.51563262939453, + "learning_rate": 5.3297801465689545e-06, + "loss": 1.0396, + "step": 13570 + }, + { + "epoch": 9.00413057961359, + "grad_norm": 21.32733154296875, + "learning_rate": 5.292767784440003e-06, + "loss": 0.875, + "step": 13580 + }, + { + "epoch": 9.004796802131912, + "grad_norm": 8.315468788146973, + "learning_rate": 5.255755422311052e-06, + "loss": 0.9438, + "step": 13590 + }, + { + "epoch": 9.005463024650233, + "grad_norm": 29.727874755859375, + "learning_rate": 5.2187430601821015e-06, + "loss": 0.9843, + "step": 13600 + }, + { + "epoch": 9.006129247168554, + "grad_norm": 21.11098861694336, + "learning_rate": 5.18173069805315e-06, + "loss": 1.0088, + "step": 13610 + }, + { + "epoch": 9.006795469686875, + "grad_norm": 9.857388496398926, + "learning_rate": 5.144718335924199e-06, + "loss": 0.7325, + "step": 13620 + }, + { + "epoch": 9.007461692205197, + "grad_norm": 23.14414405822754, + "learning_rate": 5.107705973795248e-06, + "loss": 1.0938, + "step": 13630 + }, + { + "epoch": 9.008127914723518, + "grad_norm": 11.88949966430664, + "learning_rate": 5.070693611666297e-06, + "loss": 0.8109, + "step": 13640 + }, + { + "epoch": 9.00879413724184, + "grad_norm": 18.450145721435547, + "learning_rate": 5.033681249537346e-06, + "loss": 0.6526, + "step": 13650 + }, + { + "epoch": 9.00946035976016, + "grad_norm": 19.765588760375977, + "learning_rate": 4.996668887408395e-06, + "loss": 0.7992, + "step": 13660 + }, + { + "epoch": 9.010126582278481, + "grad_norm": 12.451539039611816, + "learning_rate": 4.959656525279444e-06, + "loss": 1.2102, + "step": 13670 + }, + { + "epoch": 9.010792804796802, + "grad_norm": 9.399398803710938, + "learning_rate": 4.922644163150493e-06, + "loss": 0.9433, + "step": 13680 + }, + { + "epoch": 9.011459027315123, + "grad_norm": 22.869823455810547, + "learning_rate": 4.885631801021542e-06, + "loss": 0.9461, + "step": 13690 + }, + { + "epoch": 9.012125249833444, + "grad_norm": 14.958271026611328, + "learning_rate": 4.848619438892591e-06, + "loss": 0.7305, + "step": 13700 + }, + { + "epoch": 9.012791472351765, + "grad_norm": 16.759414672851562, + "learning_rate": 4.811607076763639e-06, + "loss": 1.0578, + "step": 13710 + }, + { + "epoch": 9.013457694870086, + "grad_norm": 15.498473167419434, + "learning_rate": 4.774594714634689e-06, + "loss": 0.8799, + "step": 13720 + }, + { + "epoch": 9.014123917388408, + "grad_norm": 15.972556114196777, + "learning_rate": 4.737582352505738e-06, + "loss": 0.9694, + "step": 13730 + }, + { + "epoch": 9.01479013990673, + "grad_norm": 19.409330368041992, + "learning_rate": 4.700569990376786e-06, + "loss": 0.9809, + "step": 13740 + }, + { + "epoch": 9.01545636242505, + "grad_norm": 21.176664352416992, + "learning_rate": 4.663557628247835e-06, + "loss": 1.0444, + "step": 13750 + }, + { + "epoch": 9.016122584943371, + "grad_norm": 14.635652542114258, + "learning_rate": 4.626545266118884e-06, + "loss": 1.1901, + "step": 13760 + }, + { + "epoch": 9.016788807461692, + "grad_norm": 13.545337677001953, + "learning_rate": 4.589532903989933e-06, + "loss": 0.8258, + "step": 13770 + }, + { + "epoch": 9.017455029980013, + "grad_norm": 15.609186172485352, + "learning_rate": 4.552520541860982e-06, + "loss": 1.0008, + "step": 13780 + }, + { + "epoch": 9.018121252498334, + "grad_norm": 17.932830810546875, + "learning_rate": 4.515508179732031e-06, + "loss": 1.0108, + "step": 13790 + }, + { + "epoch": 9.018787475016655, + "grad_norm": 19.69332504272461, + "learning_rate": 4.47849581760308e-06, + "loss": 0.8656, + "step": 13800 + }, + { + "epoch": 9.019453697534976, + "grad_norm": 18.6381778717041, + "learning_rate": 4.441483455474128e-06, + "loss": 1.1187, + "step": 13810 + }, + { + "epoch": 9.020119920053299, + "grad_norm": 17.65327262878418, + "learning_rate": 4.404471093345178e-06, + "loss": 0.745, + "step": 13820 + }, + { + "epoch": 9.02078614257162, + "grad_norm": 21.64731788635254, + "learning_rate": 4.367458731216227e-06, + "loss": 0.9958, + "step": 13830 + }, + { + "epoch": 9.02145236508994, + "grad_norm": 28.844314575195312, + "learning_rate": 4.330446369087275e-06, + "loss": 0.9836, + "step": 13840 + }, + { + "epoch": 9.022118587608261, + "grad_norm": 18.975547790527344, + "learning_rate": 4.293434006958325e-06, + "loss": 1.1603, + "step": 13850 + }, + { + "epoch": 9.022784810126582, + "grad_norm": 17.587818145751953, + "learning_rate": 4.256421644829373e-06, + "loss": 1.1473, + "step": 13860 + }, + { + "epoch": 9.023451032644903, + "grad_norm": 13.153083801269531, + "learning_rate": 4.219409282700422e-06, + "loss": 0.9033, + "step": 13870 + }, + { + "epoch": 9.024117255163224, + "grad_norm": 19.280969619750977, + "learning_rate": 4.182396920571472e-06, + "loss": 1.0406, + "step": 13880 + }, + { + "epoch": 9.024783477681545, + "grad_norm": 27.012893676757812, + "learning_rate": 4.14538455844252e-06, + "loss": 0.6843, + "step": 13890 + }, + { + "epoch": 9.025449700199866, + "grad_norm": 24.629638671875, + "learning_rate": 4.108372196313569e-06, + "loss": 1.0875, + "step": 13900 + }, + { + "epoch": 9.026115922718187, + "grad_norm": 22.807397842407227, + "learning_rate": 4.071359834184618e-06, + "loss": 0.7494, + "step": 13910 + }, + { + "epoch": 9.02678214523651, + "grad_norm": 15.220276832580566, + "learning_rate": 4.034347472055667e-06, + "loss": 0.9768, + "step": 13920 + }, + { + "epoch": 9.02744836775483, + "grad_norm": 12.251656532287598, + "learning_rate": 3.997335109926716e-06, + "loss": 1.0346, + "step": 13930 + }, + { + "epoch": 9.028114590273152, + "grad_norm": 24.144866943359375, + "learning_rate": 3.960322747797764e-06, + "loss": 1.0701, + "step": 13940 + }, + { + "epoch": 9.028780812791473, + "grad_norm": 25.019176483154297, + "learning_rate": 3.923310385668814e-06, + "loss": 0.7641, + "step": 13950 + }, + { + "epoch": 9.029447035309794, + "grad_norm": 19.51873779296875, + "learning_rate": 3.886298023539862e-06, + "loss": 0.8013, + "step": 13960 + }, + { + "epoch": 9.030113257828114, + "grad_norm": 24.272336959838867, + "learning_rate": 3.849285661410911e-06, + "loss": 0.9524, + "step": 13970 + }, + { + "epoch": 9.030779480346435, + "grad_norm": 15.087508201599121, + "learning_rate": 3.8122732992819605e-06, + "loss": 1.2019, + "step": 13980 + }, + { + "epoch": 9.031445702864756, + "grad_norm": 26.37247085571289, + "learning_rate": 3.7752609371530095e-06, + "loss": 1.1275, + "step": 13990 + }, + { + "epoch": 9.032111925383077, + "grad_norm": 10.386835098266602, + "learning_rate": 3.738248575024058e-06, + "loss": 1.0538, + "step": 14000 + }, + { + "epoch": 9.032778147901398, + "grad_norm": 38.59900665283203, + "learning_rate": 3.7012362128951075e-06, + "loss": 0.7886, + "step": 14010 + }, + { + "epoch": 9.033444370419721, + "grad_norm": 22.676761627197266, + "learning_rate": 3.664223850766156e-06, + "loss": 1.2528, + "step": 14020 + }, + { + "epoch": 9.034110592938042, + "grad_norm": 13.058721542358398, + "learning_rate": 3.6272114886372046e-06, + "loss": 0.7393, + "step": 14030 + }, + { + "epoch": 9.034776815456363, + "grad_norm": 27.642520904541016, + "learning_rate": 3.590199126508254e-06, + "loss": 1.1666, + "step": 14040 + }, + { + "epoch": 9.035443037974684, + "grad_norm": 13.986379623413086, + "learning_rate": 3.553186764379303e-06, + "loss": 0.7992, + "step": 14050 + }, + { + "epoch": 9.036109260493005, + "grad_norm": 12.255427360534668, + "learning_rate": 3.5161744022503516e-06, + "loss": 0.77, + "step": 14060 + }, + { + "epoch": 9.036775483011326, + "grad_norm": 15.774858474731445, + "learning_rate": 3.479162040121401e-06, + "loss": 0.8866, + "step": 14070 + }, + { + "epoch": 9.037441705529647, + "grad_norm": 29.642332077026367, + "learning_rate": 3.4421496779924496e-06, + "loss": 1.1094, + "step": 14080 + }, + { + "epoch": 9.038107928047967, + "grad_norm": 12.797449111938477, + "learning_rate": 3.4051373158634986e-06, + "loss": 0.8335, + "step": 14090 + }, + { + "epoch": 9.038774150566288, + "grad_norm": 28.062767028808594, + "learning_rate": 3.368124953734547e-06, + "loss": 1.1449, + "step": 14100 + }, + { + "epoch": 9.039440373084611, + "grad_norm": 14.836297988891602, + "learning_rate": 3.3311125916055966e-06, + "loss": 0.9809, + "step": 14110 + }, + { + "epoch": 9.040106595602932, + "grad_norm": 18.37227439880371, + "learning_rate": 3.294100229476645e-06, + "loss": 1.0279, + "step": 14120 + }, + { + "epoch": 9.040772818121253, + "grad_norm": 14.559772491455078, + "learning_rate": 3.257087867347694e-06, + "loss": 0.9172, + "step": 14130 + }, + { + "epoch": 9.041439040639574, + "grad_norm": 18.678054809570312, + "learning_rate": 3.2200755052187436e-06, + "loss": 0.7594, + "step": 14140 + }, + { + "epoch": 9.042105263157895, + "grad_norm": 25.523271560668945, + "learning_rate": 3.183063143089792e-06, + "loss": 1.0462, + "step": 14150 + }, + { + "epoch": 9.042771485676216, + "grad_norm": 11.672037124633789, + "learning_rate": 3.1460507809608407e-06, + "loss": 1.104, + "step": 14160 + }, + { + "epoch": 9.043437708194537, + "grad_norm": 21.4510498046875, + "learning_rate": 3.1090384188318897e-06, + "loss": 0.9898, + "step": 14170 + }, + { + "epoch": 9.044103930712858, + "grad_norm": 17.42133903503418, + "learning_rate": 3.072026056702939e-06, + "loss": 0.9704, + "step": 14180 + }, + { + "epoch": 9.044770153231179, + "grad_norm": 13.38853645324707, + "learning_rate": 3.035013694573988e-06, + "loss": 0.869, + "step": 14190 + }, + { + "epoch": 9.0454363757495, + "grad_norm": 19.02942657470703, + "learning_rate": 2.9980013324450367e-06, + "loss": 1.1295, + "step": 14200 + }, + { + "epoch": 9.046102598267822, + "grad_norm": 44.093040466308594, + "learning_rate": 2.9609889703160857e-06, + "loss": 1.1286, + "step": 14210 + }, + { + "epoch": 9.046768820786143, + "grad_norm": 18.100770950317383, + "learning_rate": 2.9239766081871343e-06, + "loss": 0.9475, + "step": 14220 + }, + { + "epoch": 9.047435043304464, + "grad_norm": 20.55099105834961, + "learning_rate": 2.8869642460581837e-06, + "loss": 0.971, + "step": 14230 + }, + { + "epoch": 9.048101265822785, + "grad_norm": 10.212332725524902, + "learning_rate": 2.8499518839292327e-06, + "loss": 0.8425, + "step": 14240 + }, + { + "epoch": 9.048767488341106, + "grad_norm": 17.744693756103516, + "learning_rate": 2.8129395218002813e-06, + "loss": 1.0067, + "step": 14250 + }, + { + "epoch": 9.049433710859427, + "grad_norm": 7.877253532409668, + "learning_rate": 2.7759271596713303e-06, + "loss": 0.8747, + "step": 14260 + }, + { + "epoch": 9.050099933377748, + "grad_norm": 21.845111846923828, + "learning_rate": 2.7389147975423793e-06, + "loss": 1.0271, + "step": 14270 + }, + { + "epoch": 9.050766155896069, + "grad_norm": 14.694886207580566, + "learning_rate": 2.7019024354134283e-06, + "loss": 0.8168, + "step": 14280 + }, + { + "epoch": 9.05143237841439, + "grad_norm": 18.603790283203125, + "learning_rate": 2.6648900732844773e-06, + "loss": 0.9808, + "step": 14290 + }, + { + "epoch": 9.05209860093271, + "grad_norm": 12.570313453674316, + "learning_rate": 2.627877711155526e-06, + "loss": 0.7926, + "step": 14300 + }, + { + "epoch": 9.052764823451033, + "grad_norm": 22.884662628173828, + "learning_rate": 2.590865349026575e-06, + "loss": 1.0487, + "step": 14310 + }, + { + "epoch": 9.053431045969354, + "grad_norm": 4.630619049072266, + "learning_rate": 2.553852986897624e-06, + "loss": 0.9699, + "step": 14320 + }, + { + "epoch": 9.054097268487675, + "grad_norm": 25.252151489257812, + "learning_rate": 2.516840624768673e-06, + "loss": 0.732, + "step": 14330 + }, + { + "epoch": 9.054763491005996, + "grad_norm": 25.540369033813477, + "learning_rate": 2.479828262639722e-06, + "loss": 1.0209, + "step": 14340 + }, + { + "epoch": 9.055429713524317, + "grad_norm": 27.00995445251465, + "learning_rate": 2.442815900510771e-06, + "loss": 0.9898, + "step": 14350 + }, + { + "epoch": 9.056095936042638, + "grad_norm": 22.111276626586914, + "learning_rate": 2.4058035383818194e-06, + "loss": 1.1266, + "step": 14360 + }, + { + "epoch": 9.056762158560959, + "grad_norm": 26.307374954223633, + "learning_rate": 2.368791176252869e-06, + "loss": 0.7751, + "step": 14370 + }, + { + "epoch": 9.05742838107928, + "grad_norm": 14.811867713928223, + "learning_rate": 2.3317788141239174e-06, + "loss": 0.7983, + "step": 14380 + }, + { + "epoch": 9.0580946035976, + "grad_norm": 10.100497245788574, + "learning_rate": 2.2947664519949664e-06, + "loss": 0.9609, + "step": 14390 + }, + { + "epoch": 9.058760826115924, + "grad_norm": 17.1610107421875, + "learning_rate": 2.2577540898660154e-06, + "loss": 1.0045, + "step": 14400 + }, + { + "epoch": 9.059427048634245, + "grad_norm": 9.35977840423584, + "learning_rate": 2.220741727737064e-06, + "loss": 0.8101, + "step": 14410 + }, + { + "epoch": 9.060093271152565, + "grad_norm": 10.435575485229492, + "learning_rate": 2.1837293656081134e-06, + "loss": 0.9247, + "step": 14420 + }, + { + "epoch": 9.060759493670886, + "grad_norm": 34.150230407714844, + "learning_rate": 2.1467170034791624e-06, + "loss": 0.9052, + "step": 14430 + }, + { + "epoch": 9.061425716189207, + "grad_norm": 14.470364570617676, + "learning_rate": 2.109704641350211e-06, + "loss": 0.9014, + "step": 14440 + }, + { + "epoch": 9.062091938707528, + "grad_norm": 12.446331024169922, + "learning_rate": 2.07269227922126e-06, + "loss": 0.9708, + "step": 14450 + }, + { + "epoch": 9.06275816122585, + "grad_norm": 12.751906394958496, + "learning_rate": 2.035679917092309e-06, + "loss": 1.1148, + "step": 14460 + }, + { + "epoch": 9.06342438374417, + "grad_norm": 11.392036437988281, + "learning_rate": 1.998667554963358e-06, + "loss": 0.7479, + "step": 14470 + }, + { + "epoch": 9.064090606262491, + "grad_norm": 26.22311019897461, + "learning_rate": 1.961655192834407e-06, + "loss": 1.0442, + "step": 14480 + }, + { + "epoch": 9.064756828780812, + "grad_norm": 14.497088432312012, + "learning_rate": 1.9246428307054555e-06, + "loss": 0.7365, + "step": 14490 + }, + { + "epoch": 9.065423051299135, + "grad_norm": 15.458540916442871, + "learning_rate": 1.8876304685765047e-06, + "loss": 0.869, + "step": 14500 + }, + { + "epoch": 9.066089273817456, + "grad_norm": 19.013675689697266, + "learning_rate": 1.8506181064475537e-06, + "loss": 0.8209, + "step": 14510 + }, + { + "epoch": 9.066755496335777, + "grad_norm": 24.11631202697754, + "learning_rate": 1.8136057443186023e-06, + "loss": 0.873, + "step": 14520 + }, + { + "epoch": 9.067421718854098, + "grad_norm": 16.98272132873535, + "learning_rate": 1.7765933821896515e-06, + "loss": 0.8914, + "step": 14530 + }, + { + "epoch": 9.068087941372418, + "grad_norm": 16.685596466064453, + "learning_rate": 1.7395810200607005e-06, + "loss": 0.9063, + "step": 14540 + }, + { + "epoch": 9.06875416389074, + "grad_norm": 10.478558540344238, + "learning_rate": 1.7025686579317493e-06, + "loss": 0.7955, + "step": 14550 + }, + { + "epoch": 9.06942038640906, + "grad_norm": 14.148734092712402, + "learning_rate": 1.6655562958027983e-06, + "loss": 0.8543, + "step": 14560 + }, + { + "epoch": 9.070086608927381, + "grad_norm": 9.12399959564209, + "learning_rate": 1.628543933673847e-06, + "loss": 0.8076, + "step": 14570 + }, + { + "epoch": 9.070752831445702, + "grad_norm": 8.952740669250488, + "learning_rate": 1.591531571544896e-06, + "loss": 0.64, + "step": 14580 + }, + { + "epoch": 9.071419053964023, + "grad_norm": 21.861600875854492, + "learning_rate": 1.5545192094159449e-06, + "loss": 0.8724, + "step": 14590 + }, + { + "epoch": 9.072085276482346, + "grad_norm": 22.62474250793457, + "learning_rate": 1.517506847286994e-06, + "loss": 1.0243, + "step": 14600 + }, + { + "epoch": 9.072751499000667, + "grad_norm": 22.50910758972168, + "learning_rate": 1.4804944851580429e-06, + "loss": 1.3157, + "step": 14610 + }, + { + "epoch": 9.073417721518988, + "grad_norm": 17.900442123413086, + "learning_rate": 1.4434821230290919e-06, + "loss": 0.9803, + "step": 14620 + }, + { + "epoch": 9.074083944037309, + "grad_norm": 10.286649703979492, + "learning_rate": 1.4064697609001406e-06, + "loss": 0.8724, + "step": 14630 + }, + { + "epoch": 9.07475016655563, + "grad_norm": 27.137405395507812, + "learning_rate": 1.3694573987711896e-06, + "loss": 1.0241, + "step": 14640 + }, + { + "epoch": 9.07541638907395, + "grad_norm": 10.3027982711792, + "learning_rate": 1.3324450366422386e-06, + "loss": 1.0086, + "step": 14650 + }, + { + "epoch": 9.076082611592271, + "grad_norm": 13.301602363586426, + "learning_rate": 1.2954326745132874e-06, + "loss": 0.9288, + "step": 14660 + }, + { + "epoch": 9.076748834110592, + "grad_norm": 18.995988845825195, + "learning_rate": 1.2584203123843364e-06, + "loss": 1.383, + "step": 14670 + }, + { + "epoch": 9.077415056628913, + "grad_norm": 8.098334312438965, + "learning_rate": 1.2214079502553854e-06, + "loss": 0.6623, + "step": 14680 + }, + { + "epoch": 9.078081279147234, + "grad_norm": 12.405116081237793, + "learning_rate": 1.1843955881264344e-06, + "loss": 0.9487, + "step": 14690 + }, + { + "epoch": 9.078747501665557, + "grad_norm": 33.01274108886719, + "learning_rate": 1.1473832259974832e-06, + "loss": 0.9061, + "step": 14700 + }, + { + "epoch": 9.079413724183878, + "grad_norm": 25.67142105102539, + "learning_rate": 1.110370863868532e-06, + "loss": 0.7192, + "step": 14710 + }, + { + "epoch": 9.080079946702199, + "grad_norm": 25.35344123840332, + "learning_rate": 1.0733585017395812e-06, + "loss": 0.8415, + "step": 14720 + }, + { + "epoch": 9.08074616922052, + "grad_norm": 37.96670150756836, + "learning_rate": 1.03634613961063e-06, + "loss": 0.907, + "step": 14730 + }, + { + "epoch": 9.08141239173884, + "grad_norm": 24.11415672302246, + "learning_rate": 9.99333777481679e-07, + "loss": 1.2368, + "step": 14740 + }, + { + "epoch": 9.082078614257162, + "grad_norm": 33.59077072143555, + "learning_rate": 9.623214153527278e-07, + "loss": 1.1337, + "step": 14750 + }, + { + "epoch": 9.082744836775483, + "grad_norm": 33.37315368652344, + "learning_rate": 9.253090532237769e-07, + "loss": 0.9817, + "step": 14760 + }, + { + "epoch": 9.083411059293804, + "grad_norm": 13.038928031921387, + "learning_rate": 8.882966910948258e-07, + "loss": 0.8589, + "step": 14770 + }, + { + "epoch": 9.084077281812124, + "grad_norm": 5.54771614074707, + "learning_rate": 8.512843289658746e-07, + "loss": 0.783, + "step": 14780 + }, + { + "epoch": 9.084743504330447, + "grad_norm": 21.039886474609375, + "learning_rate": 8.142719668369235e-07, + "loss": 0.7402, + "step": 14790 + }, + { + "epoch": 9.085409726848768, + "grad_norm": 14.071789741516113, + "learning_rate": 7.772596047079724e-07, + "loss": 0.7536, + "step": 14800 + }, + { + "epoch": 9.086075949367089, + "grad_norm": 17.663433074951172, + "learning_rate": 7.402472425790214e-07, + "loss": 1.0053, + "step": 14810 + }, + { + "epoch": 9.08674217188541, + "grad_norm": 18.997224807739258, + "learning_rate": 7.032348804500703e-07, + "loss": 0.8382, + "step": 14820 + }, + { + "epoch": 9.087408394403731, + "grad_norm": 28.97583770751953, + "learning_rate": 6.662225183211193e-07, + "loss": 1.1474, + "step": 14830 + }, + { + "epoch": 9.088074616922052, + "grad_norm": 19.859697341918945, + "learning_rate": 6.292101561921682e-07, + "loss": 0.9496, + "step": 14840 + }, + { + "epoch": 9.088740839440373, + "grad_norm": 15.790230751037598, + "learning_rate": 5.921977940632172e-07, + "loss": 0.8637, + "step": 14850 + }, + { + "epoch": 9.089407061958694, + "grad_norm": 24.994583129882812, + "learning_rate": 5.55185431934266e-07, + "loss": 0.9386, + "step": 14860 + }, + { + "epoch": 9.090073284477015, + "grad_norm": 21.8890438079834, + "learning_rate": 5.18173069805315e-07, + "loss": 1.003, + "step": 14870 + }, + { + "epoch": 9.090739506995336, + "grad_norm": 24.587318420410156, + "learning_rate": 4.811607076763639e-07, + "loss": 0.9787, + "step": 14880 + }, + { + "epoch": 9.091405729513658, + "grad_norm": 38.843536376953125, + "learning_rate": 4.441483455474129e-07, + "loss": 0.9614, + "step": 14890 + }, + { + "epoch": 9.09207195203198, + "grad_norm": 26.183332443237305, + "learning_rate": 4.0713598341846177e-07, + "loss": 0.6525, + "step": 14900 + }, + { + "epoch": 9.0927381745503, + "grad_norm": 18.82017707824707, + "learning_rate": 3.701236212895107e-07, + "loss": 0.7726, + "step": 14910 + }, + { + "epoch": 9.093404397068621, + "grad_norm": 15.483589172363281, + "learning_rate": 3.3311125916055966e-07, + "loss": 0.9772, + "step": 14920 + }, + { + "epoch": 9.094070619586942, + "grad_norm": 21.13338279724121, + "learning_rate": 2.960988970316086e-07, + "loss": 1.1048, + "step": 14930 + }, + { + "epoch": 9.094736842105263, + "grad_norm": 16.59648323059082, + "learning_rate": 2.590865349026575e-07, + "loss": 0.9057, + "step": 14940 + }, + { + "epoch": 9.095403064623584, + "grad_norm": 12.583951950073242, + "learning_rate": 2.2207417277370644e-07, + "loss": 0.7593, + "step": 14950 + }, + { + "epoch": 9.096069287141905, + "grad_norm": 20.98554229736328, + "learning_rate": 1.8506181064475536e-07, + "loss": 0.987, + "step": 14960 + }, + { + "epoch": 9.096735509660226, + "grad_norm": 23.434919357299805, + "learning_rate": 1.480494485158043e-07, + "loss": 1.1058, + "step": 14970 + }, + { + "epoch": 9.097401732178547, + "grad_norm": 15.87350845336914, + "learning_rate": 1.1103708638685322e-07, + "loss": 0.6816, + "step": 14980 + }, + { + "epoch": 9.09806795469687, + "grad_norm": 8.800328254699707, + "learning_rate": 7.402472425790215e-08, + "loss": 0.8993, + "step": 14990 + }, + { + "epoch": 9.09873417721519, + "grad_norm": 18.93852996826172, + "learning_rate": 3.7012362128951075e-08, + "loss": 0.7328, + "step": 15000 + }, + { + "epoch": 9.099400399733511, + "grad_norm": 10.444864273071289, + "learning_rate": 0.0, + "loss": 0.91, + "step": 15010 + }, + { + "epoch": 9.099400399733511, + "eval_accuracy": 0.5144116649711766, + "eval_loss": 1.6655091047286987, + "eval_runtime": 936.6478, + "eval_samples_per_second": 3.148, + "eval_steps_per_second": 0.394, + "step": 15010 + }, + { + "epoch": 9.099400399733511, + "step": 15010, + "total_flos": 1.495788215937553e+20, + "train_loss": 1.4517486766367893, + "train_runtime": 70108.3948, + "train_samples_per_second": 1.713, + "train_steps_per_second": 0.214 + }, + { + "epoch": 9.099400399733511, + "eval_accuracy": 0.5269582909460834, + "eval_loss": 1.6447620391845703, + "eval_runtime": 921.4763, + "eval_samples_per_second": 3.2, + "eval_steps_per_second": 0.4, + "step": 15010 + }, + { + "epoch": 9.099400399733511, + "eval_accuracy": 0.5269582909460834, + "eval_loss": 1.6447620391845703, + "eval_runtime": 947.2239, + "eval_samples_per_second": 3.113, + "eval_steps_per_second": 0.39, + "step": 15010 + } + ], + "logging_steps": 10, + "max_steps": 15010, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.495788215937553e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}