diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21875 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 15590, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.414368184733804e-05, + "grad_norm": 2.122934421744886, + "learning_rate": 1.2828736369467608e-07, + "loss": 1.1536, + "step": 1 + }, + { + "epoch": 0.00032071840923669016, + "grad_norm": 2.6120313710646594, + "learning_rate": 6.414368184733804e-07, + "loss": 0.8767, + "step": 5 + }, + { + "epoch": 0.0006414368184733803, + "grad_norm": 1.9036320491341159, + "learning_rate": 1.2828736369467608e-06, + "loss": 1.1617, + "step": 10 + }, + { + "epoch": 0.0009621552277100705, + "grad_norm": 2.2675442011112223, + "learning_rate": 1.9243104554201413e-06, + "loss": 0.9348, + "step": 15 + }, + { + "epoch": 0.0012828736369467607, + "grad_norm": 1.8668180685612314, + "learning_rate": 2.5657472738935216e-06, + "loss": 0.9887, + "step": 20 + }, + { + "epoch": 0.001603592046183451, + "grad_norm": 3.7598222149067344, + "learning_rate": 3.2071840923669024e-06, + "loss": 0.9933, + "step": 25 + }, + { + "epoch": 0.001924310455420141, + "grad_norm": 0.7497789448952673, + "learning_rate": 3.848620910840283e-06, + "loss": 0.8951, + "step": 30 + }, + { + "epoch": 0.0022450288646568314, + "grad_norm": 0.7497466999070768, + "learning_rate": 4.490057729313663e-06, + "loss": 1.1075, + "step": 35 + }, + { + "epoch": 0.0025657472738935213, + "grad_norm": 0.6269808680953072, + "learning_rate": 5.131494547787043e-06, + "loss": 0.9902, + "step": 40 + }, + { + "epoch": 0.0028864656831302116, + "grad_norm": 0.7670239135600633, + "learning_rate": 5.7729313662604236e-06, + "loss": 0.8813, + "step": 45 + }, + { + "epoch": 0.003207184092366902, + "grad_norm": 2.7386818812948612, + "learning_rate": 6.414368184733805e-06, + "loss": 1.0156, + "step": 50 + }, + { + "epoch": 0.003527902501603592, + "grad_norm": 0.6074882325288049, + "learning_rate": 7.055805003207184e-06, + "loss": 0.8396, + "step": 55 + }, + { + "epoch": 0.003848620910840282, + "grad_norm": 0.5127156771589034, + "learning_rate": 7.697241821680565e-06, + "loss": 0.8716, + "step": 60 + }, + { + "epoch": 0.004169339320076972, + "grad_norm": 1.9056440284966447, + "learning_rate": 8.338678640153946e-06, + "loss": 0.854, + "step": 65 + }, + { + "epoch": 0.004490057729313663, + "grad_norm": 0.5220731546600494, + "learning_rate": 8.980115458627326e-06, + "loss": 0.7033, + "step": 70 + }, + { + "epoch": 0.004810776138550353, + "grad_norm": 0.8528246103677898, + "learning_rate": 9.621552277100706e-06, + "loss": 0.7803, + "step": 75 + }, + { + "epoch": 0.005131494547787043, + "grad_norm": 0.9553069537717978, + "learning_rate": 1.0262989095574087e-05, + "loss": 0.8577, + "step": 80 + }, + { + "epoch": 0.005452212957023733, + "grad_norm": 1.089966886886208, + "learning_rate": 1.0904425914047467e-05, + "loss": 0.5766, + "step": 85 + }, + { + "epoch": 0.005772931366260423, + "grad_norm": 1.3259601452666736, + "learning_rate": 1.1545862732520847e-05, + "loss": 0.6605, + "step": 90 + }, + { + "epoch": 0.006093649775497113, + "grad_norm": 0.5501705053457395, + "learning_rate": 1.2187299550994227e-05, + "loss": 0.622, + "step": 95 + }, + { + "epoch": 0.006414368184733804, + "grad_norm": 0.565393913728341, + "learning_rate": 1.282873636946761e-05, + "loss": 0.6598, + "step": 100 + }, + { + "epoch": 0.006735086593970494, + "grad_norm": 0.8355349528605185, + "learning_rate": 1.3470173187940988e-05, + "loss": 0.57, + "step": 105 + }, + { + "epoch": 0.007055805003207184, + "grad_norm": 0.7311107470147664, + "learning_rate": 1.4111610006414368e-05, + "loss": 0.7012, + "step": 110 + }, + { + "epoch": 0.0073765234124438745, + "grad_norm": 0.7172503923642882, + "learning_rate": 1.4753046824887749e-05, + "loss": 0.6794, + "step": 115 + }, + { + "epoch": 0.007697241821680564, + "grad_norm": 0.6517343625027339, + "learning_rate": 1.539448364336113e-05, + "loss": 0.7512, + "step": 120 + }, + { + "epoch": 0.008017960230917255, + "grad_norm": 0.7506138412539792, + "learning_rate": 1.603592046183451e-05, + "loss": 0.6939, + "step": 125 + }, + { + "epoch": 0.008338678640153944, + "grad_norm": 0.6731898912128177, + "learning_rate": 1.667735728030789e-05, + "loss": 0.6853, + "step": 130 + }, + { + "epoch": 0.008659397049390635, + "grad_norm": 0.5526935784877048, + "learning_rate": 1.731879409878127e-05, + "loss": 0.7614, + "step": 135 + }, + { + "epoch": 0.008980115458627326, + "grad_norm": 0.7407967682593112, + "learning_rate": 1.7960230917254652e-05, + "loss": 0.6847, + "step": 140 + }, + { + "epoch": 0.009300833867864015, + "grad_norm": 1.0558900513241394, + "learning_rate": 1.8601667735728032e-05, + "loss": 0.6291, + "step": 145 + }, + { + "epoch": 0.009621552277100705, + "grad_norm": 0.534720549756236, + "learning_rate": 1.9243104554201412e-05, + "loss": 0.6933, + "step": 150 + }, + { + "epoch": 0.009942270686337396, + "grad_norm": 0.8533880407106053, + "learning_rate": 1.9884541372674793e-05, + "loss": 0.7405, + "step": 155 + }, + { + "epoch": 0.010262989095574085, + "grad_norm": 2.2157811263492633, + "learning_rate": 2.0525978191148173e-05, + "loss": 0.6605, + "step": 160 + }, + { + "epoch": 0.010583707504810776, + "grad_norm": 0.741553600188979, + "learning_rate": 2.1167415009621553e-05, + "loss": 0.6929, + "step": 165 + }, + { + "epoch": 0.010904425914047467, + "grad_norm": 0.592672329081525, + "learning_rate": 2.1808851828094934e-05, + "loss": 0.7712, + "step": 170 + }, + { + "epoch": 0.011225144323284156, + "grad_norm": 0.7143661642401767, + "learning_rate": 2.2450288646568314e-05, + "loss": 0.7264, + "step": 175 + }, + { + "epoch": 0.011545862732520847, + "grad_norm": 0.7168820160805862, + "learning_rate": 2.3091725465041694e-05, + "loss": 0.7147, + "step": 180 + }, + { + "epoch": 0.011866581141757537, + "grad_norm": 0.8106566714421187, + "learning_rate": 2.3733162283515075e-05, + "loss": 0.7091, + "step": 185 + }, + { + "epoch": 0.012187299550994226, + "grad_norm": 1.131984585130431, + "learning_rate": 2.4374599101988455e-05, + "loss": 0.6725, + "step": 190 + }, + { + "epoch": 0.012508017960230917, + "grad_norm": 0.5991057607118903, + "learning_rate": 2.5016035920461832e-05, + "loss": 0.5288, + "step": 195 + }, + { + "epoch": 0.012828736369467608, + "grad_norm": 0.7441333776346593, + "learning_rate": 2.565747273893522e-05, + "loss": 0.6001, + "step": 200 + }, + { + "epoch": 0.013149454778704297, + "grad_norm": 0.7177668887803592, + "learning_rate": 2.6298909557408596e-05, + "loss": 0.729, + "step": 205 + }, + { + "epoch": 0.013470173187940988, + "grad_norm": 1.152356658408425, + "learning_rate": 2.6940346375881976e-05, + "loss": 0.649, + "step": 210 + }, + { + "epoch": 0.013790891597177678, + "grad_norm": 0.8692844040434968, + "learning_rate": 2.758178319435536e-05, + "loss": 0.7514, + "step": 215 + }, + { + "epoch": 0.014111610006414367, + "grad_norm": 0.7731506164196528, + "learning_rate": 2.8223220012828737e-05, + "loss": 0.7303, + "step": 220 + }, + { + "epoch": 0.014432328415651058, + "grad_norm": 0.6675669855403799, + "learning_rate": 2.8864656831302117e-05, + "loss": 0.5974, + "step": 225 + }, + { + "epoch": 0.014753046824887749, + "grad_norm": 0.6511258667141646, + "learning_rate": 2.9506093649775497e-05, + "loss": 0.6502, + "step": 230 + }, + { + "epoch": 0.015073765234124438, + "grad_norm": 0.8153736796805081, + "learning_rate": 3.014753046824888e-05, + "loss": 0.7187, + "step": 235 + }, + { + "epoch": 0.015394483643361129, + "grad_norm": 0.682020511101791, + "learning_rate": 3.078896728672226e-05, + "loss": 0.7687, + "step": 240 + }, + { + "epoch": 0.01571520205259782, + "grad_norm": 0.9723518475601368, + "learning_rate": 3.143040410519564e-05, + "loss": 0.6333, + "step": 245 + }, + { + "epoch": 0.01603592046183451, + "grad_norm": 0.6642430373016617, + "learning_rate": 3.207184092366902e-05, + "loss": 0.7503, + "step": 250 + }, + { + "epoch": 0.0163566388710712, + "grad_norm": 1.0604072659225818, + "learning_rate": 3.27132777421424e-05, + "loss": 0.7296, + "step": 255 + }, + { + "epoch": 0.01667735728030789, + "grad_norm": 0.5389238146909613, + "learning_rate": 3.335471456061578e-05, + "loss": 0.6449, + "step": 260 + }, + { + "epoch": 0.01699807568954458, + "grad_norm": 1.0886777633244675, + "learning_rate": 3.3996151379089166e-05, + "loss": 0.6087, + "step": 265 + }, + { + "epoch": 0.01731879409878127, + "grad_norm": 0.7740455363235514, + "learning_rate": 3.463758819756254e-05, + "loss": 0.7716, + "step": 270 + }, + { + "epoch": 0.01763951250801796, + "grad_norm": 0.7842668340726671, + "learning_rate": 3.527902501603592e-05, + "loss": 0.6184, + "step": 275 + }, + { + "epoch": 0.01796023091725465, + "grad_norm": 0.8724306321758412, + "learning_rate": 3.5920461834509304e-05, + "loss": 0.5647, + "step": 280 + }, + { + "epoch": 0.018280949326491342, + "grad_norm": 0.6108159651722537, + "learning_rate": 3.656189865298269e-05, + "loss": 0.7748, + "step": 285 + }, + { + "epoch": 0.01860166773572803, + "grad_norm": 0.554729905784846, + "learning_rate": 3.7203335471456064e-05, + "loss": 0.6969, + "step": 290 + }, + { + "epoch": 0.01892238614496472, + "grad_norm": 0.5263100377774543, + "learning_rate": 3.784477228992944e-05, + "loss": 0.6331, + "step": 295 + }, + { + "epoch": 0.01924310455420141, + "grad_norm": 0.7458575860438468, + "learning_rate": 3.8486209108402825e-05, + "loss": 0.7178, + "step": 300 + }, + { + "epoch": 0.0195638229634381, + "grad_norm": 0.7209749688824592, + "learning_rate": 3.912764592687621e-05, + "loss": 0.7774, + "step": 305 + }, + { + "epoch": 0.019884541372674792, + "grad_norm": 0.8894616503150261, + "learning_rate": 3.9769082745349585e-05, + "loss": 0.8354, + "step": 310 + }, + { + "epoch": 0.020205259781911483, + "grad_norm": 0.6322923436990817, + "learning_rate": 4.041051956382296e-05, + "loss": 0.6009, + "step": 315 + }, + { + "epoch": 0.02052597819114817, + "grad_norm": 0.9519419320088668, + "learning_rate": 4.1051956382296346e-05, + "loss": 0.61, + "step": 320 + }, + { + "epoch": 0.02084669660038486, + "grad_norm": 0.654969001631436, + "learning_rate": 4.169339320076972e-05, + "loss": 0.602, + "step": 325 + }, + { + "epoch": 0.021167415009621552, + "grad_norm": 0.6250956091655624, + "learning_rate": 4.233483001924311e-05, + "loss": 0.6451, + "step": 330 + }, + { + "epoch": 0.021488133418858243, + "grad_norm": 0.7392153639819625, + "learning_rate": 4.297626683771649e-05, + "loss": 0.7724, + "step": 335 + }, + { + "epoch": 0.021808851828094934, + "grad_norm": 0.7914340872699686, + "learning_rate": 4.361770365618987e-05, + "loss": 0.7245, + "step": 340 + }, + { + "epoch": 0.022129570237331624, + "grad_norm": 0.5688389882467555, + "learning_rate": 4.4259140474663244e-05, + "loss": 0.5756, + "step": 345 + }, + { + "epoch": 0.02245028864656831, + "grad_norm": 0.6860675746425041, + "learning_rate": 4.490057729313663e-05, + "loss": 0.6515, + "step": 350 + }, + { + "epoch": 0.022771007055805002, + "grad_norm": 0.8497624484329163, + "learning_rate": 4.554201411161001e-05, + "loss": 0.742, + "step": 355 + }, + { + "epoch": 0.023091725465041693, + "grad_norm": 0.9589070592978919, + "learning_rate": 4.618345093008339e-05, + "loss": 0.7261, + "step": 360 + }, + { + "epoch": 0.023412443874278384, + "grad_norm": 0.5397605849852198, + "learning_rate": 4.6824887748556765e-05, + "loss": 0.721, + "step": 365 + }, + { + "epoch": 0.023733162283515075, + "grad_norm": 0.4218758453965537, + "learning_rate": 4.746632456703015e-05, + "loss": 0.7008, + "step": 370 + }, + { + "epoch": 0.024053880692751765, + "grad_norm": 0.4660237223228576, + "learning_rate": 4.810776138550353e-05, + "loss": 0.5954, + "step": 375 + }, + { + "epoch": 0.024374599101988453, + "grad_norm": 1.1414044523272346, + "learning_rate": 4.874919820397691e-05, + "loss": 0.7092, + "step": 380 + }, + { + "epoch": 0.024695317511225143, + "grad_norm": 0.7794538849217394, + "learning_rate": 4.939063502245029e-05, + "loss": 0.6556, + "step": 385 + }, + { + "epoch": 0.025016035920461834, + "grad_norm": 0.6784254428885176, + "learning_rate": 5.0032071840923663e-05, + "loss": 0.6523, + "step": 390 + }, + { + "epoch": 0.025336754329698525, + "grad_norm": 0.5550050199692612, + "learning_rate": 5.0673508659397054e-05, + "loss": 0.7065, + "step": 395 + }, + { + "epoch": 0.025657472738935216, + "grad_norm": 1.3489642897531091, + "learning_rate": 5.131494547787044e-05, + "loss": 0.657, + "step": 400 + }, + { + "epoch": 0.025978191148171906, + "grad_norm": 0.8799442657849393, + "learning_rate": 5.195638229634381e-05, + "loss": 0.7712, + "step": 405 + }, + { + "epoch": 0.026298909557408594, + "grad_norm": 0.6211518086394292, + "learning_rate": 5.259781911481719e-05, + "loss": 0.6556, + "step": 410 + }, + { + "epoch": 0.026619627966645285, + "grad_norm": 0.527786179579098, + "learning_rate": 5.3239255933290575e-05, + "loss": 0.6304, + "step": 415 + }, + { + "epoch": 0.026940346375881975, + "grad_norm": 0.6225940856068456, + "learning_rate": 5.388069275176395e-05, + "loss": 0.7504, + "step": 420 + }, + { + "epoch": 0.027261064785118666, + "grad_norm": 0.7472577597094603, + "learning_rate": 5.4522129570237336e-05, + "loss": 0.5737, + "step": 425 + }, + { + "epoch": 0.027581783194355357, + "grad_norm": 0.9003123884674169, + "learning_rate": 5.516356638871072e-05, + "loss": 0.6751, + "step": 430 + }, + { + "epoch": 0.027902501603592048, + "grad_norm": 1.193348964937134, + "learning_rate": 5.580500320718409e-05, + "loss": 0.6685, + "step": 435 + }, + { + "epoch": 0.028223220012828735, + "grad_norm": 0.8207452374854483, + "learning_rate": 5.644644002565747e-05, + "loss": 0.5606, + "step": 440 + }, + { + "epoch": 0.028543938422065426, + "grad_norm": 0.6253317338492933, + "learning_rate": 5.7087876844130864e-05, + "loss": 0.6848, + "step": 445 + }, + { + "epoch": 0.028864656831302116, + "grad_norm": 0.5089340890778841, + "learning_rate": 5.7729313662604234e-05, + "loss": 0.5969, + "step": 450 + }, + { + "epoch": 0.029185375240538807, + "grad_norm": 0.6403611822232731, + "learning_rate": 5.837075048107762e-05, + "loss": 0.6663, + "step": 455 + }, + { + "epoch": 0.029506093649775498, + "grad_norm": 0.9017481128452324, + "learning_rate": 5.9012187299550994e-05, + "loss": 0.6253, + "step": 460 + }, + { + "epoch": 0.02982681205901219, + "grad_norm": 0.7102935907261797, + "learning_rate": 5.965362411802438e-05, + "loss": 0.6032, + "step": 465 + }, + { + "epoch": 0.030147530468248876, + "grad_norm": 0.572528044090495, + "learning_rate": 6.029506093649776e-05, + "loss": 0.7059, + "step": 470 + }, + { + "epoch": 0.030468248877485567, + "grad_norm": 0.6507630672872388, + "learning_rate": 6.093649775497113e-05, + "loss": 0.551, + "step": 475 + }, + { + "epoch": 0.030788967286722257, + "grad_norm": 0.4787872258590136, + "learning_rate": 6.157793457344452e-05, + "loss": 0.4953, + "step": 480 + }, + { + "epoch": 0.031109685695958948, + "grad_norm": 0.6446626662145857, + "learning_rate": 6.22193713919179e-05, + "loss": 0.7073, + "step": 485 + }, + { + "epoch": 0.03143040410519564, + "grad_norm": 0.46176975999305003, + "learning_rate": 6.286080821039128e-05, + "loss": 0.7031, + "step": 490 + }, + { + "epoch": 0.03175112251443233, + "grad_norm": 0.6364571216466376, + "learning_rate": 6.350224502886466e-05, + "loss": 0.7208, + "step": 495 + }, + { + "epoch": 0.03207184092366902, + "grad_norm": 0.6441271299481783, + "learning_rate": 6.414368184733804e-05, + "loss": 0.651, + "step": 500 + }, + { + "epoch": 0.03239255933290571, + "grad_norm": 0.5277240516380076, + "learning_rate": 6.478511866581141e-05, + "loss": 0.7596, + "step": 505 + }, + { + "epoch": 0.0327132777421424, + "grad_norm": 0.6102741778617242, + "learning_rate": 6.54265554842848e-05, + "loss": 0.8127, + "step": 510 + }, + { + "epoch": 0.033033996151379086, + "grad_norm": 1.2909493866489476, + "learning_rate": 6.606799230275818e-05, + "loss": 0.6172, + "step": 515 + }, + { + "epoch": 0.03335471456061578, + "grad_norm": 0.8290001510292774, + "learning_rate": 6.670942912123156e-05, + "loss": 0.7024, + "step": 520 + }, + { + "epoch": 0.03367543296985247, + "grad_norm": 0.5082074367378367, + "learning_rate": 6.735086593970495e-05, + "loss": 0.5993, + "step": 525 + }, + { + "epoch": 0.03399615137908916, + "grad_norm": 0.8948141239538124, + "learning_rate": 6.799230275817833e-05, + "loss": 0.6288, + "step": 530 + }, + { + "epoch": 0.03431686978832585, + "grad_norm": 0.704188041016483, + "learning_rate": 6.86337395766517e-05, + "loss": 0.6173, + "step": 535 + }, + { + "epoch": 0.03463758819756254, + "grad_norm": 0.8493617205406083, + "learning_rate": 6.927517639512509e-05, + "loss": 0.6472, + "step": 540 + }, + { + "epoch": 0.03495830660679923, + "grad_norm": 0.6071336551640186, + "learning_rate": 6.991661321359846e-05, + "loss": 0.7066, + "step": 545 + }, + { + "epoch": 0.03527902501603592, + "grad_norm": 0.6299761061285323, + "learning_rate": 7.055805003207184e-05, + "loss": 0.5004, + "step": 550 + }, + { + "epoch": 0.03559974342527261, + "grad_norm": 0.36030076856010784, + "learning_rate": 7.119948685054522e-05, + "loss": 0.5939, + "step": 555 + }, + { + "epoch": 0.0359204618345093, + "grad_norm": 0.5657747344505833, + "learning_rate": 7.184092366901861e-05, + "loss": 0.6394, + "step": 560 + }, + { + "epoch": 0.03624118024374599, + "grad_norm": 0.5512464769253931, + "learning_rate": 7.248236048749199e-05, + "loss": 0.7496, + "step": 565 + }, + { + "epoch": 0.036561898652982684, + "grad_norm": 1.6012481016769327, + "learning_rate": 7.312379730596537e-05, + "loss": 0.74, + "step": 570 + }, + { + "epoch": 0.03688261706221937, + "grad_norm": 0.539931431422469, + "learning_rate": 7.376523412443874e-05, + "loss": 0.655, + "step": 575 + }, + { + "epoch": 0.03720333547145606, + "grad_norm": 0.5792692922947517, + "learning_rate": 7.440667094291213e-05, + "loss": 0.6268, + "step": 580 + }, + { + "epoch": 0.03752405388069275, + "grad_norm": 0.44904646394711184, + "learning_rate": 7.504810776138551e-05, + "loss": 0.621, + "step": 585 + }, + { + "epoch": 0.03784477228992944, + "grad_norm": 0.38995414700568637, + "learning_rate": 7.568954457985888e-05, + "loss": 0.479, + "step": 590 + }, + { + "epoch": 0.03816549069916613, + "grad_norm": 0.6100292909911376, + "learning_rate": 7.633098139833227e-05, + "loss": 0.5087, + "step": 595 + }, + { + "epoch": 0.03848620910840282, + "grad_norm": 0.767232067956154, + "learning_rate": 7.697241821680565e-05, + "loss": 0.7094, + "step": 600 + }, + { + "epoch": 0.03880692751763951, + "grad_norm": 0.5093223662182627, + "learning_rate": 7.761385503527902e-05, + "loss": 0.6216, + "step": 605 + }, + { + "epoch": 0.0391276459268762, + "grad_norm": 0.604996949026468, + "learning_rate": 7.825529185375242e-05, + "loss": 0.6343, + "step": 610 + }, + { + "epoch": 0.039448364336112894, + "grad_norm": 0.6313031887029451, + "learning_rate": 7.88967286722258e-05, + "loss": 0.6814, + "step": 615 + }, + { + "epoch": 0.039769082745349585, + "grad_norm": 0.5515684818028812, + "learning_rate": 7.953816549069917e-05, + "loss": 0.6319, + "step": 620 + }, + { + "epoch": 0.040089801154586276, + "grad_norm": 0.9067875561472081, + "learning_rate": 8.017960230917255e-05, + "loss": 0.626, + "step": 625 + }, + { + "epoch": 0.040410519563822966, + "grad_norm": 0.4402348046376401, + "learning_rate": 8.082103912764592e-05, + "loss": 0.6581, + "step": 630 + }, + { + "epoch": 0.04073123797305965, + "grad_norm": 0.6653624732467279, + "learning_rate": 8.146247594611931e-05, + "loss": 0.6266, + "step": 635 + }, + { + "epoch": 0.04105195638229634, + "grad_norm": 0.7506028416479603, + "learning_rate": 8.210391276459269e-05, + "loss": 0.7304, + "step": 640 + }, + { + "epoch": 0.04137267479153303, + "grad_norm": 0.43305772472870374, + "learning_rate": 8.274534958306606e-05, + "loss": 0.6272, + "step": 645 + }, + { + "epoch": 0.04169339320076972, + "grad_norm": 0.7883927079167802, + "learning_rate": 8.338678640153945e-05, + "loss": 0.564, + "step": 650 + }, + { + "epoch": 0.04201411161000641, + "grad_norm": 0.6406069976891953, + "learning_rate": 8.402822322001283e-05, + "loss": 0.6594, + "step": 655 + }, + { + "epoch": 0.042334830019243104, + "grad_norm": 0.6650787540082842, + "learning_rate": 8.466966003848621e-05, + "loss": 0.6086, + "step": 660 + }, + { + "epoch": 0.042655548428479795, + "grad_norm": 0.6280025445964529, + "learning_rate": 8.53110968569596e-05, + "loss": 0.6188, + "step": 665 + }, + { + "epoch": 0.042976266837716486, + "grad_norm": 0.6181001304138187, + "learning_rate": 8.595253367543298e-05, + "loss": 0.6454, + "step": 670 + }, + { + "epoch": 0.043296985246953176, + "grad_norm": 0.9164302121431295, + "learning_rate": 8.659397049390635e-05, + "loss": 0.7409, + "step": 675 + }, + { + "epoch": 0.04361770365618987, + "grad_norm": 0.5146934352157929, + "learning_rate": 8.723540731237973e-05, + "loss": 0.7961, + "step": 680 + }, + { + "epoch": 0.04393842206542656, + "grad_norm": 0.8884783771604745, + "learning_rate": 8.787684413085312e-05, + "loss": 0.7023, + "step": 685 + }, + { + "epoch": 0.04425914047466325, + "grad_norm": 0.5972459928844025, + "learning_rate": 8.851828094932649e-05, + "loss": 0.6437, + "step": 690 + }, + { + "epoch": 0.04457985888389993, + "grad_norm": 1.027137591537084, + "learning_rate": 8.915971776779987e-05, + "loss": 0.6461, + "step": 695 + }, + { + "epoch": 0.04490057729313662, + "grad_norm": 0.684561126713197, + "learning_rate": 8.980115458627326e-05, + "loss": 0.6417, + "step": 700 + }, + { + "epoch": 0.045221295702373314, + "grad_norm": 0.5791897637489775, + "learning_rate": 9.044259140474664e-05, + "loss": 0.6545, + "step": 705 + }, + { + "epoch": 0.045542014111610005, + "grad_norm": 0.6093322265483176, + "learning_rate": 9.108402822322002e-05, + "loss": 0.5431, + "step": 710 + }, + { + "epoch": 0.045862732520846695, + "grad_norm": 1.20412780035678, + "learning_rate": 9.172546504169339e-05, + "loss": 0.6122, + "step": 715 + }, + { + "epoch": 0.046183450930083386, + "grad_norm": 0.4344736289735069, + "learning_rate": 9.236690186016678e-05, + "loss": 0.6896, + "step": 720 + }, + { + "epoch": 0.04650416933932008, + "grad_norm": 0.479553471093618, + "learning_rate": 9.300833867864016e-05, + "loss": 0.7446, + "step": 725 + }, + { + "epoch": 0.04682488774855677, + "grad_norm": 0.4175717995477323, + "learning_rate": 9.364977549711353e-05, + "loss": 0.5635, + "step": 730 + }, + { + "epoch": 0.04714560615779346, + "grad_norm": 0.43527442203162864, + "learning_rate": 9.429121231558691e-05, + "loss": 0.5984, + "step": 735 + }, + { + "epoch": 0.04746632456703015, + "grad_norm": 0.6764034597420034, + "learning_rate": 9.49326491340603e-05, + "loss": 0.6575, + "step": 740 + }, + { + "epoch": 0.04778704297626684, + "grad_norm": 0.6994297524226791, + "learning_rate": 9.557408595253368e-05, + "loss": 0.6381, + "step": 745 + }, + { + "epoch": 0.04810776138550353, + "grad_norm": 0.5924112864276749, + "learning_rate": 9.621552277100707e-05, + "loss": 0.6273, + "step": 750 + }, + { + "epoch": 0.04842847979474022, + "grad_norm": 0.529839489096258, + "learning_rate": 9.685695958948045e-05, + "loss": 0.5524, + "step": 755 + }, + { + "epoch": 0.048749198203976905, + "grad_norm": 0.5412474092793377, + "learning_rate": 9.749839640795382e-05, + "loss": 0.6584, + "step": 760 + }, + { + "epoch": 0.049069916613213596, + "grad_norm": 0.62325178443721, + "learning_rate": 9.81398332264272e-05, + "loss": 0.7556, + "step": 765 + }, + { + "epoch": 0.04939063502245029, + "grad_norm": 0.6185109985068113, + "learning_rate": 9.878127004490059e-05, + "loss": 0.6396, + "step": 770 + }, + { + "epoch": 0.04971135343168698, + "grad_norm": 0.5650081284141024, + "learning_rate": 9.942270686337396e-05, + "loss": 0.6761, + "step": 775 + }, + { + "epoch": 0.05003207184092367, + "grad_norm": 0.6838574740900004, + "learning_rate": 0.00010006414368184733, + "loss": 0.6228, + "step": 780 + }, + { + "epoch": 0.05035279025016036, + "grad_norm": 0.6196830613093786, + "learning_rate": 0.00010070558050032072, + "loss": 0.6648, + "step": 785 + }, + { + "epoch": 0.05067350865939705, + "grad_norm": 0.5504649558203162, + "learning_rate": 0.00010134701731879411, + "loss": 0.697, + "step": 790 + }, + { + "epoch": 0.05099422706863374, + "grad_norm": 0.654837344932131, + "learning_rate": 0.00010198845413726748, + "loss": 0.6986, + "step": 795 + }, + { + "epoch": 0.05131494547787043, + "grad_norm": 0.7011329232246133, + "learning_rate": 0.00010262989095574088, + "loss": 0.7206, + "step": 800 + }, + { + "epoch": 0.05163566388710712, + "grad_norm": 0.6807528459174979, + "learning_rate": 0.00010327132777421425, + "loss": 0.6834, + "step": 805 + }, + { + "epoch": 0.05195638229634381, + "grad_norm": 0.8856217259425705, + "learning_rate": 0.00010391276459268762, + "loss": 0.7028, + "step": 810 + }, + { + "epoch": 0.052277100705580504, + "grad_norm": 0.5962908888781525, + "learning_rate": 0.00010455420141116101, + "loss": 0.5113, + "step": 815 + }, + { + "epoch": 0.05259781911481719, + "grad_norm": 0.9014177998142, + "learning_rate": 0.00010519563822963438, + "loss": 0.6129, + "step": 820 + }, + { + "epoch": 0.05291853752405388, + "grad_norm": 0.6753791164158136, + "learning_rate": 0.00010583707504810775, + "loss": 0.756, + "step": 825 + }, + { + "epoch": 0.05323925593329057, + "grad_norm": 0.48791891735015575, + "learning_rate": 0.00010647851186658115, + "loss": 0.5352, + "step": 830 + }, + { + "epoch": 0.05355997434252726, + "grad_norm": 0.7373582383544524, + "learning_rate": 0.00010711994868505453, + "loss": 0.7345, + "step": 835 + }, + { + "epoch": 0.05388069275176395, + "grad_norm": 0.49964472362766127, + "learning_rate": 0.0001077613855035279, + "loss": 0.7314, + "step": 840 + }, + { + "epoch": 0.05420141116100064, + "grad_norm": 0.48415921267506284, + "learning_rate": 0.0001084028223220013, + "loss": 0.5548, + "step": 845 + }, + { + "epoch": 0.05452212957023733, + "grad_norm": 0.6197607704084165, + "learning_rate": 0.00010904425914047467, + "loss": 0.6271, + "step": 850 + }, + { + "epoch": 0.05484284797947402, + "grad_norm": 0.677683386452661, + "learning_rate": 0.00010968569595894804, + "loss": 0.7739, + "step": 855 + }, + { + "epoch": 0.055163566388710714, + "grad_norm": 0.7298215600744931, + "learning_rate": 0.00011032713277742144, + "loss": 0.6813, + "step": 860 + }, + { + "epoch": 0.055484284797947404, + "grad_norm": 0.49556474863687744, + "learning_rate": 0.00011096856959589481, + "loss": 0.7165, + "step": 865 + }, + { + "epoch": 0.055805003207184095, + "grad_norm": 0.4755941527376833, + "learning_rate": 0.00011161000641436818, + "loss": 0.7439, + "step": 870 + }, + { + "epoch": 0.056125721616420786, + "grad_norm": 0.8183131489420952, + "learning_rate": 0.00011225144323284158, + "loss": 0.7741, + "step": 875 + }, + { + "epoch": 0.05644644002565747, + "grad_norm": 0.577588746397813, + "learning_rate": 0.00011289288005131495, + "loss": 0.6951, + "step": 880 + }, + { + "epoch": 0.05676715843489416, + "grad_norm": 0.3104626766912227, + "learning_rate": 0.00011353431686978833, + "loss": 0.6068, + "step": 885 + }, + { + "epoch": 0.05708787684413085, + "grad_norm": 0.6364607751424182, + "learning_rate": 0.00011417575368826173, + "loss": 0.6601, + "step": 890 + }, + { + "epoch": 0.05740859525336754, + "grad_norm": 0.5489548053878326, + "learning_rate": 0.0001148171905067351, + "loss": 0.6498, + "step": 895 + }, + { + "epoch": 0.05772931366260423, + "grad_norm": 0.8290809901584166, + "learning_rate": 0.00011545862732520847, + "loss": 0.7598, + "step": 900 + }, + { + "epoch": 0.058050032071840924, + "grad_norm": 0.9889805070312973, + "learning_rate": 0.00011610006414368186, + "loss": 0.6528, + "step": 905 + }, + { + "epoch": 0.058370750481077614, + "grad_norm": 0.5034027315098741, + "learning_rate": 0.00011674150096215524, + "loss": 0.6916, + "step": 910 + }, + { + "epoch": 0.058691468890314305, + "grad_norm": 0.5211514737547632, + "learning_rate": 0.0001173829377806286, + "loss": 0.6455, + "step": 915 + }, + { + "epoch": 0.059012187299550996, + "grad_norm": 0.5915915443611912, + "learning_rate": 0.00011802437459910199, + "loss": 0.619, + "step": 920 + }, + { + "epoch": 0.05933290570878769, + "grad_norm": 0.6356669965403786, + "learning_rate": 0.00011866581141757537, + "loss": 0.6339, + "step": 925 + }, + { + "epoch": 0.05965362411802438, + "grad_norm": 0.5203747383599147, + "learning_rate": 0.00011930724823604876, + "loss": 0.7174, + "step": 930 + }, + { + "epoch": 0.05997434252726107, + "grad_norm": 0.4400681567105204, + "learning_rate": 0.00011994868505452213, + "loss": 0.757, + "step": 935 + }, + { + "epoch": 0.06029506093649775, + "grad_norm": 0.5134463977576896, + "learning_rate": 0.00012059012187299552, + "loss": 0.5941, + "step": 940 + }, + { + "epoch": 0.06061577934573444, + "grad_norm": 2.1514572255404563, + "learning_rate": 0.0001212315586914689, + "loss": 0.5872, + "step": 945 + }, + { + "epoch": 0.06093649775497113, + "grad_norm": 0.5533804918183362, + "learning_rate": 0.00012187299550994226, + "loss": 0.681, + "step": 950 + }, + { + "epoch": 0.061257216164207824, + "grad_norm": 0.43736512301454394, + "learning_rate": 0.00012251443232841566, + "loss": 0.6241, + "step": 955 + }, + { + "epoch": 0.061577934573444515, + "grad_norm": 0.7036625039029036, + "learning_rate": 0.00012315586914688904, + "loss": 0.7528, + "step": 960 + }, + { + "epoch": 0.061898652982681206, + "grad_norm": 0.5883952786255479, + "learning_rate": 0.0001237973059653624, + "loss": 0.6132, + "step": 965 + }, + { + "epoch": 0.062219371391917896, + "grad_norm": 0.593687347482467, + "learning_rate": 0.0001244387427838358, + "loss": 0.6453, + "step": 970 + }, + { + "epoch": 0.06254008980115458, + "grad_norm": 0.8797836564455341, + "learning_rate": 0.00012508017960230917, + "loss": 0.6658, + "step": 975 + }, + { + "epoch": 0.06286080821039128, + "grad_norm": 0.8231331839998992, + "learning_rate": 0.00012572161642078255, + "loss": 0.6615, + "step": 980 + }, + { + "epoch": 0.06318152661962796, + "grad_norm": 0.5202568995405973, + "learning_rate": 0.00012636305323925594, + "loss": 0.8156, + "step": 985 + }, + { + "epoch": 0.06350224502886466, + "grad_norm": 0.623580493806845, + "learning_rate": 0.00012700449005772932, + "loss": 0.6959, + "step": 990 + }, + { + "epoch": 0.06382296343810134, + "grad_norm": 0.5798575607273242, + "learning_rate": 0.0001276459268762027, + "loss": 0.5538, + "step": 995 + }, + { + "epoch": 0.06414368184733804, + "grad_norm": 0.6970653558425355, + "learning_rate": 0.0001282873636946761, + "loss": 0.7063, + "step": 1000 + }, + { + "epoch": 0.06446440025657472, + "grad_norm": 0.8241115273976609, + "learning_rate": 0.00012892880051314947, + "loss": 0.6371, + "step": 1005 + }, + { + "epoch": 0.06478511866581142, + "grad_norm": 0.7769868872755683, + "learning_rate": 0.00012957023733162283, + "loss": 0.6202, + "step": 1010 + }, + { + "epoch": 0.0651058370750481, + "grad_norm": 0.4974832858382039, + "learning_rate": 0.00013021167415009624, + "loss": 0.652, + "step": 1015 + }, + { + "epoch": 0.0654265554842848, + "grad_norm": 0.7988613498086312, + "learning_rate": 0.0001308531109685696, + "loss": 0.6179, + "step": 1020 + }, + { + "epoch": 0.06574727389352149, + "grad_norm": 0.5975032929676001, + "learning_rate": 0.00013149454778704298, + "loss": 0.7551, + "step": 1025 + }, + { + "epoch": 0.06606799230275817, + "grad_norm": 0.46478481189365806, + "learning_rate": 0.00013213598460551636, + "loss": 0.6643, + "step": 1030 + }, + { + "epoch": 0.06638871071199487, + "grad_norm": 0.5467473022741837, + "learning_rate": 0.00013277742142398975, + "loss": 0.6786, + "step": 1035 + }, + { + "epoch": 0.06670942912123155, + "grad_norm": 0.788511157965346, + "learning_rate": 0.00013341885824246313, + "loss": 0.699, + "step": 1040 + }, + { + "epoch": 0.06703014753046825, + "grad_norm": 0.7378591658959022, + "learning_rate": 0.0001340602950609365, + "loss": 0.5498, + "step": 1045 + }, + { + "epoch": 0.06735086593970493, + "grad_norm": 0.524580967213953, + "learning_rate": 0.0001347017318794099, + "loss": 0.7092, + "step": 1050 + }, + { + "epoch": 0.06767158434894163, + "grad_norm": 10.11033461685559, + "learning_rate": 0.00013534316869788325, + "loss": 0.6694, + "step": 1055 + }, + { + "epoch": 0.06799230275817832, + "grad_norm": 0.6039061177211199, + "learning_rate": 0.00013598460551635666, + "loss": 0.6105, + "step": 1060 + }, + { + "epoch": 0.06831302116741501, + "grad_norm": 0.7863303522868051, + "learning_rate": 0.00013662604233483002, + "loss": 0.6867, + "step": 1065 + }, + { + "epoch": 0.0686337395766517, + "grad_norm": 0.6197712573428509, + "learning_rate": 0.0001372674791533034, + "loss": 0.6893, + "step": 1070 + }, + { + "epoch": 0.0689544579858884, + "grad_norm": 0.43888192750291055, + "learning_rate": 0.0001379089159717768, + "loss": 0.7157, + "step": 1075 + }, + { + "epoch": 0.06927517639512508, + "grad_norm": 0.7306535592576365, + "learning_rate": 0.00013855035279025017, + "loss": 0.7648, + "step": 1080 + }, + { + "epoch": 0.06959589480436178, + "grad_norm": 0.5833095869044383, + "learning_rate": 0.00013919178960872356, + "loss": 0.6655, + "step": 1085 + }, + { + "epoch": 0.06991661321359846, + "grad_norm": 0.3330431009666685, + "learning_rate": 0.0001398332264271969, + "loss": 0.5681, + "step": 1090 + }, + { + "epoch": 0.07023733162283514, + "grad_norm": 0.8485768964159431, + "learning_rate": 0.00014047466324567032, + "loss": 0.6139, + "step": 1095 + }, + { + "epoch": 0.07055805003207184, + "grad_norm": 0.48935398848123357, + "learning_rate": 0.00014111610006414368, + "loss": 0.6591, + "step": 1100 + }, + { + "epoch": 0.07087876844130853, + "grad_norm": 0.6694840056428312, + "learning_rate": 0.00014175753688261706, + "loss": 0.5986, + "step": 1105 + }, + { + "epoch": 0.07119948685054522, + "grad_norm": 0.7907065672480846, + "learning_rate": 0.00014239897370109045, + "loss": 0.8477, + "step": 1110 + }, + { + "epoch": 0.07152020525978191, + "grad_norm": 0.45721463553494507, + "learning_rate": 0.00014304041051956383, + "loss": 0.6511, + "step": 1115 + }, + { + "epoch": 0.0718409236690186, + "grad_norm": 0.5932773719713492, + "learning_rate": 0.00014368184733803721, + "loss": 0.6205, + "step": 1120 + }, + { + "epoch": 0.07216164207825529, + "grad_norm": 0.7933284443225256, + "learning_rate": 0.0001443232841565106, + "loss": 0.524, + "step": 1125 + }, + { + "epoch": 0.07248236048749199, + "grad_norm": 0.4677884329123659, + "learning_rate": 0.00014496472097498398, + "loss": 0.555, + "step": 1130 + }, + { + "epoch": 0.07280307889672867, + "grad_norm": 0.850254756515873, + "learning_rate": 0.00014560615779345734, + "loss": 0.7627, + "step": 1135 + }, + { + "epoch": 0.07312379730596537, + "grad_norm": 0.522103651356661, + "learning_rate": 0.00014624759461193075, + "loss": 0.7255, + "step": 1140 + }, + { + "epoch": 0.07344451571520205, + "grad_norm": 0.6063292373713933, + "learning_rate": 0.0001468890314304041, + "loss": 0.6222, + "step": 1145 + }, + { + "epoch": 0.07376523412443874, + "grad_norm": 0.9713303841273095, + "learning_rate": 0.0001475304682488775, + "loss": 0.7341, + "step": 1150 + }, + { + "epoch": 0.07408595253367543, + "grad_norm": 0.837884018201796, + "learning_rate": 0.00014817190506735087, + "loss": 0.6822, + "step": 1155 + }, + { + "epoch": 0.07440667094291212, + "grad_norm": 0.39437246960153793, + "learning_rate": 0.00014881334188582426, + "loss": 0.7116, + "step": 1160 + }, + { + "epoch": 0.07472738935214882, + "grad_norm": 0.6202094758512229, + "learning_rate": 0.0001494547787042976, + "loss": 0.6015, + "step": 1165 + }, + { + "epoch": 0.0750481077613855, + "grad_norm": 0.8135054592447762, + "learning_rate": 0.00015009621552277102, + "loss": 0.6487, + "step": 1170 + }, + { + "epoch": 0.0753688261706222, + "grad_norm": 0.5507524560111344, + "learning_rate": 0.0001507376523412444, + "loss": 0.5846, + "step": 1175 + }, + { + "epoch": 0.07568954457985888, + "grad_norm": 0.5961939171868111, + "learning_rate": 0.00015137908915971776, + "loss": 0.6111, + "step": 1180 + }, + { + "epoch": 0.07601026298909558, + "grad_norm": 0.5352884760699661, + "learning_rate": 0.00015202052597819118, + "loss": 0.6401, + "step": 1185 + }, + { + "epoch": 0.07633098139833226, + "grad_norm": 0.6620834657515849, + "learning_rate": 0.00015266196279666453, + "loss": 0.7108, + "step": 1190 + }, + { + "epoch": 0.07665169980756896, + "grad_norm": 0.24886726646481336, + "learning_rate": 0.00015330339961513792, + "loss": 0.465, + "step": 1195 + }, + { + "epoch": 0.07697241821680564, + "grad_norm": 0.5949618384904851, + "learning_rate": 0.0001539448364336113, + "loss": 0.6872, + "step": 1200 + }, + { + "epoch": 0.07729313662604234, + "grad_norm": 0.7888477619326826, + "learning_rate": 0.00015458627325208468, + "loss": 0.5609, + "step": 1205 + }, + { + "epoch": 0.07761385503527903, + "grad_norm": 1.0669700966748508, + "learning_rate": 0.00015522771007055804, + "loss": 0.7743, + "step": 1210 + }, + { + "epoch": 0.07793457344451571, + "grad_norm": 0.7068283314311553, + "learning_rate": 0.00015586914688903145, + "loss": 0.6263, + "step": 1215 + }, + { + "epoch": 0.0782552918537524, + "grad_norm": 0.5841407337661559, + "learning_rate": 0.00015651058370750483, + "loss": 0.6187, + "step": 1220 + }, + { + "epoch": 0.07857601026298909, + "grad_norm": 0.6229227132294815, + "learning_rate": 0.0001571520205259782, + "loss": 0.7183, + "step": 1225 + }, + { + "epoch": 0.07889672867222579, + "grad_norm": 0.6002586833079545, + "learning_rate": 0.0001577934573444516, + "loss": 0.7077, + "step": 1230 + }, + { + "epoch": 0.07921744708146247, + "grad_norm": 0.5383734940611982, + "learning_rate": 0.00015843489416292496, + "loss": 0.6251, + "step": 1235 + }, + { + "epoch": 0.07953816549069917, + "grad_norm": 0.6051581628244698, + "learning_rate": 0.00015907633098139834, + "loss": 0.6742, + "step": 1240 + }, + { + "epoch": 0.07985888389993585, + "grad_norm": 0.6524111511727346, + "learning_rate": 0.0001597177677998717, + "loss": 0.6258, + "step": 1245 + }, + { + "epoch": 0.08017960230917255, + "grad_norm": 0.8452071724294624, + "learning_rate": 0.0001603592046183451, + "loss": 0.6583, + "step": 1250 + }, + { + "epoch": 0.08050032071840924, + "grad_norm": 0.5380526459581828, + "learning_rate": 0.00016100064143681847, + "loss": 0.7976, + "step": 1255 + }, + { + "epoch": 0.08082103912764593, + "grad_norm": 0.846065125270878, + "learning_rate": 0.00016164207825529185, + "loss": 0.5684, + "step": 1260 + }, + { + "epoch": 0.08114175753688262, + "grad_norm": 1.2668855662638892, + "learning_rate": 0.00016228351507376523, + "loss": 0.6079, + "step": 1265 + }, + { + "epoch": 0.0814624759461193, + "grad_norm": 0.7795964267281216, + "learning_rate": 0.00016292495189223862, + "loss": 0.646, + "step": 1270 + }, + { + "epoch": 0.081783194355356, + "grad_norm": 0.7027735707273621, + "learning_rate": 0.000163566388710712, + "loss": 0.7358, + "step": 1275 + }, + { + "epoch": 0.08210391276459268, + "grad_norm": 0.6792816013615487, + "learning_rate": 0.00016420782552918538, + "loss": 0.6695, + "step": 1280 + }, + { + "epoch": 0.08242463117382938, + "grad_norm": 0.6182179483058359, + "learning_rate": 0.00016484926234765877, + "loss": 0.6096, + "step": 1285 + }, + { + "epoch": 0.08274534958306606, + "grad_norm": 0.7701573171054498, + "learning_rate": 0.00016549069916613212, + "loss": 0.6467, + "step": 1290 + }, + { + "epoch": 0.08306606799230276, + "grad_norm": 0.8699247842006342, + "learning_rate": 0.00016613213598460554, + "loss": 0.5635, + "step": 1295 + }, + { + "epoch": 0.08338678640153944, + "grad_norm": 1.5815526952211336, + "learning_rate": 0.0001667735728030789, + "loss": 0.7091, + "step": 1300 + }, + { + "epoch": 0.08370750481077614, + "grad_norm": 1.1184328365345817, + "learning_rate": 0.00016741500962155228, + "loss": 0.6598, + "step": 1305 + }, + { + "epoch": 0.08402822322001283, + "grad_norm": 0.5795213958251844, + "learning_rate": 0.00016805644644002566, + "loss": 0.6638, + "step": 1310 + }, + { + "epoch": 0.08434894162924952, + "grad_norm": 0.9373149156332843, + "learning_rate": 0.00016869788325849904, + "loss": 0.6091, + "step": 1315 + }, + { + "epoch": 0.08466966003848621, + "grad_norm": 1.581754110063961, + "learning_rate": 0.00016933932007697243, + "loss": 0.6641, + "step": 1320 + }, + { + "epoch": 0.0849903784477229, + "grad_norm": 0.8932544552326179, + "learning_rate": 0.0001699807568954458, + "loss": 0.7052, + "step": 1325 + }, + { + "epoch": 0.08531109685695959, + "grad_norm": 0.7663989496912428, + "learning_rate": 0.0001706221937139192, + "loss": 0.6517, + "step": 1330 + }, + { + "epoch": 0.08563181526619627, + "grad_norm": 0.46405474836741084, + "learning_rate": 0.00017126363053239255, + "loss": 0.6357, + "step": 1335 + }, + { + "epoch": 0.08595253367543297, + "grad_norm": 0.6382018266002509, + "learning_rate": 0.00017190506735086596, + "loss": 0.5614, + "step": 1340 + }, + { + "epoch": 0.08627325208466965, + "grad_norm": 0.43085923514031815, + "learning_rate": 0.00017254650416933932, + "loss": 0.6499, + "step": 1345 + }, + { + "epoch": 0.08659397049390635, + "grad_norm": 0.8362450675258178, + "learning_rate": 0.0001731879409878127, + "loss": 0.7166, + "step": 1350 + }, + { + "epoch": 0.08691468890314304, + "grad_norm": 0.6383324045212788, + "learning_rate": 0.00017382937780628609, + "loss": 0.6072, + "step": 1355 + }, + { + "epoch": 0.08723540731237973, + "grad_norm": 0.8935601954358443, + "learning_rate": 0.00017447081462475947, + "loss": 0.6483, + "step": 1360 + }, + { + "epoch": 0.08755612572161642, + "grad_norm": 0.7202566228037989, + "learning_rate": 0.00017511225144323285, + "loss": 0.5967, + "step": 1365 + }, + { + "epoch": 0.08787684413085312, + "grad_norm": 0.5230697995372986, + "learning_rate": 0.00017575368826170624, + "loss": 0.7513, + "step": 1370 + }, + { + "epoch": 0.0881975625400898, + "grad_norm": 0.816705171178794, + "learning_rate": 0.00017639512508017962, + "loss": 0.6651, + "step": 1375 + }, + { + "epoch": 0.0885182809493265, + "grad_norm": 0.5342326962298032, + "learning_rate": 0.00017703656189865298, + "loss": 0.5963, + "step": 1380 + }, + { + "epoch": 0.08883899935856318, + "grad_norm": 0.5984545509333057, + "learning_rate": 0.0001776779987171264, + "loss": 0.6455, + "step": 1385 + }, + { + "epoch": 0.08915971776779986, + "grad_norm": 0.6477898184624558, + "learning_rate": 0.00017831943553559974, + "loss": 0.7328, + "step": 1390 + }, + { + "epoch": 0.08948043617703656, + "grad_norm": 0.5092110599627088, + "learning_rate": 0.00017896087235407313, + "loss": 0.6279, + "step": 1395 + }, + { + "epoch": 0.08980115458627325, + "grad_norm": 0.7029802255673286, + "learning_rate": 0.0001796023091725465, + "loss": 0.7776, + "step": 1400 + }, + { + "epoch": 0.09012187299550994, + "grad_norm": 0.8832343335799728, + "learning_rate": 0.0001802437459910199, + "loss": 0.6111, + "step": 1405 + }, + { + "epoch": 0.09044259140474663, + "grad_norm": 0.8016746694750925, + "learning_rate": 0.00018088518280949328, + "loss": 0.6695, + "step": 1410 + }, + { + "epoch": 0.09076330981398333, + "grad_norm": 1.1634306884211862, + "learning_rate": 0.00018152661962796664, + "loss": 0.7114, + "step": 1415 + }, + { + "epoch": 0.09108402822322001, + "grad_norm": 0.6624473023568856, + "learning_rate": 0.00018216805644644005, + "loss": 0.7559, + "step": 1420 + }, + { + "epoch": 0.09140474663245671, + "grad_norm": 0.8042759336949421, + "learning_rate": 0.0001828094932649134, + "loss": 0.7583, + "step": 1425 + }, + { + "epoch": 0.09172546504169339, + "grad_norm": 0.9772784468918035, + "learning_rate": 0.00018345093008338679, + "loss": 0.6853, + "step": 1430 + }, + { + "epoch": 0.09204618345093009, + "grad_norm": 0.5803428867246113, + "learning_rate": 0.00018409236690186017, + "loss": 0.6201, + "step": 1435 + }, + { + "epoch": 0.09236690186016677, + "grad_norm": 1.0135605254585267, + "learning_rate": 0.00018473380372033355, + "loss": 0.5897, + "step": 1440 + }, + { + "epoch": 0.09268762026940347, + "grad_norm": 1.0146665351265378, + "learning_rate": 0.00018537524053880694, + "loss": 0.7739, + "step": 1445 + }, + { + "epoch": 0.09300833867864015, + "grad_norm": 0.6409550994368336, + "learning_rate": 0.00018601667735728032, + "loss": 0.652, + "step": 1450 + }, + { + "epoch": 0.09332905708787684, + "grad_norm": 0.9063409381829404, + "learning_rate": 0.0001866581141757537, + "loss": 0.5091, + "step": 1455 + }, + { + "epoch": 0.09364977549711354, + "grad_norm": 0.6035311851346433, + "learning_rate": 0.00018729955099422706, + "loss": 0.5951, + "step": 1460 + }, + { + "epoch": 0.09397049390635022, + "grad_norm": 0.4305914555852047, + "learning_rate": 0.00018794098781270047, + "loss": 0.6979, + "step": 1465 + }, + { + "epoch": 0.09429121231558692, + "grad_norm": 0.592322337116948, + "learning_rate": 0.00018858242463117383, + "loss": 0.7894, + "step": 1470 + }, + { + "epoch": 0.0946119307248236, + "grad_norm": 0.7019994823024447, + "learning_rate": 0.0001892238614496472, + "loss": 0.6685, + "step": 1475 + }, + { + "epoch": 0.0949326491340603, + "grad_norm": 0.6511984672543305, + "learning_rate": 0.0001898652982681206, + "loss": 0.7993, + "step": 1480 + }, + { + "epoch": 0.09525336754329698, + "grad_norm": 0.7220123377652353, + "learning_rate": 0.00019050673508659398, + "loss": 0.6424, + "step": 1485 + }, + { + "epoch": 0.09557408595253368, + "grad_norm": 0.569165004645741, + "learning_rate": 0.00019114817190506736, + "loss": 0.5879, + "step": 1490 + }, + { + "epoch": 0.09589480436177036, + "grad_norm": 0.6841283140830406, + "learning_rate": 0.00019178960872354075, + "loss": 0.6944, + "step": 1495 + }, + { + "epoch": 0.09621552277100706, + "grad_norm": 0.5806780565962407, + "learning_rate": 0.00019243104554201413, + "loss": 0.7039, + "step": 1500 + }, + { + "epoch": 0.09653624118024375, + "grad_norm": 1.0231588558162683, + "learning_rate": 0.0001930724823604875, + "loss": 0.7447, + "step": 1505 + }, + { + "epoch": 0.09685695958948044, + "grad_norm": 0.6513202839027658, + "learning_rate": 0.0001937139191789609, + "loss": 0.6013, + "step": 1510 + }, + { + "epoch": 0.09717767799871713, + "grad_norm": 0.7845659853361092, + "learning_rate": 0.00019435535599743425, + "loss": 0.6069, + "step": 1515 + }, + { + "epoch": 0.09749839640795381, + "grad_norm": 0.7194048768316849, + "learning_rate": 0.00019499679281590764, + "loss": 0.7641, + "step": 1520 + }, + { + "epoch": 0.09781911481719051, + "grad_norm": 0.6191788469641755, + "learning_rate": 0.00019563822963438102, + "loss": 0.7448, + "step": 1525 + }, + { + "epoch": 0.09813983322642719, + "grad_norm": 0.7426546495672112, + "learning_rate": 0.0001962796664528544, + "loss": 0.7761, + "step": 1530 + }, + { + "epoch": 0.09846055163566389, + "grad_norm": 0.7572762314827131, + "learning_rate": 0.0001969211032713278, + "loss": 0.8618, + "step": 1535 + }, + { + "epoch": 0.09878127004490057, + "grad_norm": 0.6372317781767599, + "learning_rate": 0.00019756254008980117, + "loss": 0.6666, + "step": 1540 + }, + { + "epoch": 0.09910198845413727, + "grad_norm": 0.6326871836739665, + "learning_rate": 0.00019820397690827456, + "loss": 0.6547, + "step": 1545 + }, + { + "epoch": 0.09942270686337396, + "grad_norm": 1.1563371750862326, + "learning_rate": 0.0001988454137267479, + "loss": 0.6453, + "step": 1550 + }, + { + "epoch": 0.09974342527261065, + "grad_norm": 0.9479492008239019, + "learning_rate": 0.00019948685054522132, + "loss": 0.651, + "step": 1555 + }, + { + "epoch": 0.10006414368184734, + "grad_norm": 0.7535422723224012, + "learning_rate": 0.00019999999749335695, + "loss": 0.7093, + "step": 1560 + }, + { + "epoch": 0.10038486209108403, + "grad_norm": 0.5932112107729582, + "learning_rate": 0.00019999990976086248, + "loss": 0.7182, + "step": 1565 + }, + { + "epoch": 0.10070558050032072, + "grad_norm": 0.6993778331415806, + "learning_rate": 0.00019999969669633985, + "loss": 0.6146, + "step": 1570 + }, + { + "epoch": 0.1010262989095574, + "grad_norm": 0.7283971397341802, + "learning_rate": 0.00019999935830005615, + "loss": 0.6496, + "step": 1575 + }, + { + "epoch": 0.1013470173187941, + "grad_norm": 1.0242803435192598, + "learning_rate": 0.00019999889457243545, + "loss": 0.8042, + "step": 1580 + }, + { + "epoch": 0.10166773572803078, + "grad_norm": 0.6322290861624766, + "learning_rate": 0.000199998305514059, + "loss": 0.7667, + "step": 1585 + }, + { + "epoch": 0.10198845413726748, + "grad_norm": 0.6155965395909687, + "learning_rate": 0.00019999759112566498, + "loss": 0.6363, + "step": 1590 + }, + { + "epoch": 0.10230917254650417, + "grad_norm": 0.4557230080410517, + "learning_rate": 0.00019999675140814887, + "loss": 0.5606, + "step": 1595 + }, + { + "epoch": 0.10262989095574086, + "grad_norm": 0.6477761450960091, + "learning_rate": 0.00019999578636256302, + "loss": 0.6693, + "step": 1600 + }, + { + "epoch": 0.10295060936497755, + "grad_norm": 0.8654904236010101, + "learning_rate": 0.000199994695990117, + "loss": 0.6314, + "step": 1605 + }, + { + "epoch": 0.10327132777421424, + "grad_norm": 0.6903326653951578, + "learning_rate": 0.00019999348029217732, + "loss": 0.6179, + "step": 1610 + }, + { + "epoch": 0.10359204618345093, + "grad_norm": 0.8840202106741641, + "learning_rate": 0.00019999213927026775, + "loss": 0.6985, + "step": 1615 + }, + { + "epoch": 0.10391276459268763, + "grad_norm": 0.668673191642802, + "learning_rate": 0.00019999067292606894, + "loss": 0.6218, + "step": 1620 + }, + { + "epoch": 0.10423348300192431, + "grad_norm": 0.8731375253636731, + "learning_rate": 0.00019998908126141868, + "loss": 0.6898, + "step": 1625 + }, + { + "epoch": 0.10455420141116101, + "grad_norm": 1.0983344411460778, + "learning_rate": 0.00019998736427831194, + "loss": 0.7532, + "step": 1630 + }, + { + "epoch": 0.10487491982039769, + "grad_norm": 0.5721731282366914, + "learning_rate": 0.00019998552197890052, + "loss": 0.6003, + "step": 1635 + }, + { + "epoch": 0.10519563822963438, + "grad_norm": 0.679795611170959, + "learning_rate": 0.0001999835543654935, + "loss": 0.7003, + "step": 1640 + }, + { + "epoch": 0.10551635663887107, + "grad_norm": 0.7659746917304108, + "learning_rate": 0.0001999814614405569, + "loss": 0.6359, + "step": 1645 + }, + { + "epoch": 0.10583707504810776, + "grad_norm": 1.1962756283471876, + "learning_rate": 0.00019997924320671383, + "loss": 0.6308, + "step": 1650 + }, + { + "epoch": 0.10615779345734445, + "grad_norm": 0.7621683185763631, + "learning_rate": 0.00019997689966674446, + "loss": 0.7957, + "step": 1655 + }, + { + "epoch": 0.10647851186658114, + "grad_norm": 0.7338531701197929, + "learning_rate": 0.00019997443082358601, + "loss": 0.6757, + "step": 1660 + }, + { + "epoch": 0.10679923027581784, + "grad_norm": 0.7150664806057576, + "learning_rate": 0.00019997183668033267, + "loss": 0.694, + "step": 1665 + }, + { + "epoch": 0.10711994868505452, + "grad_norm": 0.7869356473972234, + "learning_rate": 0.0001999691172402358, + "loss": 0.719, + "step": 1670 + }, + { + "epoch": 0.10744066709429122, + "grad_norm": 0.611503667039071, + "learning_rate": 0.00019996627250670374, + "loss": 0.6343, + "step": 1675 + }, + { + "epoch": 0.1077613855035279, + "grad_norm": 0.7766135920581687, + "learning_rate": 0.00019996330248330183, + "loss": 0.693, + "step": 1680 + }, + { + "epoch": 0.1080821039127646, + "grad_norm": 0.4786388847248821, + "learning_rate": 0.00019996020717375247, + "loss": 0.6194, + "step": 1685 + }, + { + "epoch": 0.10840282232200128, + "grad_norm": 0.6991936018277035, + "learning_rate": 0.000199956986581935, + "loss": 0.7263, + "step": 1690 + }, + { + "epoch": 0.10872354073123797, + "grad_norm": 0.7205841321201338, + "learning_rate": 0.000199953640711886, + "loss": 0.4831, + "step": 1695 + }, + { + "epoch": 0.10904425914047466, + "grad_norm": 0.9131191032401795, + "learning_rate": 0.00019995016956779886, + "loss": 0.5177, + "step": 1700 + }, + { + "epoch": 0.10936497754971135, + "grad_norm": 0.5536147800325968, + "learning_rate": 0.000199946573154024, + "loss": 0.6789, + "step": 1705 + }, + { + "epoch": 0.10968569595894805, + "grad_norm": 0.6451976876558219, + "learning_rate": 0.00019994285147506888, + "loss": 0.7275, + "step": 1710 + }, + { + "epoch": 0.11000641436818473, + "grad_norm": 0.9579506214333907, + "learning_rate": 0.00019993900453559805, + "loss": 0.6589, + "step": 1715 + }, + { + "epoch": 0.11032713277742143, + "grad_norm": 0.9260040237199151, + "learning_rate": 0.00019993503234043284, + "loss": 0.6823, + "step": 1720 + }, + { + "epoch": 0.11064785118665811, + "grad_norm": 0.9505358223036796, + "learning_rate": 0.00019993093489455182, + "loss": 0.7616, + "step": 1725 + }, + { + "epoch": 0.11096856959589481, + "grad_norm": 0.7825553328319829, + "learning_rate": 0.0001999267122030903, + "loss": 0.6443, + "step": 1730 + }, + { + "epoch": 0.11128928800513149, + "grad_norm": 1.277608679789176, + "learning_rate": 0.00019992236427134069, + "loss": 0.6155, + "step": 1735 + }, + { + "epoch": 0.11161000641436819, + "grad_norm": 0.5889261013180431, + "learning_rate": 0.00019991789110475238, + "loss": 0.6994, + "step": 1740 + }, + { + "epoch": 0.11193072482360487, + "grad_norm": 0.8029959511201281, + "learning_rate": 0.00019991329270893163, + "loss": 0.5902, + "step": 1745 + }, + { + "epoch": 0.11225144323284157, + "grad_norm": 0.8303612970994603, + "learning_rate": 0.00019990856908964178, + "loss": 0.783, + "step": 1750 + }, + { + "epoch": 0.11257216164207826, + "grad_norm": 0.7054559375502497, + "learning_rate": 0.00019990372025280304, + "loss": 0.6792, + "step": 1755 + }, + { + "epoch": 0.11289288005131494, + "grad_norm": 0.7420987703476908, + "learning_rate": 0.0001998987462044925, + "loss": 0.6013, + "step": 1760 + }, + { + "epoch": 0.11321359846055164, + "grad_norm": 0.7094425366646243, + "learning_rate": 0.00019989364695094426, + "loss": 0.5688, + "step": 1765 + }, + { + "epoch": 0.11353431686978832, + "grad_norm": 0.569373653159604, + "learning_rate": 0.00019988842249854934, + "loss": 0.58, + "step": 1770 + }, + { + "epoch": 0.11385503527902502, + "grad_norm": 0.46978550262066865, + "learning_rate": 0.00019988307285385566, + "loss": 0.7256, + "step": 1775 + }, + { + "epoch": 0.1141757536882617, + "grad_norm": 0.6612438373633108, + "learning_rate": 0.00019987759802356803, + "loss": 0.7488, + "step": 1780 + }, + { + "epoch": 0.1144964720974984, + "grad_norm": 0.7309333682103005, + "learning_rate": 0.00019987199801454816, + "loss": 0.7284, + "step": 1785 + }, + { + "epoch": 0.11481719050673508, + "grad_norm": 0.9460563497076551, + "learning_rate": 0.00019986627283381472, + "loss": 0.6057, + "step": 1790 + }, + { + "epoch": 0.11513790891597178, + "grad_norm": 0.6266870049609108, + "learning_rate": 0.00019986042248854312, + "loss": 0.6476, + "step": 1795 + }, + { + "epoch": 0.11545862732520847, + "grad_norm": 0.7739200309128734, + "learning_rate": 0.0001998544469860658, + "loss": 0.6622, + "step": 1800 + }, + { + "epoch": 0.11577934573444516, + "grad_norm": 0.7469556806210228, + "learning_rate": 0.00019984834633387193, + "loss": 0.5747, + "step": 1805 + }, + { + "epoch": 0.11610006414368185, + "grad_norm": 0.5367955199234249, + "learning_rate": 0.00019984212053960763, + "loss": 0.671, + "step": 1810 + }, + { + "epoch": 0.11642078255291853, + "grad_norm": 1.0704497861003814, + "learning_rate": 0.00019983576961107576, + "loss": 0.6748, + "step": 1815 + }, + { + "epoch": 0.11674150096215523, + "grad_norm": 0.6669764759339204, + "learning_rate": 0.00019982929355623615, + "loss": 0.7167, + "step": 1820 + }, + { + "epoch": 0.11706221937139191, + "grad_norm": 0.6039796198063991, + "learning_rate": 0.00019982269238320532, + "loss": 0.6067, + "step": 1825 + }, + { + "epoch": 0.11738293778062861, + "grad_norm": 1.7098001118613075, + "learning_rate": 0.00019981596610025668, + "loss": 0.7805, + "step": 1830 + }, + { + "epoch": 0.1177036561898653, + "grad_norm": 0.7398855694010563, + "learning_rate": 0.00019980911471582043, + "loss": 0.6427, + "step": 1835 + }, + { + "epoch": 0.11802437459910199, + "grad_norm": 0.8354800121875872, + "learning_rate": 0.0001998021382384836, + "loss": 0.7408, + "step": 1840 + }, + { + "epoch": 0.11834509300833868, + "grad_norm": 0.6722235019789473, + "learning_rate": 0.00019979503667698985, + "loss": 0.6435, + "step": 1845 + }, + { + "epoch": 0.11866581141757537, + "grad_norm": 0.717593721397057, + "learning_rate": 0.00019978781004023982, + "loss": 0.6241, + "step": 1850 + }, + { + "epoch": 0.11898652982681206, + "grad_norm": 0.7195515776738803, + "learning_rate": 0.00019978045833729074, + "loss": 0.5947, + "step": 1855 + }, + { + "epoch": 0.11930724823604875, + "grad_norm": 0.8882886022840869, + "learning_rate": 0.00019977298157735672, + "loss": 0.7388, + "step": 1860 + }, + { + "epoch": 0.11962796664528544, + "grad_norm": 0.989988319302347, + "learning_rate": 0.0001997653797698085, + "loss": 0.7599, + "step": 1865 + }, + { + "epoch": 0.11994868505452214, + "grad_norm": 0.8403633651058144, + "learning_rate": 0.00019975765292417358, + "loss": 0.6432, + "step": 1870 + }, + { + "epoch": 0.12026940346375882, + "grad_norm": 1.2049771636877937, + "learning_rate": 0.00019974980105013623, + "loss": 0.7333, + "step": 1875 + }, + { + "epoch": 0.1205901218729955, + "grad_norm": 0.8525983520687547, + "learning_rate": 0.00019974182415753732, + "loss": 0.6699, + "step": 1880 + }, + { + "epoch": 0.1209108402822322, + "grad_norm": 0.5716659731530915, + "learning_rate": 0.00019973372225637453, + "loss": 0.5793, + "step": 1885 + }, + { + "epoch": 0.12123155869146889, + "grad_norm": 0.6060632420377923, + "learning_rate": 0.00019972549535680206, + "loss": 0.671, + "step": 1890 + }, + { + "epoch": 0.12155227710070558, + "grad_norm": 0.7561918292328402, + "learning_rate": 0.00019971714346913086, + "loss": 0.5316, + "step": 1895 + }, + { + "epoch": 0.12187299550994227, + "grad_norm": 0.9824211285333242, + "learning_rate": 0.00019970866660382863, + "loss": 0.5868, + "step": 1900 + }, + { + "epoch": 0.12219371391917896, + "grad_norm": 0.7951038927386893, + "learning_rate": 0.00019970006477151953, + "loss": 0.7, + "step": 1905 + }, + { + "epoch": 0.12251443232841565, + "grad_norm": 0.747912075117886, + "learning_rate": 0.0001996913379829844, + "loss": 0.5798, + "step": 1910 + }, + { + "epoch": 0.12283515073765235, + "grad_norm": 1.2254454430699995, + "learning_rate": 0.00019968248624916077, + "loss": 0.6667, + "step": 1915 + }, + { + "epoch": 0.12315586914688903, + "grad_norm": 1.1768102485963885, + "learning_rate": 0.00019967350958114267, + "loss": 0.5774, + "step": 1920 + }, + { + "epoch": 0.12347658755612573, + "grad_norm": 0.6310183951664794, + "learning_rate": 0.0001996644079901808, + "loss": 0.4486, + "step": 1925 + }, + { + "epoch": 0.12379730596536241, + "grad_norm": 0.8260925792950813, + "learning_rate": 0.00019965518148768233, + "loss": 0.5623, + "step": 1930 + }, + { + "epoch": 0.1241180243745991, + "grad_norm": 0.9150306074218141, + "learning_rate": 0.000199645830085211, + "loss": 0.83, + "step": 1935 + }, + { + "epoch": 0.12443874278383579, + "grad_norm": 0.9369210275043979, + "learning_rate": 0.00019963635379448722, + "loss": 0.7223, + "step": 1940 + }, + { + "epoch": 0.12475946119307248, + "grad_norm": 0.748894355642791, + "learning_rate": 0.00019962675262738774, + "loss": 0.6919, + "step": 1945 + }, + { + "epoch": 0.12508017960230916, + "grad_norm": 1.1961745083017192, + "learning_rate": 0.00019961702659594598, + "loss": 0.5536, + "step": 1950 + }, + { + "epoch": 0.12540089801154586, + "grad_norm": 0.568991855421978, + "learning_rate": 0.00019960717571235173, + "loss": 0.639, + "step": 1955 + }, + { + "epoch": 0.12572161642078256, + "grad_norm": 1.0900526061976745, + "learning_rate": 0.00019959719998895135, + "loss": 0.6571, + "step": 1960 + }, + { + "epoch": 0.12604233483001925, + "grad_norm": 0.7953938211319622, + "learning_rate": 0.00019958709943824758, + "loss": 0.7077, + "step": 1965 + }, + { + "epoch": 0.12636305323925592, + "grad_norm": 1.0090362549424627, + "learning_rate": 0.0001995768740728997, + "loss": 0.629, + "step": 1970 + }, + { + "epoch": 0.12668377164849262, + "grad_norm": 0.7822194115921188, + "learning_rate": 0.0001995665239057234, + "loss": 0.7948, + "step": 1975 + }, + { + "epoch": 0.12700449005772932, + "grad_norm": 0.82569207599097, + "learning_rate": 0.00019955604894969067, + "loss": 0.6823, + "step": 1980 + }, + { + "epoch": 0.12732520846696602, + "grad_norm": 0.5455388809406508, + "learning_rate": 0.0001995454492179301, + "loss": 0.5594, + "step": 1985 + }, + { + "epoch": 0.1276459268762027, + "grad_norm": 0.7695218529222057, + "learning_rate": 0.00019953472472372647, + "loss": 0.7198, + "step": 1990 + }, + { + "epoch": 0.12796664528543938, + "grad_norm": 0.8673513110262479, + "learning_rate": 0.00019952387548052112, + "loss": 0.7148, + "step": 1995 + }, + { + "epoch": 0.12828736369467608, + "grad_norm": 0.919881076337375, + "learning_rate": 0.00019951290150191158, + "loss": 0.6439, + "step": 2000 + }, + { + "epoch": 0.12860808210391275, + "grad_norm": 0.9262998866124367, + "learning_rate": 0.00019950180280165175, + "loss": 0.5764, + "step": 2005 + }, + { + "epoch": 0.12892880051314945, + "grad_norm": 0.6765034342263078, + "learning_rate": 0.00019949057939365193, + "loss": 0.4096, + "step": 2010 + }, + { + "epoch": 0.12924951892238615, + "grad_norm": 0.7219277816800387, + "learning_rate": 0.00019947923129197862, + "loss": 0.7127, + "step": 2015 + }, + { + "epoch": 0.12957023733162285, + "grad_norm": 0.8406570776216719, + "learning_rate": 0.0001994677585108546, + "loss": 0.6191, + "step": 2020 + }, + { + "epoch": 0.12989095574085952, + "grad_norm": 0.7458490203268737, + "learning_rate": 0.00019945616106465904, + "loss": 0.5701, + "step": 2025 + }, + { + "epoch": 0.1302116741500962, + "grad_norm": 1.293735176011679, + "learning_rate": 0.0001994444389679272, + "loss": 0.6852, + "step": 2030 + }, + { + "epoch": 0.1305323925593329, + "grad_norm": 0.9148850105541353, + "learning_rate": 0.00019943259223535067, + "loss": 0.7057, + "step": 2035 + }, + { + "epoch": 0.1308531109685696, + "grad_norm": 0.6641079479178653, + "learning_rate": 0.0001994206208817772, + "loss": 0.7629, + "step": 2040 + }, + { + "epoch": 0.13117382937780628, + "grad_norm": 0.791984066260629, + "learning_rate": 0.00019940852492221075, + "loss": 0.6992, + "step": 2045 + }, + { + "epoch": 0.13149454778704298, + "grad_norm": 0.849479398893481, + "learning_rate": 0.00019939630437181143, + "loss": 0.6966, + "step": 2050 + }, + { + "epoch": 0.13181526619627967, + "grad_norm": 0.8367106501858504, + "learning_rate": 0.00019938395924589552, + "loss": 0.5852, + "step": 2055 + }, + { + "epoch": 0.13213598460551634, + "grad_norm": 0.6790358847768235, + "learning_rate": 0.00019937148955993545, + "loss": 0.6393, + "step": 2060 + }, + { + "epoch": 0.13245670301475304, + "grad_norm": 0.9502499514885022, + "learning_rate": 0.00019935889532955977, + "loss": 0.6777, + "step": 2065 + }, + { + "epoch": 0.13277742142398974, + "grad_norm": 0.8134631960781032, + "learning_rate": 0.000199346176570553, + "loss": 0.6862, + "step": 2070 + }, + { + "epoch": 0.13309813983322644, + "grad_norm": 0.6366664689319048, + "learning_rate": 0.00019933333329885593, + "loss": 0.604, + "step": 2075 + }, + { + "epoch": 0.1334188582424631, + "grad_norm": 0.8465634973529981, + "learning_rate": 0.00019932036553056524, + "loss": 0.7162, + "step": 2080 + }, + { + "epoch": 0.1337395766516998, + "grad_norm": 0.8425039370601171, + "learning_rate": 0.00019930727328193378, + "loss": 0.6855, + "step": 2085 + }, + { + "epoch": 0.1340602950609365, + "grad_norm": 1.14970228361299, + "learning_rate": 0.00019929405656937032, + "loss": 0.7191, + "step": 2090 + }, + { + "epoch": 0.1343810134701732, + "grad_norm": 1.0969227215850126, + "learning_rate": 0.0001992807154094396, + "loss": 0.728, + "step": 2095 + }, + { + "epoch": 0.13470173187940987, + "grad_norm": 0.5634883710558874, + "learning_rate": 0.00019926724981886244, + "loss": 0.6794, + "step": 2100 + }, + { + "epoch": 0.13502245028864657, + "grad_norm": 0.9532151941436401, + "learning_rate": 0.0001992536598145155, + "loss": 0.6422, + "step": 2105 + }, + { + "epoch": 0.13534316869788326, + "grad_norm": 0.8529397357920244, + "learning_rate": 0.0001992399454134315, + "loss": 0.8323, + "step": 2110 + }, + { + "epoch": 0.13566388710711993, + "grad_norm": 0.5995161683553816, + "learning_rate": 0.00019922610663279894, + "loss": 0.6443, + "step": 2115 + }, + { + "epoch": 0.13598460551635663, + "grad_norm": 1.1645114047730085, + "learning_rate": 0.00019921214348996228, + "loss": 0.638, + "step": 2120 + }, + { + "epoch": 0.13630532392559333, + "grad_norm": 0.7254426735765782, + "learning_rate": 0.00019919805600242176, + "loss": 0.6233, + "step": 2125 + }, + { + "epoch": 0.13662604233483003, + "grad_norm": 1.2630556570142795, + "learning_rate": 0.00019918384418783362, + "loss": 0.7941, + "step": 2130 + }, + { + "epoch": 0.1369467607440667, + "grad_norm": 0.5842349667453849, + "learning_rate": 0.00019916950806400983, + "loss": 0.7714, + "step": 2135 + }, + { + "epoch": 0.1372674791533034, + "grad_norm": 0.5918400976970277, + "learning_rate": 0.00019915504764891808, + "loss": 0.7118, + "step": 2140 + }, + { + "epoch": 0.1375881975625401, + "grad_norm": 0.8666504796220831, + "learning_rate": 0.000199140462960682, + "loss": 0.7462, + "step": 2145 + }, + { + "epoch": 0.1379089159717768, + "grad_norm": 0.7764199666330917, + "learning_rate": 0.00019912575401758082, + "loss": 0.6395, + "step": 2150 + }, + { + "epoch": 0.13822963438101346, + "grad_norm": 0.9186504138753783, + "learning_rate": 0.00019911092083804962, + "loss": 0.6289, + "step": 2155 + }, + { + "epoch": 0.13855035279025016, + "grad_norm": 0.8035713423211853, + "learning_rate": 0.00019909596344067914, + "loss": 0.7541, + "step": 2160 + }, + { + "epoch": 0.13887107119948686, + "grad_norm": 0.7189520752077799, + "learning_rate": 0.00019908088184421578, + "loss": 0.6826, + "step": 2165 + }, + { + "epoch": 0.13919178960872355, + "grad_norm": 0.6655350088157191, + "learning_rate": 0.00019906567606756167, + "loss": 0.7409, + "step": 2170 + }, + { + "epoch": 0.13951250801796022, + "grad_norm": 0.3224597929224718, + "learning_rate": 0.0001990503461297745, + "loss": 0.5904, + "step": 2175 + }, + { + "epoch": 0.13983322642719692, + "grad_norm": 0.8267424045917116, + "learning_rate": 0.00019903489205006764, + "loss": 0.6894, + "step": 2180 + }, + { + "epoch": 0.14015394483643362, + "grad_norm": 0.6123341217762982, + "learning_rate": 0.00019901931384780995, + "loss": 0.703, + "step": 2185 + }, + { + "epoch": 0.1404746632456703, + "grad_norm": 0.45163827780119753, + "learning_rate": 0.00019900361154252602, + "loss": 0.59, + "step": 2190 + }, + { + "epoch": 0.140795381654907, + "grad_norm": 0.9556170145817368, + "learning_rate": 0.00019898778515389584, + "loss": 0.5857, + "step": 2195 + }, + { + "epoch": 0.14111610006414368, + "grad_norm": 0.7479105122087544, + "learning_rate": 0.00019897183470175494, + "loss": 0.6585, + "step": 2200 + }, + { + "epoch": 0.14143681847338038, + "grad_norm": 1.0326719597420064, + "learning_rate": 0.0001989557602060944, + "loss": 0.7534, + "step": 2205 + }, + { + "epoch": 0.14175753688261705, + "grad_norm": 0.8658293920784573, + "learning_rate": 0.00019893956168706066, + "loss": 0.7002, + "step": 2210 + }, + { + "epoch": 0.14207825529185375, + "grad_norm": 0.8622344203075765, + "learning_rate": 0.00019892323916495582, + "loss": 0.7086, + "step": 2215 + }, + { + "epoch": 0.14239897370109045, + "grad_norm": 0.7259813554322444, + "learning_rate": 0.00019890679266023709, + "loss": 0.4999, + "step": 2220 + }, + { + "epoch": 0.14271969211032715, + "grad_norm": 0.6647794000879613, + "learning_rate": 0.0001988902221935173, + "loss": 0.7005, + "step": 2225 + }, + { + "epoch": 0.14304041051956382, + "grad_norm": 0.8451399712054074, + "learning_rate": 0.00019887352778556454, + "loss": 0.6435, + "step": 2230 + }, + { + "epoch": 0.1433611289288005, + "grad_norm": 0.7567525634116421, + "learning_rate": 0.0001988567094573023, + "loss": 0.7609, + "step": 2235 + }, + { + "epoch": 0.1436818473380372, + "grad_norm": 0.8106441964345322, + "learning_rate": 0.00019883976722980936, + "loss": 0.6969, + "step": 2240 + }, + { + "epoch": 0.14400256574727388, + "grad_norm": 0.6312440700944748, + "learning_rate": 0.00019882270112431974, + "loss": 0.6787, + "step": 2245 + }, + { + "epoch": 0.14432328415651058, + "grad_norm": 0.8698670635315567, + "learning_rate": 0.00019880551116222277, + "loss": 0.79, + "step": 2250 + }, + { + "epoch": 0.14464400256574728, + "grad_norm": 0.5675337075202405, + "learning_rate": 0.00019878819736506297, + "loss": 0.6922, + "step": 2255 + }, + { + "epoch": 0.14496472097498397, + "grad_norm": 0.8080748220001619, + "learning_rate": 0.00019877075975454015, + "loss": 0.6723, + "step": 2260 + }, + { + "epoch": 0.14528543938422064, + "grad_norm": 1.18598966284805, + "learning_rate": 0.00019875319835250922, + "loss": 0.6078, + "step": 2265 + }, + { + "epoch": 0.14560615779345734, + "grad_norm": 0.7396735588781944, + "learning_rate": 0.00019873551318098026, + "loss": 0.6555, + "step": 2270 + }, + { + "epoch": 0.14592687620269404, + "grad_norm": 0.9421384978371221, + "learning_rate": 0.00019871770426211843, + "loss": 0.6763, + "step": 2275 + }, + { + "epoch": 0.14624759461193074, + "grad_norm": 1.3557865695262534, + "learning_rate": 0.0001986997716182441, + "loss": 0.6517, + "step": 2280 + }, + { + "epoch": 0.1465683130211674, + "grad_norm": 0.7620628179190014, + "learning_rate": 0.0001986817152718326, + "loss": 0.8213, + "step": 2285 + }, + { + "epoch": 0.1468890314304041, + "grad_norm": 1.1665229535256436, + "learning_rate": 0.0001986635352455143, + "loss": 0.6593, + "step": 2290 + }, + { + "epoch": 0.1472097498396408, + "grad_norm": 0.549262325529975, + "learning_rate": 0.0001986452315620747, + "loss": 0.5682, + "step": 2295 + }, + { + "epoch": 0.14753046824887747, + "grad_norm": 0.6290840720109729, + "learning_rate": 0.00019862680424445413, + "loss": 0.5891, + "step": 2300 + }, + { + "epoch": 0.14785118665811417, + "grad_norm": 0.6806013181414412, + "learning_rate": 0.00019860825331574798, + "loss": 0.7814, + "step": 2305 + }, + { + "epoch": 0.14817190506735087, + "grad_norm": 0.9105112621167852, + "learning_rate": 0.00019858957879920647, + "loss": 0.6707, + "step": 2310 + }, + { + "epoch": 0.14849262347658757, + "grad_norm": 0.8528821816779594, + "learning_rate": 0.00019857078071823484, + "loss": 0.664, + "step": 2315 + }, + { + "epoch": 0.14881334188582424, + "grad_norm": 0.7181914153458827, + "learning_rate": 0.0001985518590963931, + "loss": 0.6854, + "step": 2320 + }, + { + "epoch": 0.14913406029506093, + "grad_norm": 0.7397278453494517, + "learning_rate": 0.00019853281395739613, + "loss": 0.6665, + "step": 2325 + }, + { + "epoch": 0.14945477870429763, + "grad_norm": 0.8745968398949746, + "learning_rate": 0.00019851364532511362, + "loss": 0.5766, + "step": 2330 + }, + { + "epoch": 0.14977549711353433, + "grad_norm": 1.2088886679730004, + "learning_rate": 0.00019849435322356995, + "loss": 0.7018, + "step": 2335 + }, + { + "epoch": 0.150096215522771, + "grad_norm": 1.0443479254100274, + "learning_rate": 0.00019847493767694444, + "loss": 0.5986, + "step": 2340 + }, + { + "epoch": 0.1504169339320077, + "grad_norm": 1.0921241128817574, + "learning_rate": 0.00019845539870957092, + "loss": 0.5923, + "step": 2345 + }, + { + "epoch": 0.1507376523412444, + "grad_norm": 0.9646802917631114, + "learning_rate": 0.00019843573634593806, + "loss": 0.7926, + "step": 2350 + }, + { + "epoch": 0.1510583707504811, + "grad_norm": 0.7656847484095911, + "learning_rate": 0.00019841595061068906, + "loss": 0.7207, + "step": 2355 + }, + { + "epoch": 0.15137908915971776, + "grad_norm": 0.5049528849051477, + "learning_rate": 0.0001983960415286219, + "loss": 0.6228, + "step": 2360 + }, + { + "epoch": 0.15169980756895446, + "grad_norm": 0.9068993192806996, + "learning_rate": 0.00019837600912468893, + "loss": 0.5693, + "step": 2365 + }, + { + "epoch": 0.15202052597819116, + "grad_norm": 0.8676250105736654, + "learning_rate": 0.00019835585342399732, + "loss": 0.5743, + "step": 2370 + }, + { + "epoch": 0.15234124438742783, + "grad_norm": 0.5246385631697503, + "learning_rate": 0.00019833557445180855, + "loss": 0.7401, + "step": 2375 + }, + { + "epoch": 0.15266196279666452, + "grad_norm": 0.7016792226152242, + "learning_rate": 0.0001983151722335387, + "loss": 0.6403, + "step": 2380 + }, + { + "epoch": 0.15298268120590122, + "grad_norm": 0.7722496289657441, + "learning_rate": 0.00019829464679475836, + "loss": 0.5484, + "step": 2385 + }, + { + "epoch": 0.15330339961513792, + "grad_norm": 1.2298123662291214, + "learning_rate": 0.00019827399816119243, + "loss": 0.7674, + "step": 2390 + }, + { + "epoch": 0.1536241180243746, + "grad_norm": 0.7861238282945989, + "learning_rate": 0.00019825322635872036, + "loss": 0.619, + "step": 2395 + }, + { + "epoch": 0.1539448364336113, + "grad_norm": 0.9211911752664865, + "learning_rate": 0.00019823233141337584, + "loss": 0.6211, + "step": 2400 + }, + { + "epoch": 0.15426555484284799, + "grad_norm": 0.7151255909037463, + "learning_rate": 0.00019821131335134696, + "loss": 0.548, + "step": 2405 + }, + { + "epoch": 0.15458627325208468, + "grad_norm": 0.9458426635711992, + "learning_rate": 0.00019819017219897613, + "loss": 0.6482, + "step": 2410 + }, + { + "epoch": 0.15490699166132135, + "grad_norm": 1.0258204800171964, + "learning_rate": 0.00019816890798276, + "loss": 0.6717, + "step": 2415 + }, + { + "epoch": 0.15522771007055805, + "grad_norm": 1.2116376507078799, + "learning_rate": 0.00019814752072934945, + "loss": 0.6242, + "step": 2420 + }, + { + "epoch": 0.15554842847979475, + "grad_norm": 0.7799968415850017, + "learning_rate": 0.00019812601046554962, + "loss": 0.6257, + "step": 2425 + }, + { + "epoch": 0.15586914688903142, + "grad_norm": 0.4916761578519649, + "learning_rate": 0.00019810437721831976, + "loss": 0.7221, + "step": 2430 + }, + { + "epoch": 0.15618986529826812, + "grad_norm": 0.9089669003206741, + "learning_rate": 0.00019808262101477328, + "loss": 0.6457, + "step": 2435 + }, + { + "epoch": 0.1565105837075048, + "grad_norm": 0.5752941624716728, + "learning_rate": 0.00019806074188217766, + "loss": 0.5367, + "step": 2440 + }, + { + "epoch": 0.1568313021167415, + "grad_norm": 0.7889396413468497, + "learning_rate": 0.0001980387398479546, + "loss": 0.5704, + "step": 2445 + }, + { + "epoch": 0.15715202052597818, + "grad_norm": 0.7974301152247996, + "learning_rate": 0.00019801661493967955, + "loss": 0.7438, + "step": 2450 + }, + { + "epoch": 0.15747273893521488, + "grad_norm": 0.9099718674001662, + "learning_rate": 0.00019799436718508228, + "loss": 0.7057, + "step": 2455 + }, + { + "epoch": 0.15779345734445158, + "grad_norm": 0.7460789907183486, + "learning_rate": 0.0001979719966120463, + "loss": 0.6769, + "step": 2460 + }, + { + "epoch": 0.15811417575368827, + "grad_norm": 0.9026682063218279, + "learning_rate": 0.00019794950324860918, + "loss": 0.6677, + "step": 2465 + }, + { + "epoch": 0.15843489416292494, + "grad_norm": 0.706813388972018, + "learning_rate": 0.0001979268871229623, + "loss": 0.652, + "step": 2470 + }, + { + "epoch": 0.15875561257216164, + "grad_norm": 0.7951893501420781, + "learning_rate": 0.00019790414826345094, + "loss": 0.7231, + "step": 2475 + }, + { + "epoch": 0.15907633098139834, + "grad_norm": 0.9695064104615378, + "learning_rate": 0.0001978812866985742, + "loss": 0.6308, + "step": 2480 + }, + { + "epoch": 0.159397049390635, + "grad_norm": 0.5344509876021667, + "learning_rate": 0.00019785830245698497, + "loss": 0.6997, + "step": 2485 + }, + { + "epoch": 0.1597177677998717, + "grad_norm": 0.834051661967047, + "learning_rate": 0.00019783519556748987, + "loss": 0.6783, + "step": 2490 + }, + { + "epoch": 0.1600384862091084, + "grad_norm": 0.9723305146917021, + "learning_rate": 0.0001978119660590493, + "loss": 0.7798, + "step": 2495 + }, + { + "epoch": 0.1603592046183451, + "grad_norm": 0.8859242414039744, + "learning_rate": 0.00019778861396077725, + "loss": 0.793, + "step": 2500 + }, + { + "epoch": 0.16067992302758177, + "grad_norm": 0.7241777810857887, + "learning_rate": 0.00019776513930194148, + "loss": 0.504, + "step": 2505 + }, + { + "epoch": 0.16100064143681847, + "grad_norm": 1.054121315907452, + "learning_rate": 0.00019774154211196318, + "loss": 0.7509, + "step": 2510 + }, + { + "epoch": 0.16132135984605517, + "grad_norm": 0.8701449793412597, + "learning_rate": 0.0001977178224204173, + "loss": 0.7875, + "step": 2515 + }, + { + "epoch": 0.16164207825529187, + "grad_norm": 0.7757819809049686, + "learning_rate": 0.00019769398025703224, + "loss": 0.6047, + "step": 2520 + }, + { + "epoch": 0.16196279666452854, + "grad_norm": 1.0713357367053484, + "learning_rate": 0.00019767001565168982, + "loss": 0.7384, + "step": 2525 + }, + { + "epoch": 0.16228351507376523, + "grad_norm": 0.43793306094407475, + "learning_rate": 0.00019764592863442544, + "loss": 0.6156, + "step": 2530 + }, + { + "epoch": 0.16260423348300193, + "grad_norm": 0.9034469617213254, + "learning_rate": 0.0001976217192354279, + "loss": 0.6383, + "step": 2535 + }, + { + "epoch": 0.1629249518922386, + "grad_norm": 0.7090465404578327, + "learning_rate": 0.0001975973874850393, + "loss": 0.59, + "step": 2540 + }, + { + "epoch": 0.1632456703014753, + "grad_norm": 0.7781025944113742, + "learning_rate": 0.00019757293341375517, + "loss": 0.6829, + "step": 2545 + }, + { + "epoch": 0.163566388710712, + "grad_norm": 0.701765797555506, + "learning_rate": 0.00019754835705222435, + "loss": 0.6682, + "step": 2550 + }, + { + "epoch": 0.1638871071199487, + "grad_norm": 0.8486110822681391, + "learning_rate": 0.00019752365843124885, + "loss": 0.7107, + "step": 2555 + }, + { + "epoch": 0.16420782552918536, + "grad_norm": 1.2183183484648679, + "learning_rate": 0.00019749883758178404, + "loss": 0.7092, + "step": 2560 + }, + { + "epoch": 0.16452854393842206, + "grad_norm": 0.5747438190450085, + "learning_rate": 0.0001974738945349384, + "loss": 0.5635, + "step": 2565 + }, + { + "epoch": 0.16484926234765876, + "grad_norm": 0.754766366798954, + "learning_rate": 0.0001974488293219736, + "loss": 0.7561, + "step": 2570 + }, + { + "epoch": 0.16516998075689546, + "grad_norm": 0.9579439740753294, + "learning_rate": 0.00019742364197430443, + "loss": 0.6015, + "step": 2575 + }, + { + "epoch": 0.16549069916613213, + "grad_norm": 0.6786544154968012, + "learning_rate": 0.00019739833252349867, + "loss": 0.5112, + "step": 2580 + }, + { + "epoch": 0.16581141757536882, + "grad_norm": 0.7934214823629537, + "learning_rate": 0.00019737290100127722, + "loss": 0.7203, + "step": 2585 + }, + { + "epoch": 0.16613213598460552, + "grad_norm": 1.33220621050734, + "learning_rate": 0.00019734734743951396, + "loss": 0.6863, + "step": 2590 + }, + { + "epoch": 0.16645285439384222, + "grad_norm": 0.8267900862256077, + "learning_rate": 0.00019732167187023572, + "loss": 0.6449, + "step": 2595 + }, + { + "epoch": 0.1667735728030789, + "grad_norm": 0.7287938245757929, + "learning_rate": 0.0001972958743256222, + "loss": 0.7308, + "step": 2600 + }, + { + "epoch": 0.1670942912123156, + "grad_norm": 0.5363094807734924, + "learning_rate": 0.00019726995483800613, + "loss": 0.6403, + "step": 2605 + }, + { + "epoch": 0.16741500962155229, + "grad_norm": 0.7277617239159246, + "learning_rate": 0.00019724391343987284, + "loss": 0.6777, + "step": 2610 + }, + { + "epoch": 0.16773572803078896, + "grad_norm": 0.9462519719607535, + "learning_rate": 0.00019721775016386057, + "loss": 0.6895, + "step": 2615 + }, + { + "epoch": 0.16805644644002565, + "grad_norm": 0.8528897030121969, + "learning_rate": 0.0001971914650427604, + "loss": 0.5536, + "step": 2620 + }, + { + "epoch": 0.16837716484926235, + "grad_norm": 0.9319172497451002, + "learning_rate": 0.000197165058109516, + "loss": 0.5724, + "step": 2625 + }, + { + "epoch": 0.16869788325849905, + "grad_norm": 0.7410196474628663, + "learning_rate": 0.0001971385293972237, + "loss": 0.6785, + "step": 2630 + }, + { + "epoch": 0.16901860166773572, + "grad_norm": 0.9192207798068145, + "learning_rate": 0.00019711187893913255, + "loss": 0.7219, + "step": 2635 + }, + { + "epoch": 0.16933932007697242, + "grad_norm": 0.5750937169325536, + "learning_rate": 0.00019708510676864414, + "loss": 0.482, + "step": 2640 + }, + { + "epoch": 0.1696600384862091, + "grad_norm": 0.7158603995106417, + "learning_rate": 0.0001970582129193126, + "loss": 0.577, + "step": 2645 + }, + { + "epoch": 0.1699807568954458, + "grad_norm": 0.9152254783119084, + "learning_rate": 0.00019703119742484453, + "loss": 0.6657, + "step": 2650 + }, + { + "epoch": 0.17030147530468248, + "grad_norm": 0.7435319188039847, + "learning_rate": 0.00019700406031909905, + "loss": 0.6779, + "step": 2655 + }, + { + "epoch": 0.17062219371391918, + "grad_norm": 1.504228508241335, + "learning_rate": 0.0001969768016360877, + "loss": 0.7278, + "step": 2660 + }, + { + "epoch": 0.17094291212315588, + "grad_norm": 1.2092049917834673, + "learning_rate": 0.00019694942140997435, + "loss": 0.7341, + "step": 2665 + }, + { + "epoch": 0.17126363053239255, + "grad_norm": 0.6080302726719192, + "learning_rate": 0.00019692191967507524, + "loss": 0.6543, + "step": 2670 + }, + { + "epoch": 0.17158434894162924, + "grad_norm": 0.7373008700852878, + "learning_rate": 0.0001968942964658589, + "loss": 0.6152, + "step": 2675 + }, + { + "epoch": 0.17190506735086594, + "grad_norm": 0.9214476765346659, + "learning_rate": 0.000196866551816946, + "loss": 0.6878, + "step": 2680 + }, + { + "epoch": 0.17222578576010264, + "grad_norm": 0.7450194855735123, + "learning_rate": 0.0001968386857631096, + "loss": 0.6173, + "step": 2685 + }, + { + "epoch": 0.1725465041693393, + "grad_norm": 0.6242054305521421, + "learning_rate": 0.00019681069833927476, + "loss": 0.6746, + "step": 2690 + }, + { + "epoch": 0.172867222578576, + "grad_norm": 0.711220248168634, + "learning_rate": 0.00019678258958051877, + "loss": 0.6821, + "step": 2695 + }, + { + "epoch": 0.1731879409878127, + "grad_norm": 0.7496584977206721, + "learning_rate": 0.00019675435952207088, + "loss": 0.5238, + "step": 2700 + }, + { + "epoch": 0.1735086593970494, + "grad_norm": 0.7084413643635924, + "learning_rate": 0.00019672600819931247, + "loss": 0.7056, + "step": 2705 + }, + { + "epoch": 0.17382937780628607, + "grad_norm": 1.0439027628488613, + "learning_rate": 0.00019669753564777688, + "loss": 0.6513, + "step": 2710 + }, + { + "epoch": 0.17415009621552277, + "grad_norm": 0.71498067288977, + "learning_rate": 0.0001966689419031493, + "loss": 0.7406, + "step": 2715 + }, + { + "epoch": 0.17447081462475947, + "grad_norm": 0.7033452927937216, + "learning_rate": 0.00019664022700126695, + "loss": 0.6923, + "step": 2720 + }, + { + "epoch": 0.17479153303399614, + "grad_norm": 0.8919976779446186, + "learning_rate": 0.00019661139097811877, + "loss": 0.6326, + "step": 2725 + }, + { + "epoch": 0.17511225144323284, + "grad_norm": 0.9493437873661492, + "learning_rate": 0.00019658243386984562, + "loss": 0.5783, + "step": 2730 + }, + { + "epoch": 0.17543296985246953, + "grad_norm": 0.9860728443591087, + "learning_rate": 0.00019655335571274003, + "loss": 0.7279, + "step": 2735 + }, + { + "epoch": 0.17575368826170623, + "grad_norm": 0.6352021684421743, + "learning_rate": 0.0001965241565432463, + "loss": 0.6397, + "step": 2740 + }, + { + "epoch": 0.1760744066709429, + "grad_norm": 1.099016920497353, + "learning_rate": 0.00019649483639796032, + "loss": 0.6756, + "step": 2745 + }, + { + "epoch": 0.1763951250801796, + "grad_norm": 0.7058834343210731, + "learning_rate": 0.00019646539531362973, + "loss": 0.7218, + "step": 2750 + }, + { + "epoch": 0.1767158434894163, + "grad_norm": 0.8020832284905198, + "learning_rate": 0.00019643583332715366, + "loss": 0.5708, + "step": 2755 + }, + { + "epoch": 0.177036561898653, + "grad_norm": 0.8014855578510585, + "learning_rate": 0.0001964061504755827, + "loss": 0.7843, + "step": 2760 + }, + { + "epoch": 0.17735728030788966, + "grad_norm": 1.0134184586337234, + "learning_rate": 0.0001963763467961191, + "loss": 0.6599, + "step": 2765 + }, + { + "epoch": 0.17767799871712636, + "grad_norm": 0.6050193347531744, + "learning_rate": 0.0001963464223261164, + "loss": 0.7984, + "step": 2770 + }, + { + "epoch": 0.17799871712636306, + "grad_norm": 0.7479913165773774, + "learning_rate": 0.0001963163771030796, + "loss": 0.7469, + "step": 2775 + }, + { + "epoch": 0.17831943553559973, + "grad_norm": 1.091278392341476, + "learning_rate": 0.00019628621116466502, + "loss": 0.6991, + "step": 2780 + }, + { + "epoch": 0.17864015394483643, + "grad_norm": 1.0105012542968526, + "learning_rate": 0.00019625592454868026, + "loss": 0.6867, + "step": 2785 + }, + { + "epoch": 0.17896087235407312, + "grad_norm": 0.8032083651463552, + "learning_rate": 0.0001962255172930842, + "loss": 0.7184, + "step": 2790 + }, + { + "epoch": 0.17928159076330982, + "grad_norm": 0.8193497605449357, + "learning_rate": 0.00019619498943598688, + "loss": 0.5785, + "step": 2795 + }, + { + "epoch": 0.1796023091725465, + "grad_norm": 0.7772046040254091, + "learning_rate": 0.00019616434101564956, + "loss": 0.7471, + "step": 2800 + }, + { + "epoch": 0.1799230275817832, + "grad_norm": 1.224565960941351, + "learning_rate": 0.00019613357207048452, + "loss": 0.856, + "step": 2805 + }, + { + "epoch": 0.1802437459910199, + "grad_norm": 0.6591412427417273, + "learning_rate": 0.00019610268263905515, + "loss": 0.5893, + "step": 2810 + }, + { + "epoch": 0.18056446440025659, + "grad_norm": 0.8875976837711199, + "learning_rate": 0.00019607167276007587, + "loss": 0.7161, + "step": 2815 + }, + { + "epoch": 0.18088518280949326, + "grad_norm": 0.8225479052301773, + "learning_rate": 0.00019604054247241193, + "loss": 0.5873, + "step": 2820 + }, + { + "epoch": 0.18120590121872995, + "grad_norm": 1.2087539785527361, + "learning_rate": 0.00019600929181507972, + "loss": 0.6542, + "step": 2825 + }, + { + "epoch": 0.18152661962796665, + "grad_norm": 0.8050140113302814, + "learning_rate": 0.00019597792082724625, + "loss": 0.5778, + "step": 2830 + }, + { + "epoch": 0.18184733803720335, + "grad_norm": 1.321288241534433, + "learning_rate": 0.00019594642954822952, + "loss": 0.5994, + "step": 2835 + }, + { + "epoch": 0.18216805644644002, + "grad_norm": 0.9376939681240336, + "learning_rate": 0.00019591481801749816, + "loss": 0.5046, + "step": 2840 + }, + { + "epoch": 0.18248877485567672, + "grad_norm": 0.6185458970009285, + "learning_rate": 0.00019588308627467162, + "loss": 0.6859, + "step": 2845 + }, + { + "epoch": 0.18280949326491341, + "grad_norm": 0.7801762201714135, + "learning_rate": 0.00019585123435952, + "loss": 0.7015, + "step": 2850 + }, + { + "epoch": 0.18313021167415008, + "grad_norm": 0.7265831165052501, + "learning_rate": 0.00019581926231196391, + "loss": 0.823, + "step": 2855 + }, + { + "epoch": 0.18345093008338678, + "grad_norm": 0.8151220320154888, + "learning_rate": 0.00019578717017207467, + "loss": 0.689, + "step": 2860 + }, + { + "epoch": 0.18377164849262348, + "grad_norm": 0.9213195972340709, + "learning_rate": 0.000195754957980074, + "loss": 0.7382, + "step": 2865 + }, + { + "epoch": 0.18409236690186018, + "grad_norm": 0.782822592817081, + "learning_rate": 0.0001957226257763342, + "loss": 0.6929, + "step": 2870 + }, + { + "epoch": 0.18441308531109685, + "grad_norm": 0.980335474676683, + "learning_rate": 0.0001956901736013778, + "loss": 0.6156, + "step": 2875 + }, + { + "epoch": 0.18473380372033354, + "grad_norm": 0.9039810035947186, + "learning_rate": 0.00019565760149587794, + "loss": 0.7664, + "step": 2880 + }, + { + "epoch": 0.18505452212957024, + "grad_norm": 0.000701834979829147, + "learning_rate": 0.0001956249095006578, + "loss": 0.5249, + "step": 2885 + }, + { + "epoch": 0.18537524053880694, + "grad_norm": 1.0237955976436885, + "learning_rate": 0.00019559209765669105, + "loss": 0.6839, + "step": 2890 + }, + { + "epoch": 0.1856959589480436, + "grad_norm": 0.6769833810242086, + "learning_rate": 0.00019555916600510145, + "loss": 0.6537, + "step": 2895 + }, + { + "epoch": 0.1860166773572803, + "grad_norm": 0.6462485885713231, + "learning_rate": 0.00019552611458716296, + "loss": 0.723, + "step": 2900 + }, + { + "epoch": 0.186337395766517, + "grad_norm": 0.8722147531755802, + "learning_rate": 0.0001954929434442996, + "loss": 0.6837, + "step": 2905 + }, + { + "epoch": 0.18665811417575368, + "grad_norm": 0.6906487731551919, + "learning_rate": 0.0001954596526180855, + "loss": 0.6678, + "step": 2910 + }, + { + "epoch": 0.18697883258499037, + "grad_norm": 0.8754536117451718, + "learning_rate": 0.00019542624215024474, + "loss": 0.7607, + "step": 2915 + }, + { + "epoch": 0.18729955099422707, + "grad_norm": 0.7481215119155424, + "learning_rate": 0.0001953927120826514, + "loss": 0.7354, + "step": 2920 + }, + { + "epoch": 0.18762026940346377, + "grad_norm": 0.7173045174318763, + "learning_rate": 0.0001953590624573294, + "loss": 0.6889, + "step": 2925 + }, + { + "epoch": 0.18794098781270044, + "grad_norm": 0.688657494500447, + "learning_rate": 0.00019532529331645258, + "loss": 0.7716, + "step": 2930 + }, + { + "epoch": 0.18826170622193714, + "grad_norm": 0.8542179699315836, + "learning_rate": 0.0001952914047023445, + "loss": 0.6846, + "step": 2935 + }, + { + "epoch": 0.18858242463117383, + "grad_norm": 0.6693936334963977, + "learning_rate": 0.0001952573966574785, + "loss": 0.6893, + "step": 2940 + }, + { + "epoch": 0.18890314304041053, + "grad_norm": 1.1047249058364512, + "learning_rate": 0.00019522326922447755, + "loss": 0.7203, + "step": 2945 + }, + { + "epoch": 0.1892238614496472, + "grad_norm": 0.6082855408476369, + "learning_rate": 0.00019518902244611435, + "loss": 0.7069, + "step": 2950 + }, + { + "epoch": 0.1895445798588839, + "grad_norm": 0.5867678432004605, + "learning_rate": 0.00019515465636531107, + "loss": 0.7485, + "step": 2955 + }, + { + "epoch": 0.1898652982681206, + "grad_norm": 0.6389524482986783, + "learning_rate": 0.0001951201710251395, + "loss": 0.6291, + "step": 2960 + }, + { + "epoch": 0.19018601667735727, + "grad_norm": 0.40852828777296263, + "learning_rate": 0.00019508556646882083, + "loss": 0.6572, + "step": 2965 + }, + { + "epoch": 0.19050673508659396, + "grad_norm": 0.6625359401782684, + "learning_rate": 0.00019505084273972568, + "loss": 0.6905, + "step": 2970 + }, + { + "epoch": 0.19082745349583066, + "grad_norm": 0.6733266631590418, + "learning_rate": 0.00019501599988137406, + "loss": 0.6065, + "step": 2975 + }, + { + "epoch": 0.19114817190506736, + "grad_norm": 0.8217762217578838, + "learning_rate": 0.00019498103793743528, + "loss": 0.6843, + "step": 2980 + }, + { + "epoch": 0.19146889031430403, + "grad_norm": 1.220514466724885, + "learning_rate": 0.00019494595695172787, + "loss": 0.604, + "step": 2985 + }, + { + "epoch": 0.19178960872354073, + "grad_norm": 0.792446196427873, + "learning_rate": 0.00019491075696821962, + "loss": 0.6326, + "step": 2990 + }, + { + "epoch": 0.19211032713277743, + "grad_norm": 0.8158356531364367, + "learning_rate": 0.00019487543803102736, + "loss": 0.7795, + "step": 2995 + }, + { + "epoch": 0.19243104554201412, + "grad_norm": 1.3297681323714916, + "learning_rate": 0.00019484000018441715, + "loss": 0.6776, + "step": 3000 + }, + { + "epoch": 0.1927517639512508, + "grad_norm": 1.1206878255004398, + "learning_rate": 0.00019480444347280392, + "loss": 0.7425, + "step": 3005 + }, + { + "epoch": 0.1930724823604875, + "grad_norm": 0.5668482553685025, + "learning_rate": 0.00019476876794075168, + "loss": 0.6463, + "step": 3010 + }, + { + "epoch": 0.1933932007697242, + "grad_norm": 0.9274228876056752, + "learning_rate": 0.0001947329736329734, + "loss": 0.7253, + "step": 3015 + }, + { + "epoch": 0.19371391917896089, + "grad_norm": 0.8934110376365801, + "learning_rate": 0.0001946970605943308, + "loss": 0.8008, + "step": 3020 + }, + { + "epoch": 0.19403463758819756, + "grad_norm": 0.7054346176332205, + "learning_rate": 0.00019466102886983445, + "loss": 0.6421, + "step": 3025 + }, + { + "epoch": 0.19435535599743425, + "grad_norm": 1.112312708275422, + "learning_rate": 0.0001946248785046437, + "loss": 0.5448, + "step": 3030 + }, + { + "epoch": 0.19467607440667095, + "grad_norm": 0.9514480454813623, + "learning_rate": 0.00019458860954406655, + "loss": 0.8921, + "step": 3035 + }, + { + "epoch": 0.19499679281590762, + "grad_norm": 0.8289559763958162, + "learning_rate": 0.00019455222203355974, + "loss": 0.6384, + "step": 3040 + }, + { + "epoch": 0.19531751122514432, + "grad_norm": 1.6772904982725059, + "learning_rate": 0.00019451571601872842, + "loss": 0.593, + "step": 3045 + }, + { + "epoch": 0.19563822963438102, + "grad_norm": 0.933959150583705, + "learning_rate": 0.00019447909154532642, + "loss": 0.7033, + "step": 3050 + }, + { + "epoch": 0.19595894804361771, + "grad_norm": 0.9836848697506737, + "learning_rate": 0.00019444234865925597, + "loss": 0.694, + "step": 3055 + }, + { + "epoch": 0.19627966645285438, + "grad_norm": 0.752058149609346, + "learning_rate": 0.00019440548740656772, + "loss": 0.8419, + "step": 3060 + }, + { + "epoch": 0.19660038486209108, + "grad_norm": 0.5564595991041628, + "learning_rate": 0.00019436850783346063, + "loss": 0.5868, + "step": 3065 + }, + { + "epoch": 0.19692110327132778, + "grad_norm": 1.1233031900082198, + "learning_rate": 0.00019433140998628202, + "loss": 0.7213, + "step": 3070 + }, + { + "epoch": 0.19724182168056448, + "grad_norm": 0.9846847511141703, + "learning_rate": 0.00019429419391152743, + "loss": 0.6083, + "step": 3075 + }, + { + "epoch": 0.19756254008980115, + "grad_norm": 0.9133697850179805, + "learning_rate": 0.00019425685965584056, + "loss": 0.7509, + "step": 3080 + }, + { + "epoch": 0.19788325849903785, + "grad_norm": 1.1268873349974773, + "learning_rate": 0.0001942194072660132, + "loss": 0.6734, + "step": 3085 + }, + { + "epoch": 0.19820397690827454, + "grad_norm": 0.663450697814864, + "learning_rate": 0.00019418183678898525, + "loss": 0.7093, + "step": 3090 + }, + { + "epoch": 0.1985246953175112, + "grad_norm": 0.6245075928754343, + "learning_rate": 0.0001941441482718446, + "loss": 0.7194, + "step": 3095 + }, + { + "epoch": 0.1988454137267479, + "grad_norm": 0.9587885835266485, + "learning_rate": 0.00019410634176182705, + "loss": 0.6995, + "step": 3100 + }, + { + "epoch": 0.1991661321359846, + "grad_norm": 0.8163502504890695, + "learning_rate": 0.00019406841730631636, + "loss": 0.7503, + "step": 3105 + }, + { + "epoch": 0.1994868505452213, + "grad_norm": 0.9426439782405206, + "learning_rate": 0.00019403037495284398, + "loss": 0.7404, + "step": 3110 + }, + { + "epoch": 0.19980756895445798, + "grad_norm": 0.8220300785309613, + "learning_rate": 0.00019399221474908932, + "loss": 0.6744, + "step": 3115 + }, + { + "epoch": 0.20012828736369467, + "grad_norm": 0.9955681688037235, + "learning_rate": 0.00019395393674287927, + "loss": 0.6852, + "step": 3120 + }, + { + "epoch": 0.20044900577293137, + "grad_norm": 1.1278721654085937, + "learning_rate": 0.00019391554098218853, + "loss": 0.8426, + "step": 3125 + }, + { + "epoch": 0.20076972418216807, + "grad_norm": 1.289322139002122, + "learning_rate": 0.00019387702751513932, + "loss": 0.7352, + "step": 3130 + }, + { + "epoch": 0.20109044259140474, + "grad_norm": 1.4969951218148942, + "learning_rate": 0.0001938383963900014, + "loss": 0.7202, + "step": 3135 + }, + { + "epoch": 0.20141116100064144, + "grad_norm": 0.8939306827167222, + "learning_rate": 0.000193799647655192, + "loss": 0.6326, + "step": 3140 + }, + { + "epoch": 0.20173187940987813, + "grad_norm": 1.038193039895127, + "learning_rate": 0.00019376078135927566, + "loss": 0.5945, + "step": 3145 + }, + { + "epoch": 0.2020525978191148, + "grad_norm": 0.8466700431352269, + "learning_rate": 0.00019372179755096448, + "loss": 0.4709, + "step": 3150 + }, + { + "epoch": 0.2023733162283515, + "grad_norm": 0.8353167491615692, + "learning_rate": 0.00019368269627911757, + "loss": 0.6145, + "step": 3155 + }, + { + "epoch": 0.2026940346375882, + "grad_norm": 0.5826569638112876, + "learning_rate": 0.00019364347759274144, + "loss": 0.6798, + "step": 3160 + }, + { + "epoch": 0.2030147530468249, + "grad_norm": 0.6596971126256945, + "learning_rate": 0.0001936041415409897, + "loss": 0.7164, + "step": 3165 + }, + { + "epoch": 0.20333547145606157, + "grad_norm": 1.1459761657771013, + "learning_rate": 0.00019356468817316311, + "loss": 0.6503, + "step": 3170 + }, + { + "epoch": 0.20365618986529826, + "grad_norm": 0.6795054057142108, + "learning_rate": 0.0001935251175387094, + "loss": 0.624, + "step": 3175 + }, + { + "epoch": 0.20397690827453496, + "grad_norm": 0.740763733162126, + "learning_rate": 0.00019348542968722324, + "loss": 0.6297, + "step": 3180 + }, + { + "epoch": 0.20429762668377166, + "grad_norm": 0.7064796503029271, + "learning_rate": 0.00019344562466844635, + "loss": 0.6003, + "step": 3185 + }, + { + "epoch": 0.20461834509300833, + "grad_norm": 1.6506358182547065, + "learning_rate": 0.00019340570253226712, + "loss": 0.4787, + "step": 3190 + }, + { + "epoch": 0.20493906350224503, + "grad_norm": 1.1332295207671033, + "learning_rate": 0.0001933656633287209, + "loss": 0.7126, + "step": 3195 + }, + { + "epoch": 0.20525978191148173, + "grad_norm": 0.617200353783866, + "learning_rate": 0.00019332550710798966, + "loss": 0.598, + "step": 3200 + }, + { + "epoch": 0.2055805003207184, + "grad_norm": 0.868513802069887, + "learning_rate": 0.000193285233920402, + "loss": 0.7152, + "step": 3205 + }, + { + "epoch": 0.2059012187299551, + "grad_norm": 1.1852925025104672, + "learning_rate": 0.00019324484381643325, + "loss": 0.7774, + "step": 3210 + }, + { + "epoch": 0.2062219371391918, + "grad_norm": 1.0280680170586727, + "learning_rate": 0.00019320433684670514, + "loss": 0.7043, + "step": 3215 + }, + { + "epoch": 0.2065426555484285, + "grad_norm": 0.6987881012001924, + "learning_rate": 0.00019316371306198592, + "loss": 0.7619, + "step": 3220 + }, + { + "epoch": 0.20686337395766516, + "grad_norm": 0.8392027535004901, + "learning_rate": 0.00019312297251319026, + "loss": 0.6781, + "step": 3225 + }, + { + "epoch": 0.20718409236690186, + "grad_norm": 1.2842078269698645, + "learning_rate": 0.00019308211525137915, + "loss": 0.7145, + "step": 3230 + }, + { + "epoch": 0.20750481077613855, + "grad_norm": 0.6603411917591546, + "learning_rate": 0.0001930411413277599, + "loss": 0.6411, + "step": 3235 + }, + { + "epoch": 0.20782552918537525, + "grad_norm": 1.3159150838945801, + "learning_rate": 0.000193000050793686, + "loss": 0.7067, + "step": 3240 + }, + { + "epoch": 0.20814624759461192, + "grad_norm": 1.2826837962016335, + "learning_rate": 0.0001929588437006571, + "loss": 0.657, + "step": 3245 + }, + { + "epoch": 0.20846696600384862, + "grad_norm": 0.7429467281992763, + "learning_rate": 0.00019291752010031887, + "loss": 0.6783, + "step": 3250 + }, + { + "epoch": 0.20878768441308532, + "grad_norm": 0.9388767995389723, + "learning_rate": 0.00019287608004446314, + "loss": 0.6873, + "step": 3255 + }, + { + "epoch": 0.20910840282232201, + "grad_norm": 0.8840070141339184, + "learning_rate": 0.0001928345235850276, + "loss": 0.6159, + "step": 3260 + }, + { + "epoch": 0.20942912123155868, + "grad_norm": 1.0732885802726535, + "learning_rate": 0.00019279285077409582, + "loss": 0.6713, + "step": 3265 + }, + { + "epoch": 0.20974983964079538, + "grad_norm": 0.7289657532988314, + "learning_rate": 0.00019275106166389725, + "loss": 0.6831, + "step": 3270 + }, + { + "epoch": 0.21007055805003208, + "grad_norm": 0.6492856906135663, + "learning_rate": 0.00019270915630680707, + "loss": 0.7126, + "step": 3275 + }, + { + "epoch": 0.21039127645926875, + "grad_norm": 0.8073736143636202, + "learning_rate": 0.0001926671347553462, + "loss": 0.7527, + "step": 3280 + }, + { + "epoch": 0.21071199486850545, + "grad_norm": 0.8682418292741673, + "learning_rate": 0.0001926249970621811, + "loss": 0.5924, + "step": 3285 + }, + { + "epoch": 0.21103271327774215, + "grad_norm": 0.553914766273313, + "learning_rate": 0.00019258274328012384, + "loss": 0.5456, + "step": 3290 + }, + { + "epoch": 0.21135343168697884, + "grad_norm": 0.9718939215705609, + "learning_rate": 0.00019254037346213204, + "loss": 0.5976, + "step": 3295 + }, + { + "epoch": 0.2116741500962155, + "grad_norm": 0.9064065621099515, + "learning_rate": 0.00019249788766130863, + "loss": 0.7424, + "step": 3300 + }, + { + "epoch": 0.2119948685054522, + "grad_norm": 0.6693670165919959, + "learning_rate": 0.00019245528593090204, + "loss": 0.7834, + "step": 3305 + }, + { + "epoch": 0.2123155869146889, + "grad_norm": 0.68000110275399, + "learning_rate": 0.0001924125683243059, + "loss": 0.8261, + "step": 3310 + }, + { + "epoch": 0.2126363053239256, + "grad_norm": 0.8936655552945705, + "learning_rate": 0.0001923697348950591, + "loss": 0.7315, + "step": 3315 + }, + { + "epoch": 0.21295702373316228, + "grad_norm": 0.9370537429273521, + "learning_rate": 0.0001923267856968457, + "loss": 0.6054, + "step": 3320 + }, + { + "epoch": 0.21327774214239897, + "grad_norm": 1.5321045176308976, + "learning_rate": 0.00019228372078349486, + "loss": 0.6995, + "step": 3325 + }, + { + "epoch": 0.21359846055163567, + "grad_norm": 0.8164083897600656, + "learning_rate": 0.00019224054020898073, + "loss": 0.7217, + "step": 3330 + }, + { + "epoch": 0.21391917896087234, + "grad_norm": 0.9360751302506096, + "learning_rate": 0.00019219724402742247, + "loss": 0.7071, + "step": 3335 + }, + { + "epoch": 0.21423989737010904, + "grad_norm": 1.1474158049320227, + "learning_rate": 0.00019215383229308412, + "loss": 0.696, + "step": 3340 + }, + { + "epoch": 0.21456061577934574, + "grad_norm": 0.6286443687036616, + "learning_rate": 0.0001921103050603745, + "loss": 0.6582, + "step": 3345 + }, + { + "epoch": 0.21488133418858243, + "grad_norm": 0.930008180786893, + "learning_rate": 0.00019206666238384728, + "loss": 0.7267, + "step": 3350 + }, + { + "epoch": 0.2152020525978191, + "grad_norm": 0.8966235538817937, + "learning_rate": 0.0001920229043182007, + "loss": 0.7461, + "step": 3355 + }, + { + "epoch": 0.2155227710070558, + "grad_norm": 0.6075118442836386, + "learning_rate": 0.0001919790309182777, + "loss": 0.6218, + "step": 3360 + }, + { + "epoch": 0.2158434894162925, + "grad_norm": 1.120521483944113, + "learning_rate": 0.00019193504223906577, + "loss": 0.7854, + "step": 3365 + }, + { + "epoch": 0.2161642078255292, + "grad_norm": 0.7536443555714086, + "learning_rate": 0.00019189093833569686, + "loss": 0.6665, + "step": 3370 + }, + { + "epoch": 0.21648492623476587, + "grad_norm": 0.7306155955546904, + "learning_rate": 0.00019184671926344732, + "loss": 0.5562, + "step": 3375 + }, + { + "epoch": 0.21680564464400257, + "grad_norm": 1.4066089443224215, + "learning_rate": 0.00019180238507773788, + "loss": 0.7206, + "step": 3380 + }, + { + "epoch": 0.21712636305323926, + "grad_norm": 1.0420087314885336, + "learning_rate": 0.0001917579358341335, + "loss": 0.8488, + "step": 3385 + }, + { + "epoch": 0.21744708146247593, + "grad_norm": 1.24092779077047, + "learning_rate": 0.0001917133715883434, + "loss": 0.7737, + "step": 3390 + }, + { + "epoch": 0.21776779987171263, + "grad_norm": 1.2683256948043233, + "learning_rate": 0.00019166869239622085, + "loss": 0.5991, + "step": 3395 + }, + { + "epoch": 0.21808851828094933, + "grad_norm": 1.0154708506536307, + "learning_rate": 0.0001916238983137633, + "loss": 0.6921, + "step": 3400 + }, + { + "epoch": 0.21840923669018603, + "grad_norm": 1.250860867590444, + "learning_rate": 0.00019157898939711212, + "loss": 0.772, + "step": 3405 + }, + { + "epoch": 0.2187299550994227, + "grad_norm": 1.0205976247637063, + "learning_rate": 0.0001915339657025526, + "loss": 0.6262, + "step": 3410 + }, + { + "epoch": 0.2190506735086594, + "grad_norm": 0.6808470166264919, + "learning_rate": 0.0001914888272865139, + "loss": 0.5628, + "step": 3415 + }, + { + "epoch": 0.2193713919178961, + "grad_norm": 1.0460679318245396, + "learning_rate": 0.00019144357420556893, + "loss": 0.6497, + "step": 3420 + }, + { + "epoch": 0.2196921103271328, + "grad_norm": 0.8912439646989759, + "learning_rate": 0.00019139820651643442, + "loss": 0.5868, + "step": 3425 + }, + { + "epoch": 0.22001282873636946, + "grad_norm": 0.6690277429678054, + "learning_rate": 0.00019135272427597063, + "loss": 0.6833, + "step": 3430 + }, + { + "epoch": 0.22033354714560616, + "grad_norm": 1.0200781753500376, + "learning_rate": 0.00019130712754118138, + "loss": 0.6225, + "step": 3435 + }, + { + "epoch": 0.22065426555484285, + "grad_norm": 1.0186432727769665, + "learning_rate": 0.00019126141636921414, + "loss": 0.769, + "step": 3440 + }, + { + "epoch": 0.22097498396407952, + "grad_norm": 0.671761473616358, + "learning_rate": 0.0001912155908173596, + "loss": 0.6917, + "step": 3445 + }, + { + "epoch": 0.22129570237331622, + "grad_norm": 0.7493482108843831, + "learning_rate": 0.00019116965094305197, + "loss": 0.7762, + "step": 3450 + }, + { + "epoch": 0.22161642078255292, + "grad_norm": 0.9676529237022933, + "learning_rate": 0.00019112359680386863, + "loss": 0.6426, + "step": 3455 + }, + { + "epoch": 0.22193713919178962, + "grad_norm": 0.7117654744699794, + "learning_rate": 0.00019107742845753025, + "loss": 0.6968, + "step": 3460 + }, + { + "epoch": 0.2222578576010263, + "grad_norm": 1.0489562483489054, + "learning_rate": 0.0001910311459619006, + "loss": 0.7852, + "step": 3465 + }, + { + "epoch": 0.22257857601026299, + "grad_norm": 0.7103830582474117, + "learning_rate": 0.00019098474937498652, + "loss": 0.6496, + "step": 3470 + }, + { + "epoch": 0.22289929441949968, + "grad_norm": 1.1088261693908699, + "learning_rate": 0.00019093823875493784, + "loss": 0.7313, + "step": 3475 + }, + { + "epoch": 0.22322001282873638, + "grad_norm": 1.1659589438084368, + "learning_rate": 0.00019089161416004733, + "loss": 0.6526, + "step": 3480 + }, + { + "epoch": 0.22354073123797305, + "grad_norm": 0.7493230462026259, + "learning_rate": 0.0001908448756487506, + "loss": 0.6629, + "step": 3485 + }, + { + "epoch": 0.22386144964720975, + "grad_norm": 0.8650060759204274, + "learning_rate": 0.000190798023279626, + "loss": 0.7321, + "step": 3490 + }, + { + "epoch": 0.22418216805644645, + "grad_norm": 0.8002336983221607, + "learning_rate": 0.0001907510571113946, + "loss": 0.7816, + "step": 3495 + }, + { + "epoch": 0.22450288646568314, + "grad_norm": 0.6840069838552998, + "learning_rate": 0.00019070397720292014, + "loss": 0.6472, + "step": 3500 + }, + { + "epoch": 0.2248236048749198, + "grad_norm": 0.9253534124109082, + "learning_rate": 0.0001906567836132089, + "loss": 0.7952, + "step": 3505 + }, + { + "epoch": 0.2251443232841565, + "grad_norm": 0.8707427934510977, + "learning_rate": 0.0001906094764014095, + "loss": 0.6403, + "step": 3510 + }, + { + "epoch": 0.2254650416933932, + "grad_norm": 0.8952137846177877, + "learning_rate": 0.00019056205562681324, + "loss": 0.7713, + "step": 3515 + }, + { + "epoch": 0.22578576010262988, + "grad_norm": 1.2157321282590767, + "learning_rate": 0.00019051452134885346, + "loss": 0.7791, + "step": 3520 + }, + { + "epoch": 0.22610647851186658, + "grad_norm": 1.1942747630269164, + "learning_rate": 0.000190466873627106, + "loss": 0.7107, + "step": 3525 + }, + { + "epoch": 0.22642719692110327, + "grad_norm": 0.7534228887260359, + "learning_rate": 0.00019041911252128864, + "loss": 0.7748, + "step": 3530 + }, + { + "epoch": 0.22674791533033997, + "grad_norm": 0.7020738108193582, + "learning_rate": 0.0001903712380912615, + "loss": 0.641, + "step": 3535 + }, + { + "epoch": 0.22706863373957664, + "grad_norm": 0.8822584692031392, + "learning_rate": 0.0001903232503970266, + "loss": 0.7302, + "step": 3540 + }, + { + "epoch": 0.22738935214881334, + "grad_norm": 0.7669563154301963, + "learning_rate": 0.00019027514949872794, + "loss": 0.6305, + "step": 3545 + }, + { + "epoch": 0.22771007055805004, + "grad_norm": 0.75341665833547, + "learning_rate": 0.0001902269354566514, + "loss": 0.5966, + "step": 3550 + }, + { + "epoch": 0.22803078896728673, + "grad_norm": 1.3621102982113154, + "learning_rate": 0.00019017860833122466, + "loss": 0.7256, + "step": 3555 + }, + { + "epoch": 0.2283515073765234, + "grad_norm": 0.6413371506739955, + "learning_rate": 0.00019013016818301718, + "loss": 0.7576, + "step": 3560 + }, + { + "epoch": 0.2286722257857601, + "grad_norm": 0.9240762303756279, + "learning_rate": 0.00019008161507274004, + "loss": 0.6412, + "step": 3565 + }, + { + "epoch": 0.2289929441949968, + "grad_norm": 0.600216888507175, + "learning_rate": 0.0001900329490612458, + "loss": 0.6077, + "step": 3570 + }, + { + "epoch": 0.22931366260423347, + "grad_norm": 0.7764633127488129, + "learning_rate": 0.0001899841702095287, + "loss": 0.7296, + "step": 3575 + }, + { + "epoch": 0.22963438101347017, + "grad_norm": 0.8982484209272996, + "learning_rate": 0.00018993527857872437, + "loss": 0.7016, + "step": 3580 + }, + { + "epoch": 0.22995509942270687, + "grad_norm": 1.0720659350142319, + "learning_rate": 0.0001898862742301096, + "loss": 0.7538, + "step": 3585 + }, + { + "epoch": 0.23027581783194356, + "grad_norm": 1.1146855770453603, + "learning_rate": 0.00018983715722510267, + "loss": 0.7336, + "step": 3590 + }, + { + "epoch": 0.23059653624118023, + "grad_norm": 1.0183157286000422, + "learning_rate": 0.00018978792762526297, + "loss": 0.7608, + "step": 3595 + }, + { + "epoch": 0.23091725465041693, + "grad_norm": 0.5987067875621542, + "learning_rate": 0.000189738585492291, + "loss": 0.7482, + "step": 3600 + }, + { + "epoch": 0.23123797305965363, + "grad_norm": 1.2051854914953493, + "learning_rate": 0.0001896891308880283, + "loss": 0.6866, + "step": 3605 + }, + { + "epoch": 0.23155869146889033, + "grad_norm": 0.6469997389423526, + "learning_rate": 0.00018963956387445743, + "loss": 0.5533, + "step": 3610 + }, + { + "epoch": 0.231879409878127, + "grad_norm": 0.751435050187464, + "learning_rate": 0.00018958988451370172, + "loss": 0.5345, + "step": 3615 + }, + { + "epoch": 0.2322001282873637, + "grad_norm": 0.9296699512717883, + "learning_rate": 0.00018954009286802545, + "loss": 0.6395, + "step": 3620 + }, + { + "epoch": 0.2325208466966004, + "grad_norm": 0.8523320100136826, + "learning_rate": 0.0001894901889998335, + "loss": 0.6699, + "step": 3625 + }, + { + "epoch": 0.23284156510583706, + "grad_norm": 0.8927205659717501, + "learning_rate": 0.0001894401729716715, + "loss": 0.7016, + "step": 3630 + }, + { + "epoch": 0.23316228351507376, + "grad_norm": 0.9773519130062428, + "learning_rate": 0.00018939004484622556, + "loss": 0.5938, + "step": 3635 + }, + { + "epoch": 0.23348300192431046, + "grad_norm": 1.205672119851859, + "learning_rate": 0.00018933980468632236, + "loss": 0.6659, + "step": 3640 + }, + { + "epoch": 0.23380372033354715, + "grad_norm": 0.7579640404532227, + "learning_rate": 0.00018928945255492898, + "loss": 0.6189, + "step": 3645 + }, + { + "epoch": 0.23412443874278382, + "grad_norm": 0.7167559954703847, + "learning_rate": 0.0001892389885151528, + "loss": 0.7174, + "step": 3650 + }, + { + "epoch": 0.23444515715202052, + "grad_norm": 0.9211676236510546, + "learning_rate": 0.0001891884126302415, + "loss": 0.7194, + "step": 3655 + }, + { + "epoch": 0.23476587556125722, + "grad_norm": 1.0264289808335763, + "learning_rate": 0.00018913772496358293, + "loss": 0.7518, + "step": 3660 + }, + { + "epoch": 0.23508659397049392, + "grad_norm": 0.7037785727516465, + "learning_rate": 0.000189086925578705, + "loss": 0.6463, + "step": 3665 + }, + { + "epoch": 0.2354073123797306, + "grad_norm": 0.7939519982595736, + "learning_rate": 0.0001890360145392757, + "loss": 0.6679, + "step": 3670 + }, + { + "epoch": 0.23572803078896729, + "grad_norm": 0.9346634485226615, + "learning_rate": 0.00018898499190910285, + "loss": 0.6707, + "step": 3675 + }, + { + "epoch": 0.23604874919820398, + "grad_norm": 0.9205144038862676, + "learning_rate": 0.00018893385775213428, + "loss": 0.5932, + "step": 3680 + }, + { + "epoch": 0.23636946760744068, + "grad_norm": 0.7662986014450179, + "learning_rate": 0.00018888261213245751, + "loss": 0.626, + "step": 3685 + }, + { + "epoch": 0.23669018601667735, + "grad_norm": 0.9540864146877855, + "learning_rate": 0.00018883125511429976, + "loss": 0.6775, + "step": 3690 + }, + { + "epoch": 0.23701090442591405, + "grad_norm": 0.8236472390358622, + "learning_rate": 0.0001887797867620279, + "loss": 0.5783, + "step": 3695 + }, + { + "epoch": 0.23733162283515075, + "grad_norm": 1.1046319576589374, + "learning_rate": 0.00018872820714014828, + "loss": 0.7178, + "step": 3700 + }, + { + "epoch": 0.23765234124438742, + "grad_norm": 0.8687058181792315, + "learning_rate": 0.0001886765163133068, + "loss": 0.7188, + "step": 3705 + }, + { + "epoch": 0.2379730596536241, + "grad_norm": 0.8074055463421766, + "learning_rate": 0.0001886247143462886, + "loss": 0.6839, + "step": 3710 + }, + { + "epoch": 0.2382937780628608, + "grad_norm": 0.9477091526553252, + "learning_rate": 0.0001885728013040183, + "loss": 0.694, + "step": 3715 + }, + { + "epoch": 0.2386144964720975, + "grad_norm": 1.4070444194213776, + "learning_rate": 0.00018852077725155955, + "loss": 0.6443, + "step": 3720 + }, + { + "epoch": 0.23893521488133418, + "grad_norm": 0.7885481772614231, + "learning_rate": 0.00018846864225411522, + "loss": 0.6975, + "step": 3725 + }, + { + "epoch": 0.23925593329057088, + "grad_norm": 1.416662073982706, + "learning_rate": 0.0001884163963770272, + "loss": 0.5101, + "step": 3730 + }, + { + "epoch": 0.23957665169980757, + "grad_norm": 1.1458969994696415, + "learning_rate": 0.00018836403968577642, + "loss": 0.6615, + "step": 3735 + }, + { + "epoch": 0.23989737010904427, + "grad_norm": 0.8353107592687541, + "learning_rate": 0.00018831157224598265, + "loss": 0.6361, + "step": 3740 + }, + { + "epoch": 0.24021808851828094, + "grad_norm": 0.9588837283118316, + "learning_rate": 0.0001882589941234044, + "loss": 0.6013, + "step": 3745 + }, + { + "epoch": 0.24053880692751764, + "grad_norm": 0.9378372320194371, + "learning_rate": 0.00018820630538393896, + "loss": 0.6638, + "step": 3750 + }, + { + "epoch": 0.24085952533675434, + "grad_norm": 0.657630819098, + "learning_rate": 0.0001881535060936223, + "loss": 0.6291, + "step": 3755 + }, + { + "epoch": 0.241180243745991, + "grad_norm": 0.8483718480641205, + "learning_rate": 0.00018810059631862885, + "loss": 0.7489, + "step": 3760 + }, + { + "epoch": 0.2415009621552277, + "grad_norm": 0.6502718844446955, + "learning_rate": 0.0001880475761252716, + "loss": 0.7414, + "step": 3765 + }, + { + "epoch": 0.2418216805644644, + "grad_norm": 1.1168778404379636, + "learning_rate": 0.00018799444558000188, + "loss": 0.5148, + "step": 3770 + }, + { + "epoch": 0.2421423989737011, + "grad_norm": 0.7913864245267141, + "learning_rate": 0.00018794120474940936, + "loss": 0.7854, + "step": 3775 + }, + { + "epoch": 0.24246311738293777, + "grad_norm": 0.6448828952136001, + "learning_rate": 0.00018788785370022187, + "loss": 0.7078, + "step": 3780 + }, + { + "epoch": 0.24278383579217447, + "grad_norm": 1.5060141096885609, + "learning_rate": 0.00018783439249930544, + "loss": 0.6149, + "step": 3785 + }, + { + "epoch": 0.24310455420141117, + "grad_norm": 1.1449759900992198, + "learning_rate": 0.00018778082121366415, + "loss": 0.6848, + "step": 3790 + }, + { + "epoch": 0.24342527261064786, + "grad_norm": 0.8978384550293506, + "learning_rate": 0.00018772713991044006, + "loss": 0.5786, + "step": 3795 + }, + { + "epoch": 0.24374599101988453, + "grad_norm": 1.0307173194583823, + "learning_rate": 0.0001876733486569131, + "loss": 0.6089, + "step": 3800 + }, + { + "epoch": 0.24406670942912123, + "grad_norm": 1.0460496173819018, + "learning_rate": 0.00018761944752050092, + "loss": 0.7205, + "step": 3805 + }, + { + "epoch": 0.24438742783835793, + "grad_norm": 0.7905784500183457, + "learning_rate": 0.00018756543656875903, + "loss": 0.6866, + "step": 3810 + }, + { + "epoch": 0.2447081462475946, + "grad_norm": 0.8146037687112702, + "learning_rate": 0.0001875113158693805, + "loss": 0.6722, + "step": 3815 + }, + { + "epoch": 0.2450288646568313, + "grad_norm": 0.6700527883378358, + "learning_rate": 0.00018745708549019598, + "loss": 0.69, + "step": 3820 + }, + { + "epoch": 0.245349583066068, + "grad_norm": 0.86059539710882, + "learning_rate": 0.00018740274549917355, + "loss": 0.6951, + "step": 3825 + }, + { + "epoch": 0.2456703014753047, + "grad_norm": 0.754486021920581, + "learning_rate": 0.00018734829596441869, + "loss": 0.669, + "step": 3830 + }, + { + "epoch": 0.24599101988454136, + "grad_norm": 1.2671234138000913, + "learning_rate": 0.00018729373695417411, + "loss": 0.53, + "step": 3835 + }, + { + "epoch": 0.24631173829377806, + "grad_norm": 0.6932982987761634, + "learning_rate": 0.0001872390685368199, + "loss": 0.6588, + "step": 3840 + }, + { + "epoch": 0.24663245670301476, + "grad_norm": 0.8973942648351731, + "learning_rate": 0.00018718429078087306, + "loss": 0.759, + "step": 3845 + }, + { + "epoch": 0.24695317511225146, + "grad_norm": 0.8232879633687452, + "learning_rate": 0.00018712940375498777, + "loss": 0.7228, + "step": 3850 + }, + { + "epoch": 0.24727389352148813, + "grad_norm": 0.6326649992249508, + "learning_rate": 0.0001870744075279551, + "loss": 0.7392, + "step": 3855 + }, + { + "epoch": 0.24759461193072482, + "grad_norm": 1.097141467166474, + "learning_rate": 0.000187019302168703, + "loss": 0.6787, + "step": 3860 + }, + { + "epoch": 0.24791533033996152, + "grad_norm": 0.3009107744843191, + "learning_rate": 0.00018696408774629623, + "loss": 0.5101, + "step": 3865 + }, + { + "epoch": 0.2482360487491982, + "grad_norm": 0.8763665765416497, + "learning_rate": 0.00018690876432993616, + "loss": 0.6693, + "step": 3870 + }, + { + "epoch": 0.2485567671584349, + "grad_norm": 0.8358957515633696, + "learning_rate": 0.00018685333198896085, + "loss": 0.4624, + "step": 3875 + }, + { + "epoch": 0.24887748556767159, + "grad_norm": 0.7954157351888587, + "learning_rate": 0.00018679779079284478, + "loss": 0.6448, + "step": 3880 + }, + { + "epoch": 0.24919820397690828, + "grad_norm": 0.8015671945298257, + "learning_rate": 0.00018674214081119899, + "loss": 0.7378, + "step": 3885 + }, + { + "epoch": 0.24951892238614495, + "grad_norm": 0.4176253877935304, + "learning_rate": 0.00018668638211377075, + "loss": 0.6243, + "step": 3890 + }, + { + "epoch": 0.24983964079538165, + "grad_norm": 0.9442754652275936, + "learning_rate": 0.00018663051477044363, + "loss": 0.7179, + "step": 3895 + }, + { + "epoch": 0.2501603592046183, + "grad_norm": 0.4823245844586911, + "learning_rate": 0.00018657453885123743, + "loss": 0.6911, + "step": 3900 + }, + { + "epoch": 0.250481077613855, + "grad_norm": 1.2379921804802545, + "learning_rate": 0.00018651845442630788, + "loss": 0.7287, + "step": 3905 + }, + { + "epoch": 0.2508017960230917, + "grad_norm": 0.8025900155844875, + "learning_rate": 0.00018646226156594683, + "loss": 0.6996, + "step": 3910 + }, + { + "epoch": 0.2511225144323284, + "grad_norm": 0.7107570481507937, + "learning_rate": 0.00018640596034058202, + "loss": 0.6547, + "step": 3915 + }, + { + "epoch": 0.2514432328415651, + "grad_norm": 1.0641358272949475, + "learning_rate": 0.00018634955082077694, + "loss": 0.6644, + "step": 3920 + }, + { + "epoch": 0.2517639512508018, + "grad_norm": 0.47480734009901776, + "learning_rate": 0.00018629303307723087, + "loss": 0.573, + "step": 3925 + }, + { + "epoch": 0.2520846696600385, + "grad_norm": 0.793188561410365, + "learning_rate": 0.0001862364071807787, + "loss": 0.5214, + "step": 3930 + }, + { + "epoch": 0.25240538806927515, + "grad_norm": 1.0592935580458442, + "learning_rate": 0.00018617967320239088, + "loss": 0.7271, + "step": 3935 + }, + { + "epoch": 0.25272610647851185, + "grad_norm": 1.2256726599433683, + "learning_rate": 0.00018612283121317334, + "loss": 0.6422, + "step": 3940 + }, + { + "epoch": 0.25304682488774854, + "grad_norm": 0.7519903384129473, + "learning_rate": 0.00018606588128436733, + "loss": 0.5867, + "step": 3945 + }, + { + "epoch": 0.25336754329698524, + "grad_norm": 0.7245403184900441, + "learning_rate": 0.00018600882348734942, + "loss": 0.595, + "step": 3950 + }, + { + "epoch": 0.25368826170622194, + "grad_norm": 0.8118238034713691, + "learning_rate": 0.0001859516578936314, + "loss": 0.6789, + "step": 3955 + }, + { + "epoch": 0.25400898011545864, + "grad_norm": 0.94671989401086, + "learning_rate": 0.0001858943845748601, + "loss": 0.5563, + "step": 3960 + }, + { + "epoch": 0.25432969852469534, + "grad_norm": 1.2366250568429358, + "learning_rate": 0.00018583700360281743, + "loss": 0.7508, + "step": 3965 + }, + { + "epoch": 0.25465041693393203, + "grad_norm": 0.79253106009907, + "learning_rate": 0.00018577951504942014, + "loss": 0.8067, + "step": 3970 + }, + { + "epoch": 0.2549711353431687, + "grad_norm": 0.8702530726486416, + "learning_rate": 0.0001857219189867199, + "loss": 0.617, + "step": 3975 + }, + { + "epoch": 0.2552918537524054, + "grad_norm": 1.0941049074741396, + "learning_rate": 0.0001856642154869031, + "loss": 0.6722, + "step": 3980 + }, + { + "epoch": 0.25561257216164207, + "grad_norm": 0.8439431895631772, + "learning_rate": 0.00018560640462229072, + "loss": 0.4939, + "step": 3985 + }, + { + "epoch": 0.25593329057087877, + "grad_norm": 0.6351905484581176, + "learning_rate": 0.00018554848646533842, + "loss": 0.6447, + "step": 3990 + }, + { + "epoch": 0.25625400898011547, + "grad_norm": 0.5405523691592523, + "learning_rate": 0.00018549046108863623, + "loss": 0.619, + "step": 3995 + }, + { + "epoch": 0.25657472738935216, + "grad_norm": 0.9663208760661458, + "learning_rate": 0.00018543232856490857, + "loss": 0.7077, + "step": 4000 + }, + { + "epoch": 0.25689544579858886, + "grad_norm": 1.1847646315539586, + "learning_rate": 0.00018537408896701426, + "loss": 0.645, + "step": 4005 + }, + { + "epoch": 0.2572161642078255, + "grad_norm": 0.9615403982388305, + "learning_rate": 0.00018531574236794614, + "loss": 0.6811, + "step": 4010 + }, + { + "epoch": 0.2575368826170622, + "grad_norm": 0.8358212875135942, + "learning_rate": 0.0001852572888408313, + "loss": 0.7614, + "step": 4015 + }, + { + "epoch": 0.2578576010262989, + "grad_norm": 0.654849517944886, + "learning_rate": 0.00018519872845893084, + "loss": 0.7217, + "step": 4020 + }, + { + "epoch": 0.2581783194355356, + "grad_norm": 1.2575079996892056, + "learning_rate": 0.00018514006129563966, + "loss": 0.6607, + "step": 4025 + }, + { + "epoch": 0.2584990378447723, + "grad_norm": 0.9922068320402926, + "learning_rate": 0.00018508128742448664, + "loss": 0.837, + "step": 4030 + }, + { + "epoch": 0.258819756254009, + "grad_norm": 0.6769732353504583, + "learning_rate": 0.00018502240691913423, + "loss": 0.5391, + "step": 4035 + }, + { + "epoch": 0.2591404746632457, + "grad_norm": 1.0085400425349142, + "learning_rate": 0.00018496341985337872, + "loss": 0.6348, + "step": 4040 + }, + { + "epoch": 0.2594611930724824, + "grad_norm": 1.0848700957447277, + "learning_rate": 0.00018490432630114987, + "loss": 0.6778, + "step": 4045 + }, + { + "epoch": 0.25978191148171903, + "grad_norm": 2.0271957707532953, + "learning_rate": 0.00018484512633651083, + "loss": 0.654, + "step": 4050 + }, + { + "epoch": 0.2601026298909557, + "grad_norm": 0.7805695373329654, + "learning_rate": 0.00018478582003365822, + "loss": 0.7096, + "step": 4055 + }, + { + "epoch": 0.2604233483001924, + "grad_norm": 0.9870035129297559, + "learning_rate": 0.0001847264074669219, + "loss": 0.6384, + "step": 4060 + }, + { + "epoch": 0.2607440667094291, + "grad_norm": 1.4231275295206969, + "learning_rate": 0.00018466688871076492, + "loss": 0.7516, + "step": 4065 + }, + { + "epoch": 0.2610647851186658, + "grad_norm": 0.9526984436593213, + "learning_rate": 0.00018460726383978337, + "loss": 0.7593, + "step": 4070 + }, + { + "epoch": 0.2613855035279025, + "grad_norm": 0.8092373561884175, + "learning_rate": 0.00018454753292870645, + "loss": 0.7056, + "step": 4075 + }, + { + "epoch": 0.2617062219371392, + "grad_norm": 1.0372403017182314, + "learning_rate": 0.0001844876960523961, + "loss": 0.7301, + "step": 4080 + }, + { + "epoch": 0.26202694034637586, + "grad_norm": 1.0864230414581424, + "learning_rate": 0.0001844277532858472, + "loss": 0.7108, + "step": 4085 + }, + { + "epoch": 0.26234765875561256, + "grad_norm": 1.1180610427980169, + "learning_rate": 0.00018436770470418734, + "loss": 0.6945, + "step": 4090 + }, + { + "epoch": 0.26266837716484925, + "grad_norm": 0.7213205274182185, + "learning_rate": 0.00018430755038267664, + "loss": 0.5532, + "step": 4095 + }, + { + "epoch": 0.26298909557408595, + "grad_norm": 1.1163686122257008, + "learning_rate": 0.00018424729039670786, + "loss": 0.6516, + "step": 4100 + }, + { + "epoch": 0.26330981398332265, + "grad_norm": 1.2583036183921432, + "learning_rate": 0.00018418692482180605, + "loss": 0.6414, + "step": 4105 + }, + { + "epoch": 0.26363053239255935, + "grad_norm": 0.9930140372439703, + "learning_rate": 0.0001841264537336287, + "loss": 0.6207, + "step": 4110 + }, + { + "epoch": 0.26395125080179604, + "grad_norm": 1.0089622154428168, + "learning_rate": 0.00018406587720796555, + "loss": 0.584, + "step": 4115 + }, + { + "epoch": 0.2642719692110327, + "grad_norm": 0.7458841041229098, + "learning_rate": 0.00018400519532073845, + "loss": 0.5883, + "step": 4120 + }, + { + "epoch": 0.2645926876202694, + "grad_norm": 0.8089823917563255, + "learning_rate": 0.0001839444081480013, + "loss": 0.7034, + "step": 4125 + }, + { + "epoch": 0.2649134060295061, + "grad_norm": 0.6692062310802624, + "learning_rate": 0.00018388351576594, + "loss": 0.6344, + "step": 4130 + }, + { + "epoch": 0.2652341244387428, + "grad_norm": 1.1933403776576017, + "learning_rate": 0.0001838225182508722, + "loss": 0.6661, + "step": 4135 + }, + { + "epoch": 0.2655548428479795, + "grad_norm": 0.8440572180162611, + "learning_rate": 0.00018376141567924746, + "loss": 0.748, + "step": 4140 + }, + { + "epoch": 0.2658755612572162, + "grad_norm": 0.8186841087339073, + "learning_rate": 0.0001837002081276469, + "loss": 0.7713, + "step": 4145 + }, + { + "epoch": 0.2661962796664529, + "grad_norm": 1.0666433490645642, + "learning_rate": 0.0001836388956727833, + "loss": 0.8609, + "step": 4150 + }, + { + "epoch": 0.26651699807568957, + "grad_norm": 1.1355241254608384, + "learning_rate": 0.00018357747839150082, + "loss": 0.6469, + "step": 4155 + }, + { + "epoch": 0.2668377164849262, + "grad_norm": 0.7464964673319473, + "learning_rate": 0.00018351595636077509, + "loss": 0.5979, + "step": 4160 + }, + { + "epoch": 0.2671584348941629, + "grad_norm": 0.8983502422541593, + "learning_rate": 0.00018345432965771296, + "loss": 0.6956, + "step": 4165 + }, + { + "epoch": 0.2674791533033996, + "grad_norm": 1.0667530685360391, + "learning_rate": 0.00018339259835955252, + "loss": 0.613, + "step": 4170 + }, + { + "epoch": 0.2677998717126363, + "grad_norm": 0.9132017699113576, + "learning_rate": 0.00018333076254366292, + "loss": 0.7377, + "step": 4175 + }, + { + "epoch": 0.268120590121873, + "grad_norm": 0.820877622590415, + "learning_rate": 0.0001832688222875443, + "loss": 0.6287, + "step": 4180 + }, + { + "epoch": 0.2684413085311097, + "grad_norm": 1.118619920969021, + "learning_rate": 0.00018320677766882777, + "loss": 0.6384, + "step": 4185 + }, + { + "epoch": 0.2687620269403464, + "grad_norm": 1.4366554572404993, + "learning_rate": 0.00018314462876527508, + "loss": 0.6833, + "step": 4190 + }, + { + "epoch": 0.26908274534958304, + "grad_norm": 1.0835964639148083, + "learning_rate": 0.00018308237565477887, + "loss": 0.5727, + "step": 4195 + }, + { + "epoch": 0.26940346375881974, + "grad_norm": 0.9256686315486947, + "learning_rate": 0.00018302001841536222, + "loss": 0.6766, + "step": 4200 + }, + { + "epoch": 0.26972418216805644, + "grad_norm": 0.9133924374197757, + "learning_rate": 0.00018295755712517887, + "loss": 0.6114, + "step": 4205 + }, + { + "epoch": 0.27004490057729313, + "grad_norm": 0.9886601065235708, + "learning_rate": 0.00018289499186251282, + "loss": 0.6487, + "step": 4210 + }, + { + "epoch": 0.27036561898652983, + "grad_norm": 0.7921503565458989, + "learning_rate": 0.00018283232270577854, + "loss": 0.5979, + "step": 4215 + }, + { + "epoch": 0.27068633739576653, + "grad_norm": 0.6150099468882971, + "learning_rate": 0.00018276954973352053, + "loss": 0.6981, + "step": 4220 + }, + { + "epoch": 0.2710070558050032, + "grad_norm": 1.0834800425960802, + "learning_rate": 0.00018270667302441355, + "loss": 0.5754, + "step": 4225 + }, + { + "epoch": 0.27132777421423987, + "grad_norm": 1.6569395813805736, + "learning_rate": 0.00018264369265726232, + "loss": 0.6754, + "step": 4230 + }, + { + "epoch": 0.27164849262347657, + "grad_norm": 1.1904706994873762, + "learning_rate": 0.0001825806087110015, + "loss": 0.6955, + "step": 4235 + }, + { + "epoch": 0.27196921103271327, + "grad_norm": 0.9036845887010689, + "learning_rate": 0.00018251742126469553, + "loss": 0.6245, + "step": 4240 + }, + { + "epoch": 0.27228992944194996, + "grad_norm": 1.2154289806047023, + "learning_rate": 0.00018245413039753858, + "loss": 0.6966, + "step": 4245 + }, + { + "epoch": 0.27261064785118666, + "grad_norm": 0.7781670764658554, + "learning_rate": 0.00018239073618885447, + "loss": 0.5014, + "step": 4250 + }, + { + "epoch": 0.27293136626042336, + "grad_norm": 0.9312674308580604, + "learning_rate": 0.00018232723871809654, + "loss": 0.7177, + "step": 4255 + }, + { + "epoch": 0.27325208466966006, + "grad_norm": 0.7997579086131462, + "learning_rate": 0.00018226363806484749, + "loss": 0.6622, + "step": 4260 + }, + { + "epoch": 0.27357280307889675, + "grad_norm": 1.1414064891921076, + "learning_rate": 0.00018219993430881935, + "loss": 0.7326, + "step": 4265 + }, + { + "epoch": 0.2738935214881334, + "grad_norm": 0.8488220516302005, + "learning_rate": 0.00018213612752985346, + "loss": 0.6111, + "step": 4270 + }, + { + "epoch": 0.2742142398973701, + "grad_norm": 0.6785943182404776, + "learning_rate": 0.00018207221780792022, + "loss": 0.568, + "step": 4275 + }, + { + "epoch": 0.2745349583066068, + "grad_norm": 0.7407135493281501, + "learning_rate": 0.00018200820522311907, + "loss": 0.9428, + "step": 4280 + }, + { + "epoch": 0.2748556767158435, + "grad_norm": 0.7785838981084623, + "learning_rate": 0.00018194408985567826, + "loss": 0.6602, + "step": 4285 + }, + { + "epoch": 0.2751763951250802, + "grad_norm": 1.3274741440702664, + "learning_rate": 0.00018187987178595506, + "loss": 0.6326, + "step": 4290 + }, + { + "epoch": 0.2754971135343169, + "grad_norm": 0.7698326162883183, + "learning_rate": 0.00018181555109443527, + "loss": 0.7828, + "step": 4295 + }, + { + "epoch": 0.2758178319435536, + "grad_norm": 0.9874438661020553, + "learning_rate": 0.00018175112786173345, + "loss": 0.6177, + "step": 4300 + }, + { + "epoch": 0.2761385503527902, + "grad_norm": 1.2983806783457539, + "learning_rate": 0.0001816866021685926, + "loss": 0.5931, + "step": 4305 + }, + { + "epoch": 0.2764592687620269, + "grad_norm": 0.6650133276949847, + "learning_rate": 0.00018162197409588414, + "loss": 0.6065, + "step": 4310 + }, + { + "epoch": 0.2767799871712636, + "grad_norm": 0.6615532414642794, + "learning_rate": 0.0001815572437246078, + "loss": 0.6777, + "step": 4315 + }, + { + "epoch": 0.2771007055805003, + "grad_norm": 0.9856674878658384, + "learning_rate": 0.00018149241113589158, + "loss": 0.7992, + "step": 4320 + }, + { + "epoch": 0.277421423989737, + "grad_norm": 0.9736624117716728, + "learning_rate": 0.00018142747641099156, + "loss": 0.6433, + "step": 4325 + }, + { + "epoch": 0.2777421423989737, + "grad_norm": 0.6411826659070557, + "learning_rate": 0.00018136243963129176, + "loss": 0.6934, + "step": 4330 + }, + { + "epoch": 0.2780628608082104, + "grad_norm": 1.1535749419623087, + "learning_rate": 0.00018129730087830423, + "loss": 0.6763, + "step": 4335 + }, + { + "epoch": 0.2783835792174471, + "grad_norm": 0.9545043501616219, + "learning_rate": 0.00018123206023366875, + "loss": 0.6913, + "step": 4340 + }, + { + "epoch": 0.27870429762668375, + "grad_norm": 0.8726709507710128, + "learning_rate": 0.00018116671777915279, + "loss": 0.6719, + "step": 4345 + }, + { + "epoch": 0.27902501603592045, + "grad_norm": 0.8365717106126314, + "learning_rate": 0.00018110127359665144, + "loss": 0.8124, + "step": 4350 + }, + { + "epoch": 0.27934573444515715, + "grad_norm": 1.2549482014888076, + "learning_rate": 0.00018103572776818734, + "loss": 0.6818, + "step": 4355 + }, + { + "epoch": 0.27966645285439384, + "grad_norm": 1.0842835676700455, + "learning_rate": 0.00018097008037591046, + "loss": 0.6671, + "step": 4360 + }, + { + "epoch": 0.27998717126363054, + "grad_norm": 0.9380406537541407, + "learning_rate": 0.00018090433150209809, + "loss": 0.6949, + "step": 4365 + }, + { + "epoch": 0.28030788967286724, + "grad_norm": 1.150794578223368, + "learning_rate": 0.00018083848122915468, + "loss": 0.7515, + "step": 4370 + }, + { + "epoch": 0.28062860808210394, + "grad_norm": 0.8083227750174746, + "learning_rate": 0.0001807725296396118, + "loss": 0.7616, + "step": 4375 + }, + { + "epoch": 0.2809493264913406, + "grad_norm": 0.7534176713677331, + "learning_rate": 0.000180706476816128, + "loss": 0.7793, + "step": 4380 + }, + { + "epoch": 0.2812700449005773, + "grad_norm": 0.8339195487244033, + "learning_rate": 0.00018064032284148868, + "loss": 0.6498, + "step": 4385 + }, + { + "epoch": 0.281590763309814, + "grad_norm": 1.0737472499663367, + "learning_rate": 0.00018057406779860603, + "loss": 0.717, + "step": 4390 + }, + { + "epoch": 0.28191148171905067, + "grad_norm": 0.9978477560799941, + "learning_rate": 0.00018050771177051896, + "loss": 0.5892, + "step": 4395 + }, + { + "epoch": 0.28223220012828737, + "grad_norm": 1.3027101386742324, + "learning_rate": 0.00018044125484039284, + "loss": 0.7084, + "step": 4400 + }, + { + "epoch": 0.28255291853752407, + "grad_norm": 0.930029771124351, + "learning_rate": 0.0001803746970915196, + "loss": 0.6916, + "step": 4405 + }, + { + "epoch": 0.28287363694676076, + "grad_norm": 0.7778850969886842, + "learning_rate": 0.00018030803860731744, + "loss": 0.7685, + "step": 4410 + }, + { + "epoch": 0.2831943553559974, + "grad_norm": 0.7650986542927773, + "learning_rate": 0.00018024127947133096, + "loss": 0.6537, + "step": 4415 + }, + { + "epoch": 0.2835150737652341, + "grad_norm": 1.5408988991120984, + "learning_rate": 0.00018017441976723073, + "loss": 0.7775, + "step": 4420 + }, + { + "epoch": 0.2838357921744708, + "grad_norm": 1.2912216339714508, + "learning_rate": 0.0001801074595788135, + "loss": 0.6968, + "step": 4425 + }, + { + "epoch": 0.2841565105837075, + "grad_norm": 1.0528277674684878, + "learning_rate": 0.00018004039899000186, + "loss": 0.6352, + "step": 4430 + }, + { + "epoch": 0.2844772289929442, + "grad_norm": 0.9968577641995723, + "learning_rate": 0.00017997323808484434, + "loss": 0.681, + "step": 4435 + }, + { + "epoch": 0.2847979474021809, + "grad_norm": 0.7048566927661232, + "learning_rate": 0.0001799059769475151, + "loss": 0.589, + "step": 4440 + }, + { + "epoch": 0.2851186658114176, + "grad_norm": 1.2752536855080614, + "learning_rate": 0.00017983861566231397, + "loss": 0.6021, + "step": 4445 + }, + { + "epoch": 0.2854393842206543, + "grad_norm": 0.6838772733375945, + "learning_rate": 0.0001797711543136663, + "loss": 0.62, + "step": 4450 + }, + { + "epoch": 0.28576010262989093, + "grad_norm": 1.0992940781905054, + "learning_rate": 0.00017970359298612282, + "loss": 0.7695, + "step": 4455 + }, + { + "epoch": 0.28608082103912763, + "grad_norm": 0.9891320713998334, + "learning_rate": 0.00017963593176435964, + "loss": 0.7417, + "step": 4460 + }, + { + "epoch": 0.28640153944836433, + "grad_norm": 1.0219509493165506, + "learning_rate": 0.00017956817073317793, + "loss": 0.8078, + "step": 4465 + }, + { + "epoch": 0.286722257857601, + "grad_norm": 0.601838514745307, + "learning_rate": 0.00017950030997750414, + "loss": 0.6521, + "step": 4470 + }, + { + "epoch": 0.2870429762668377, + "grad_norm": 0.6658616403524804, + "learning_rate": 0.00017943234958238952, + "loss": 0.4757, + "step": 4475 + }, + { + "epoch": 0.2873636946760744, + "grad_norm": 1.007316511383742, + "learning_rate": 0.00017936428963301036, + "loss": 0.7311, + "step": 4480 + }, + { + "epoch": 0.2876844130853111, + "grad_norm": 1.1189936485732135, + "learning_rate": 0.00017929613021466765, + "loss": 0.6303, + "step": 4485 + }, + { + "epoch": 0.28800513149454776, + "grad_norm": 0.7720709103171642, + "learning_rate": 0.000179227871412787, + "loss": 0.5517, + "step": 4490 + }, + { + "epoch": 0.28832584990378446, + "grad_norm": 0.840259961080622, + "learning_rate": 0.00017915951331291864, + "loss": 0.7003, + "step": 4495 + }, + { + "epoch": 0.28864656831302116, + "grad_norm": 0.7950998217641071, + "learning_rate": 0.00017909105600073726, + "loss": 0.6693, + "step": 4500 + }, + { + "epoch": 0.28896728672225785, + "grad_norm": 0.8828219239731676, + "learning_rate": 0.00017902249956204183, + "loss": 0.613, + "step": 4505 + }, + { + "epoch": 0.28928800513149455, + "grad_norm": 0.8050366826668545, + "learning_rate": 0.0001789538440827557, + "loss": 0.5657, + "step": 4510 + }, + { + "epoch": 0.28960872354073125, + "grad_norm": 1.0967164706749888, + "learning_rate": 0.00017888508964892616, + "loss": 0.8128, + "step": 4515 + }, + { + "epoch": 0.28992944194996795, + "grad_norm": 0.9150715640614145, + "learning_rate": 0.00017881623634672465, + "loss": 0.7572, + "step": 4520 + }, + { + "epoch": 0.29025016035920465, + "grad_norm": 1.2602671775870735, + "learning_rate": 0.00017874728426244647, + "loss": 0.6905, + "step": 4525 + }, + { + "epoch": 0.2905708787684413, + "grad_norm": 0.9346668957570068, + "learning_rate": 0.00017867823348251076, + "loss": 0.7051, + "step": 4530 + }, + { + "epoch": 0.290891597177678, + "grad_norm": 0.7910849436025686, + "learning_rate": 0.00017860908409346034, + "loss": 0.709, + "step": 4535 + }, + { + "epoch": 0.2912123155869147, + "grad_norm": 0.8218374279342303, + "learning_rate": 0.0001785398361819616, + "loss": 0.5839, + "step": 4540 + }, + { + "epoch": 0.2915330339961514, + "grad_norm": 0.8511332345341893, + "learning_rate": 0.0001784704898348045, + "loss": 0.7218, + "step": 4545 + }, + { + "epoch": 0.2918537524053881, + "grad_norm": 1.2396495867604176, + "learning_rate": 0.0001784010451389022, + "loss": 0.5707, + "step": 4550 + }, + { + "epoch": 0.2921744708146248, + "grad_norm": 0.5453795713818735, + "learning_rate": 0.00017833150218129129, + "loss": 0.7248, + "step": 4555 + }, + { + "epoch": 0.2924951892238615, + "grad_norm": 0.8544441259057197, + "learning_rate": 0.00017826186104913142, + "loss": 0.6706, + "step": 4560 + }, + { + "epoch": 0.2928159076330981, + "grad_norm": 0.7078874543955929, + "learning_rate": 0.00017819212182970535, + "loss": 0.6732, + "step": 4565 + }, + { + "epoch": 0.2931366260423348, + "grad_norm": 1.1258864806353122, + "learning_rate": 0.0001781222846104187, + "loss": 0.696, + "step": 4570 + }, + { + "epoch": 0.2934573444515715, + "grad_norm": 0.8952983146425741, + "learning_rate": 0.00017805234947879993, + "loss": 0.6778, + "step": 4575 + }, + { + "epoch": 0.2937780628608082, + "grad_norm": 1.078013753440664, + "learning_rate": 0.0001779823165225003, + "loss": 0.6494, + "step": 4580 + }, + { + "epoch": 0.2940987812700449, + "grad_norm": 1.2457998074637708, + "learning_rate": 0.0001779121858292936, + "loss": 0.6356, + "step": 4585 + }, + { + "epoch": 0.2944194996792816, + "grad_norm": 0.9452414867290724, + "learning_rate": 0.0001778419574870761, + "loss": 0.7049, + "step": 4590 + }, + { + "epoch": 0.2947402180885183, + "grad_norm": 1.0903318911783695, + "learning_rate": 0.00017777163158386647, + "loss": 0.653, + "step": 4595 + }, + { + "epoch": 0.29506093649775494, + "grad_norm": 1.172298521370259, + "learning_rate": 0.00017770120820780573, + "loss": 0.7285, + "step": 4600 + }, + { + "epoch": 0.29538165490699164, + "grad_norm": 0.6583420678299451, + "learning_rate": 0.00017763068744715697, + "loss": 0.6031, + "step": 4605 + }, + { + "epoch": 0.29570237331622834, + "grad_norm": 0.8591774180151724, + "learning_rate": 0.00017756006939030535, + "loss": 0.7409, + "step": 4610 + }, + { + "epoch": 0.29602309172546504, + "grad_norm": 0.6898541329818539, + "learning_rate": 0.00017748935412575804, + "loss": 0.589, + "step": 4615 + }, + { + "epoch": 0.29634381013470174, + "grad_norm": 0.5395272492697519, + "learning_rate": 0.000177418541742144, + "loss": 0.708, + "step": 4620 + }, + { + "epoch": 0.29666452854393843, + "grad_norm": 1.0169898045901036, + "learning_rate": 0.0001773476323282138, + "loss": 0.6948, + "step": 4625 + }, + { + "epoch": 0.29698524695317513, + "grad_norm": 1.0000948614259928, + "learning_rate": 0.00017727662597283986, + "loss": 0.7215, + "step": 4630 + }, + { + "epoch": 0.29730596536241183, + "grad_norm": 0.9689865733719959, + "learning_rate": 0.00017720552276501592, + "loss": 0.6701, + "step": 4635 + }, + { + "epoch": 0.29762668377164847, + "grad_norm": 0.6557948134140331, + "learning_rate": 0.00017713432279385712, + "loss": 0.6235, + "step": 4640 + }, + { + "epoch": 0.29794740218088517, + "grad_norm": 1.1877573091679572, + "learning_rate": 0.00017706302614859992, + "loss": 0.7863, + "step": 4645 + }, + { + "epoch": 0.29826812059012187, + "grad_norm": 0.8462973100804213, + "learning_rate": 0.00017699163291860198, + "loss": 0.5724, + "step": 4650 + }, + { + "epoch": 0.29858883899935856, + "grad_norm": 0.9236445624740109, + "learning_rate": 0.0001769201431933419, + "loss": 0.5787, + "step": 4655 + }, + { + "epoch": 0.29890955740859526, + "grad_norm": 1.0716376234952218, + "learning_rate": 0.00017684855706241934, + "loss": 0.7401, + "step": 4660 + }, + { + "epoch": 0.29923027581783196, + "grad_norm": 1.1600311786248418, + "learning_rate": 0.00017677687461555467, + "loss": 0.708, + "step": 4665 + }, + { + "epoch": 0.29955099422706866, + "grad_norm": 0.7413385734559219, + "learning_rate": 0.00017670509594258912, + "loss": 0.5718, + "step": 4670 + }, + { + "epoch": 0.2998717126363053, + "grad_norm": 0.9348593211146833, + "learning_rate": 0.00017663322113348434, + "loss": 0.7492, + "step": 4675 + }, + { + "epoch": 0.300192431045542, + "grad_norm": 1.5696315279326167, + "learning_rate": 0.0001765612502783226, + "loss": 0.6552, + "step": 4680 + }, + { + "epoch": 0.3005131494547787, + "grad_norm": 1.0990775256909542, + "learning_rate": 0.00017648918346730653, + "loss": 0.582, + "step": 4685 + }, + { + "epoch": 0.3008338678640154, + "grad_norm": 0.7467674097224691, + "learning_rate": 0.00017641702079075904, + "loss": 0.6326, + "step": 4690 + }, + { + "epoch": 0.3011545862732521, + "grad_norm": 0.7256436706311058, + "learning_rate": 0.00017634476233912308, + "loss": 0.7717, + "step": 4695 + }, + { + "epoch": 0.3014753046824888, + "grad_norm": 0.754840650778496, + "learning_rate": 0.00017627240820296177, + "loss": 0.6896, + "step": 4700 + }, + { + "epoch": 0.3017960230917255, + "grad_norm": 0.7072150395545665, + "learning_rate": 0.0001761999584729581, + "loss": 0.6332, + "step": 4705 + }, + { + "epoch": 0.3021167415009622, + "grad_norm": 1.2009873604762311, + "learning_rate": 0.00017612741323991488, + "loss": 0.6393, + "step": 4710 + }, + { + "epoch": 0.3024374599101988, + "grad_norm": 0.6086745243060716, + "learning_rate": 0.0001760547725947545, + "loss": 0.6681, + "step": 4715 + }, + { + "epoch": 0.3027581783194355, + "grad_norm": 0.9853085984018423, + "learning_rate": 0.0001759820366285192, + "loss": 0.5961, + "step": 4720 + }, + { + "epoch": 0.3030788967286722, + "grad_norm": 1.0109466174974706, + "learning_rate": 0.00017590920543237036, + "loss": 0.7225, + "step": 4725 + }, + { + "epoch": 0.3033996151379089, + "grad_norm": 1.2139597067132748, + "learning_rate": 0.00017583627909758902, + "loss": 0.6542, + "step": 4730 + }, + { + "epoch": 0.3037203335471456, + "grad_norm": 0.9478885183065455, + "learning_rate": 0.00017576325771557518, + "loss": 0.6881, + "step": 4735 + }, + { + "epoch": 0.3040410519563823, + "grad_norm": 0.8539507613861936, + "learning_rate": 0.00017569014137784822, + "loss": 0.6331, + "step": 4740 + }, + { + "epoch": 0.304361770365619, + "grad_norm": 0.9679885840401695, + "learning_rate": 0.00017561693017604637, + "loss": 0.7997, + "step": 4745 + }, + { + "epoch": 0.30468248877485565, + "grad_norm": 0.9422216475894025, + "learning_rate": 0.00017554362420192676, + "loss": 0.6769, + "step": 4750 + }, + { + "epoch": 0.30500320718409235, + "grad_norm": 1.0998446041770769, + "learning_rate": 0.00017547022354736538, + "loss": 0.6072, + "step": 4755 + }, + { + "epoch": 0.30532392559332905, + "grad_norm": 1.0857238442878236, + "learning_rate": 0.00017539672830435682, + "loss": 0.7689, + "step": 4760 + }, + { + "epoch": 0.30564464400256575, + "grad_norm": 0.7440444931879342, + "learning_rate": 0.00017532313856501427, + "loss": 0.5841, + "step": 4765 + }, + { + "epoch": 0.30596536241180244, + "grad_norm": 0.7172978744287396, + "learning_rate": 0.0001752494544215693, + "loss": 0.6583, + "step": 4770 + }, + { + "epoch": 0.30628608082103914, + "grad_norm": 1.2045039512423583, + "learning_rate": 0.00017517567596637184, + "loss": 0.6052, + "step": 4775 + }, + { + "epoch": 0.30660679923027584, + "grad_norm": 0.6334336485782317, + "learning_rate": 0.00017510180329189, + "loss": 0.6194, + "step": 4780 + }, + { + "epoch": 0.3069275176395125, + "grad_norm": 1.3899325242838065, + "learning_rate": 0.00017502783649070994, + "loss": 0.7102, + "step": 4785 + }, + { + "epoch": 0.3072482360487492, + "grad_norm": 1.1877009077958471, + "learning_rate": 0.00017495377565553594, + "loss": 0.683, + "step": 4790 + }, + { + "epoch": 0.3075689544579859, + "grad_norm": 1.1043105680832985, + "learning_rate": 0.00017487962087918993, + "loss": 0.6165, + "step": 4795 + }, + { + "epoch": 0.3078896728672226, + "grad_norm": 0.9571802341999754, + "learning_rate": 0.00017480537225461178, + "loss": 0.499, + "step": 4800 + }, + { + "epoch": 0.3082103912764593, + "grad_norm": 1.0846077393930171, + "learning_rate": 0.00017473102987485876, + "loss": 0.7685, + "step": 4805 + }, + { + "epoch": 0.30853110968569597, + "grad_norm": 0.9095961738585777, + "learning_rate": 0.00017465659383310587, + "loss": 0.6373, + "step": 4810 + }, + { + "epoch": 0.30885182809493267, + "grad_norm": 1.1872255037042634, + "learning_rate": 0.00017458206422264533, + "loss": 0.6564, + "step": 4815 + }, + { + "epoch": 0.30917254650416937, + "grad_norm": 1.0600317447426089, + "learning_rate": 0.00017450744113688672, + "loss": 0.6103, + "step": 4820 + }, + { + "epoch": 0.309493264913406, + "grad_norm": 0.89956531270657, + "learning_rate": 0.00017443272466935675, + "loss": 0.7056, + "step": 4825 + }, + { + "epoch": 0.3098139833226427, + "grad_norm": 0.6138048573378617, + "learning_rate": 0.00017435791491369917, + "loss": 0.6437, + "step": 4830 + }, + { + "epoch": 0.3101347017318794, + "grad_norm": 0.6479672204769544, + "learning_rate": 0.00017428301196367464, + "loss": 0.7149, + "step": 4835 + }, + { + "epoch": 0.3104554201411161, + "grad_norm": 0.9059240016877552, + "learning_rate": 0.00017420801591316062, + "loss": 0.6641, + "step": 4840 + }, + { + "epoch": 0.3107761385503528, + "grad_norm": 0.7000331742442105, + "learning_rate": 0.00017413292685615134, + "loss": 0.6227, + "step": 4845 + }, + { + "epoch": 0.3110968569595895, + "grad_norm": 0.8706735159170973, + "learning_rate": 0.00017405774488675742, + "loss": 0.6191, + "step": 4850 + }, + { + "epoch": 0.3114175753688262, + "grad_norm": 0.9657278531523165, + "learning_rate": 0.0001739824700992061, + "loss": 0.5956, + "step": 4855 + }, + { + "epoch": 0.31173829377806284, + "grad_norm": 0.9553637466697323, + "learning_rate": 0.0001739071025878409, + "loss": 0.7627, + "step": 4860 + }, + { + "epoch": 0.31205901218729953, + "grad_norm": 1.1595347795694808, + "learning_rate": 0.00017383164244712146, + "loss": 0.6432, + "step": 4865 + }, + { + "epoch": 0.31237973059653623, + "grad_norm": 1.3557930665103466, + "learning_rate": 0.0001737560897716236, + "loss": 0.6965, + "step": 4870 + }, + { + "epoch": 0.31270044900577293, + "grad_norm": 0.919377290874929, + "learning_rate": 0.00017368044465603915, + "loss": 0.6913, + "step": 4875 + }, + { + "epoch": 0.3130211674150096, + "grad_norm": 0.9179711638304333, + "learning_rate": 0.00017360470719517577, + "loss": 0.5516, + "step": 4880 + }, + { + "epoch": 0.3133418858242463, + "grad_norm": 0.8074363475177312, + "learning_rate": 0.00017352887748395678, + "loss": 0.6421, + "step": 4885 + }, + { + "epoch": 0.313662604233483, + "grad_norm": 1.3217851235374773, + "learning_rate": 0.00017345295561742123, + "loss": 0.7387, + "step": 4890 + }, + { + "epoch": 0.31398332264271966, + "grad_norm": 0.8100107368582629, + "learning_rate": 0.0001733769416907236, + "loss": 0.6104, + "step": 4895 + }, + { + "epoch": 0.31430404105195636, + "grad_norm": 1.0974582938152775, + "learning_rate": 0.0001733008357991338, + "loss": 0.649, + "step": 4900 + }, + { + "epoch": 0.31462475946119306, + "grad_norm": 1.233711986487123, + "learning_rate": 0.00017322463803803688, + "loss": 0.5448, + "step": 4905 + }, + { + "epoch": 0.31494547787042976, + "grad_norm": 0.8777266459889339, + "learning_rate": 0.00017314834850293325, + "loss": 0.7512, + "step": 4910 + }, + { + "epoch": 0.31526619627966646, + "grad_norm": 0.8794148401176598, + "learning_rate": 0.00017307196728943812, + "loss": 0.6314, + "step": 4915 + }, + { + "epoch": 0.31558691468890315, + "grad_norm": 0.7021113325319495, + "learning_rate": 0.00017299549449328175, + "loss": 0.5404, + "step": 4920 + }, + { + "epoch": 0.31590763309813985, + "grad_norm": 0.76819009517203, + "learning_rate": 0.00017291893021030913, + "loss": 0.7646, + "step": 4925 + }, + { + "epoch": 0.31622835150737655, + "grad_norm": 1.3281150753972946, + "learning_rate": 0.00017284227453647993, + "loss": 0.6404, + "step": 4930 + }, + { + "epoch": 0.3165490699166132, + "grad_norm": 0.8777792257027988, + "learning_rate": 0.00017276552756786831, + "loss": 0.7211, + "step": 4935 + }, + { + "epoch": 0.3168697883258499, + "grad_norm": 0.9522765071117524, + "learning_rate": 0.00017268868940066288, + "loss": 0.7659, + "step": 4940 + }, + { + "epoch": 0.3171905067350866, + "grad_norm": 0.7347381221386469, + "learning_rate": 0.0001726117601311666, + "loss": 0.7521, + "step": 4945 + }, + { + "epoch": 0.3175112251443233, + "grad_norm": 0.947686463596072, + "learning_rate": 0.00017253473985579657, + "loss": 0.6981, + "step": 4950 + }, + { + "epoch": 0.31783194355356, + "grad_norm": 0.9948270615790568, + "learning_rate": 0.0001724576286710839, + "loss": 0.5347, + "step": 4955 + }, + { + "epoch": 0.3181526619627967, + "grad_norm": 0.7412951434019396, + "learning_rate": 0.00017238042667367377, + "loss": 0.6563, + "step": 4960 + }, + { + "epoch": 0.3184733803720334, + "grad_norm": 0.9060455966464537, + "learning_rate": 0.00017230313396032504, + "loss": 0.8452, + "step": 4965 + }, + { + "epoch": 0.31879409878127, + "grad_norm": 0.7926379737171755, + "learning_rate": 0.00017222575062791033, + "loss": 0.6834, + "step": 4970 + }, + { + "epoch": 0.3191148171905067, + "grad_norm": 1.1978749811848812, + "learning_rate": 0.00017214827677341582, + "loss": 0.5959, + "step": 4975 + }, + { + "epoch": 0.3194355355997434, + "grad_norm": 1.1382243993856835, + "learning_rate": 0.00017207071249394118, + "loss": 0.8144, + "step": 4980 + }, + { + "epoch": 0.3197562540089801, + "grad_norm": 0.9207041310652729, + "learning_rate": 0.00017199305788669937, + "loss": 0.7515, + "step": 4985 + }, + { + "epoch": 0.3200769724182168, + "grad_norm": 0.7762438521118743, + "learning_rate": 0.00017191531304901653, + "loss": 0.7128, + "step": 4990 + }, + { + "epoch": 0.3203976908274535, + "grad_norm": 1.0657161158728048, + "learning_rate": 0.000171837478078332, + "loss": 0.7206, + "step": 4995 + }, + { + "epoch": 0.3207184092366902, + "grad_norm": 0.8853471042976426, + "learning_rate": 0.00017175955307219796, + "loss": 0.6661, + "step": 5000 + }, + { + "epoch": 0.3210391276459269, + "grad_norm": 0.730931049927295, + "learning_rate": 0.00017168153812827957, + "loss": 0.7177, + "step": 5005 + }, + { + "epoch": 0.32135984605516354, + "grad_norm": 1.24238938271146, + "learning_rate": 0.0001716034333443545, + "loss": 0.7264, + "step": 5010 + }, + { + "epoch": 0.32168056446440024, + "grad_norm": 1.0598509644567646, + "learning_rate": 0.00017152523881831325, + "loss": 0.5868, + "step": 5015 + }, + { + "epoch": 0.32200128287363694, + "grad_norm": 1.142674205123222, + "learning_rate": 0.00017144695464815866, + "loss": 0.7652, + "step": 5020 + }, + { + "epoch": 0.32232200128287364, + "grad_norm": 1.2248444413302872, + "learning_rate": 0.00017136858093200593, + "loss": 0.6078, + "step": 5025 + }, + { + "epoch": 0.32264271969211034, + "grad_norm": 0.9090404485944782, + "learning_rate": 0.00017129011776808258, + "loss": 0.6921, + "step": 5030 + }, + { + "epoch": 0.32296343810134703, + "grad_norm": 1.0978730524660503, + "learning_rate": 0.00017121156525472814, + "loss": 0.7593, + "step": 5035 + }, + { + "epoch": 0.32328415651058373, + "grad_norm": 1.8023280272488704, + "learning_rate": 0.00017113292349039413, + "loss": 0.7583, + "step": 5040 + }, + { + "epoch": 0.3236048749198204, + "grad_norm": 1.0487723489551213, + "learning_rate": 0.000171054192573644, + "loss": 0.7754, + "step": 5045 + }, + { + "epoch": 0.32392559332905707, + "grad_norm": 0.7931120571928945, + "learning_rate": 0.0001709753726031529, + "loss": 0.7182, + "step": 5050 + }, + { + "epoch": 0.32424631173829377, + "grad_norm": 1.3448284362405596, + "learning_rate": 0.00017089646367770756, + "loss": 0.6391, + "step": 5055 + }, + { + "epoch": 0.32456703014753047, + "grad_norm": 0.9771883061194023, + "learning_rate": 0.0001708174658962062, + "loss": 0.632, + "step": 5060 + }, + { + "epoch": 0.32488774855676716, + "grad_norm": 0.944625885099161, + "learning_rate": 0.00017073837935765846, + "loss": 0.6235, + "step": 5065 + }, + { + "epoch": 0.32520846696600386, + "grad_norm": 0.9899695819556337, + "learning_rate": 0.00017065920416118522, + "loss": 0.7345, + "step": 5070 + }, + { + "epoch": 0.32552918537524056, + "grad_norm": 0.5815153267452241, + "learning_rate": 0.00017057994040601838, + "loss": 0.5988, + "step": 5075 + }, + { + "epoch": 0.3258499037844772, + "grad_norm": 0.7182304509869034, + "learning_rate": 0.00017050058819150098, + "loss": 0.5962, + "step": 5080 + }, + { + "epoch": 0.3261706221937139, + "grad_norm": 0.7916342652857238, + "learning_rate": 0.0001704211476170868, + "loss": 0.5903, + "step": 5085 + }, + { + "epoch": 0.3264913406029506, + "grad_norm": 1.186592480709318, + "learning_rate": 0.00017034161878234043, + "loss": 0.7071, + "step": 5090 + }, + { + "epoch": 0.3268120590121873, + "grad_norm": 1.4501384859209354, + "learning_rate": 0.00017026200178693704, + "loss": 0.5699, + "step": 5095 + }, + { + "epoch": 0.327132777421424, + "grad_norm": 0.4770414244602479, + "learning_rate": 0.0001701822967306624, + "loss": 0.6942, + "step": 5100 + }, + { + "epoch": 0.3274534958306607, + "grad_norm": 1.2188679878291713, + "learning_rate": 0.00017010250371341244, + "loss": 0.6633, + "step": 5105 + }, + { + "epoch": 0.3277742142398974, + "grad_norm": 1.0813857287425748, + "learning_rate": 0.0001700226228351935, + "loss": 0.6257, + "step": 5110 + }, + { + "epoch": 0.3280949326491341, + "grad_norm": 0.8540165463861037, + "learning_rate": 0.00016994265419612205, + "loss": 0.5918, + "step": 5115 + }, + { + "epoch": 0.32841565105837073, + "grad_norm": 1.1642007608342173, + "learning_rate": 0.00016986259789642444, + "loss": 0.6911, + "step": 5120 + }, + { + "epoch": 0.3287363694676074, + "grad_norm": 0.8539433327300491, + "learning_rate": 0.00016978245403643694, + "loss": 0.7732, + "step": 5125 + }, + { + "epoch": 0.3290570878768441, + "grad_norm": 1.0202618411725253, + "learning_rate": 0.0001697022227166056, + "loss": 0.7798, + "step": 5130 + }, + { + "epoch": 0.3293778062860808, + "grad_norm": 0.8876324268732894, + "learning_rate": 0.00016962190403748605, + "loss": 0.714, + "step": 5135 + }, + { + "epoch": 0.3296985246953175, + "grad_norm": 0.7783501191713772, + "learning_rate": 0.0001695414980997434, + "loss": 0.7987, + "step": 5140 + }, + { + "epoch": 0.3300192431045542, + "grad_norm": 1.204240570280653, + "learning_rate": 0.00016946100500415213, + "loss": 0.6914, + "step": 5145 + }, + { + "epoch": 0.3303399615137909, + "grad_norm": 0.7152048301163425, + "learning_rate": 0.00016938042485159594, + "loss": 0.6703, + "step": 5150 + }, + { + "epoch": 0.33066067992302756, + "grad_norm": 1.191922058294469, + "learning_rate": 0.0001692997577430677, + "loss": 0.6539, + "step": 5155 + }, + { + "epoch": 0.33098139833226425, + "grad_norm": 0.8187793173057333, + "learning_rate": 0.00016921900377966923, + "loss": 0.7468, + "step": 5160 + }, + { + "epoch": 0.33130211674150095, + "grad_norm": 0.9381392106872509, + "learning_rate": 0.00016913816306261112, + "loss": 0.766, + "step": 5165 + }, + { + "epoch": 0.33162283515073765, + "grad_norm": 0.7128118797176758, + "learning_rate": 0.00016905723569321288, + "loss": 0.6719, + "step": 5170 + }, + { + "epoch": 0.33194355355997435, + "grad_norm": 1.500297575057347, + "learning_rate": 0.00016897622177290244, + "loss": 0.7072, + "step": 5175 + }, + { + "epoch": 0.33226427196921104, + "grad_norm": 0.9800774031498481, + "learning_rate": 0.0001688951214032163, + "loss": 0.6549, + "step": 5180 + }, + { + "epoch": 0.33258499037844774, + "grad_norm": 0.8808790723791357, + "learning_rate": 0.00016881393468579932, + "loss": 0.6955, + "step": 5185 + }, + { + "epoch": 0.33290570878768444, + "grad_norm": 0.8920914860291771, + "learning_rate": 0.00016873266172240452, + "loss": 0.5649, + "step": 5190 + }, + { + "epoch": 0.3332264271969211, + "grad_norm": 0.6851960157071083, + "learning_rate": 0.00016865130261489305, + "loss": 0.6897, + "step": 5195 + }, + { + "epoch": 0.3335471456061578, + "grad_norm": 0.8407283098592762, + "learning_rate": 0.00016856985746523405, + "loss": 0.6559, + "step": 5200 + }, + { + "epoch": 0.3338678640153945, + "grad_norm": 0.9215186470532375, + "learning_rate": 0.00016848832637550437, + "loss": 0.7664, + "step": 5205 + }, + { + "epoch": 0.3341885824246312, + "grad_norm": 0.7299164606010856, + "learning_rate": 0.00016840670944788882, + "loss": 0.5981, + "step": 5210 + }, + { + "epoch": 0.3345093008338679, + "grad_norm": 0.8732424966610127, + "learning_rate": 0.00016832500678467952, + "loss": 0.7035, + "step": 5215 + }, + { + "epoch": 0.33483001924310457, + "grad_norm": 0.9750167638289885, + "learning_rate": 0.00016824321848827624, + "loss": 0.5995, + "step": 5220 + }, + { + "epoch": 0.33515073765234127, + "grad_norm": 1.0976388995980935, + "learning_rate": 0.00016816134466118596, + "loss": 0.7107, + "step": 5225 + }, + { + "epoch": 0.3354714560615779, + "grad_norm": 1.0135781126967063, + "learning_rate": 0.00016807938540602292, + "loss": 0.7174, + "step": 5230 + }, + { + "epoch": 0.3357921744708146, + "grad_norm": 0.8189118457664761, + "learning_rate": 0.00016799734082550844, + "loss": 0.6645, + "step": 5235 + }, + { + "epoch": 0.3361128928800513, + "grad_norm": 0.6996919391876488, + "learning_rate": 0.0001679152110224707, + "loss": 0.6629, + "step": 5240 + }, + { + "epoch": 0.336433611289288, + "grad_norm": 0.7381428623848976, + "learning_rate": 0.00016783299609984478, + "loss": 0.6016, + "step": 5245 + }, + { + "epoch": 0.3367543296985247, + "grad_norm": 0.9095764087290898, + "learning_rate": 0.00016775069616067233, + "loss": 0.8577, + "step": 5250 + }, + { + "epoch": 0.3370750481077614, + "grad_norm": 0.7032412366347235, + "learning_rate": 0.00016766831130810171, + "loss": 0.7342, + "step": 5255 + }, + { + "epoch": 0.3373957665169981, + "grad_norm": 0.9697869860649856, + "learning_rate": 0.00016758584164538757, + "loss": 0.6338, + "step": 5260 + }, + { + "epoch": 0.33771648492623474, + "grad_norm": 0.7784503288752077, + "learning_rate": 0.00016750328727589095, + "loss": 0.6666, + "step": 5265 + }, + { + "epoch": 0.33803720333547144, + "grad_norm": 0.5156266401874552, + "learning_rate": 0.00016742064830307897, + "loss": 0.7699, + "step": 5270 + }, + { + "epoch": 0.33835792174470813, + "grad_norm": 1.0003590365934907, + "learning_rate": 0.0001673379248305248, + "loss": 0.6751, + "step": 5275 + }, + { + "epoch": 0.33867864015394483, + "grad_norm": 0.8026066074245787, + "learning_rate": 0.0001672551169619076, + "loss": 0.7573, + "step": 5280 + }, + { + "epoch": 0.33899935856318153, + "grad_norm": 1.0369937352211243, + "learning_rate": 0.00016717222480101221, + "loss": 0.667, + "step": 5285 + }, + { + "epoch": 0.3393200769724182, + "grad_norm": 0.9644006720446381, + "learning_rate": 0.0001670892484517292, + "loss": 0.6383, + "step": 5290 + }, + { + "epoch": 0.3396407953816549, + "grad_norm": 1.0076204289252497, + "learning_rate": 0.00016700618801805453, + "loss": 0.7178, + "step": 5295 + }, + { + "epoch": 0.3399615137908916, + "grad_norm": 0.5579579624666732, + "learning_rate": 0.00016692304360408966, + "loss": 0.6665, + "step": 5300 + }, + { + "epoch": 0.34028223220012827, + "grad_norm": 0.8064350566112853, + "learning_rate": 0.00016683981531404125, + "loss": 0.5122, + "step": 5305 + }, + { + "epoch": 0.34060295060936496, + "grad_norm": 0.9816255727453933, + "learning_rate": 0.0001667565032522211, + "loss": 0.6926, + "step": 5310 + }, + { + "epoch": 0.34092366901860166, + "grad_norm": 0.817929460216783, + "learning_rate": 0.00016667310752304602, + "loss": 0.5491, + "step": 5315 + }, + { + "epoch": 0.34124438742783836, + "grad_norm": 0.9215347160545883, + "learning_rate": 0.00016658962823103764, + "loss": 0.6835, + "step": 5320 + }, + { + "epoch": 0.34156510583707506, + "grad_norm": 1.1290419292904414, + "learning_rate": 0.00016650606548082236, + "loss": 0.735, + "step": 5325 + }, + { + "epoch": 0.34188582424631175, + "grad_norm": 1.1930691902617288, + "learning_rate": 0.0001664224193771312, + "loss": 0.5138, + "step": 5330 + }, + { + "epoch": 0.34220654265554845, + "grad_norm": 0.8088938421114102, + "learning_rate": 0.0001663386900247995, + "loss": 0.6654, + "step": 5335 + }, + { + "epoch": 0.3425272610647851, + "grad_norm": 0.5514542526950761, + "learning_rate": 0.0001662548775287672, + "loss": 0.6456, + "step": 5340 + }, + { + "epoch": 0.3428479794740218, + "grad_norm": 0.8205842308107273, + "learning_rate": 0.00016617098199407814, + "loss": 0.7144, + "step": 5345 + }, + { + "epoch": 0.3431686978832585, + "grad_norm": 0.9295493105678805, + "learning_rate": 0.00016608700352588053, + "loss": 0.6876, + "step": 5350 + }, + { + "epoch": 0.3434894162924952, + "grad_norm": 0.7296614219020304, + "learning_rate": 0.00016600294222942626, + "loss": 0.6785, + "step": 5355 + }, + { + "epoch": 0.3438101347017319, + "grad_norm": 0.6002339895362847, + "learning_rate": 0.00016591879821007126, + "loss": 0.5796, + "step": 5360 + }, + { + "epoch": 0.3441308531109686, + "grad_norm": 1.6160052086574104, + "learning_rate": 0.00016583457157327497, + "loss": 0.7118, + "step": 5365 + }, + { + "epoch": 0.3444515715202053, + "grad_norm": 1.2282552121625845, + "learning_rate": 0.00016575026242460046, + "loss": 0.6564, + "step": 5370 + }, + { + "epoch": 0.344772289929442, + "grad_norm": 0.9643175110463178, + "learning_rate": 0.00016566587086971416, + "loss": 0.669, + "step": 5375 + }, + { + "epoch": 0.3450930083386786, + "grad_norm": 0.9607772443483632, + "learning_rate": 0.00016558139701438584, + "loss": 0.6276, + "step": 5380 + }, + { + "epoch": 0.3454137267479153, + "grad_norm": 0.9147875672042459, + "learning_rate": 0.0001654968409644884, + "loss": 0.5905, + "step": 5385 + }, + { + "epoch": 0.345734445157152, + "grad_norm": 0.7334238812099275, + "learning_rate": 0.00016541220282599773, + "loss": 0.6261, + "step": 5390 + }, + { + "epoch": 0.3460551635663887, + "grad_norm": 1.1742953273617749, + "learning_rate": 0.00016532748270499262, + "loss": 0.7, + "step": 5395 + }, + { + "epoch": 0.3463758819756254, + "grad_norm": 1.1387016781633938, + "learning_rate": 0.00016524268070765465, + "loss": 0.7061, + "step": 5400 + }, + { + "epoch": 0.3466966003848621, + "grad_norm": 0.9794060869341327, + "learning_rate": 0.0001651577969402679, + "loss": 0.7031, + "step": 5405 + }, + { + "epoch": 0.3470173187940988, + "grad_norm": 0.9732807122694793, + "learning_rate": 0.0001650728315092191, + "loss": 0.6588, + "step": 5410 + }, + { + "epoch": 0.34733803720333545, + "grad_norm": 1.2045887990242425, + "learning_rate": 0.0001649877845209972, + "loss": 0.5635, + "step": 5415 + }, + { + "epoch": 0.34765875561257215, + "grad_norm": 0.9098967972234847, + "learning_rate": 0.0001649026560821934, + "loss": 0.6877, + "step": 5420 + }, + { + "epoch": 0.34797947402180884, + "grad_norm": 0.8919518792507914, + "learning_rate": 0.000164817446299501, + "loss": 0.852, + "step": 5425 + }, + { + "epoch": 0.34830019243104554, + "grad_norm": 1.082286394388753, + "learning_rate": 0.00016473215527971528, + "loss": 0.6497, + "step": 5430 + }, + { + "epoch": 0.34862091084028224, + "grad_norm": 0.7681820908697059, + "learning_rate": 0.00016464678312973327, + "loss": 0.7075, + "step": 5435 + }, + { + "epoch": 0.34894162924951894, + "grad_norm": 0.8577629521944062, + "learning_rate": 0.00016456132995655372, + "loss": 0.6942, + "step": 5440 + }, + { + "epoch": 0.34926234765875563, + "grad_norm": 0.7981749008936162, + "learning_rate": 0.00016447579586727692, + "loss": 0.6658, + "step": 5445 + }, + { + "epoch": 0.3495830660679923, + "grad_norm": 0.6566080494812765, + "learning_rate": 0.0001643901809691046, + "loss": 0.6325, + "step": 5450 + }, + { + "epoch": 0.349903784477229, + "grad_norm": 0.7729498372329889, + "learning_rate": 0.00016430448536933965, + "loss": 0.5609, + "step": 5455 + }, + { + "epoch": 0.35022450288646567, + "grad_norm": 1.0464507162443157, + "learning_rate": 0.00016421870917538635, + "loss": 0.6353, + "step": 5460 + }, + { + "epoch": 0.35054522129570237, + "grad_norm": 1.3013839685098925, + "learning_rate": 0.00016413285249474975, + "loss": 0.5724, + "step": 5465 + }, + { + "epoch": 0.35086593970493907, + "grad_norm": 0.813558813259816, + "learning_rate": 0.00016404691543503588, + "loss": 0.7074, + "step": 5470 + }, + { + "epoch": 0.35118665811417576, + "grad_norm": 1.001748370098994, + "learning_rate": 0.0001639608981039515, + "loss": 0.7945, + "step": 5475 + }, + { + "epoch": 0.35150737652341246, + "grad_norm": 0.870149957049954, + "learning_rate": 0.00016387480060930395, + "loss": 0.689, + "step": 5480 + }, + { + "epoch": 0.35182809493264916, + "grad_norm": 0.8680578535676656, + "learning_rate": 0.00016378862305900112, + "loss": 0.6239, + "step": 5485 + }, + { + "epoch": 0.3521488133418858, + "grad_norm": 0.8274627515878666, + "learning_rate": 0.0001637023655610511, + "loss": 0.6437, + "step": 5490 + }, + { + "epoch": 0.3524695317511225, + "grad_norm": 0.8836905220838523, + "learning_rate": 0.00016361602822356232, + "loss": 0.581, + "step": 5495 + }, + { + "epoch": 0.3527902501603592, + "grad_norm": 0.645087928333498, + "learning_rate": 0.0001635296111547432, + "loss": 0.65, + "step": 5500 + }, + { + "epoch": 0.3531109685695959, + "grad_norm": 0.9138176884852274, + "learning_rate": 0.00016344311446290212, + "loss": 0.6039, + "step": 5505 + }, + { + "epoch": 0.3534316869788326, + "grad_norm": 0.8932196439321753, + "learning_rate": 0.00016335653825644717, + "loss": 0.6447, + "step": 5510 + }, + { + "epoch": 0.3537524053880693, + "grad_norm": 0.700814257534255, + "learning_rate": 0.00016326988264388624, + "loss": 0.634, + "step": 5515 + }, + { + "epoch": 0.354073123797306, + "grad_norm": 0.8079984489578869, + "learning_rate": 0.0001631831477338266, + "loss": 0.5378, + "step": 5520 + }, + { + "epoch": 0.35439384220654263, + "grad_norm": 1.0368102707808613, + "learning_rate": 0.00016309633363497503, + "loss": 0.6121, + "step": 5525 + }, + { + "epoch": 0.35471456061577933, + "grad_norm": 1.0720279870828384, + "learning_rate": 0.00016300944045613745, + "loss": 0.615, + "step": 5530 + }, + { + "epoch": 0.355035279025016, + "grad_norm": 0.6936759908598535, + "learning_rate": 0.00016292246830621897, + "loss": 0.7186, + "step": 5535 + }, + { + "epoch": 0.3553559974342527, + "grad_norm": 0.8578757956070833, + "learning_rate": 0.00016283541729422368, + "loss": 0.6859, + "step": 5540 + }, + { + "epoch": 0.3556767158434894, + "grad_norm": 0.6299846194893505, + "learning_rate": 0.0001627482875292544, + "loss": 0.7011, + "step": 5545 + }, + { + "epoch": 0.3559974342527261, + "grad_norm": 2.8465820906119697, + "learning_rate": 0.00016266107912051275, + "loss": 0.6824, + "step": 5550 + }, + { + "epoch": 0.3563181526619628, + "grad_norm": 0.8212652492805361, + "learning_rate": 0.00016257379217729897, + "loss": 0.7353, + "step": 5555 + }, + { + "epoch": 0.35663887107119946, + "grad_norm": 0.8592127708286107, + "learning_rate": 0.00016248642680901157, + "loss": 0.7493, + "step": 5560 + }, + { + "epoch": 0.35695958948043616, + "grad_norm": 1.5401896960046906, + "learning_rate": 0.00016239898312514747, + "loss": 0.6233, + "step": 5565 + }, + { + "epoch": 0.35728030788967285, + "grad_norm": 0.9880669672357292, + "learning_rate": 0.00016231146123530169, + "loss": 0.7483, + "step": 5570 + }, + { + "epoch": 0.35760102629890955, + "grad_norm": 1.0054106975653296, + "learning_rate": 0.00016222386124916733, + "loss": 0.7477, + "step": 5575 + }, + { + "epoch": 0.35792174470814625, + "grad_norm": 0.8851121102484797, + "learning_rate": 0.0001621361832765353, + "loss": 0.7338, + "step": 5580 + }, + { + "epoch": 0.35824246311738295, + "grad_norm": 0.7868381457390292, + "learning_rate": 0.0001620484274272943, + "loss": 0.8315, + "step": 5585 + }, + { + "epoch": 0.35856318152661965, + "grad_norm": 2.2302567668996907, + "learning_rate": 0.00016196059381143056, + "loss": 0.6057, + "step": 5590 + }, + { + "epoch": 0.35888389993585634, + "grad_norm": 0.8632558537630518, + "learning_rate": 0.0001618726825390279, + "loss": 0.6017, + "step": 5595 + }, + { + "epoch": 0.359204618345093, + "grad_norm": 0.9301897471057365, + "learning_rate": 0.0001617846937202674, + "loss": 0.7127, + "step": 5600 + }, + { + "epoch": 0.3595253367543297, + "grad_norm": 1.0314386924705863, + "learning_rate": 0.00016169662746542724, + "loss": 0.6471, + "step": 5605 + }, + { + "epoch": 0.3598460551635664, + "grad_norm": 0.7527220509268685, + "learning_rate": 0.00016160848388488283, + "loss": 0.5149, + "step": 5610 + }, + { + "epoch": 0.3601667735728031, + "grad_norm": 0.9964259981347259, + "learning_rate": 0.0001615202630891064, + "loss": 0.7551, + "step": 5615 + }, + { + "epoch": 0.3604874919820398, + "grad_norm": 0.9534877288363439, + "learning_rate": 0.0001614319651886669, + "loss": 0.7869, + "step": 5620 + }, + { + "epoch": 0.3608082103912765, + "grad_norm": 0.6624325233415048, + "learning_rate": 0.00016134359029423004, + "loss": 0.6187, + "step": 5625 + }, + { + "epoch": 0.36112892880051317, + "grad_norm": 1.1438885759745019, + "learning_rate": 0.000161255138516558, + "loss": 0.6818, + "step": 5630 + }, + { + "epoch": 0.3614496472097498, + "grad_norm": 1.0060076302436596, + "learning_rate": 0.00016116660996650918, + "loss": 0.7134, + "step": 5635 + }, + { + "epoch": 0.3617703656189865, + "grad_norm": 0.824054815580278, + "learning_rate": 0.0001610780047550384, + "loss": 0.6322, + "step": 5640 + }, + { + "epoch": 0.3620910840282232, + "grad_norm": 1.1593592610393137, + "learning_rate": 0.00016098932299319642, + "loss": 0.6549, + "step": 5645 + }, + { + "epoch": 0.3624118024374599, + "grad_norm": 1.3453462014445998, + "learning_rate": 0.00016090056479213, + "loss": 0.6626, + "step": 5650 + }, + { + "epoch": 0.3627325208466966, + "grad_norm": 0.6303823430985745, + "learning_rate": 0.00016081173026308168, + "loss": 0.6129, + "step": 5655 + }, + { + "epoch": 0.3630532392559333, + "grad_norm": 0.9682139214042652, + "learning_rate": 0.00016072281951738974, + "loss": 0.5327, + "step": 5660 + }, + { + "epoch": 0.36337395766517, + "grad_norm": 0.6265113009833752, + "learning_rate": 0.00016063383266648788, + "loss": 0.7972, + "step": 5665 + }, + { + "epoch": 0.3636946760744067, + "grad_norm": 1.0602611989591288, + "learning_rate": 0.0001605447698219052, + "loss": 0.7568, + "step": 5670 + }, + { + "epoch": 0.36401539448364334, + "grad_norm": 0.8085898565934937, + "learning_rate": 0.0001604556310952661, + "loss": 0.7088, + "step": 5675 + }, + { + "epoch": 0.36433611289288004, + "grad_norm": 0.9259612439090465, + "learning_rate": 0.00016036641659829005, + "loss": 0.6433, + "step": 5680 + }, + { + "epoch": 0.36465683130211674, + "grad_norm": 1.0560925548902709, + "learning_rate": 0.00016027712644279147, + "loss": 0.6389, + "step": 5685 + }, + { + "epoch": 0.36497754971135343, + "grad_norm": 0.9202003497456687, + "learning_rate": 0.00016018776074067965, + "loss": 0.6588, + "step": 5690 + }, + { + "epoch": 0.36529826812059013, + "grad_norm": 0.7606894269431724, + "learning_rate": 0.00016009831960395854, + "loss": 0.6249, + "step": 5695 + }, + { + "epoch": 0.36561898652982683, + "grad_norm": 1.0194051743569745, + "learning_rate": 0.00016000880314472662, + "loss": 0.7063, + "step": 5700 + }, + { + "epoch": 0.3659397049390635, + "grad_norm": 0.8971345599358044, + "learning_rate": 0.0001599192114751768, + "loss": 0.7758, + "step": 5705 + }, + { + "epoch": 0.36626042334830017, + "grad_norm": 0.8114509690004853, + "learning_rate": 0.0001598295447075962, + "loss": 0.687, + "step": 5710 + }, + { + "epoch": 0.36658114175753687, + "grad_norm": 1.1086821486366683, + "learning_rate": 0.00015973980295436613, + "loss": 0.7663, + "step": 5715 + }, + { + "epoch": 0.36690186016677356, + "grad_norm": 0.8305079494288046, + "learning_rate": 0.00015964998632796187, + "loss": 0.7841, + "step": 5720 + }, + { + "epoch": 0.36722257857601026, + "grad_norm": 0.9332565471912556, + "learning_rate": 0.00015956009494095245, + "loss": 0.7629, + "step": 5725 + }, + { + "epoch": 0.36754329698524696, + "grad_norm": 1.2026329331281138, + "learning_rate": 0.00015947012890600072, + "loss": 0.6034, + "step": 5730 + }, + { + "epoch": 0.36786401539448366, + "grad_norm": 0.8890367793012931, + "learning_rate": 0.00015938008833586307, + "loss": 0.673, + "step": 5735 + }, + { + "epoch": 0.36818473380372035, + "grad_norm": 1.1168519576569294, + "learning_rate": 0.00015928997334338924, + "loss": 0.7265, + "step": 5740 + }, + { + "epoch": 0.368505452212957, + "grad_norm": 0.7323689106049717, + "learning_rate": 0.00015919978404152225, + "loss": 0.5286, + "step": 5745 + }, + { + "epoch": 0.3688261706221937, + "grad_norm": 0.7491408637491445, + "learning_rate": 0.00015910952054329832, + "loss": 0.6603, + "step": 5750 + }, + { + "epoch": 0.3691468890314304, + "grad_norm": 0.5720787370255552, + "learning_rate": 0.00015901918296184664, + "loss": 0.7637, + "step": 5755 + }, + { + "epoch": 0.3694676074406671, + "grad_norm": 1.247050118094861, + "learning_rate": 0.00015892877141038917, + "loss": 0.6643, + "step": 5760 + }, + { + "epoch": 0.3697883258499038, + "grad_norm": 0.8428619170851901, + "learning_rate": 0.00015883828600224073, + "loss": 0.603, + "step": 5765 + }, + { + "epoch": 0.3701090442591405, + "grad_norm": 0.6414166600611392, + "learning_rate": 0.00015874772685080853, + "loss": 0.6775, + "step": 5770 + }, + { + "epoch": 0.3704297626683772, + "grad_norm": 1.39629472630112, + "learning_rate": 0.0001586570940695924, + "loss": 0.7512, + "step": 5775 + }, + { + "epoch": 0.3707504810776139, + "grad_norm": 1.0547557813661854, + "learning_rate": 0.00015856638777218422, + "loss": 0.7574, + "step": 5780 + }, + { + "epoch": 0.3710711994868505, + "grad_norm": 0.8689805862522758, + "learning_rate": 0.00015847560807226823, + "loss": 0.6427, + "step": 5785 + }, + { + "epoch": 0.3713919178960872, + "grad_norm": 1.068120678282078, + "learning_rate": 0.00015838475508362051, + "loss": 0.7343, + "step": 5790 + }, + { + "epoch": 0.3717126363053239, + "grad_norm": 0.8164191154263224, + "learning_rate": 0.00015829382892010912, + "loss": 0.7685, + "step": 5795 + }, + { + "epoch": 0.3720333547145606, + "grad_norm": 0.9769245060606544, + "learning_rate": 0.00015820282969569374, + "loss": 0.6804, + "step": 5800 + }, + { + "epoch": 0.3723540731237973, + "grad_norm": 0.676619842133273, + "learning_rate": 0.00015811175752442562, + "loss": 0.7244, + "step": 5805 + }, + { + "epoch": 0.372674791533034, + "grad_norm": 3.577185251797483, + "learning_rate": 0.00015802061252044748, + "loss": 0.7426, + "step": 5810 + }, + { + "epoch": 0.3729955099422707, + "grad_norm": 0.5176738358349613, + "learning_rate": 0.00015792939479799333, + "loss": 0.6545, + "step": 5815 + }, + { + "epoch": 0.37331622835150735, + "grad_norm": 0.9510093482774353, + "learning_rate": 0.00015783810447138826, + "loss": 0.6358, + "step": 5820 + }, + { + "epoch": 0.37363694676074405, + "grad_norm": 0.8940071414235186, + "learning_rate": 0.0001577467416550484, + "loss": 0.7573, + "step": 5825 + }, + { + "epoch": 0.37395766516998075, + "grad_norm": 0.8502887517010003, + "learning_rate": 0.0001576553064634807, + "loss": 0.6371, + "step": 5830 + }, + { + "epoch": 0.37427838357921744, + "grad_norm": 0.7260357322627535, + "learning_rate": 0.00015756379901128294, + "loss": 0.6106, + "step": 5835 + }, + { + "epoch": 0.37459910198845414, + "grad_norm": 0.5018237254264993, + "learning_rate": 0.00015747221941314325, + "loss": 0.6329, + "step": 5840 + }, + { + "epoch": 0.37491982039769084, + "grad_norm": 0.9130075924966622, + "learning_rate": 0.00015738056778384038, + "loss": 0.6868, + "step": 5845 + }, + { + "epoch": 0.37524053880692754, + "grad_norm": 0.803836499340597, + "learning_rate": 0.00015728884423824323, + "loss": 0.5845, + "step": 5850 + }, + { + "epoch": 0.37556125721616423, + "grad_norm": 0.7604942646833414, + "learning_rate": 0.0001571970488913109, + "loss": 0.6911, + "step": 5855 + }, + { + "epoch": 0.3758819756254009, + "grad_norm": 0.6458258737911328, + "learning_rate": 0.00015710518185809246, + "loss": 0.5681, + "step": 5860 + }, + { + "epoch": 0.3762026940346376, + "grad_norm": 1.4247194077938075, + "learning_rate": 0.00015701324325372688, + "loss": 0.7889, + "step": 5865 + }, + { + "epoch": 0.3765234124438743, + "grad_norm": 0.9972586435085499, + "learning_rate": 0.00015692123319344272, + "loss": 0.5962, + "step": 5870 + }, + { + "epoch": 0.37684413085311097, + "grad_norm": 0.8022131053222762, + "learning_rate": 0.0001568291517925582, + "loss": 0.7065, + "step": 5875 + }, + { + "epoch": 0.37716484926234767, + "grad_norm": 1.0767684802416355, + "learning_rate": 0.00015673699916648085, + "loss": 0.5781, + "step": 5880 + }, + { + "epoch": 0.37748556767158437, + "grad_norm": 0.9496229847137114, + "learning_rate": 0.00015664477543070757, + "loss": 0.7056, + "step": 5885 + }, + { + "epoch": 0.37780628608082106, + "grad_norm": 0.9633177655644503, + "learning_rate": 0.00015655248070082438, + "loss": 0.6939, + "step": 5890 + }, + { + "epoch": 0.3781270044900577, + "grad_norm": 1.115479048562113, + "learning_rate": 0.00015646011509250617, + "loss": 0.7378, + "step": 5895 + }, + { + "epoch": 0.3784477228992944, + "grad_norm": 0.7941062299537334, + "learning_rate": 0.0001563676787215168, + "loss": 0.5145, + "step": 5900 + }, + { + "epoch": 0.3787684413085311, + "grad_norm": 0.7169731124858206, + "learning_rate": 0.0001562751717037087, + "loss": 0.5164, + "step": 5905 + }, + { + "epoch": 0.3790891597177678, + "grad_norm": 0.844339179385339, + "learning_rate": 0.00015618259415502291, + "loss": 0.7001, + "step": 5910 + }, + { + "epoch": 0.3794098781270045, + "grad_norm": 0.8954099088632127, + "learning_rate": 0.00015608994619148886, + "loss": 0.7601, + "step": 5915 + }, + { + "epoch": 0.3797305965362412, + "grad_norm": 0.9177657222066289, + "learning_rate": 0.00015599722792922425, + "loss": 0.6568, + "step": 5920 + }, + { + "epoch": 0.3800513149454779, + "grad_norm": 0.6243318997688123, + "learning_rate": 0.00015590443948443482, + "loss": 0.696, + "step": 5925 + }, + { + "epoch": 0.38037203335471453, + "grad_norm": 1.4596235194468596, + "learning_rate": 0.00015581158097341435, + "loss": 0.5778, + "step": 5930 + }, + { + "epoch": 0.38069275176395123, + "grad_norm": 1.2871148477384242, + "learning_rate": 0.0001557186525125444, + "loss": 0.6818, + "step": 5935 + }, + { + "epoch": 0.38101347017318793, + "grad_norm": 0.999208910914117, + "learning_rate": 0.00015562565421829415, + "loss": 0.763, + "step": 5940 + }, + { + "epoch": 0.3813341885824246, + "grad_norm": 0.7364320418930576, + "learning_rate": 0.0001555325862072204, + "loss": 0.5347, + "step": 5945 + }, + { + "epoch": 0.3816549069916613, + "grad_norm": 1.2608428725949008, + "learning_rate": 0.0001554394485959673, + "loss": 0.7863, + "step": 5950 + }, + { + "epoch": 0.381975625400898, + "grad_norm": 1.1072984033964586, + "learning_rate": 0.00015534624150126617, + "loss": 0.6498, + "step": 5955 + }, + { + "epoch": 0.3822963438101347, + "grad_norm": 1.058590608018293, + "learning_rate": 0.00015525296503993548, + "loss": 0.5703, + "step": 5960 + }, + { + "epoch": 0.3826170622193714, + "grad_norm": 1.0908171799744935, + "learning_rate": 0.0001551596193288806, + "loss": 0.7091, + "step": 5965 + }, + { + "epoch": 0.38293778062860806, + "grad_norm": 0.947173201252904, + "learning_rate": 0.0001550662044850937, + "loss": 0.7283, + "step": 5970 + }, + { + "epoch": 0.38325849903784476, + "grad_norm": 1.3607073347278296, + "learning_rate": 0.00015497272062565362, + "loss": 0.6388, + "step": 5975 + }, + { + "epoch": 0.38357921744708146, + "grad_norm": 0.6419239829629664, + "learning_rate": 0.0001548791678677257, + "loss": 0.6622, + "step": 5980 + }, + { + "epoch": 0.38389993585631815, + "grad_norm": 0.7871877023929343, + "learning_rate": 0.0001547855463285616, + "loss": 0.6371, + "step": 5985 + }, + { + "epoch": 0.38422065426555485, + "grad_norm": 0.8230439216037291, + "learning_rate": 0.00015469185612549917, + "loss": 0.6582, + "step": 5990 + }, + { + "epoch": 0.38454137267479155, + "grad_norm": 0.9173584864551694, + "learning_rate": 0.00015459809737596237, + "loss": 0.6135, + "step": 5995 + }, + { + "epoch": 0.38486209108402825, + "grad_norm": 0.8582018157822806, + "learning_rate": 0.0001545042701974611, + "loss": 0.7084, + "step": 6000 + }, + { + "epoch": 0.3851828094932649, + "grad_norm": 1.2042820074202123, + "learning_rate": 0.0001544103747075909, + "loss": 0.8395, + "step": 6005 + }, + { + "epoch": 0.3855035279025016, + "grad_norm": 1.1522620963550567, + "learning_rate": 0.00015431641102403302, + "loss": 0.7, + "step": 6010 + }, + { + "epoch": 0.3858242463117383, + "grad_norm": 1.2293665111038254, + "learning_rate": 0.00015422237926455417, + "loss": 0.8011, + "step": 6015 + }, + { + "epoch": 0.386144964720975, + "grad_norm": 0.8550912373038257, + "learning_rate": 0.00015412827954700632, + "loss": 0.7712, + "step": 6020 + }, + { + "epoch": 0.3864656831302117, + "grad_norm": 1.216484260092555, + "learning_rate": 0.00015403411198932672, + "loss": 0.5951, + "step": 6025 + }, + { + "epoch": 0.3867864015394484, + "grad_norm": 1.0566678632706732, + "learning_rate": 0.00015393987670953756, + "loss": 0.6986, + "step": 6030 + }, + { + "epoch": 0.3871071199486851, + "grad_norm": 0.9868957913863856, + "learning_rate": 0.00015384557382574595, + "loss": 0.583, + "step": 6035 + }, + { + "epoch": 0.38742783835792177, + "grad_norm": 0.7263178308133398, + "learning_rate": 0.0001537512034561437, + "loss": 0.7377, + "step": 6040 + }, + { + "epoch": 0.3877485567671584, + "grad_norm": 0.8464297973296825, + "learning_rate": 0.00015365676571900725, + "loss": 0.6738, + "step": 6045 + }, + { + "epoch": 0.3880692751763951, + "grad_norm": 0.754402075351536, + "learning_rate": 0.00015356226073269736, + "loss": 0.8025, + "step": 6050 + }, + { + "epoch": 0.3883899935856318, + "grad_norm": 1.0080496408230515, + "learning_rate": 0.0001534676886156592, + "loss": 0.6925, + "step": 6055 + }, + { + "epoch": 0.3887107119948685, + "grad_norm": 0.859538871785963, + "learning_rate": 0.000153373049486422, + "loss": 0.6198, + "step": 6060 + }, + { + "epoch": 0.3890314304041052, + "grad_norm": 0.617907781420839, + "learning_rate": 0.0001532783434635991, + "loss": 0.708, + "step": 6065 + }, + { + "epoch": 0.3893521488133419, + "grad_norm": 0.9321179061358089, + "learning_rate": 0.00015318357066588747, + "loss": 0.8021, + "step": 6070 + }, + { + "epoch": 0.3896728672225786, + "grad_norm": 1.0543925706918078, + "learning_rate": 0.00015308873121206798, + "loss": 0.6394, + "step": 6075 + }, + { + "epoch": 0.38999358563181524, + "grad_norm": 0.7635204958993133, + "learning_rate": 0.00015299382522100484, + "loss": 0.7279, + "step": 6080 + }, + { + "epoch": 0.39031430404105194, + "grad_norm": 0.4808058715738424, + "learning_rate": 0.00015289885281164587, + "loss": 0.6074, + "step": 6085 + }, + { + "epoch": 0.39063502245028864, + "grad_norm": 0.8001578622671749, + "learning_rate": 0.00015280381410302197, + "loss": 0.7391, + "step": 6090 + }, + { + "epoch": 0.39095574085952534, + "grad_norm": 0.6800874640636567, + "learning_rate": 0.00015270870921424721, + "loss": 0.6633, + "step": 6095 + }, + { + "epoch": 0.39127645926876203, + "grad_norm": 1.6563190981003801, + "learning_rate": 0.00015261353826451858, + "loss": 0.5687, + "step": 6100 + }, + { + "epoch": 0.39159717767799873, + "grad_norm": 1.541052532833452, + "learning_rate": 0.00015251830137311587, + "loss": 0.7656, + "step": 6105 + }, + { + "epoch": 0.39191789608723543, + "grad_norm": 0.8841300600956734, + "learning_rate": 0.00015242299865940147, + "loss": 0.5984, + "step": 6110 + }, + { + "epoch": 0.39223861449647207, + "grad_norm": 1.2069800426391173, + "learning_rate": 0.00015232763024282034, + "loss": 0.8064, + "step": 6115 + }, + { + "epoch": 0.39255933290570877, + "grad_norm": 1.12582638671757, + "learning_rate": 0.00015223219624289978, + "loss": 0.7329, + "step": 6120 + }, + { + "epoch": 0.39288005131494547, + "grad_norm": 0.8200206186838468, + "learning_rate": 0.0001521366967792493, + "loss": 0.5894, + "step": 6125 + }, + { + "epoch": 0.39320076972418216, + "grad_norm": 0.8420632536848158, + "learning_rate": 0.0001520411319715603, + "loss": 0.7387, + "step": 6130 + }, + { + "epoch": 0.39352148813341886, + "grad_norm": 0.8067132371420835, + "learning_rate": 0.00015194550193960632, + "loss": 0.682, + "step": 6135 + }, + { + "epoch": 0.39384220654265556, + "grad_norm": 0.7708975305048692, + "learning_rate": 0.00015184980680324248, + "loss": 0.68, + "step": 6140 + }, + { + "epoch": 0.39416292495189226, + "grad_norm": 1.0673984272805985, + "learning_rate": 0.00015175404668240554, + "loss": 0.765, + "step": 6145 + }, + { + "epoch": 0.39448364336112896, + "grad_norm": 1.3041455682451786, + "learning_rate": 0.00015165822169711373, + "loss": 0.6576, + "step": 6150 + }, + { + "epoch": 0.3948043617703656, + "grad_norm": 0.6831544367344609, + "learning_rate": 0.00015156233196746653, + "loss": 0.7366, + "step": 6155 + }, + { + "epoch": 0.3951250801796023, + "grad_norm": 0.9906492347644728, + "learning_rate": 0.00015146637761364457, + "loss": 0.7104, + "step": 6160 + }, + { + "epoch": 0.395445798588839, + "grad_norm": 0.8542271989350849, + "learning_rate": 0.00015137035875590956, + "loss": 0.6678, + "step": 6165 + }, + { + "epoch": 0.3957665169980757, + "grad_norm": 1.542102394105923, + "learning_rate": 0.00015127427551460396, + "loss": 0.665, + "step": 6170 + }, + { + "epoch": 0.3960872354073124, + "grad_norm": 0.8016623705576872, + "learning_rate": 0.00015117812801015095, + "loss": 0.5812, + "step": 6175 + }, + { + "epoch": 0.3964079538165491, + "grad_norm": 1.2073109631978751, + "learning_rate": 0.00015108191636305427, + "loss": 0.7527, + "step": 6180 + }, + { + "epoch": 0.3967286722257858, + "grad_norm": 0.8328169453200382, + "learning_rate": 0.000150985640693898, + "loss": 0.6733, + "step": 6185 + }, + { + "epoch": 0.3970493906350224, + "grad_norm": 0.9951192780366616, + "learning_rate": 0.00015088930112334653, + "loss": 0.733, + "step": 6190 + }, + { + "epoch": 0.3973701090442591, + "grad_norm": 0.7405889864532202, + "learning_rate": 0.0001507928977721443, + "loss": 0.5478, + "step": 6195 + }, + { + "epoch": 0.3976908274534958, + "grad_norm": 1.080626962102723, + "learning_rate": 0.0001506964307611157, + "loss": 0.6115, + "step": 6200 + }, + { + "epoch": 0.3980115458627325, + "grad_norm": 0.7995884570597525, + "learning_rate": 0.0001505999002111649, + "loss": 0.5829, + "step": 6205 + }, + { + "epoch": 0.3983322642719692, + "grad_norm": 0.4992231946350308, + "learning_rate": 0.0001505033062432757, + "loss": 0.5649, + "step": 6210 + }, + { + "epoch": 0.3986529826812059, + "grad_norm": 0.8489355096183382, + "learning_rate": 0.00015040664897851138, + "loss": 0.7291, + "step": 6215 + }, + { + "epoch": 0.3989737010904426, + "grad_norm": 1.136002981763331, + "learning_rate": 0.00015030992853801454, + "loss": 0.7918, + "step": 6220 + }, + { + "epoch": 0.39929441949967925, + "grad_norm": 0.895880595156802, + "learning_rate": 0.00015021314504300704, + "loss": 0.5635, + "step": 6225 + }, + { + "epoch": 0.39961513790891595, + "grad_norm": 0.8226243605298355, + "learning_rate": 0.0001501162986147897, + "loss": 0.815, + "step": 6230 + }, + { + "epoch": 0.39993585631815265, + "grad_norm": 0.9921294907910895, + "learning_rate": 0.00015001938937474218, + "loss": 0.7156, + "step": 6235 + }, + { + "epoch": 0.40025657472738935, + "grad_norm": 0.9510451771447491, + "learning_rate": 0.0001499224174443229, + "loss": 0.681, + "step": 6240 + }, + { + "epoch": 0.40057729313662604, + "grad_norm": 0.9952627757450367, + "learning_rate": 0.0001498253829450689, + "loss": 0.712, + "step": 6245 + }, + { + "epoch": 0.40089801154586274, + "grad_norm": 0.6514927458391138, + "learning_rate": 0.00014972828599859556, + "loss": 0.633, + "step": 6250 + }, + { + "epoch": 0.40121872995509944, + "grad_norm": 0.9621219480196492, + "learning_rate": 0.0001496311267265966, + "loss": 0.6988, + "step": 6255 + }, + { + "epoch": 0.40153944836433614, + "grad_norm": 1.0155290688557055, + "learning_rate": 0.00014953390525084377, + "loss": 0.7093, + "step": 6260 + }, + { + "epoch": 0.4018601667735728, + "grad_norm": 0.6507458129551235, + "learning_rate": 0.00014943662169318686, + "loss": 0.6781, + "step": 6265 + }, + { + "epoch": 0.4021808851828095, + "grad_norm": 0.8206284722853324, + "learning_rate": 0.00014933927617555342, + "loss": 0.6472, + "step": 6270 + }, + { + "epoch": 0.4025016035920462, + "grad_norm": 0.969513442832448, + "learning_rate": 0.00014924186881994867, + "loss": 0.6322, + "step": 6275 + }, + { + "epoch": 0.4028223220012829, + "grad_norm": 1.0110426378326145, + "learning_rate": 0.00014914439974845532, + "loss": 0.6192, + "step": 6280 + }, + { + "epoch": 0.40314304041051957, + "grad_norm": 0.9182180122329154, + "learning_rate": 0.0001490468690832335, + "loss": 0.7624, + "step": 6285 + }, + { + "epoch": 0.40346375881975627, + "grad_norm": 1.0221754081762093, + "learning_rate": 0.00014894927694652046, + "loss": 0.5685, + "step": 6290 + }, + { + "epoch": 0.40378447722899297, + "grad_norm": 0.7951566985169003, + "learning_rate": 0.00014885162346063048, + "loss": 0.6114, + "step": 6295 + }, + { + "epoch": 0.4041051956382296, + "grad_norm": 0.9205666830852229, + "learning_rate": 0.00014875390874795482, + "loss": 0.6126, + "step": 6300 + }, + { + "epoch": 0.4044259140474663, + "grad_norm": 0.8495232187331296, + "learning_rate": 0.00014865613293096132, + "loss": 0.6743, + "step": 6305 + }, + { + "epoch": 0.404746632456703, + "grad_norm": 0.5863050150246784, + "learning_rate": 0.0001485582961321946, + "loss": 0.5965, + "step": 6310 + }, + { + "epoch": 0.4050673508659397, + "grad_norm": 0.732145223215556, + "learning_rate": 0.00014846039847427563, + "loss": 0.6549, + "step": 6315 + }, + { + "epoch": 0.4053880692751764, + "grad_norm": 0.7872248738108987, + "learning_rate": 0.00014836244007990156, + "loss": 0.675, + "step": 6320 + }, + { + "epoch": 0.4057087876844131, + "grad_norm": 0.6983906622235366, + "learning_rate": 0.0001482644210718458, + "loss": 0.6684, + "step": 6325 + }, + { + "epoch": 0.4060295060936498, + "grad_norm": 1.036082877660743, + "learning_rate": 0.0001481663415729576, + "loss": 0.6682, + "step": 6330 + }, + { + "epoch": 0.4063502245028865, + "grad_norm": 0.8176112608665335, + "learning_rate": 0.00014806820170616222, + "loss": 0.8555, + "step": 6335 + }, + { + "epoch": 0.40667094291212313, + "grad_norm": 0.7770154320072936, + "learning_rate": 0.00014797000159446038, + "loss": 0.557, + "step": 6340 + }, + { + "epoch": 0.40699166132135983, + "grad_norm": 1.5604043527138882, + "learning_rate": 0.00014787174136092837, + "loss": 0.5678, + "step": 6345 + }, + { + "epoch": 0.40731237973059653, + "grad_norm": 0.5000651713384456, + "learning_rate": 0.00014777342112871786, + "loss": 0.6323, + "step": 6350 + }, + { + "epoch": 0.4076330981398332, + "grad_norm": 0.7129539414804645, + "learning_rate": 0.0001476750410210557, + "loss": 0.6531, + "step": 6355 + }, + { + "epoch": 0.4079538165490699, + "grad_norm": 0.6838741535402209, + "learning_rate": 0.0001475766011612438, + "loss": 0.6734, + "step": 6360 + }, + { + "epoch": 0.4082745349583066, + "grad_norm": 0.6003288340459018, + "learning_rate": 0.00014747810167265894, + "loss": 0.5793, + "step": 6365 + }, + { + "epoch": 0.4085952533675433, + "grad_norm": 1.5754948140455838, + "learning_rate": 0.00014737954267875263, + "loss": 0.702, + "step": 6370 + }, + { + "epoch": 0.40891597177677996, + "grad_norm": 1.0150345516766142, + "learning_rate": 0.000147280924303051, + "loss": 0.8569, + "step": 6375 + }, + { + "epoch": 0.40923669018601666, + "grad_norm": 1.0034479899495579, + "learning_rate": 0.0001471822466691545, + "loss": 0.8446, + "step": 6380 + }, + { + "epoch": 0.40955740859525336, + "grad_norm": 0.9184425443953635, + "learning_rate": 0.00014708350990073798, + "loss": 0.6602, + "step": 6385 + }, + { + "epoch": 0.40987812700449006, + "grad_norm": 0.6284500041695303, + "learning_rate": 0.0001469847141215503, + "loss": 0.7291, + "step": 6390 + }, + { + "epoch": 0.41019884541372675, + "grad_norm": 0.9039981636058719, + "learning_rate": 0.0001468858594554144, + "loss": 0.8008, + "step": 6395 + }, + { + "epoch": 0.41051956382296345, + "grad_norm": 0.9662431864347534, + "learning_rate": 0.0001467869460262269, + "loss": 0.5989, + "step": 6400 + }, + { + "epoch": 0.41084028223220015, + "grad_norm": 0.6824016883811361, + "learning_rate": 0.00014668797395795812, + "loss": 0.7651, + "step": 6405 + }, + { + "epoch": 0.4111610006414368, + "grad_norm": 0.8325307304433841, + "learning_rate": 0.00014658894337465187, + "loss": 0.762, + "step": 6410 + }, + { + "epoch": 0.4114817190506735, + "grad_norm": 0.6873637896445222, + "learning_rate": 0.00014648985440042533, + "loss": 0.6868, + "step": 6415 + }, + { + "epoch": 0.4118024374599102, + "grad_norm": 0.8851369890257763, + "learning_rate": 0.0001463907071594688, + "loss": 0.719, + "step": 6420 + }, + { + "epoch": 0.4121231558691469, + "grad_norm": 0.8755806997147045, + "learning_rate": 0.00014629150177604565, + "loss": 0.6161, + "step": 6425 + }, + { + "epoch": 0.4124438742783836, + "grad_norm": 0.9956221559599793, + "learning_rate": 0.00014619223837449211, + "loss": 0.6246, + "step": 6430 + }, + { + "epoch": 0.4127645926876203, + "grad_norm": 0.9146462627716199, + "learning_rate": 0.00014609291707921713, + "loss": 0.665, + "step": 6435 + }, + { + "epoch": 0.413085311096857, + "grad_norm": 0.7096303973864491, + "learning_rate": 0.0001459935380147022, + "loss": 0.7379, + "step": 6440 + }, + { + "epoch": 0.4134060295060937, + "grad_norm": 0.8414445373385668, + "learning_rate": 0.00014589410130550124, + "loss": 0.7533, + "step": 6445 + }, + { + "epoch": 0.4137267479153303, + "grad_norm": 1.1009718984925583, + "learning_rate": 0.0001457946070762404, + "loss": 0.673, + "step": 6450 + }, + { + "epoch": 0.414047466324567, + "grad_norm": 0.9982085240192685, + "learning_rate": 0.000145695055451618, + "loss": 0.6951, + "step": 6455 + }, + { + "epoch": 0.4143681847338037, + "grad_norm": 0.7828125692520432, + "learning_rate": 0.00014559544655640412, + "loss": 0.7779, + "step": 6460 + }, + { + "epoch": 0.4146889031430404, + "grad_norm": 1.0323696606312884, + "learning_rate": 0.0001454957805154408, + "loss": 0.666, + "step": 6465 + }, + { + "epoch": 0.4150096215522771, + "grad_norm": 0.6618186643447491, + "learning_rate": 0.00014539605745364156, + "loss": 0.7354, + "step": 6470 + }, + { + "epoch": 0.4153303399615138, + "grad_norm": 1.3747337158411725, + "learning_rate": 0.00014529627749599146, + "loss": 0.7191, + "step": 6475 + }, + { + "epoch": 0.4156510583707505, + "grad_norm": 0.6208342823219867, + "learning_rate": 0.0001451964407675469, + "loss": 0.648, + "step": 6480 + }, + { + "epoch": 0.41597177677998715, + "grad_norm": 1.0319199547152835, + "learning_rate": 0.00014509654739343534, + "loss": 0.7808, + "step": 6485 + }, + { + "epoch": 0.41629249518922384, + "grad_norm": 1.1954322026767323, + "learning_rate": 0.0001449965974988553, + "loss": 0.7695, + "step": 6490 + }, + { + "epoch": 0.41661321359846054, + "grad_norm": 1.2496250479975701, + "learning_rate": 0.00014489659120907615, + "loss": 0.6214, + "step": 6495 + }, + { + "epoch": 0.41693393200769724, + "grad_norm": 0.6983577410015246, + "learning_rate": 0.00014479652864943788, + "loss": 0.6312, + "step": 6500 + }, + { + "epoch": 0.41725465041693394, + "grad_norm": 0.8629680447857923, + "learning_rate": 0.0001446964099453511, + "loss": 0.7508, + "step": 6505 + }, + { + "epoch": 0.41757536882617063, + "grad_norm": 1.0627571850045838, + "learning_rate": 0.00014459623522229662, + "loss": 0.7044, + "step": 6510 + }, + { + "epoch": 0.41789608723540733, + "grad_norm": 0.900748883113857, + "learning_rate": 0.00014449600460582563, + "loss": 0.7454, + "step": 6515 + }, + { + "epoch": 0.41821680564464403, + "grad_norm": 0.9690669274483888, + "learning_rate": 0.00014439571822155934, + "loss": 0.5726, + "step": 6520 + }, + { + "epoch": 0.41853752405388067, + "grad_norm": 1.0487104357417287, + "learning_rate": 0.00014429537619518873, + "loss": 0.799, + "step": 6525 + }, + { + "epoch": 0.41885824246311737, + "grad_norm": 1.2154258394073059, + "learning_rate": 0.0001441949786524747, + "loss": 0.5219, + "step": 6530 + }, + { + "epoch": 0.41917896087235407, + "grad_norm": 0.7794877423395151, + "learning_rate": 0.0001440945257192476, + "loss": 0.5707, + "step": 6535 + }, + { + "epoch": 0.41949967928159076, + "grad_norm": 0.5347206233594262, + "learning_rate": 0.00014399401752140728, + "loss": 0.55, + "step": 6540 + }, + { + "epoch": 0.41982039769082746, + "grad_norm": 0.6862664210890991, + "learning_rate": 0.00014389345418492272, + "loss": 0.7803, + "step": 6545 + }, + { + "epoch": 0.42014111610006416, + "grad_norm": 1.1652827614213763, + "learning_rate": 0.0001437928358358322, + "loss": 0.6907, + "step": 6550 + }, + { + "epoch": 0.42046183450930086, + "grad_norm": 1.0771142450313498, + "learning_rate": 0.00014369216260024282, + "loss": 0.5868, + "step": 6555 + }, + { + "epoch": 0.4207825529185375, + "grad_norm": 0.7384688516596317, + "learning_rate": 0.00014359143460433046, + "loss": 0.5754, + "step": 6560 + }, + { + "epoch": 0.4211032713277742, + "grad_norm": 0.7961839635706309, + "learning_rate": 0.00014349065197433977, + "loss": 0.6247, + "step": 6565 + }, + { + "epoch": 0.4214239897370109, + "grad_norm": 0.9321579239285109, + "learning_rate": 0.0001433898148365837, + "loss": 0.6856, + "step": 6570 + }, + { + "epoch": 0.4217447081462476, + "grad_norm": 0.7081449574427117, + "learning_rate": 0.00014328892331744362, + "loss": 0.5893, + "step": 6575 + }, + { + "epoch": 0.4220654265554843, + "grad_norm": 0.9200227580239932, + "learning_rate": 0.000143187977543369, + "loss": 0.661, + "step": 6580 + }, + { + "epoch": 0.422386144964721, + "grad_norm": 1.1330174896855054, + "learning_rate": 0.00014308697764087738, + "loss": 0.8342, + "step": 6585 + }, + { + "epoch": 0.4227068633739577, + "grad_norm": 0.851200673216541, + "learning_rate": 0.00014298592373655414, + "loss": 0.8357, + "step": 6590 + }, + { + "epoch": 0.42302758178319433, + "grad_norm": 0.6342829663427049, + "learning_rate": 0.00014288481595705217, + "loss": 0.4643, + "step": 6595 + }, + { + "epoch": 0.423348300192431, + "grad_norm": 0.5823246535632486, + "learning_rate": 0.00014278365442909214, + "loss": 0.6472, + "step": 6600 + }, + { + "epoch": 0.4236690186016677, + "grad_norm": 1.0907798326084035, + "learning_rate": 0.0001426824392794619, + "loss": 0.5667, + "step": 6605 + }, + { + "epoch": 0.4239897370109044, + "grad_norm": 0.6171485537285203, + "learning_rate": 0.00014258117063501658, + "loss": 0.7975, + "step": 6610 + }, + { + "epoch": 0.4243104554201411, + "grad_norm": 0.920203490173087, + "learning_rate": 0.00014247984862267833, + "loss": 0.5432, + "step": 6615 + }, + { + "epoch": 0.4246311738293778, + "grad_norm": 0.6838928556102262, + "learning_rate": 0.0001423784733694362, + "loss": 0.5982, + "step": 6620 + }, + { + "epoch": 0.4249518922386145, + "grad_norm": 1.2229051146263923, + "learning_rate": 0.00014227704500234599, + "loss": 0.8164, + "step": 6625 + }, + { + "epoch": 0.4252726106478512, + "grad_norm": 0.7990664572540562, + "learning_rate": 0.00014217556364853006, + "loss": 0.7974, + "step": 6630 + }, + { + "epoch": 0.42559332905708785, + "grad_norm": 1.439913710180236, + "learning_rate": 0.00014207402943517707, + "loss": 0.6574, + "step": 6635 + }, + { + "epoch": 0.42591404746632455, + "grad_norm": 1.5833188763841297, + "learning_rate": 0.0001419724424895421, + "loss": 0.6127, + "step": 6640 + }, + { + "epoch": 0.42623476587556125, + "grad_norm": 1.0972694183324532, + "learning_rate": 0.00014187080293894623, + "loss": 0.6384, + "step": 6645 + }, + { + "epoch": 0.42655548428479795, + "grad_norm": 0.7755437327444886, + "learning_rate": 0.0001417691109107765, + "loss": 0.6467, + "step": 6650 + }, + { + "epoch": 0.42687620269403465, + "grad_norm": 0.7307053147732903, + "learning_rate": 0.00014166736653248568, + "loss": 0.6857, + "step": 6655 + }, + { + "epoch": 0.42719692110327134, + "grad_norm": 1.1129466425839534, + "learning_rate": 0.00014156556993159215, + "loss": 0.6325, + "step": 6660 + }, + { + "epoch": 0.42751763951250804, + "grad_norm": 1.0829046562773215, + "learning_rate": 0.00014146372123567986, + "loss": 0.4627, + "step": 6665 + }, + { + "epoch": 0.4278383579217447, + "grad_norm": 0.7286117691400992, + "learning_rate": 0.00014136182057239788, + "loss": 0.7129, + "step": 6670 + }, + { + "epoch": 0.4281590763309814, + "grad_norm": 0.9030468850448815, + "learning_rate": 0.00014125986806946052, + "loss": 0.6249, + "step": 6675 + }, + { + "epoch": 0.4284797947402181, + "grad_norm": 1.0569038979072376, + "learning_rate": 0.00014115786385464704, + "loss": 0.5753, + "step": 6680 + }, + { + "epoch": 0.4288005131494548, + "grad_norm": 1.9939364349504531, + "learning_rate": 0.0001410558080558015, + "loss": 0.6928, + "step": 6685 + }, + { + "epoch": 0.4291212315586915, + "grad_norm": 0.7638304398434881, + "learning_rate": 0.00014095370080083262, + "loss": 0.7665, + "step": 6690 + }, + { + "epoch": 0.42944194996792817, + "grad_norm": 1.0470546825430735, + "learning_rate": 0.00014085154221771362, + "loss": 0.5786, + "step": 6695 + }, + { + "epoch": 0.42976266837716487, + "grad_norm": 1.122127513476166, + "learning_rate": 0.00014074933243448203, + "loss": 0.5162, + "step": 6700 + }, + { + "epoch": 0.43008338678640157, + "grad_norm": 0.9808616961072774, + "learning_rate": 0.00014064707157923956, + "loss": 0.5722, + "step": 6705 + }, + { + "epoch": 0.4304041051956382, + "grad_norm": 0.8653107924861354, + "learning_rate": 0.00014054475978015192, + "loss": 0.6378, + "step": 6710 + }, + { + "epoch": 0.4307248236048749, + "grad_norm": 0.8962127595984706, + "learning_rate": 0.00014044239716544868, + "loss": 0.6408, + "step": 6715 + }, + { + "epoch": 0.4310455420141116, + "grad_norm": 0.8365357084309853, + "learning_rate": 0.00014033998386342312, + "loss": 0.6256, + "step": 6720 + }, + { + "epoch": 0.4313662604233483, + "grad_norm": 1.0863245013957081, + "learning_rate": 0.000140237520002432, + "loss": 0.7068, + "step": 6725 + }, + { + "epoch": 0.431686978832585, + "grad_norm": 0.662268709969254, + "learning_rate": 0.0001401350057108955, + "loss": 0.7573, + "step": 6730 + }, + { + "epoch": 0.4320076972418217, + "grad_norm": 1.1715222453521494, + "learning_rate": 0.0001400324411172969, + "loss": 0.7574, + "step": 6735 + }, + { + "epoch": 0.4323284156510584, + "grad_norm": 0.9424425544184805, + "learning_rate": 0.0001399298263501827, + "loss": 0.8143, + "step": 6740 + }, + { + "epoch": 0.43264913406029504, + "grad_norm": 0.7955537139368879, + "learning_rate": 0.00013982716153816213, + "loss": 0.5263, + "step": 6745 + }, + { + "epoch": 0.43296985246953174, + "grad_norm": 0.9554382885880205, + "learning_rate": 0.00013972444680990722, + "loss": 0.6976, + "step": 6750 + }, + { + "epoch": 0.43329057087876843, + "grad_norm": 1.5328515613064213, + "learning_rate": 0.00013962168229415253, + "loss": 0.627, + "step": 6755 + }, + { + "epoch": 0.43361128928800513, + "grad_norm": 1.096222900496091, + "learning_rate": 0.00013951886811969501, + "loss": 0.8235, + "step": 6760 + }, + { + "epoch": 0.43393200769724183, + "grad_norm": 1.3093624744883847, + "learning_rate": 0.00013941600441539392, + "loss": 0.5996, + "step": 6765 + }, + { + "epoch": 0.4342527261064785, + "grad_norm": 0.9375701783813489, + "learning_rate": 0.00013931309131017046, + "loss": 0.8571, + "step": 6770 + }, + { + "epoch": 0.4345734445157152, + "grad_norm": 0.8981486814038466, + "learning_rate": 0.0001392101289330079, + "loss": 0.7036, + "step": 6775 + }, + { + "epoch": 0.43489416292495187, + "grad_norm": 1.030290589948871, + "learning_rate": 0.00013910711741295113, + "loss": 0.5523, + "step": 6780 + }, + { + "epoch": 0.43521488133418856, + "grad_norm": 0.7783354813811616, + "learning_rate": 0.00013900405687910676, + "loss": 0.6957, + "step": 6785 + }, + { + "epoch": 0.43553559974342526, + "grad_norm": 0.7762966668183622, + "learning_rate": 0.00013890094746064273, + "loss": 0.7249, + "step": 6790 + }, + { + "epoch": 0.43585631815266196, + "grad_norm": 1.0757163547744426, + "learning_rate": 0.0001387977892867883, + "loss": 0.7033, + "step": 6795 + }, + { + "epoch": 0.43617703656189866, + "grad_norm": 0.9414991837160046, + "learning_rate": 0.00013869458248683377, + "loss": 0.6503, + "step": 6800 + }, + { + "epoch": 0.43649775497113535, + "grad_norm": 1.1367581767585646, + "learning_rate": 0.0001385913271901305, + "loss": 0.6653, + "step": 6805 + }, + { + "epoch": 0.43681847338037205, + "grad_norm": 0.9718072928244804, + "learning_rate": 0.0001384880235260905, + "loss": 0.6126, + "step": 6810 + }, + { + "epoch": 0.43713919178960875, + "grad_norm": 1.051631260475179, + "learning_rate": 0.00013838467162418652, + "loss": 0.7529, + "step": 6815 + }, + { + "epoch": 0.4374599101988454, + "grad_norm": 1.1255123924704187, + "learning_rate": 0.00013828127161395165, + "loss": 0.7, + "step": 6820 + }, + { + "epoch": 0.4377806286080821, + "grad_norm": 0.6159074752294377, + "learning_rate": 0.00013817782362497938, + "loss": 0.7815, + "step": 6825 + }, + { + "epoch": 0.4381013470173188, + "grad_norm": 0.7651323158439101, + "learning_rate": 0.00013807432778692333, + "loss": 0.6508, + "step": 6830 + }, + { + "epoch": 0.4384220654265555, + "grad_norm": 1.49661820735196, + "learning_rate": 0.00013797078422949697, + "loss": 0.6949, + "step": 6835 + }, + { + "epoch": 0.4387427838357922, + "grad_norm": 0.9888853439915466, + "learning_rate": 0.0001378671930824737, + "loss": 0.6223, + "step": 6840 + }, + { + "epoch": 0.4390635022450289, + "grad_norm": 1.248537199848208, + "learning_rate": 0.00013776355447568648, + "loss": 0.8024, + "step": 6845 + }, + { + "epoch": 0.4393842206542656, + "grad_norm": 0.9575631631075234, + "learning_rate": 0.00013765986853902783, + "loss": 0.6739, + "step": 6850 + }, + { + "epoch": 0.4397049390635022, + "grad_norm": 0.9680343975909423, + "learning_rate": 0.00013755613540244958, + "loss": 0.6917, + "step": 6855 + }, + { + "epoch": 0.4400256574727389, + "grad_norm": 1.117269566951374, + "learning_rate": 0.00013745235519596263, + "loss": 0.7042, + "step": 6860 + }, + { + "epoch": 0.4403463758819756, + "grad_norm": 0.8619372703069825, + "learning_rate": 0.00013734852804963703, + "loss": 0.609, + "step": 6865 + }, + { + "epoch": 0.4406670942912123, + "grad_norm": 0.8117525588458958, + "learning_rate": 0.00013724465409360148, + "loss": 0.6981, + "step": 6870 + }, + { + "epoch": 0.440987812700449, + "grad_norm": 1.01398403519154, + "learning_rate": 0.0001371407334580434, + "loss": 0.6151, + "step": 6875 + }, + { + "epoch": 0.4413085311096857, + "grad_norm": 0.834092658374222, + "learning_rate": 0.00013703676627320886, + "loss": 0.7673, + "step": 6880 + }, + { + "epoch": 0.4416292495189224, + "grad_norm": 1.5311945048848135, + "learning_rate": 0.00013693275266940207, + "loss": 0.7119, + "step": 6885 + }, + { + "epoch": 0.44194996792815905, + "grad_norm": 1.527540376439275, + "learning_rate": 0.00013682869277698557, + "loss": 0.6265, + "step": 6890 + }, + { + "epoch": 0.44227068633739575, + "grad_norm": 0.7951368893260018, + "learning_rate": 0.00013672458672637984, + "loss": 0.8016, + "step": 6895 + }, + { + "epoch": 0.44259140474663244, + "grad_norm": 1.2763559389048758, + "learning_rate": 0.0001366204346480632, + "loss": 0.7206, + "step": 6900 + }, + { + "epoch": 0.44291212315586914, + "grad_norm": 0.8023255338282319, + "learning_rate": 0.00013651623667257164, + "loss": 0.7554, + "step": 6905 + }, + { + "epoch": 0.44323284156510584, + "grad_norm": 0.8695350841504818, + "learning_rate": 0.00013641199293049877, + "loss": 0.8358, + "step": 6910 + }, + { + "epoch": 0.44355355997434254, + "grad_norm": 0.9044131348318595, + "learning_rate": 0.0001363077035524955, + "loss": 0.6412, + "step": 6915 + }, + { + "epoch": 0.44387427838357923, + "grad_norm": 0.8127899752297872, + "learning_rate": 0.00013620336866926997, + "loss": 0.6957, + "step": 6920 + }, + { + "epoch": 0.44419499679281593, + "grad_norm": 0.8688512997555105, + "learning_rate": 0.00013609898841158725, + "loss": 0.724, + "step": 6925 + }, + { + "epoch": 0.4445157152020526, + "grad_norm": 0.8760877608220616, + "learning_rate": 0.0001359945629102694, + "loss": 0.5738, + "step": 6930 + }, + { + "epoch": 0.4448364336112893, + "grad_norm": 1.0325674004426306, + "learning_rate": 0.0001358900922961951, + "loss": 0.5873, + "step": 6935 + }, + { + "epoch": 0.44515715202052597, + "grad_norm": 0.8467908302129974, + "learning_rate": 0.00013578557670029966, + "loss": 0.7058, + "step": 6940 + }, + { + "epoch": 0.44547787042976267, + "grad_norm": 0.8131400613232301, + "learning_rate": 0.00013568101625357465, + "loss": 0.7422, + "step": 6945 + }, + { + "epoch": 0.44579858883899937, + "grad_norm": 0.724722516850653, + "learning_rate": 0.000135576411087068, + "loss": 0.6638, + "step": 6950 + }, + { + "epoch": 0.44611930724823606, + "grad_norm": 0.8948898208956525, + "learning_rate": 0.00013547176133188354, + "loss": 0.7129, + "step": 6955 + }, + { + "epoch": 0.44644002565747276, + "grad_norm": 1.0104789290655904, + "learning_rate": 0.00013536706711918107, + "loss": 0.7032, + "step": 6960 + }, + { + "epoch": 0.4467607440667094, + "grad_norm": 0.8414717932992289, + "learning_rate": 0.0001352623285801761, + "loss": 0.6836, + "step": 6965 + }, + { + "epoch": 0.4470814624759461, + "grad_norm": 1.1406826410807314, + "learning_rate": 0.00013515754584613962, + "loss": 0.6053, + "step": 6970 + }, + { + "epoch": 0.4474021808851828, + "grad_norm": 0.8742591243812547, + "learning_rate": 0.00013505271904839817, + "loss": 0.7431, + "step": 6975 + }, + { + "epoch": 0.4477228992944195, + "grad_norm": 0.6939509932441673, + "learning_rate": 0.00013494784831833337, + "loss": 0.6291, + "step": 6980 + }, + { + "epoch": 0.4480436177036562, + "grad_norm": 1.1945030623029917, + "learning_rate": 0.00013484293378738193, + "loss": 0.6403, + "step": 6985 + }, + { + "epoch": 0.4483643361128929, + "grad_norm": 1.2041604733537394, + "learning_rate": 0.0001347379755870355, + "loss": 0.7259, + "step": 6990 + }, + { + "epoch": 0.4486850545221296, + "grad_norm": 1.2915007724773113, + "learning_rate": 0.00013463297384884047, + "loss": 0.659, + "step": 6995 + }, + { + "epoch": 0.4490057729313663, + "grad_norm": 0.9604685032866782, + "learning_rate": 0.00013452792870439774, + "loss": 0.7607, + "step": 7000 + }, + { + "epoch": 0.44932649134060293, + "grad_norm": 0.683575690655945, + "learning_rate": 0.00013442284028536265, + "loss": 0.6597, + "step": 7005 + }, + { + "epoch": 0.4496472097498396, + "grad_norm": 0.8599337861042293, + "learning_rate": 0.0001343177087234447, + "loss": 0.6324, + "step": 7010 + }, + { + "epoch": 0.4499679281590763, + "grad_norm": 1.0590394622444155, + "learning_rate": 0.00013421253415040764, + "loss": 0.7187, + "step": 7015 + }, + { + "epoch": 0.450288646568313, + "grad_norm": 0.7304239044871675, + "learning_rate": 0.00013410731669806893, + "loss": 0.6951, + "step": 7020 + }, + { + "epoch": 0.4506093649775497, + "grad_norm": 0.6027716061436601, + "learning_rate": 0.00013400205649829986, + "loss": 0.6254, + "step": 7025 + }, + { + "epoch": 0.4509300833867864, + "grad_norm": 0.9290585913030099, + "learning_rate": 0.00013389675368302538, + "loss": 0.6395, + "step": 7030 + }, + { + "epoch": 0.4512508017960231, + "grad_norm": 0.6100444770178587, + "learning_rate": 0.00013379140838422368, + "loss": 0.6956, + "step": 7035 + }, + { + "epoch": 0.45157152020525976, + "grad_norm": 1.0560462270870308, + "learning_rate": 0.00013368602073392626, + "loss": 0.7217, + "step": 7040 + }, + { + "epoch": 0.45189223861449646, + "grad_norm": 0.9506970796048375, + "learning_rate": 0.00013358059086421777, + "loss": 0.7538, + "step": 7045 + }, + { + "epoch": 0.45221295702373315, + "grad_norm": 0.8472683366273123, + "learning_rate": 0.0001334751189072357, + "loss": 0.7699, + "step": 7050 + }, + { + "epoch": 0.45253367543296985, + "grad_norm": 0.8123297983190807, + "learning_rate": 0.00013336960499517035, + "loss": 0.7617, + "step": 7055 + }, + { + "epoch": 0.45285439384220655, + "grad_norm": 0.7432610908008688, + "learning_rate": 0.00013326404926026453, + "loss": 0.4966, + "step": 7060 + }, + { + "epoch": 0.45317511225144325, + "grad_norm": 1.9038556869996193, + "learning_rate": 0.00013315845183481352, + "loss": 0.7716, + "step": 7065 + }, + { + "epoch": 0.45349583066067994, + "grad_norm": 1.517420207283064, + "learning_rate": 0.0001330528128511648, + "loss": 0.7335, + "step": 7070 + }, + { + "epoch": 0.4538165490699166, + "grad_norm": 0.8901376925504432, + "learning_rate": 0.00013294713244171798, + "loss": 0.6803, + "step": 7075 + }, + { + "epoch": 0.4541372674791533, + "grad_norm": 0.9458291501306725, + "learning_rate": 0.0001328414107389246, + "loss": 0.8463, + "step": 7080 + }, + { + "epoch": 0.45445798588839, + "grad_norm": 0.771925264607674, + "learning_rate": 0.00013273564787528796, + "loss": 0.6271, + "step": 7085 + }, + { + "epoch": 0.4547787042976267, + "grad_norm": 0.9552006861914584, + "learning_rate": 0.00013262984398336287, + "loss": 0.6903, + "step": 7090 + }, + { + "epoch": 0.4550994227068634, + "grad_norm": 0.7912142730312611, + "learning_rate": 0.00013252399919575565, + "loss": 0.7355, + "step": 7095 + }, + { + "epoch": 0.4554201411161001, + "grad_norm": 0.8790500769675236, + "learning_rate": 0.0001324181136451238, + "loss": 0.6732, + "step": 7100 + }, + { + "epoch": 0.45574085952533677, + "grad_norm": 1.2386079454717946, + "learning_rate": 0.00013231218746417595, + "loss": 0.7522, + "step": 7105 + }, + { + "epoch": 0.45606157793457347, + "grad_norm": 0.7962051132713993, + "learning_rate": 0.0001322062207856717, + "loss": 0.8145, + "step": 7110 + }, + { + "epoch": 0.4563822963438101, + "grad_norm": 1.0329953407444796, + "learning_rate": 0.00013210021374242134, + "loss": 0.7769, + "step": 7115 + }, + { + "epoch": 0.4567030147530468, + "grad_norm": 0.9259650281367799, + "learning_rate": 0.00013199416646728573, + "loss": 0.6457, + "step": 7120 + }, + { + "epoch": 0.4570237331622835, + "grad_norm": 0.9088503892075743, + "learning_rate": 0.0001318880790931762, + "loss": 0.6294, + "step": 7125 + }, + { + "epoch": 0.4573444515715202, + "grad_norm": 0.8985892524046365, + "learning_rate": 0.00013178195175305438, + "loss": 0.6828, + "step": 7130 + }, + { + "epoch": 0.4576651699807569, + "grad_norm": 0.912515537663532, + "learning_rate": 0.00013167578457993188, + "loss": 0.7064, + "step": 7135 + }, + { + "epoch": 0.4579858883899936, + "grad_norm": 0.9729614181574077, + "learning_rate": 0.0001315695777068703, + "loss": 0.7272, + "step": 7140 + }, + { + "epoch": 0.4583066067992303, + "grad_norm": 0.6424734919666812, + "learning_rate": 0.00013146333126698103, + "loss": 0.6299, + "step": 7145 + }, + { + "epoch": 0.45862732520846694, + "grad_norm": 0.9359545383993509, + "learning_rate": 0.00013135704539342494, + "loss": 0.6424, + "step": 7150 + }, + { + "epoch": 0.45894804361770364, + "grad_norm": 0.7928212174336042, + "learning_rate": 0.00013125072021941248, + "loss": 0.6982, + "step": 7155 + }, + { + "epoch": 0.45926876202694034, + "grad_norm": 0.5352504172374731, + "learning_rate": 0.00013114435587820316, + "loss": 0.5291, + "step": 7160 + }, + { + "epoch": 0.45958948043617703, + "grad_norm": 0.7128732592198029, + "learning_rate": 0.00013103795250310577, + "loss": 0.7029, + "step": 7165 + }, + { + "epoch": 0.45991019884541373, + "grad_norm": 1.0850764381783637, + "learning_rate": 0.00013093151022747793, + "loss": 0.7707, + "step": 7170 + }, + { + "epoch": 0.46023091725465043, + "grad_norm": 1.0237223555264552, + "learning_rate": 0.000130825029184726, + "loss": 0.6769, + "step": 7175 + }, + { + "epoch": 0.4605516356638871, + "grad_norm": 1.1136242211182483, + "learning_rate": 0.00013071850950830492, + "loss": 0.5703, + "step": 7180 + }, + { + "epoch": 0.4608723540731238, + "grad_norm": 0.8143443059526504, + "learning_rate": 0.00013061195133171814, + "loss": 0.6334, + "step": 7185 + }, + { + "epoch": 0.46119307248236047, + "grad_norm": 0.9509973045912795, + "learning_rate": 0.00013050535478851728, + "loss": 0.6757, + "step": 7190 + }, + { + "epoch": 0.46151379089159716, + "grad_norm": 0.6191444236173257, + "learning_rate": 0.00013039872001230208, + "loss": 0.6217, + "step": 7195 + }, + { + "epoch": 0.46183450930083386, + "grad_norm": 0.7788953363838352, + "learning_rate": 0.00013029204713672015, + "loss": 0.7384, + "step": 7200 + }, + { + "epoch": 0.46215522771007056, + "grad_norm": 0.8450930304171778, + "learning_rate": 0.00013018533629546695, + "loss": 0.7298, + "step": 7205 + }, + { + "epoch": 0.46247594611930726, + "grad_norm": 1.0385186485500146, + "learning_rate": 0.0001300785876222854, + "loss": 0.6529, + "step": 7210 + }, + { + "epoch": 0.46279666452854396, + "grad_norm": 0.9152190048763487, + "learning_rate": 0.00012997180125096596, + "loss": 0.4276, + "step": 7215 + }, + { + "epoch": 0.46311738293778065, + "grad_norm": 0.9787836443016305, + "learning_rate": 0.00012986497731534618, + "loss": 0.63, + "step": 7220 + }, + { + "epoch": 0.4634381013470173, + "grad_norm": 0.9734043537474775, + "learning_rate": 0.00012975811594931094, + "loss": 0.7634, + "step": 7225 + }, + { + "epoch": 0.463758819756254, + "grad_norm": 0.9713910942202003, + "learning_rate": 0.00012965121728679175, + "loss": 0.757, + "step": 7230 + }, + { + "epoch": 0.4640795381654907, + "grad_norm": 0.9081157831943877, + "learning_rate": 0.00012954428146176703, + "loss": 0.7426, + "step": 7235 + }, + { + "epoch": 0.4644002565747274, + "grad_norm": 0.7116758820381245, + "learning_rate": 0.00012943730860826174, + "loss": 0.8052, + "step": 7240 + }, + { + "epoch": 0.4647209749839641, + "grad_norm": 0.8501864866133851, + "learning_rate": 0.00012933029886034723, + "loss": 0.7407, + "step": 7245 + }, + { + "epoch": 0.4650416933932008, + "grad_norm": 0.9701598818030126, + "learning_rate": 0.00012922325235214114, + "loss": 0.672, + "step": 7250 + }, + { + "epoch": 0.4653624118024375, + "grad_norm": 0.7147413441513334, + "learning_rate": 0.00012911616921780717, + "loss": 0.572, + "step": 7255 + }, + { + "epoch": 0.4656831302116741, + "grad_norm": 1.1031756310087157, + "learning_rate": 0.00012900904959155482, + "loss": 0.502, + "step": 7260 + }, + { + "epoch": 0.4660038486209108, + "grad_norm": 0.9549539883250536, + "learning_rate": 0.0001289018936076395, + "loss": 0.7697, + "step": 7265 + }, + { + "epoch": 0.4663245670301475, + "grad_norm": 0.7061368474604979, + "learning_rate": 0.00012879470140036205, + "loss": 0.77, + "step": 7270 + }, + { + "epoch": 0.4666452854393842, + "grad_norm": 0.8174054654625066, + "learning_rate": 0.00012868747310406875, + "loss": 0.644, + "step": 7275 + }, + { + "epoch": 0.4669660038486209, + "grad_norm": 1.0847763653058102, + "learning_rate": 0.00012858020885315118, + "loss": 0.6265, + "step": 7280 + }, + { + "epoch": 0.4672867222578576, + "grad_norm": 0.7498493863919715, + "learning_rate": 0.00012847290878204584, + "loss": 0.6246, + "step": 7285 + }, + { + "epoch": 0.4676074406670943, + "grad_norm": 0.981941482754815, + "learning_rate": 0.0001283655730252343, + "loss": 0.6622, + "step": 7290 + }, + { + "epoch": 0.467928159076331, + "grad_norm": 0.9518018861299121, + "learning_rate": 0.00012825820171724267, + "loss": 0.6284, + "step": 7295 + }, + { + "epoch": 0.46824887748556765, + "grad_norm": 0.8663834243061985, + "learning_rate": 0.00012815079499264178, + "loss": 0.5667, + "step": 7300 + }, + { + "epoch": 0.46856959589480435, + "grad_norm": 0.7672027770311252, + "learning_rate": 0.00012804335298604672, + "loss": 0.7221, + "step": 7305 + }, + { + "epoch": 0.46889031430404104, + "grad_norm": 0.8035416637587046, + "learning_rate": 0.00012793587583211693, + "loss": 0.5737, + "step": 7310 + }, + { + "epoch": 0.46921103271327774, + "grad_norm": 0.7309561664000054, + "learning_rate": 0.00012782836366555578, + "loss": 0.6313, + "step": 7315 + }, + { + "epoch": 0.46953175112251444, + "grad_norm": 0.6252749910832299, + "learning_rate": 0.00012772081662111053, + "loss": 0.6736, + "step": 7320 + }, + { + "epoch": 0.46985246953175114, + "grad_norm": 1.025835083057594, + "learning_rate": 0.00012761323483357227, + "loss": 0.5665, + "step": 7325 + }, + { + "epoch": 0.47017318794098784, + "grad_norm": 0.6525095712503345, + "learning_rate": 0.00012750561843777552, + "loss": 0.6443, + "step": 7330 + }, + { + "epoch": 0.4704939063502245, + "grad_norm": 0.7418969128305869, + "learning_rate": 0.00012739796756859825, + "loss": 0.8236, + "step": 7335 + }, + { + "epoch": 0.4708146247594612, + "grad_norm": 1.0413884397203683, + "learning_rate": 0.00012729028236096155, + "loss": 0.6624, + "step": 7340 + }, + { + "epoch": 0.4711353431686979, + "grad_norm": 0.9159067009468284, + "learning_rate": 0.0001271825629498296, + "loss": 0.6376, + "step": 7345 + }, + { + "epoch": 0.47145606157793457, + "grad_norm": 0.5992387879000995, + "learning_rate": 0.0001270748094702095, + "loss": 0.5685, + "step": 7350 + }, + { + "epoch": 0.47177677998717127, + "grad_norm": 1.7163402868588182, + "learning_rate": 0.00012696702205715088, + "loss": 0.5311, + "step": 7355 + }, + { + "epoch": 0.47209749839640797, + "grad_norm": 0.7926851445802399, + "learning_rate": 0.00012685920084574618, + "loss": 0.7548, + "step": 7360 + }, + { + "epoch": 0.47241821680564466, + "grad_norm": 0.9751658539863987, + "learning_rate": 0.0001267513459711299, + "loss": 0.6665, + "step": 7365 + }, + { + "epoch": 0.47273893521488136, + "grad_norm": 1.0752483823874541, + "learning_rate": 0.00012664345756847892, + "loss": 0.583, + "step": 7370 + }, + { + "epoch": 0.473059653624118, + "grad_norm": 1.0127918776763205, + "learning_rate": 0.00012653553577301202, + "loss": 0.749, + "step": 7375 + }, + { + "epoch": 0.4733803720333547, + "grad_norm": 0.9059323990908674, + "learning_rate": 0.00012642758071999, + "loss": 0.7049, + "step": 7380 + }, + { + "epoch": 0.4737010904425914, + "grad_norm": 0.8259800182390388, + "learning_rate": 0.00012631959254471515, + "loss": 0.6771, + "step": 7385 + }, + { + "epoch": 0.4740218088518281, + "grad_norm": 1.47432552983105, + "learning_rate": 0.00012621157138253142, + "loss": 0.5965, + "step": 7390 + }, + { + "epoch": 0.4743425272610648, + "grad_norm": 0.9830245238116091, + "learning_rate": 0.00012610351736882402, + "loss": 0.7302, + "step": 7395 + }, + { + "epoch": 0.4746632456703015, + "grad_norm": 0.9860227904680734, + "learning_rate": 0.00012599543063901935, + "loss": 0.6942, + "step": 7400 + }, + { + "epoch": 0.4749839640795382, + "grad_norm": 0.9011424798066042, + "learning_rate": 0.00012588731132858486, + "loss": 0.6456, + "step": 7405 + }, + { + "epoch": 0.47530468248877483, + "grad_norm": 0.9091580384346607, + "learning_rate": 0.00012577915957302872, + "loss": 0.6091, + "step": 7410 + }, + { + "epoch": 0.47562540089801153, + "grad_norm": 0.9741008974793179, + "learning_rate": 0.00012567097550789997, + "loss": 0.6012, + "step": 7415 + }, + { + "epoch": 0.4759461193072482, + "grad_norm": 0.9602884477063278, + "learning_rate": 0.00012556275926878789, + "loss": 0.6792, + "step": 7420 + }, + { + "epoch": 0.4762668377164849, + "grad_norm": 0.6210052131474215, + "learning_rate": 0.00012545451099132225, + "loss": 0.6193, + "step": 7425 + }, + { + "epoch": 0.4765875561257216, + "grad_norm": 0.8832670583789428, + "learning_rate": 0.000125346230811173, + "loss": 0.6106, + "step": 7430 + }, + { + "epoch": 0.4769082745349583, + "grad_norm": 0.851189577398919, + "learning_rate": 0.00012523791886404986, + "loss": 0.8305, + "step": 7435 + }, + { + "epoch": 0.477228992944195, + "grad_norm": 1.2879732211506167, + "learning_rate": 0.00012512957528570265, + "loss": 0.5887, + "step": 7440 + }, + { + "epoch": 0.47754971135343166, + "grad_norm": 0.5699068076911031, + "learning_rate": 0.0001250212002119207, + "loss": 0.5558, + "step": 7445 + }, + { + "epoch": 0.47787042976266836, + "grad_norm": 1.1918583269997756, + "learning_rate": 0.00012491279377853268, + "loss": 0.6408, + "step": 7450 + }, + { + "epoch": 0.47819114817190506, + "grad_norm": 1.4317720523654553, + "learning_rate": 0.0001248043561214068, + "loss": 0.6172, + "step": 7455 + }, + { + "epoch": 0.47851186658114175, + "grad_norm": 1.0666113380037154, + "learning_rate": 0.0001246958873764503, + "loss": 0.7485, + "step": 7460 + }, + { + "epoch": 0.47883258499037845, + "grad_norm": 1.2123844625766853, + "learning_rate": 0.00012458738767960937, + "loss": 0.7277, + "step": 7465 + }, + { + "epoch": 0.47915330339961515, + "grad_norm": 0.6850700187680755, + "learning_rate": 0.00012447885716686892, + "loss": 0.6412, + "step": 7470 + }, + { + "epoch": 0.47947402180885185, + "grad_norm": 0.7818905955159324, + "learning_rate": 0.00012437029597425268, + "loss": 0.6845, + "step": 7475 + }, + { + "epoch": 0.47979474021808854, + "grad_norm": 0.7985800895037933, + "learning_rate": 0.00012426170423782265, + "loss": 0.7376, + "step": 7480 + }, + { + "epoch": 0.4801154586273252, + "grad_norm": 1.4988959271026578, + "learning_rate": 0.0001241530820936792, + "loss": 0.6025, + "step": 7485 + }, + { + "epoch": 0.4804361770365619, + "grad_norm": 0.7532644364170019, + "learning_rate": 0.00012404442967796077, + "loss": 0.7597, + "step": 7490 + }, + { + "epoch": 0.4807568954457986, + "grad_norm": 0.9781127180520404, + "learning_rate": 0.0001239357471268438, + "loss": 0.7113, + "step": 7495 + }, + { + "epoch": 0.4810776138550353, + "grad_norm": 1.2808191157193494, + "learning_rate": 0.00012382703457654247, + "loss": 0.7197, + "step": 7500 + }, + { + "epoch": 0.481398332264272, + "grad_norm": 0.9577008167614253, + "learning_rate": 0.00012371829216330842, + "loss": 0.6633, + "step": 7505 + }, + { + "epoch": 0.4817190506735087, + "grad_norm": 0.9163574634981259, + "learning_rate": 0.000123609520023431, + "loss": 0.6577, + "step": 7510 + }, + { + "epoch": 0.4820397690827454, + "grad_norm": 0.9436379402563304, + "learning_rate": 0.00012350071829323657, + "loss": 0.665, + "step": 7515 + }, + { + "epoch": 0.482360487491982, + "grad_norm": 0.8955893724229462, + "learning_rate": 0.0001233918871090887, + "loss": 0.65, + "step": 7520 + }, + { + "epoch": 0.4826812059012187, + "grad_norm": 1.1039069837177617, + "learning_rate": 0.0001232830266073879, + "loss": 0.6262, + "step": 7525 + }, + { + "epoch": 0.4830019243104554, + "grad_norm": 0.8240710234420133, + "learning_rate": 0.00012317413692457125, + "loss": 0.7796, + "step": 7530 + }, + { + "epoch": 0.4833226427196921, + "grad_norm": 0.5672101461672577, + "learning_rate": 0.0001230652181971126, + "loss": 0.6606, + "step": 7535 + }, + { + "epoch": 0.4836433611289288, + "grad_norm": 0.6312799174708051, + "learning_rate": 0.00012295627056152205, + "loss": 0.6847, + "step": 7540 + }, + { + "epoch": 0.4839640795381655, + "grad_norm": 0.9279904903302523, + "learning_rate": 0.0001228472941543461, + "loss": 0.7298, + "step": 7545 + }, + { + "epoch": 0.4842847979474022, + "grad_norm": 1.0061624072103414, + "learning_rate": 0.00012273828911216715, + "loss": 0.688, + "step": 7550 + }, + { + "epoch": 0.48460551635663884, + "grad_norm": 0.9531338313200752, + "learning_rate": 0.00012262925557160362, + "loss": 0.7381, + "step": 7555 + }, + { + "epoch": 0.48492623476587554, + "grad_norm": 0.9084381778100004, + "learning_rate": 0.0001225201936693095, + "loss": 0.5676, + "step": 7560 + }, + { + "epoch": 0.48524695317511224, + "grad_norm": 1.0203436397332364, + "learning_rate": 0.00012241110354197448, + "loss": 0.571, + "step": 7565 + }, + { + "epoch": 0.48556767158434894, + "grad_norm": 0.9169062207127215, + "learning_rate": 0.00012230198532632347, + "loss": 0.6456, + "step": 7570 + }, + { + "epoch": 0.48588838999358563, + "grad_norm": 0.6002350728637655, + "learning_rate": 0.0001221928391591167, + "loss": 0.6998, + "step": 7575 + }, + { + "epoch": 0.48620910840282233, + "grad_norm": 0.5575094896397851, + "learning_rate": 0.00012208366517714946, + "loss": 0.6751, + "step": 7580 + }, + { + "epoch": 0.48652982681205903, + "grad_norm": 0.7309868460212633, + "learning_rate": 0.00012197446351725174, + "loss": 0.6152, + "step": 7585 + }, + { + "epoch": 0.4868505452212957, + "grad_norm": 0.9692168543018325, + "learning_rate": 0.0001218652343162884, + "loss": 0.6374, + "step": 7590 + }, + { + "epoch": 0.48717126363053237, + "grad_norm": 0.7189150002506619, + "learning_rate": 0.00012175597771115871, + "loss": 0.7784, + "step": 7595 + }, + { + "epoch": 0.48749198203976907, + "grad_norm": 0.8123916784425887, + "learning_rate": 0.0001216466938387963, + "loss": 0.5559, + "step": 7600 + }, + { + "epoch": 0.48781270044900576, + "grad_norm": 0.903323959073406, + "learning_rate": 0.00012153738283616897, + "loss": 0.6245, + "step": 7605 + }, + { + "epoch": 0.48813341885824246, + "grad_norm": 1.1841897784251287, + "learning_rate": 0.00012142804484027862, + "loss": 0.7076, + "step": 7610 + }, + { + "epoch": 0.48845413726747916, + "grad_norm": 0.96970852663879, + "learning_rate": 0.0001213186799881608, + "loss": 0.6394, + "step": 7615 + }, + { + "epoch": 0.48877485567671586, + "grad_norm": 0.9366182177279975, + "learning_rate": 0.00012120928841688486, + "loss": 0.6738, + "step": 7620 + }, + { + "epoch": 0.48909557408595256, + "grad_norm": 0.6547998596688648, + "learning_rate": 0.0001210998702635536, + "loss": 0.5484, + "step": 7625 + }, + { + "epoch": 0.4894162924951892, + "grad_norm": 0.61835825910844, + "learning_rate": 0.00012099042566530318, + "loss": 0.7106, + "step": 7630 + }, + { + "epoch": 0.4897370109044259, + "grad_norm": 0.9889648893113016, + "learning_rate": 0.00012088095475930281, + "loss": 0.6665, + "step": 7635 + }, + { + "epoch": 0.4900577293136626, + "grad_norm": 1.0009313158645148, + "learning_rate": 0.00012077145768275473, + "loss": 0.7342, + "step": 7640 + }, + { + "epoch": 0.4903784477228993, + "grad_norm": 1.207980433506984, + "learning_rate": 0.00012066193457289397, + "loss": 0.797, + "step": 7645 + }, + { + "epoch": 0.490699166132136, + "grad_norm": 0.7854979595695312, + "learning_rate": 0.00012055238556698816, + "loss": 0.6988, + "step": 7650 + }, + { + "epoch": 0.4910198845413727, + "grad_norm": 0.7188797039130606, + "learning_rate": 0.00012044281080233746, + "loss": 0.7325, + "step": 7655 + }, + { + "epoch": 0.4913406029506094, + "grad_norm": 0.9561317362271494, + "learning_rate": 0.00012033321041627425, + "loss": 0.6506, + "step": 7660 + }, + { + "epoch": 0.4916613213598461, + "grad_norm": 0.7528076899928123, + "learning_rate": 0.00012022358454616306, + "loss": 0.5609, + "step": 7665 + }, + { + "epoch": 0.4919820397690827, + "grad_norm": 0.8596601027470778, + "learning_rate": 0.0001201139333294003, + "loss": 0.6597, + "step": 7670 + }, + { + "epoch": 0.4923027581783194, + "grad_norm": 0.6508137207715219, + "learning_rate": 0.00012000425690341422, + "loss": 0.4953, + "step": 7675 + }, + { + "epoch": 0.4926234765875561, + "grad_norm": 0.8505276898684504, + "learning_rate": 0.00011989455540566462, + "loss": 0.6649, + "step": 7680 + }, + { + "epoch": 0.4929441949967928, + "grad_norm": 0.758748378012195, + "learning_rate": 0.00011978482897364273, + "loss": 0.7204, + "step": 7685 + }, + { + "epoch": 0.4932649134060295, + "grad_norm": 0.8242651845310669, + "learning_rate": 0.00011967507774487108, + "loss": 0.6598, + "step": 7690 + }, + { + "epoch": 0.4935856318152662, + "grad_norm": 0.8816627197677691, + "learning_rate": 0.0001195653018569032, + "loss": 0.8369, + "step": 7695 + }, + { + "epoch": 0.4939063502245029, + "grad_norm": 0.781020774879966, + "learning_rate": 0.00011945550144732354, + "loss": 0.7912, + "step": 7700 + }, + { + "epoch": 0.49422706863373955, + "grad_norm": 0.5912028419510443, + "learning_rate": 0.00011934567665374732, + "loss": 0.673, + "step": 7705 + }, + { + "epoch": 0.49454778704297625, + "grad_norm": 0.7852150600454825, + "learning_rate": 0.00011923582761382031, + "loss": 0.6989, + "step": 7710 + }, + { + "epoch": 0.49486850545221295, + "grad_norm": 0.8345934386959575, + "learning_rate": 0.00011912595446521868, + "loss": 0.6319, + "step": 7715 + }, + { + "epoch": 0.49518922386144965, + "grad_norm": 1.2815263854782484, + "learning_rate": 0.0001190160573456488, + "loss": 0.6247, + "step": 7720 + }, + { + "epoch": 0.49550994227068634, + "grad_norm": 1.1234841964502218, + "learning_rate": 0.00011890613639284704, + "loss": 0.653, + "step": 7725 + }, + { + "epoch": 0.49583066067992304, + "grad_norm": 0.9428012694473118, + "learning_rate": 0.00011879619174457976, + "loss": 0.9064, + "step": 7730 + }, + { + "epoch": 0.49615137908915974, + "grad_norm": 0.7822481283735353, + "learning_rate": 0.00011868622353864285, + "loss": 0.5887, + "step": 7735 + }, + { + "epoch": 0.4964720974983964, + "grad_norm": 0.6197300598147442, + "learning_rate": 0.00011857623191286186, + "loss": 0.5871, + "step": 7740 + }, + { + "epoch": 0.4967928159076331, + "grad_norm": 0.6742268900193886, + "learning_rate": 0.00011846621700509171, + "loss": 0.6153, + "step": 7745 + }, + { + "epoch": 0.4971135343168698, + "grad_norm": 1.0097074349573119, + "learning_rate": 0.00011835617895321633, + "loss": 0.726, + "step": 7750 + }, + { + "epoch": 0.4974342527261065, + "grad_norm": 0.7938742619155006, + "learning_rate": 0.00011824611789514881, + "loss": 0.7576, + "step": 7755 + }, + { + "epoch": 0.49775497113534317, + "grad_norm": 0.7594193522785816, + "learning_rate": 0.00011813603396883108, + "loss": 0.631, + "step": 7760 + }, + { + "epoch": 0.49807568954457987, + "grad_norm": 1.1449681048330884, + "learning_rate": 0.0001180259273122336, + "loss": 0.8346, + "step": 7765 + }, + { + "epoch": 0.49839640795381657, + "grad_norm": 0.6106704277152839, + "learning_rate": 0.00011791579806335547, + "loss": 0.7094, + "step": 7770 + }, + { + "epoch": 0.49871712636305326, + "grad_norm": 0.9764152562715487, + "learning_rate": 0.000117805646360224, + "loss": 0.7922, + "step": 7775 + }, + { + "epoch": 0.4990378447722899, + "grad_norm": 1.4581971435959649, + "learning_rate": 0.00011769547234089469, + "loss": 0.7598, + "step": 7780 + }, + { + "epoch": 0.4993585631815266, + "grad_norm": 1.1726593622900077, + "learning_rate": 0.00011758527614345097, + "loss": 0.6934, + "step": 7785 + }, + { + "epoch": 0.4996792815907633, + "grad_norm": 1.382229173196648, + "learning_rate": 0.00011747505790600412, + "loss": 0.6793, + "step": 7790 + }, + { + "epoch": 0.5, + "grad_norm": 0.7583044707535523, + "learning_rate": 0.00011736481776669306, + "loss": 0.7244, + "step": 7795 + }, + { + "epoch": 0.5003207184092366, + "grad_norm": 1.0327502481504163, + "learning_rate": 0.000117254555863684, + "loss": 0.7023, + "step": 7800 + }, + { + "epoch": 0.5006414368184734, + "grad_norm": 0.6928521319692996, + "learning_rate": 0.00011714427233517069, + "loss": 0.5508, + "step": 7805 + }, + { + "epoch": 0.50096215522771, + "grad_norm": 0.6645980452165248, + "learning_rate": 0.0001170339673193737, + "loss": 0.7463, + "step": 7810 + }, + { + "epoch": 0.5012828736369468, + "grad_norm": 0.6668044106727686, + "learning_rate": 0.00011692364095454076, + "loss": 0.6357, + "step": 7815 + }, + { + "epoch": 0.5016035920461834, + "grad_norm": 0.9287710383565055, + "learning_rate": 0.00011681329337894623, + "loss": 0.6308, + "step": 7820 + }, + { + "epoch": 0.5019243104554202, + "grad_norm": 1.3104043465513664, + "learning_rate": 0.0001167029247308911, + "loss": 0.5399, + "step": 7825 + }, + { + "epoch": 0.5022450288646568, + "grad_norm": 1.428373507944948, + "learning_rate": 0.00011659253514870276, + "loss": 0.7011, + "step": 7830 + }, + { + "epoch": 0.5025657472738935, + "grad_norm": 0.833100109623975, + "learning_rate": 0.00011648212477073484, + "loss": 0.7404, + "step": 7835 + }, + { + "epoch": 0.5028864656831302, + "grad_norm": 1.0751700158927022, + "learning_rate": 0.00011637169373536698, + "loss": 0.6389, + "step": 7840 + }, + { + "epoch": 0.5032071840923669, + "grad_norm": 0.9610389244865, + "learning_rate": 0.00011626124218100483, + "loss": 0.732, + "step": 7845 + }, + { + "epoch": 0.5035279025016036, + "grad_norm": 1.4064338381179782, + "learning_rate": 0.00011615077024607965, + "loss": 0.7248, + "step": 7850 + }, + { + "epoch": 0.5038486209108403, + "grad_norm": 1.0089167449788845, + "learning_rate": 0.00011604027806904833, + "loss": 0.6808, + "step": 7855 + }, + { + "epoch": 0.504169339320077, + "grad_norm": 0.8297282225570892, + "learning_rate": 0.00011592976578839303, + "loss": 0.7505, + "step": 7860 + }, + { + "epoch": 0.5044900577293137, + "grad_norm": 0.8562597418732677, + "learning_rate": 0.00011581923354262117, + "loss": 0.7069, + "step": 7865 + }, + { + "epoch": 0.5048107761385503, + "grad_norm": 1.1555443138727173, + "learning_rate": 0.00011570868147026517, + "loss": 0.6213, + "step": 7870 + }, + { + "epoch": 0.505131494547787, + "grad_norm": 1.4259877059174733, + "learning_rate": 0.00011559810970988232, + "loss": 0.6105, + "step": 7875 + }, + { + "epoch": 0.5054522129570237, + "grad_norm": 0.6183735071336424, + "learning_rate": 0.00011548751840005459, + "loss": 0.4662, + "step": 7880 + }, + { + "epoch": 0.5057729313662604, + "grad_norm": 0.9453435423443054, + "learning_rate": 0.00011537690767938843, + "loss": 0.6083, + "step": 7885 + }, + { + "epoch": 0.5060936497754971, + "grad_norm": 0.6729282582317203, + "learning_rate": 0.00011526627768651459, + "loss": 0.7553, + "step": 7890 + }, + { + "epoch": 0.5064143681847338, + "grad_norm": 0.8579324957843062, + "learning_rate": 0.00011515562856008808, + "loss": 0.7014, + "step": 7895 + }, + { + "epoch": 0.5067350865939705, + "grad_norm": 0.9652710068101304, + "learning_rate": 0.00011504496043878776, + "loss": 0.7203, + "step": 7900 + }, + { + "epoch": 0.5070558050032072, + "grad_norm": 1.3328325121052935, + "learning_rate": 0.00011493427346131636, + "loss": 0.7462, + "step": 7905 + }, + { + "epoch": 0.5073765234124439, + "grad_norm": 0.7750774157499563, + "learning_rate": 0.00011482356776640028, + "loss": 0.7554, + "step": 7910 + }, + { + "epoch": 0.5076972418216805, + "grad_norm": 0.7771858604565626, + "learning_rate": 0.00011471284349278928, + "loss": 0.7032, + "step": 7915 + }, + { + "epoch": 0.5080179602309173, + "grad_norm": 0.9990707053591126, + "learning_rate": 0.0001146021007792565, + "loss": 0.5966, + "step": 7920 + }, + { + "epoch": 0.5083386786401539, + "grad_norm": 0.9864579497103747, + "learning_rate": 0.00011449133976459816, + "loss": 0.701, + "step": 7925 + }, + { + "epoch": 0.5086593970493907, + "grad_norm": 0.9752505086126679, + "learning_rate": 0.0001143805605876334, + "loss": 0.6502, + "step": 7930 + }, + { + "epoch": 0.5089801154586273, + "grad_norm": 1.3306389404931571, + "learning_rate": 0.00011426976338720412, + "loss": 0.6592, + "step": 7935 + }, + { + "epoch": 0.5093008338678641, + "grad_norm": 0.6705402480174242, + "learning_rate": 0.00011415894830217486, + "loss": 0.6531, + "step": 7940 + }, + { + "epoch": 0.5096215522771007, + "grad_norm": 0.8130683741487627, + "learning_rate": 0.00011404811547143251, + "loss": 0.7333, + "step": 7945 + }, + { + "epoch": 0.5099422706863374, + "grad_norm": 1.1664159763922086, + "learning_rate": 0.0001139372650338862, + "loss": 0.8146, + "step": 7950 + }, + { + "epoch": 0.5102629890955741, + "grad_norm": 0.5999515830143689, + "learning_rate": 0.00011382639712846721, + "loss": 0.5825, + "step": 7955 + }, + { + "epoch": 0.5105837075048107, + "grad_norm": 1.1054727651684402, + "learning_rate": 0.00011371551189412868, + "loss": 0.7374, + "step": 7960 + }, + { + "epoch": 0.5109044259140475, + "grad_norm": 1.0319949146313503, + "learning_rate": 0.00011360460946984537, + "loss": 0.7562, + "step": 7965 + }, + { + "epoch": 0.5112251443232841, + "grad_norm": 0.6047170156572763, + "learning_rate": 0.00011349368999461374, + "loss": 0.7588, + "step": 7970 + }, + { + "epoch": 0.5115458627325209, + "grad_norm": 0.8725079332758466, + "learning_rate": 0.00011338275360745147, + "loss": 0.7421, + "step": 7975 + }, + { + "epoch": 0.5118665811417575, + "grad_norm": 0.784376771151006, + "learning_rate": 0.00011327180044739755, + "loss": 0.5837, + "step": 7980 + }, + { + "epoch": 0.5121872995509942, + "grad_norm": 0.8977359490481988, + "learning_rate": 0.00011316083065351195, + "loss": 0.7392, + "step": 7985 + }, + { + "epoch": 0.5125080179602309, + "grad_norm": 0.653772242009018, + "learning_rate": 0.00011304984436487551, + "loss": 0.6166, + "step": 7990 + }, + { + "epoch": 0.5128287363694676, + "grad_norm": 1.2310492343797879, + "learning_rate": 0.00011293884172058971, + "loss": 0.5507, + "step": 7995 + }, + { + "epoch": 0.5131494547787043, + "grad_norm": 1.0077531207139014, + "learning_rate": 0.00011282782285977649, + "loss": 0.6358, + "step": 8000 + }, + { + "epoch": 0.513470173187941, + "grad_norm": 1.19554249733326, + "learning_rate": 0.00011271678792157823, + "loss": 0.6614, + "step": 8005 + }, + { + "epoch": 0.5137908915971777, + "grad_norm": 0.8654028252618859, + "learning_rate": 0.00011260573704515734, + "loss": 0.6444, + "step": 8010 + }, + { + "epoch": 0.5141116100064144, + "grad_norm": 0.9637998906695273, + "learning_rate": 0.00011249467036969632, + "loss": 0.6859, + "step": 8015 + }, + { + "epoch": 0.514432328415651, + "grad_norm": 1.2621981138132725, + "learning_rate": 0.00011238358803439739, + "loss": 0.7247, + "step": 8020 + }, + { + "epoch": 0.5147530468248878, + "grad_norm": 0.6255230049474781, + "learning_rate": 0.0001122724901784824, + "loss": 0.7025, + "step": 8025 + }, + { + "epoch": 0.5150737652341244, + "grad_norm": 0.8124027597004405, + "learning_rate": 0.00011216137694119271, + "loss": 0.6465, + "step": 8030 + }, + { + "epoch": 0.5153944836433612, + "grad_norm": 0.7060753692578354, + "learning_rate": 0.00011205024846178886, + "loss": 0.5977, + "step": 8035 + }, + { + "epoch": 0.5157152020525978, + "grad_norm": 0.9066775542047206, + "learning_rate": 0.00011193910487955059, + "loss": 0.6407, + "step": 8040 + }, + { + "epoch": 0.5160359204618346, + "grad_norm": 0.6903326908804434, + "learning_rate": 0.00011182794633377653, + "loss": 0.6925, + "step": 8045 + }, + { + "epoch": 0.5163566388710712, + "grad_norm": 0.9472934152436594, + "learning_rate": 0.00011171677296378411, + "loss": 0.7609, + "step": 8050 + }, + { + "epoch": 0.5166773572803078, + "grad_norm": 1.0828907895794335, + "learning_rate": 0.0001116055849089092, + "loss": 0.7855, + "step": 8055 + }, + { + "epoch": 0.5169980756895446, + "grad_norm": 1.3155495321215651, + "learning_rate": 0.00011149438230850626, + "loss": 0.6561, + "step": 8060 + }, + { + "epoch": 0.5173187940987812, + "grad_norm": 0.7751536928800652, + "learning_rate": 0.00011138316530194782, + "loss": 0.6302, + "step": 8065 + }, + { + "epoch": 0.517639512508018, + "grad_norm": 1.278374102598091, + "learning_rate": 0.00011127193402862457, + "loss": 0.6741, + "step": 8070 + }, + { + "epoch": 0.5179602309172546, + "grad_norm": 0.7961067269873462, + "learning_rate": 0.00011116068862794506, + "loss": 0.7248, + "step": 8075 + }, + { + "epoch": 0.5182809493264914, + "grad_norm": 0.9325619210714818, + "learning_rate": 0.0001110494292393355, + "loss": 0.6036, + "step": 8080 + }, + { + "epoch": 0.518601667735728, + "grad_norm": 0.9427970552237784, + "learning_rate": 0.00011093815600223966, + "loss": 0.6906, + "step": 8085 + }, + { + "epoch": 0.5189223861449648, + "grad_norm": 0.9820235565256558, + "learning_rate": 0.00011082686905611872, + "loss": 0.6996, + "step": 8090 + }, + { + "epoch": 0.5192431045542014, + "grad_norm": 0.7847448260775505, + "learning_rate": 0.00011071556854045098, + "loss": 0.67, + "step": 8095 + }, + { + "epoch": 0.5195638229634381, + "grad_norm": 0.7114519312016215, + "learning_rate": 0.00011060425459473169, + "loss": 0.6844, + "step": 8100 + }, + { + "epoch": 0.5198845413726748, + "grad_norm": 0.6238373643554763, + "learning_rate": 0.00011049292735847312, + "loss": 0.5971, + "step": 8105 + }, + { + "epoch": 0.5202052597819115, + "grad_norm": 0.9399929160198239, + "learning_rate": 0.00011038158697120395, + "loss": 0.6189, + "step": 8110 + }, + { + "epoch": 0.5205259781911482, + "grad_norm": 1.1129758526237858, + "learning_rate": 0.00011027023357246955, + "loss": 0.7023, + "step": 8115 + }, + { + "epoch": 0.5208466966003849, + "grad_norm": 1.049212324811729, + "learning_rate": 0.00011015886730183152, + "loss": 0.7014, + "step": 8120 + }, + { + "epoch": 0.5211674150096216, + "grad_norm": 0.8599253114644705, + "learning_rate": 0.00011004748829886755, + "loss": 0.6835, + "step": 8125 + }, + { + "epoch": 0.5214881334188582, + "grad_norm": 0.6066610732008468, + "learning_rate": 0.0001099360967031714, + "loss": 0.5214, + "step": 8130 + }, + { + "epoch": 0.5218088518280949, + "grad_norm": 0.8343848602348406, + "learning_rate": 0.00010982469265435249, + "loss": 0.6169, + "step": 8135 + }, + { + "epoch": 0.5221295702373316, + "grad_norm": 0.4237175002588996, + "learning_rate": 0.00010971327629203587, + "loss": 0.5628, + "step": 8140 + }, + { + "epoch": 0.5224502886465683, + "grad_norm": 0.7612853893387608, + "learning_rate": 0.00010960184775586209, + "loss": 0.6496, + "step": 8145 + }, + { + "epoch": 0.522771007055805, + "grad_norm": 0.7090497030288603, + "learning_rate": 0.00010949040718548693, + "loss": 0.6699, + "step": 8150 + }, + { + "epoch": 0.5230917254650417, + "grad_norm": 0.8137233187040953, + "learning_rate": 0.00010937895472058126, + "loss": 0.7825, + "step": 8155 + }, + { + "epoch": 0.5234124438742784, + "grad_norm": 1.106458178679526, + "learning_rate": 0.0001092674905008308, + "loss": 0.5917, + "step": 8160 + }, + { + "epoch": 0.5237331622835151, + "grad_norm": 1.1023421333903827, + "learning_rate": 0.00010915601466593604, + "loss": 0.652, + "step": 8165 + }, + { + "epoch": 0.5240538806927517, + "grad_norm": 1.2339053368878727, + "learning_rate": 0.00010904452735561204, + "loss": 0.7531, + "step": 8170 + }, + { + "epoch": 0.5243745991019885, + "grad_norm": 0.8536672713520308, + "learning_rate": 0.00010893302870958824, + "loss": 0.6808, + "step": 8175 + }, + { + "epoch": 0.5246953175112251, + "grad_norm": 0.9072452347961674, + "learning_rate": 0.00010882151886760827, + "loss": 0.7883, + "step": 8180 + }, + { + "epoch": 0.5250160359204619, + "grad_norm": 0.705408047927468, + "learning_rate": 0.00010870999796942986, + "loss": 0.7448, + "step": 8185 + }, + { + "epoch": 0.5253367543296985, + "grad_norm": 0.84842819642806, + "learning_rate": 0.00010859846615482448, + "loss": 0.7873, + "step": 8190 + }, + { + "epoch": 0.5256574727389353, + "grad_norm": 0.9668127437981949, + "learning_rate": 0.00010848692356357735, + "loss": 0.6553, + "step": 8195 + }, + { + "epoch": 0.5259781911481719, + "grad_norm": 1.3910270737631052, + "learning_rate": 0.00010837537033548718, + "loss": 0.551, + "step": 8200 + }, + { + "epoch": 0.5262989095574085, + "grad_norm": 0.8934045053705592, + "learning_rate": 0.00010826380661036601, + "loss": 0.755, + "step": 8205 + }, + { + "epoch": 0.5266196279666453, + "grad_norm": 0.7580165266865208, + "learning_rate": 0.0001081522325280391, + "loss": 0.6785, + "step": 8210 + }, + { + "epoch": 0.5269403463758819, + "grad_norm": 0.895270436973056, + "learning_rate": 0.00010804064822834461, + "loss": 0.6188, + "step": 8215 + }, + { + "epoch": 0.5272610647851187, + "grad_norm": 0.8349917473129711, + "learning_rate": 0.0001079290538511335, + "loss": 0.5295, + "step": 8220 + }, + { + "epoch": 0.5275817831943553, + "grad_norm": 1.0937712586985149, + "learning_rate": 0.00010781744953626944, + "loss": 0.718, + "step": 8225 + }, + { + "epoch": 0.5279025016035921, + "grad_norm": 0.9776711832493594, + "learning_rate": 0.00010770583542362848, + "loss": 0.7394, + "step": 8230 + }, + { + "epoch": 0.5282232200128287, + "grad_norm": 0.9916244110681041, + "learning_rate": 0.00010759421165309898, + "loss": 0.6302, + "step": 8235 + }, + { + "epoch": 0.5285439384220654, + "grad_norm": 0.7709724576720045, + "learning_rate": 0.00010748257836458142, + "loss": 0.4377, + "step": 8240 + }, + { + "epoch": 0.5288646568313021, + "grad_norm": 0.9553016321868766, + "learning_rate": 0.00010737093569798815, + "loss": 0.5929, + "step": 8245 + }, + { + "epoch": 0.5291853752405388, + "grad_norm": 0.5921375135170813, + "learning_rate": 0.00010725928379324335, + "loss": 0.6308, + "step": 8250 + }, + { + "epoch": 0.5295060936497755, + "grad_norm": 0.9409908884682822, + "learning_rate": 0.00010714762279028275, + "loss": 0.6488, + "step": 8255 + }, + { + "epoch": 0.5298268120590122, + "grad_norm": 0.9164401991956044, + "learning_rate": 0.00010703595282905343, + "loss": 0.7185, + "step": 8260 + }, + { + "epoch": 0.5301475304682489, + "grad_norm": 0.7915811080548818, + "learning_rate": 0.00010692427404951379, + "loss": 0.7002, + "step": 8265 + }, + { + "epoch": 0.5304682488774856, + "grad_norm": 1.1633281858494344, + "learning_rate": 0.00010681258659163322, + "loss": 0.7142, + "step": 8270 + }, + { + "epoch": 0.5307889672867223, + "grad_norm": 1.1360488426032926, + "learning_rate": 0.00010670089059539201, + "loss": 0.6164, + "step": 8275 + }, + { + "epoch": 0.531109685695959, + "grad_norm": 0.9950081272171089, + "learning_rate": 0.0001065891862007811, + "loss": 0.5403, + "step": 8280 + }, + { + "epoch": 0.5314304041051956, + "grad_norm": 1.0499402732473173, + "learning_rate": 0.00010647747354780206, + "loss": 0.6409, + "step": 8285 + }, + { + "epoch": 0.5317511225144324, + "grad_norm": 0.9441134224109928, + "learning_rate": 0.00010636575277646672, + "loss": 0.5947, + "step": 8290 + }, + { + "epoch": 0.532071840923669, + "grad_norm": 1.3058395760608197, + "learning_rate": 0.00010625402402679712, + "loss": 0.6901, + "step": 8295 + }, + { + "epoch": 0.5323925593329057, + "grad_norm": 0.8650565306977751, + "learning_rate": 0.0001061422874388253, + "loss": 0.6536, + "step": 8300 + }, + { + "epoch": 0.5327132777421424, + "grad_norm": 1.1023501837328433, + "learning_rate": 0.0001060305431525931, + "loss": 0.7735, + "step": 8305 + }, + { + "epoch": 0.5330339961513791, + "grad_norm": 0.7402707462941108, + "learning_rate": 0.00010591879130815206, + "loss": 0.7746, + "step": 8310 + }, + { + "epoch": 0.5333547145606158, + "grad_norm": 1.0334014975634367, + "learning_rate": 0.0001058070320455631, + "loss": 0.6197, + "step": 8315 + }, + { + "epoch": 0.5336754329698524, + "grad_norm": 0.8973174424463937, + "learning_rate": 0.00010569526550489656, + "loss": 0.6662, + "step": 8320 + }, + { + "epoch": 0.5339961513790892, + "grad_norm": 1.1260137879030736, + "learning_rate": 0.00010558349182623182, + "loss": 0.7384, + "step": 8325 + }, + { + "epoch": 0.5343168697883258, + "grad_norm": 1.0775603650728314, + "learning_rate": 0.00010547171114965721, + "loss": 0.53, + "step": 8330 + }, + { + "epoch": 0.5346375881975626, + "grad_norm": 0.8657241626493881, + "learning_rate": 0.00010535992361526986, + "loss": 0.6597, + "step": 8335 + }, + { + "epoch": 0.5349583066067992, + "grad_norm": 0.7754986474145258, + "learning_rate": 0.00010524812936317545, + "loss": 0.7155, + "step": 8340 + }, + { + "epoch": 0.535279025016036, + "grad_norm": 0.7235913108295569, + "learning_rate": 0.00010513632853348817, + "loss": 0.63, + "step": 8345 + }, + { + "epoch": 0.5355997434252726, + "grad_norm": 1.0376021153773205, + "learning_rate": 0.00010502452126633033, + "loss": 0.7389, + "step": 8350 + }, + { + "epoch": 0.5359204618345093, + "grad_norm": 1.0736867388991156, + "learning_rate": 0.00010491270770183241, + "loss": 0.7524, + "step": 8355 + }, + { + "epoch": 0.536241180243746, + "grad_norm": 1.2875466262160882, + "learning_rate": 0.00010480088798013274, + "loss": 0.7637, + "step": 8360 + }, + { + "epoch": 0.5365618986529826, + "grad_norm": 1.0698179015991502, + "learning_rate": 0.00010468906224137736, + "loss": 0.7777, + "step": 8365 + }, + { + "epoch": 0.5368826170622194, + "grad_norm": 0.715308845951178, + "learning_rate": 0.00010457723062571984, + "loss": 0.581, + "step": 8370 + }, + { + "epoch": 0.537203335471456, + "grad_norm": 1.9992463200156003, + "learning_rate": 0.00010446539327332121, + "loss": 0.6813, + "step": 8375 + }, + { + "epoch": 0.5375240538806928, + "grad_norm": 0.9082670120549011, + "learning_rate": 0.00010435355032434958, + "loss": 0.8172, + "step": 8380 + }, + { + "epoch": 0.5378447722899294, + "grad_norm": 0.5039137526581597, + "learning_rate": 0.00010424170191898006, + "loss": 0.6443, + "step": 8385 + }, + { + "epoch": 0.5381654906991661, + "grad_norm": 0.8357611125226391, + "learning_rate": 0.00010412984819739473, + "loss": 0.6672, + "step": 8390 + }, + { + "epoch": 0.5384862091084028, + "grad_norm": 0.9107912987485977, + "learning_rate": 0.00010401798929978224, + "loss": 0.6107, + "step": 8395 + }, + { + "epoch": 0.5388069275176395, + "grad_norm": 0.8281442376194428, + "learning_rate": 0.0001039061253663377, + "loss": 0.6075, + "step": 8400 + }, + { + "epoch": 0.5391276459268762, + "grad_norm": 0.7249862380029812, + "learning_rate": 0.00010379425653726263, + "loss": 0.7265, + "step": 8405 + }, + { + "epoch": 0.5394483643361129, + "grad_norm": 0.9092092180370709, + "learning_rate": 0.00010368238295276455, + "loss": 0.6893, + "step": 8410 + }, + { + "epoch": 0.5397690827453496, + "grad_norm": 0.6540167568734936, + "learning_rate": 0.0001035705047530571, + "loss": 0.7305, + "step": 8415 + }, + { + "epoch": 0.5400898011545863, + "grad_norm": 0.7981383776198956, + "learning_rate": 0.00010345862207835957, + "loss": 0.6453, + "step": 8420 + }, + { + "epoch": 0.5404105195638229, + "grad_norm": 0.945104000015912, + "learning_rate": 0.00010334673506889696, + "loss": 0.7016, + "step": 8425 + }, + { + "epoch": 0.5407312379730597, + "grad_norm": 1.0547131113611765, + "learning_rate": 0.00010323484386489961, + "loss": 0.7347, + "step": 8430 + }, + { + "epoch": 0.5410519563822963, + "grad_norm": 0.8025281891388182, + "learning_rate": 0.00010312294860660319, + "loss": 0.5264, + "step": 8435 + }, + { + "epoch": 0.5413726747915331, + "grad_norm": 0.9019250163215435, + "learning_rate": 0.0001030110494342484, + "loss": 0.5963, + "step": 8440 + }, + { + "epoch": 0.5416933932007697, + "grad_norm": 0.6368675777184184, + "learning_rate": 0.00010289914648808088, + "loss": 0.5399, + "step": 8445 + }, + { + "epoch": 0.5420141116100065, + "grad_norm": 0.8008826667949324, + "learning_rate": 0.00010278723990835097, + "loss": 0.7476, + "step": 8450 + }, + { + "epoch": 0.5423348300192431, + "grad_norm": 0.7219125921723233, + "learning_rate": 0.0001026753298353136, + "loss": 0.5883, + "step": 8455 + }, + { + "epoch": 0.5426555484284797, + "grad_norm": 0.6992313736984004, + "learning_rate": 0.0001025634164092281, + "loss": 0.5797, + "step": 8460 + }, + { + "epoch": 0.5429762668377165, + "grad_norm": 0.44695714450265767, + "learning_rate": 0.00010245149977035792, + "loss": 0.6473, + "step": 8465 + }, + { + "epoch": 0.5432969852469531, + "grad_norm": 1.248682759415961, + "learning_rate": 0.00010233958005897058, + "loss": 0.5812, + "step": 8470 + }, + { + "epoch": 0.5436177036561899, + "grad_norm": 1.0568826134330056, + "learning_rate": 0.00010222765741533744, + "loss": 0.7862, + "step": 8475 + }, + { + "epoch": 0.5439384220654265, + "grad_norm": 0.8116820280676993, + "learning_rate": 0.00010211573197973356, + "loss": 0.6353, + "step": 8480 + }, + { + "epoch": 0.5442591404746633, + "grad_norm": 0.9997535811765578, + "learning_rate": 0.00010200380389243753, + "loss": 0.7229, + "step": 8485 + }, + { + "epoch": 0.5445798588838999, + "grad_norm": 0.8261136419022004, + "learning_rate": 0.00010189187329373113, + "loss": 0.6919, + "step": 8490 + }, + { + "epoch": 0.5449005772931367, + "grad_norm": 0.7977851457213406, + "learning_rate": 0.00010177994032389946, + "loss": 0.5777, + "step": 8495 + }, + { + "epoch": 0.5452212957023733, + "grad_norm": 1.211421213402399, + "learning_rate": 0.00010166800512323043, + "loss": 0.6434, + "step": 8500 + }, + { + "epoch": 0.54554201411161, + "grad_norm": 2.0722177427022244, + "learning_rate": 0.00010155606783201488, + "loss": 0.5933, + "step": 8505 + }, + { + "epoch": 0.5458627325208467, + "grad_norm": 0.7874345109274467, + "learning_rate": 0.00010144412859054617, + "loss": 0.8209, + "step": 8510 + }, + { + "epoch": 0.5461834509300834, + "grad_norm": 0.5164159774237933, + "learning_rate": 0.00010133218753912023, + "loss": 0.6337, + "step": 8515 + }, + { + "epoch": 0.5465041693393201, + "grad_norm": 0.9997324723951748, + "learning_rate": 0.00010122024481803509, + "loss": 0.7799, + "step": 8520 + }, + { + "epoch": 0.5468248877485568, + "grad_norm": 0.868379009704931, + "learning_rate": 0.000101108300567591, + "loss": 0.6205, + "step": 8525 + }, + { + "epoch": 0.5471456061577935, + "grad_norm": 0.7487726179830052, + "learning_rate": 0.00010099635492809007, + "loss": 0.7024, + "step": 8530 + }, + { + "epoch": 0.5474663245670301, + "grad_norm": 0.784320611343729, + "learning_rate": 0.00010088440803983616, + "loss": 0.765, + "step": 8535 + }, + { + "epoch": 0.5477870429762668, + "grad_norm": 0.7657678123947386, + "learning_rate": 0.00010077246004313472, + "loss": 0.6496, + "step": 8540 + }, + { + "epoch": 0.5481077613855035, + "grad_norm": 0.7225029829590283, + "learning_rate": 0.00010066051107829259, + "loss": 0.6885, + "step": 8545 + }, + { + "epoch": 0.5484284797947402, + "grad_norm": 0.8979772778090884, + "learning_rate": 0.00010054856128561778, + "loss": 0.7111, + "step": 8550 + }, + { + "epoch": 0.5487491982039769, + "grad_norm": 1.322201085524258, + "learning_rate": 0.00010043661080541936, + "loss": 0.6252, + "step": 8555 + }, + { + "epoch": 0.5490699166132136, + "grad_norm": 0.6743113052462498, + "learning_rate": 0.00010032465977800726, + "loss": 0.5282, + "step": 8560 + }, + { + "epoch": 0.5493906350224503, + "grad_norm": 0.8693068518513947, + "learning_rate": 0.00010021270834369211, + "loss": 0.6029, + "step": 8565 + }, + { + "epoch": 0.549711353431687, + "grad_norm": 1.1870868813911406, + "learning_rate": 0.00010010075664278507, + "loss": 0.6264, + "step": 8570 + }, + { + "epoch": 0.5500320718409236, + "grad_norm": 1.0567858782770287, + "learning_rate": 9.998880481559755e-05, + "loss": 0.8018, + "step": 8575 + }, + { + "epoch": 0.5503527902501604, + "grad_norm": 0.8137731229847819, + "learning_rate": 9.987685300244117e-05, + "loss": 0.614, + "step": 8580 + }, + { + "epoch": 0.550673508659397, + "grad_norm": 0.9599816781819811, + "learning_rate": 9.976490134362759e-05, + "loss": 0.687, + "step": 8585 + }, + { + "epoch": 0.5509942270686338, + "grad_norm": 0.6181246421982609, + "learning_rate": 9.965294997946815e-05, + "loss": 0.6866, + "step": 8590 + }, + { + "epoch": 0.5513149454778704, + "grad_norm": 1.1348648251746718, + "learning_rate": 9.954099905027396e-05, + "loss": 0.6416, + "step": 8595 + }, + { + "epoch": 0.5516356638871072, + "grad_norm": 1.6639502602729528, + "learning_rate": 9.94290486963555e-05, + "loss": 0.6715, + "step": 8600 + }, + { + "epoch": 0.5519563822963438, + "grad_norm": 0.7678034571145345, + "learning_rate": 9.931709905802252e-05, + "loss": 0.6886, + "step": 8605 + }, + { + "epoch": 0.5522771007055804, + "grad_norm": 1.4578465770643851, + "learning_rate": 9.92051502755839e-05, + "loss": 0.7689, + "step": 8610 + }, + { + "epoch": 0.5525978191148172, + "grad_norm": 0.7434972557340698, + "learning_rate": 9.909320248934747e-05, + "loss": 0.6374, + "step": 8615 + }, + { + "epoch": 0.5529185375240538, + "grad_norm": 0.8031136082718469, + "learning_rate": 9.898125583961977e-05, + "loss": 0.7055, + "step": 8620 + }, + { + "epoch": 0.5532392559332906, + "grad_norm": 1.000878821455057, + "learning_rate": 9.886931046670598e-05, + "loss": 0.6157, + "step": 8625 + }, + { + "epoch": 0.5535599743425272, + "grad_norm": 0.6524291495733984, + "learning_rate": 9.875736651090956e-05, + "loss": 0.561, + "step": 8630 + }, + { + "epoch": 0.553880692751764, + "grad_norm": 1.3537142167105929, + "learning_rate": 9.864542411253229e-05, + "loss": 0.6718, + "step": 8635 + }, + { + "epoch": 0.5542014111610006, + "grad_norm": 1.2775573591627376, + "learning_rate": 9.853348341187398e-05, + "loss": 0.6645, + "step": 8640 + }, + { + "epoch": 0.5545221295702373, + "grad_norm": 0.982975595575632, + "learning_rate": 9.842154454923236e-05, + "loss": 0.5919, + "step": 8645 + }, + { + "epoch": 0.554842847979474, + "grad_norm": 0.960094691754927, + "learning_rate": 9.830960766490274e-05, + "loss": 0.8113, + "step": 8650 + }, + { + "epoch": 0.5551635663887107, + "grad_norm": 0.7965375300164668, + "learning_rate": 9.819767289917802e-05, + "loss": 0.5782, + "step": 8655 + }, + { + "epoch": 0.5554842847979474, + "grad_norm": 1.1381902966011452, + "learning_rate": 9.808574039234843e-05, + "loss": 0.6242, + "step": 8660 + }, + { + "epoch": 0.5558050032071841, + "grad_norm": 0.8670424286605721, + "learning_rate": 9.79738102847014e-05, + "loss": 0.7355, + "step": 8665 + }, + { + "epoch": 0.5561257216164208, + "grad_norm": 0.8366621626207873, + "learning_rate": 9.786188271652133e-05, + "loss": 0.5744, + "step": 8670 + }, + { + "epoch": 0.5564464400256575, + "grad_norm": 0.8273685386138488, + "learning_rate": 9.774995782808943e-05, + "loss": 0.6414, + "step": 8675 + }, + { + "epoch": 0.5567671584348942, + "grad_norm": 0.9522831235441542, + "learning_rate": 9.763803575968357e-05, + "loss": 0.7632, + "step": 8680 + }, + { + "epoch": 0.5570878768441309, + "grad_norm": 0.75372169303836, + "learning_rate": 9.752611665157807e-05, + "loss": 0.6433, + "step": 8685 + }, + { + "epoch": 0.5574085952533675, + "grad_norm": 1.2109886710417286, + "learning_rate": 9.741420064404353e-05, + "loss": 0.63, + "step": 8690 + }, + { + "epoch": 0.5577293136626043, + "grad_norm": 0.5400874445069787, + "learning_rate": 9.730228787734669e-05, + "loss": 0.6789, + "step": 8695 + }, + { + "epoch": 0.5580500320718409, + "grad_norm": 0.7989657543785353, + "learning_rate": 9.719037849175023e-05, + "loss": 0.7407, + "step": 8700 + }, + { + "epoch": 0.5583707504810776, + "grad_norm": 0.7239899818926174, + "learning_rate": 9.707847262751257e-05, + "loss": 0.6029, + "step": 8705 + }, + { + "epoch": 0.5586914688903143, + "grad_norm": 1.1080694844841645, + "learning_rate": 9.696657042488774e-05, + "loss": 0.6841, + "step": 8710 + }, + { + "epoch": 0.559012187299551, + "grad_norm": 0.8668620206006121, + "learning_rate": 9.685467202412514e-05, + "loss": 0.8091, + "step": 8715 + }, + { + "epoch": 0.5593329057087877, + "grad_norm": 0.8263012333520392, + "learning_rate": 9.674277756546941e-05, + "loss": 0.5612, + "step": 8720 + }, + { + "epoch": 0.5596536241180243, + "grad_norm": 1.2272663628925047, + "learning_rate": 9.663088718916031e-05, + "loss": 0.6214, + "step": 8725 + }, + { + "epoch": 0.5599743425272611, + "grad_norm": 0.9766333412497376, + "learning_rate": 9.651900103543244e-05, + "loss": 0.7342, + "step": 8730 + }, + { + "epoch": 0.5602950609364977, + "grad_norm": 0.830624516454487, + "learning_rate": 9.640711924451514e-05, + "loss": 0.6718, + "step": 8735 + }, + { + "epoch": 0.5606157793457345, + "grad_norm": 0.4675831817637492, + "learning_rate": 9.629524195663219e-05, + "loss": 0.6039, + "step": 8740 + }, + { + "epoch": 0.5609364977549711, + "grad_norm": 0.6634840466913374, + "learning_rate": 9.618336931200182e-05, + "loss": 0.5964, + "step": 8745 + }, + { + "epoch": 0.5612572161642079, + "grad_norm": 0.9976406641974719, + "learning_rate": 9.607150145083642e-05, + "loss": 0.7166, + "step": 8750 + }, + { + "epoch": 0.5615779345734445, + "grad_norm": 0.9545013096296738, + "learning_rate": 9.595963851334237e-05, + "loss": 0.689, + "step": 8755 + }, + { + "epoch": 0.5618986529826812, + "grad_norm": 0.9634333696652287, + "learning_rate": 9.58477806397199e-05, + "loss": 0.8048, + "step": 8760 + }, + { + "epoch": 0.5622193713919179, + "grad_norm": 0.8057551483876174, + "learning_rate": 9.573592797016285e-05, + "loss": 0.672, + "step": 8765 + }, + { + "epoch": 0.5625400898011546, + "grad_norm": 1.0000169919459303, + "learning_rate": 9.562408064485858e-05, + "loss": 0.656, + "step": 8770 + }, + { + "epoch": 0.5628608082103913, + "grad_norm": 1.0059598561012926, + "learning_rate": 9.551223880398778e-05, + "loss": 0.6689, + "step": 8775 + }, + { + "epoch": 0.563181526619628, + "grad_norm": 0.7089352756337184, + "learning_rate": 9.540040258772413e-05, + "loss": 0.6104, + "step": 8780 + }, + { + "epoch": 0.5635022450288647, + "grad_norm": 0.9673260454868421, + "learning_rate": 9.528857213623441e-05, + "loss": 0.625, + "step": 8785 + }, + { + "epoch": 0.5638229634381013, + "grad_norm": 0.8425769011906392, + "learning_rate": 9.517674758967812e-05, + "loss": 0.6385, + "step": 8790 + }, + { + "epoch": 0.564143681847338, + "grad_norm": 0.8483079594314462, + "learning_rate": 9.506492908820737e-05, + "loss": 0.7091, + "step": 8795 + }, + { + "epoch": 0.5644644002565747, + "grad_norm": 1.1949041204777606, + "learning_rate": 9.495311677196663e-05, + "loss": 0.5583, + "step": 8800 + }, + { + "epoch": 0.5647851186658114, + "grad_norm": 1.1203988658358368, + "learning_rate": 9.484131078109272e-05, + "loss": 0.6491, + "step": 8805 + }, + { + "epoch": 0.5651058370750481, + "grad_norm": 0.7171168814679133, + "learning_rate": 9.472951125571447e-05, + "loss": 0.5704, + "step": 8810 + }, + { + "epoch": 0.5654265554842848, + "grad_norm": 0.43705154049643696, + "learning_rate": 9.461771833595263e-05, + "loss": 0.6235, + "step": 8815 + }, + { + "epoch": 0.5657472738935215, + "grad_norm": 0.5972509611997564, + "learning_rate": 9.450593216191962e-05, + "loss": 0.6011, + "step": 8820 + }, + { + "epoch": 0.5660679923027582, + "grad_norm": 0.6585353171844711, + "learning_rate": 9.439415287371949e-05, + "loss": 0.6338, + "step": 8825 + }, + { + "epoch": 0.5663887107119948, + "grad_norm": 1.182861072860639, + "learning_rate": 9.42823806114476e-05, + "loss": 0.6286, + "step": 8830 + }, + { + "epoch": 0.5667094291212316, + "grad_norm": 0.774985192783614, + "learning_rate": 9.417061551519051e-05, + "loss": 0.6362, + "step": 8835 + }, + { + "epoch": 0.5670301475304682, + "grad_norm": 1.6279736397998856, + "learning_rate": 9.405885772502582e-05, + "loss": 0.5434, + "step": 8840 + }, + { + "epoch": 0.567350865939705, + "grad_norm": 0.8603999240784707, + "learning_rate": 9.394710738102198e-05, + "loss": 0.7135, + "step": 8845 + }, + { + "epoch": 0.5676715843489416, + "grad_norm": 0.8326631481896093, + "learning_rate": 9.383536462323807e-05, + "loss": 0.6316, + "step": 8850 + }, + { + "epoch": 0.5679923027581784, + "grad_norm": 1.1396992210320314, + "learning_rate": 9.372362959172364e-05, + "loss": 0.6325, + "step": 8855 + }, + { + "epoch": 0.568313021167415, + "grad_norm": 0.6117345152175109, + "learning_rate": 9.361190242651864e-05, + "loss": 0.6159, + "step": 8860 + }, + { + "epoch": 0.5686337395766518, + "grad_norm": 0.9306563316596532, + "learning_rate": 9.350018326765311e-05, + "loss": 0.6533, + "step": 8865 + }, + { + "epoch": 0.5689544579858884, + "grad_norm": 0.8930767778362739, + "learning_rate": 9.338847225514708e-05, + "loss": 0.6675, + "step": 8870 + }, + { + "epoch": 0.569275176395125, + "grad_norm": 0.4141144493955828, + "learning_rate": 9.327676952901034e-05, + "loss": 0.5957, + "step": 8875 + }, + { + "epoch": 0.5695958948043618, + "grad_norm": 0.8888417335481001, + "learning_rate": 9.31650752292423e-05, + "loss": 0.5665, + "step": 8880 + }, + { + "epoch": 0.5699166132135984, + "grad_norm": 0.7603252238964692, + "learning_rate": 9.305338949583183e-05, + "loss": 0.6428, + "step": 8885 + }, + { + "epoch": 0.5702373316228352, + "grad_norm": 1.271342150118716, + "learning_rate": 9.294171246875705e-05, + "loss": 0.7219, + "step": 8890 + }, + { + "epoch": 0.5705580500320718, + "grad_norm": 0.9447555346689784, + "learning_rate": 9.283004428798519e-05, + "loss": 0.6965, + "step": 8895 + }, + { + "epoch": 0.5708787684413086, + "grad_norm": 0.8678646764049435, + "learning_rate": 9.271838509347233e-05, + "loss": 0.7673, + "step": 8900 + }, + { + "epoch": 0.5711994868505452, + "grad_norm": 0.7416908587434721, + "learning_rate": 9.260673502516333e-05, + "loss": 0.6081, + "step": 8905 + }, + { + "epoch": 0.5715202052597819, + "grad_norm": 0.939422337464896, + "learning_rate": 9.24950942229917e-05, + "loss": 0.6721, + "step": 8910 + }, + { + "epoch": 0.5718409236690186, + "grad_norm": 0.8506289909429936, + "learning_rate": 9.238346282687912e-05, + "loss": 0.7379, + "step": 8915 + }, + { + "epoch": 0.5721616420782553, + "grad_norm": 1.3927657753594376, + "learning_rate": 9.227184097673566e-05, + "loss": 0.7231, + "step": 8920 + }, + { + "epoch": 0.572482360487492, + "grad_norm": 0.6002814159409026, + "learning_rate": 9.21602288124594e-05, + "loss": 0.8172, + "step": 8925 + }, + { + "epoch": 0.5728030788967287, + "grad_norm": 0.7935777728563393, + "learning_rate": 9.204862647393625e-05, + "loss": 0.8086, + "step": 8930 + }, + { + "epoch": 0.5731237973059654, + "grad_norm": 1.0397353291637284, + "learning_rate": 9.193703410103978e-05, + "loss": 0.6631, + "step": 8935 + }, + { + "epoch": 0.573444515715202, + "grad_norm": 0.8367031156015087, + "learning_rate": 9.182545183363112e-05, + "loss": 0.5788, + "step": 8940 + }, + { + "epoch": 0.5737652341244387, + "grad_norm": 1.2325263908639137, + "learning_rate": 9.17138798115587e-05, + "loss": 0.7789, + "step": 8945 + }, + { + "epoch": 0.5740859525336754, + "grad_norm": 0.9464147249819552, + "learning_rate": 9.160231817465815e-05, + "loss": 0.5279, + "step": 8950 + }, + { + "epoch": 0.5744066709429121, + "grad_norm": 0.8158486660018726, + "learning_rate": 9.149076706275207e-05, + "loss": 0.7098, + "step": 8955 + }, + { + "epoch": 0.5747273893521488, + "grad_norm": 0.7825563949372556, + "learning_rate": 9.137922661564981e-05, + "loss": 0.6993, + "step": 8960 + }, + { + "epoch": 0.5750481077613855, + "grad_norm": 0.9955286924734048, + "learning_rate": 9.126769697314741e-05, + "loss": 0.6668, + "step": 8965 + }, + { + "epoch": 0.5753688261706222, + "grad_norm": 0.987888018064567, + "learning_rate": 9.11561782750274e-05, + "loss": 0.7683, + "step": 8970 + }, + { + "epoch": 0.5756895445798589, + "grad_norm": 0.9029264976754006, + "learning_rate": 9.104467066105855e-05, + "loss": 0.5976, + "step": 8975 + }, + { + "epoch": 0.5760102629890955, + "grad_norm": 1.2083151109064707, + "learning_rate": 9.093317427099567e-05, + "loss": 0.7444, + "step": 8980 + }, + { + "epoch": 0.5763309813983323, + "grad_norm": 0.627708721729255, + "learning_rate": 9.082168924457963e-05, + "loss": 0.5052, + "step": 8985 + }, + { + "epoch": 0.5766516998075689, + "grad_norm": 0.818341174384118, + "learning_rate": 9.071021572153699e-05, + "loss": 0.6956, + "step": 8990 + }, + { + "epoch": 0.5769724182168057, + "grad_norm": 0.7174427987431503, + "learning_rate": 9.05987538415799e-05, + "loss": 0.6537, + "step": 8995 + }, + { + "epoch": 0.5772931366260423, + "grad_norm": 1.0123101523225277, + "learning_rate": 9.048730374440593e-05, + "loss": 0.6298, + "step": 9000 + }, + { + "epoch": 0.5776138550352791, + "grad_norm": 1.4927380842347644, + "learning_rate": 9.037586556969785e-05, + "loss": 0.7866, + "step": 9005 + }, + { + "epoch": 0.5779345734445157, + "grad_norm": 1.1107550009988214, + "learning_rate": 9.026443945712355e-05, + "loss": 0.5272, + "step": 9010 + }, + { + "epoch": 0.5782552918537524, + "grad_norm": 1.042711051305287, + "learning_rate": 9.015302554633572e-05, + "loss": 0.6862, + "step": 9015 + }, + { + "epoch": 0.5785760102629891, + "grad_norm": 1.097565575641477, + "learning_rate": 9.004162397697183e-05, + "loss": 0.6653, + "step": 9020 + }, + { + "epoch": 0.5788967286722257, + "grad_norm": 0.7962187563904711, + "learning_rate": 8.993023488865384e-05, + "loss": 0.7807, + "step": 9025 + }, + { + "epoch": 0.5792174470814625, + "grad_norm": 0.8018799159927662, + "learning_rate": 8.981885842098807e-05, + "loss": 0.6755, + "step": 9030 + }, + { + "epoch": 0.5795381654906991, + "grad_norm": 1.0103385936451423, + "learning_rate": 8.970749471356508e-05, + "loss": 0.7498, + "step": 9035 + }, + { + "epoch": 0.5798588838999359, + "grad_norm": 0.8540199269462798, + "learning_rate": 8.959614390595933e-05, + "loss": 0.7041, + "step": 9040 + }, + { + "epoch": 0.5801796023091725, + "grad_norm": 1.1040345444470279, + "learning_rate": 8.948480613772923e-05, + "loss": 0.5949, + "step": 9045 + }, + { + "epoch": 0.5805003207184093, + "grad_norm": 1.0463417093934197, + "learning_rate": 8.93734815484167e-05, + "loss": 0.6716, + "step": 9050 + }, + { + "epoch": 0.5808210391276459, + "grad_norm": 0.9338670777982941, + "learning_rate": 8.92621702775473e-05, + "loss": 0.652, + "step": 9055 + }, + { + "epoch": 0.5811417575368826, + "grad_norm": 0.8605449857576016, + "learning_rate": 8.915087246462981e-05, + "loss": 0.6335, + "step": 9060 + }, + { + "epoch": 0.5814624759461193, + "grad_norm": 0.9482034036580209, + "learning_rate": 8.903958824915616e-05, + "loss": 0.7407, + "step": 9065 + }, + { + "epoch": 0.581783194355356, + "grad_norm": 0.9120660938985135, + "learning_rate": 8.892831777060128e-05, + "loss": 0.714, + "step": 9070 + }, + { + "epoch": 0.5821039127645927, + "grad_norm": 0.7546853050581628, + "learning_rate": 8.881706116842277e-05, + "loss": 0.6643, + "step": 9075 + }, + { + "epoch": 0.5824246311738294, + "grad_norm": 0.7217266514190624, + "learning_rate": 8.870581858206097e-05, + "loss": 0.6232, + "step": 9080 + }, + { + "epoch": 0.5827453495830661, + "grad_norm": 0.8122719551725256, + "learning_rate": 8.859459015093856e-05, + "loss": 0.753, + "step": 9085 + }, + { + "epoch": 0.5830660679923028, + "grad_norm": 0.6978194557670415, + "learning_rate": 8.848337601446056e-05, + "loss": 0.592, + "step": 9090 + }, + { + "epoch": 0.5833867864015394, + "grad_norm": 0.7490982355447477, + "learning_rate": 8.8372176312014e-05, + "loss": 0.6739, + "step": 9095 + }, + { + "epoch": 0.5837075048107762, + "grad_norm": 1.074058776492988, + "learning_rate": 8.826099118296781e-05, + "loss": 0.6831, + "step": 9100 + }, + { + "epoch": 0.5840282232200128, + "grad_norm": 0.7986527171477741, + "learning_rate": 8.814982076667274e-05, + "loss": 0.6572, + "step": 9105 + }, + { + "epoch": 0.5843489416292496, + "grad_norm": 0.9594556597631692, + "learning_rate": 8.803866520246111e-05, + "loss": 0.6968, + "step": 9110 + }, + { + "epoch": 0.5846696600384862, + "grad_norm": 0.8185832555992929, + "learning_rate": 8.792752462964643e-05, + "loss": 0.6396, + "step": 9115 + }, + { + "epoch": 0.584990378447723, + "grad_norm": 0.830230327348044, + "learning_rate": 8.781639918752364e-05, + "loss": 0.6288, + "step": 9120 + }, + { + "epoch": 0.5853110968569596, + "grad_norm": 1.260466190111766, + "learning_rate": 8.770528901536866e-05, + "loss": 0.6248, + "step": 9125 + }, + { + "epoch": 0.5856318152661962, + "grad_norm": 0.7805742440541377, + "learning_rate": 8.75941942524382e-05, + "loss": 0.726, + "step": 9130 + }, + { + "epoch": 0.585952533675433, + "grad_norm": 1.0612454515173708, + "learning_rate": 8.748311503796971e-05, + "loss": 0.6807, + "step": 9135 + }, + { + "epoch": 0.5862732520846696, + "grad_norm": 0.8808610696974422, + "learning_rate": 8.737205151118115e-05, + "loss": 0.7349, + "step": 9140 + }, + { + "epoch": 0.5865939704939064, + "grad_norm": 0.8397400084374878, + "learning_rate": 8.726100381127084e-05, + "loss": 0.677, + "step": 9145 + }, + { + "epoch": 0.586914688903143, + "grad_norm": 1.3081126728734789, + "learning_rate": 8.714997207741725e-05, + "loss": 0.7485, + "step": 9150 + }, + { + "epoch": 0.5872354073123798, + "grad_norm": 0.23647447615753048, + "learning_rate": 8.703895644877877e-05, + "loss": 0.5389, + "step": 9155 + }, + { + "epoch": 0.5875561257216164, + "grad_norm": 1.0035423360368345, + "learning_rate": 8.692795706449371e-05, + "loss": 0.6547, + "step": 9160 + }, + { + "epoch": 0.5878768441308531, + "grad_norm": 0.7176089252240778, + "learning_rate": 8.681697406367997e-05, + "loss": 0.6607, + "step": 9165 + }, + { + "epoch": 0.5881975625400898, + "grad_norm": 0.8342266954014463, + "learning_rate": 8.670600758543492e-05, + "loss": 0.6957, + "step": 9170 + }, + { + "epoch": 0.5885182809493265, + "grad_norm": 0.9577059909314858, + "learning_rate": 8.659505776883523e-05, + "loss": 0.7079, + "step": 9175 + }, + { + "epoch": 0.5888389993585632, + "grad_norm": 0.5591665135253571, + "learning_rate": 8.648412475293667e-05, + "loss": 0.4696, + "step": 9180 + }, + { + "epoch": 0.5891597177677999, + "grad_norm": 0.6612061534246185, + "learning_rate": 8.637320867677395e-05, + "loss": 0.8161, + "step": 9185 + }, + { + "epoch": 0.5894804361770366, + "grad_norm": 0.7364614135023326, + "learning_rate": 8.626230967936056e-05, + "loss": 0.584, + "step": 9190 + }, + { + "epoch": 0.5898011545862732, + "grad_norm": 1.1805347583614008, + "learning_rate": 8.615142789968862e-05, + "loss": 0.6749, + "step": 9195 + }, + { + "epoch": 0.5901218729955099, + "grad_norm": 0.8670374427365669, + "learning_rate": 8.604056347672862e-05, + "loss": 0.6273, + "step": 9200 + }, + { + "epoch": 0.5904425914047466, + "grad_norm": 0.9304848686764007, + "learning_rate": 8.592971654942934e-05, + "loss": 0.7438, + "step": 9205 + }, + { + "epoch": 0.5907633098139833, + "grad_norm": 0.9747134027393929, + "learning_rate": 8.581888725671756e-05, + "loss": 0.6131, + "step": 9210 + }, + { + "epoch": 0.59108402822322, + "grad_norm": 1.0129060114876993, + "learning_rate": 8.570807573749803e-05, + "loss": 0.7444, + "step": 9215 + }, + { + "epoch": 0.5914047466324567, + "grad_norm": 0.860206331729887, + "learning_rate": 8.559728213065322e-05, + "loss": 0.71, + "step": 9220 + }, + { + "epoch": 0.5917254650416934, + "grad_norm": 0.9817359438145173, + "learning_rate": 8.548650657504312e-05, + "loss": 0.6491, + "step": 9225 + }, + { + "epoch": 0.5920461834509301, + "grad_norm": 0.7544658228792815, + "learning_rate": 8.537574920950509e-05, + "loss": 0.6348, + "step": 9230 + }, + { + "epoch": 0.5923669018601668, + "grad_norm": 0.7630242666798073, + "learning_rate": 8.526501017285371e-05, + "loss": 0.6261, + "step": 9235 + }, + { + "epoch": 0.5926876202694035, + "grad_norm": 0.9267179536684838, + "learning_rate": 8.515428960388064e-05, + "loss": 0.8258, + "step": 9240 + }, + { + "epoch": 0.5930083386786401, + "grad_norm": 0.6784696630153367, + "learning_rate": 8.504358764135423e-05, + "loss": 0.707, + "step": 9245 + }, + { + "epoch": 0.5933290570878769, + "grad_norm": 0.6689426887073786, + "learning_rate": 8.49329044240197e-05, + "loss": 0.751, + "step": 9250 + }, + { + "epoch": 0.5936497754971135, + "grad_norm": 1.0074921827758931, + "learning_rate": 8.482224009059867e-05, + "loss": 0.7213, + "step": 9255 + }, + { + "epoch": 0.5939704939063503, + "grad_norm": 0.6037825152713899, + "learning_rate": 8.471159477978915e-05, + "loss": 0.621, + "step": 9260 + }, + { + "epoch": 0.5942912123155869, + "grad_norm": 0.6325399857778463, + "learning_rate": 8.460096863026523e-05, + "loss": 0.6925, + "step": 9265 + }, + { + "epoch": 0.5946119307248237, + "grad_norm": 0.9785164961672185, + "learning_rate": 8.449036178067706e-05, + "loss": 0.7721, + "step": 9270 + }, + { + "epoch": 0.5949326491340603, + "grad_norm": 0.8071126693831758, + "learning_rate": 8.437977436965057e-05, + "loss": 0.5628, + "step": 9275 + }, + { + "epoch": 0.5952533675432969, + "grad_norm": 1.093008483996882, + "learning_rate": 8.426920653578731e-05, + "loss": 0.5135, + "step": 9280 + }, + { + "epoch": 0.5955740859525337, + "grad_norm": 0.7334552943764545, + "learning_rate": 8.415865841766437e-05, + "loss": 0.6418, + "step": 9285 + }, + { + "epoch": 0.5958948043617703, + "grad_norm": 0.9720157753455849, + "learning_rate": 8.404813015383402e-05, + "loss": 0.6855, + "step": 9290 + }, + { + "epoch": 0.5962155227710071, + "grad_norm": 0.7988660585883463, + "learning_rate": 8.39376218828237e-05, + "loss": 0.5753, + "step": 9295 + }, + { + "epoch": 0.5965362411802437, + "grad_norm": 1.1413457984041735, + "learning_rate": 8.382713374313582e-05, + "loss": 0.6003, + "step": 9300 + }, + { + "epoch": 0.5968569595894805, + "grad_norm": 1.1011093623211472, + "learning_rate": 8.371666587324753e-05, + "loss": 0.7294, + "step": 9305 + }, + { + "epoch": 0.5971776779987171, + "grad_norm": 0.9285733358885891, + "learning_rate": 8.360621841161059e-05, + "loss": 0.5484, + "step": 9310 + }, + { + "epoch": 0.5974983964079538, + "grad_norm": 0.6748939404643401, + "learning_rate": 8.349579149665111e-05, + "loss": 0.6096, + "step": 9315 + }, + { + "epoch": 0.5978191148171905, + "grad_norm": 0.9020042133223751, + "learning_rate": 8.338538526676955e-05, + "loss": 0.6025, + "step": 9320 + }, + { + "epoch": 0.5981398332264272, + "grad_norm": 0.9270397135681554, + "learning_rate": 8.32749998603404e-05, + "loss": 0.7169, + "step": 9325 + }, + { + "epoch": 0.5984605516356639, + "grad_norm": 0.9890377973574781, + "learning_rate": 8.316463541571202e-05, + "loss": 0.6308, + "step": 9330 + }, + { + "epoch": 0.5987812700449006, + "grad_norm": 0.9865556224427305, + "learning_rate": 8.305429207120657e-05, + "loss": 0.6582, + "step": 9335 + }, + { + "epoch": 0.5991019884541373, + "grad_norm": 0.7178728991086797, + "learning_rate": 8.294396996511973e-05, + "loss": 0.6433, + "step": 9340 + }, + { + "epoch": 0.599422706863374, + "grad_norm": 0.9285152964545721, + "learning_rate": 8.283366923572054e-05, + "loss": 0.548, + "step": 9345 + }, + { + "epoch": 0.5997434252726106, + "grad_norm": 1.0943546547273215, + "learning_rate": 8.272339002125126e-05, + "loss": 0.5401, + "step": 9350 + }, + { + "epoch": 0.6000641436818474, + "grad_norm": 1.0722476752693422, + "learning_rate": 8.261313245992719e-05, + "loss": 0.7496, + "step": 9355 + }, + { + "epoch": 0.600384862091084, + "grad_norm": 0.7239338874930329, + "learning_rate": 8.250289668993651e-05, + "loss": 0.6294, + "step": 9360 + }, + { + "epoch": 0.6007055805003207, + "grad_norm": 0.8162856731878313, + "learning_rate": 8.239268284944008e-05, + "loss": 0.784, + "step": 9365 + }, + { + "epoch": 0.6010262989095574, + "grad_norm": 0.8529031580797097, + "learning_rate": 8.228249107657125e-05, + "loss": 0.7338, + "step": 9370 + }, + { + "epoch": 0.6013470173187941, + "grad_norm": 0.914197482847494, + "learning_rate": 8.217232150943575e-05, + "loss": 0.6738, + "step": 9375 + }, + { + "epoch": 0.6016677357280308, + "grad_norm": 0.561817894827455, + "learning_rate": 8.20621742861114e-05, + "loss": 0.4924, + "step": 9380 + }, + { + "epoch": 0.6019884541372674, + "grad_norm": 0.8679917658001024, + "learning_rate": 8.19520495446481e-05, + "loss": 0.8074, + "step": 9385 + }, + { + "epoch": 0.6023091725465042, + "grad_norm": 1.0120069230072926, + "learning_rate": 8.184194742306756e-05, + "loss": 0.7112, + "step": 9390 + }, + { + "epoch": 0.6026298909557408, + "grad_norm": 0.7356825859409829, + "learning_rate": 8.173186805936313e-05, + "loss": 0.6514, + "step": 9395 + }, + { + "epoch": 0.6029506093649776, + "grad_norm": 0.7794340302339006, + "learning_rate": 8.162181159149964e-05, + "loss": 0.7748, + "step": 9400 + }, + { + "epoch": 0.6032713277742142, + "grad_norm": 0.9190740265202144, + "learning_rate": 8.151177815741318e-05, + "loss": 0.6399, + "step": 9405 + }, + { + "epoch": 0.603592046183451, + "grad_norm": 1.1526131658530894, + "learning_rate": 8.140176789501102e-05, + "loss": 0.7519, + "step": 9410 + }, + { + "epoch": 0.6039127645926876, + "grad_norm": 0.8970675006265497, + "learning_rate": 8.129178094217141e-05, + "loss": 0.7025, + "step": 9415 + }, + { + "epoch": 0.6042334830019244, + "grad_norm": 1.16563982635486, + "learning_rate": 8.118181743674334e-05, + "loss": 0.6515, + "step": 9420 + }, + { + "epoch": 0.604554201411161, + "grad_norm": 1.009328430894082, + "learning_rate": 8.107187751654642e-05, + "loss": 0.8061, + "step": 9425 + }, + { + "epoch": 0.6048749198203976, + "grad_norm": 0.6431656020123224, + "learning_rate": 8.096196131937068e-05, + "loss": 0.7703, + "step": 9430 + }, + { + "epoch": 0.6051956382296344, + "grad_norm": 0.8022392814347792, + "learning_rate": 8.085206898297648e-05, + "loss": 0.4945, + "step": 9435 + }, + { + "epoch": 0.605516356638871, + "grad_norm": 0.8590402951031166, + "learning_rate": 8.074220064509428e-05, + "loss": 0.577, + "step": 9440 + }, + { + "epoch": 0.6058370750481078, + "grad_norm": 0.6529036302559359, + "learning_rate": 8.06323564434243e-05, + "loss": 0.6972, + "step": 9445 + }, + { + "epoch": 0.6061577934573444, + "grad_norm": 0.9053770255851836, + "learning_rate": 8.052253651563671e-05, + "loss": 0.6241, + "step": 9450 + }, + { + "epoch": 0.6064785118665812, + "grad_norm": 0.6968143227671041, + "learning_rate": 8.04127409993712e-05, + "loss": 0.7196, + "step": 9455 + }, + { + "epoch": 0.6067992302758178, + "grad_norm": 0.7907742358273027, + "learning_rate": 8.030297003223676e-05, + "loss": 0.6535, + "step": 9460 + }, + { + "epoch": 0.6071199486850545, + "grad_norm": 0.9043816519851674, + "learning_rate": 8.019322375181175e-05, + "loss": 0.7183, + "step": 9465 + }, + { + "epoch": 0.6074406670942912, + "grad_norm": 0.8583282541776323, + "learning_rate": 8.008350229564351e-05, + "loss": 0.7373, + "step": 9470 + }, + { + "epoch": 0.6077613855035279, + "grad_norm": 1.1639398571753123, + "learning_rate": 7.997380580124832e-05, + "loss": 0.6619, + "step": 9475 + }, + { + "epoch": 0.6080821039127646, + "grad_norm": 0.7363838290393571, + "learning_rate": 7.986413440611115e-05, + "loss": 0.5238, + "step": 9480 + }, + { + "epoch": 0.6084028223220013, + "grad_norm": 0.7361031316329811, + "learning_rate": 7.975448824768546e-05, + "loss": 0.7093, + "step": 9485 + }, + { + "epoch": 0.608723540731238, + "grad_norm": 0.8655976177215603, + "learning_rate": 7.964486746339315e-05, + "loss": 0.6699, + "step": 9490 + }, + { + "epoch": 0.6090442591404747, + "grad_norm": 0.7757949116609816, + "learning_rate": 7.95352721906243e-05, + "loss": 0.6457, + "step": 9495 + }, + { + "epoch": 0.6093649775497113, + "grad_norm": 1.0532442121286478, + "learning_rate": 7.942570256673704e-05, + "loss": 0.8266, + "step": 9500 + }, + { + "epoch": 0.6096856959589481, + "grad_norm": 0.8097807634079536, + "learning_rate": 7.931615872905727e-05, + "loss": 0.6542, + "step": 9505 + }, + { + "epoch": 0.6100064143681847, + "grad_norm": 1.170352424739306, + "learning_rate": 7.92066408148787e-05, + "loss": 0.6511, + "step": 9510 + }, + { + "epoch": 0.6103271327774215, + "grad_norm": 0.6465117473629731, + "learning_rate": 7.909714896146239e-05, + "loss": 0.6102, + "step": 9515 + }, + { + "epoch": 0.6106478511866581, + "grad_norm": 0.9562444288916828, + "learning_rate": 7.898768330603687e-05, + "loss": 0.7281, + "step": 9520 + }, + { + "epoch": 0.6109685695958949, + "grad_norm": 0.48629635257867143, + "learning_rate": 7.887824398579778e-05, + "loss": 0.5576, + "step": 9525 + }, + { + "epoch": 0.6112892880051315, + "grad_norm": 0.6187174821618042, + "learning_rate": 7.876883113790777e-05, + "loss": 0.4536, + "step": 9530 + }, + { + "epoch": 0.6116100064143681, + "grad_norm": 0.8491363897597337, + "learning_rate": 7.865944489949632e-05, + "loss": 0.5082, + "step": 9535 + }, + { + "epoch": 0.6119307248236049, + "grad_norm": 0.9489825766872471, + "learning_rate": 7.855008540765954e-05, + "loss": 0.8288, + "step": 9540 + }, + { + "epoch": 0.6122514432328415, + "grad_norm": 0.8247180962617905, + "learning_rate": 7.844075279945998e-05, + "loss": 0.7947, + "step": 9545 + }, + { + "epoch": 0.6125721616420783, + "grad_norm": 0.8487499152582451, + "learning_rate": 7.833144721192658e-05, + "loss": 0.4836, + "step": 9550 + }, + { + "epoch": 0.6128928800513149, + "grad_norm": 1.4749421151082263, + "learning_rate": 7.822216878205437e-05, + "loss": 0.6604, + "step": 9555 + }, + { + "epoch": 0.6132135984605517, + "grad_norm": 0.6439839118081867, + "learning_rate": 7.811291764680436e-05, + "loss": 0.5311, + "step": 9560 + }, + { + "epoch": 0.6135343168697883, + "grad_norm": 0.6948565188236483, + "learning_rate": 7.800369394310329e-05, + "loss": 0.7818, + "step": 9565 + }, + { + "epoch": 0.613855035279025, + "grad_norm": 0.5432098551962209, + "learning_rate": 7.789449780784361e-05, + "loss": 0.4817, + "step": 9570 + }, + { + "epoch": 0.6141757536882617, + "grad_norm": 0.8116998264643036, + "learning_rate": 7.778532937788319e-05, + "loss": 0.6809, + "step": 9575 + }, + { + "epoch": 0.6144964720974984, + "grad_norm": 0.927156766210116, + "learning_rate": 7.767618879004509e-05, + "loss": 0.6117, + "step": 9580 + }, + { + "epoch": 0.6148171905067351, + "grad_norm": 0.5580255415813408, + "learning_rate": 7.756707618111758e-05, + "loss": 0.5121, + "step": 9585 + }, + { + "epoch": 0.6151379089159718, + "grad_norm": 0.7697324881673694, + "learning_rate": 7.745799168785387e-05, + "loss": 0.7019, + "step": 9590 + }, + { + "epoch": 0.6154586273252085, + "grad_norm": 1.2533080746391783, + "learning_rate": 7.734893544697182e-05, + "loss": 0.6921, + "step": 9595 + }, + { + "epoch": 0.6157793457344451, + "grad_norm": 0.8591968885866408, + "learning_rate": 7.723990759515399e-05, + "loss": 0.6234, + "step": 9600 + }, + { + "epoch": 0.6161000641436819, + "grad_norm": 0.8144982447654572, + "learning_rate": 7.713090826904732e-05, + "loss": 0.6175, + "step": 9605 + }, + { + "epoch": 0.6164207825529185, + "grad_norm": 0.7852604055969639, + "learning_rate": 7.702193760526301e-05, + "loss": 0.538, + "step": 9610 + }, + { + "epoch": 0.6167415009621552, + "grad_norm": 0.82507022800839, + "learning_rate": 7.691299574037633e-05, + "loss": 0.5858, + "step": 9615 + }, + { + "epoch": 0.6170622193713919, + "grad_norm": 0.8977703001606776, + "learning_rate": 7.68040828109264e-05, + "loss": 0.6686, + "step": 9620 + }, + { + "epoch": 0.6173829377806286, + "grad_norm": 0.7575641120784353, + "learning_rate": 7.669519895341618e-05, + "loss": 0.6733, + "step": 9625 + }, + { + "epoch": 0.6177036561898653, + "grad_norm": 0.7782783108716851, + "learning_rate": 7.658634430431211e-05, + "loss": 0.6113, + "step": 9630 + }, + { + "epoch": 0.618024374599102, + "grad_norm": 0.8737688527317737, + "learning_rate": 7.647751900004408e-05, + "loss": 0.7703, + "step": 9635 + }, + { + "epoch": 0.6183450930083387, + "grad_norm": 0.7163537021531532, + "learning_rate": 7.63687231770052e-05, + "loss": 0.6687, + "step": 9640 + }, + { + "epoch": 0.6186658114175754, + "grad_norm": 0.7383194119362961, + "learning_rate": 7.625995697155153e-05, + "loss": 0.7192, + "step": 9645 + }, + { + "epoch": 0.618986529826812, + "grad_norm": 0.7818780084969111, + "learning_rate": 7.615122052000212e-05, + "loss": 0.4781, + "step": 9650 + }, + { + "epoch": 0.6193072482360488, + "grad_norm": 0.9549919791876611, + "learning_rate": 7.604251395863868e-05, + "loss": 0.5972, + "step": 9655 + }, + { + "epoch": 0.6196279666452854, + "grad_norm": 0.9266947067171263, + "learning_rate": 7.593383742370547e-05, + "loss": 0.7661, + "step": 9660 + }, + { + "epoch": 0.6199486850545222, + "grad_norm": 0.7815262374564014, + "learning_rate": 7.582519105140915e-05, + "loss": 0.844, + "step": 9665 + }, + { + "epoch": 0.6202694034637588, + "grad_norm": 0.9851958882202488, + "learning_rate": 7.571657497791855e-05, + "loss": 0.6573, + "step": 9670 + }, + { + "epoch": 0.6205901218729956, + "grad_norm": 0.863915136317819, + "learning_rate": 7.560798933936446e-05, + "loss": 0.6965, + "step": 9675 + }, + { + "epoch": 0.6209108402822322, + "grad_norm": 0.8169772635721835, + "learning_rate": 7.549943427183963e-05, + "loss": 0.6739, + "step": 9680 + }, + { + "epoch": 0.6212315586914688, + "grad_norm": 0.9621597430987586, + "learning_rate": 7.539090991139843e-05, + "loss": 0.7107, + "step": 9685 + }, + { + "epoch": 0.6215522771007056, + "grad_norm": 1.1682951488621962, + "learning_rate": 7.52824163940568e-05, + "loss": 0.7016, + "step": 9690 + }, + { + "epoch": 0.6218729955099422, + "grad_norm": 0.5988705115634277, + "learning_rate": 7.517395385579198e-05, + "loss": 0.5883, + "step": 9695 + }, + { + "epoch": 0.622193713919179, + "grad_norm": 0.6405875029114282, + "learning_rate": 7.506552243254235e-05, + "loss": 0.5632, + "step": 9700 + }, + { + "epoch": 0.6225144323284156, + "grad_norm": 0.9039124102611747, + "learning_rate": 7.49571222602074e-05, + "loss": 0.5569, + "step": 9705 + }, + { + "epoch": 0.6228351507376524, + "grad_norm": 1.1918655890149419, + "learning_rate": 7.484875347464731e-05, + "loss": 0.755, + "step": 9710 + }, + { + "epoch": 0.623155869146889, + "grad_norm": 2.014073968409583, + "learning_rate": 7.474041621168304e-05, + "loss": 0.6472, + "step": 9715 + }, + { + "epoch": 0.6234765875561257, + "grad_norm": 0.8921505648356219, + "learning_rate": 7.4632110607096e-05, + "loss": 0.8289, + "step": 9720 + }, + { + "epoch": 0.6237973059653624, + "grad_norm": 1.1073242240733232, + "learning_rate": 7.452383679662794e-05, + "loss": 0.6634, + "step": 9725 + }, + { + "epoch": 0.6241180243745991, + "grad_norm": 1.1492204881968546, + "learning_rate": 7.441559491598072e-05, + "loss": 0.6672, + "step": 9730 + }, + { + "epoch": 0.6244387427838358, + "grad_norm": 1.2072073594662214, + "learning_rate": 7.43073851008162e-05, + "loss": 0.6821, + "step": 9735 + }, + { + "epoch": 0.6247594611930725, + "grad_norm": 0.7796944953436583, + "learning_rate": 7.41992074867561e-05, + "loss": 0.5997, + "step": 9740 + }, + { + "epoch": 0.6250801796023092, + "grad_norm": 0.8744950902348806, + "learning_rate": 7.40910622093817e-05, + "loss": 0.8027, + "step": 9745 + }, + { + "epoch": 0.6254008980115459, + "grad_norm": 0.5663128313006088, + "learning_rate": 7.398294940423382e-05, + "loss": 0.6558, + "step": 9750 + }, + { + "epoch": 0.6257216164207825, + "grad_norm": 1.03786462429062, + "learning_rate": 7.387486920681251e-05, + "loss": 0.7204, + "step": 9755 + }, + { + "epoch": 0.6260423348300193, + "grad_norm": 1.0086514423501614, + "learning_rate": 7.376682175257703e-05, + "loss": 0.5726, + "step": 9760 + }, + { + "epoch": 0.6263630532392559, + "grad_norm": 0.7340138238860899, + "learning_rate": 7.365880717694558e-05, + "loss": 0.6003, + "step": 9765 + }, + { + "epoch": 0.6266837716484926, + "grad_norm": 1.0154279037896083, + "learning_rate": 7.355082561529511e-05, + "loss": 0.6518, + "step": 9770 + }, + { + "epoch": 0.6270044900577293, + "grad_norm": 1.1008265637631556, + "learning_rate": 7.344287720296128e-05, + "loss": 0.6493, + "step": 9775 + }, + { + "epoch": 0.627325208466966, + "grad_norm": 0.8136002565232989, + "learning_rate": 7.333496207523805e-05, + "loss": 0.7117, + "step": 9780 + }, + { + "epoch": 0.6276459268762027, + "grad_norm": 0.5762089560179455, + "learning_rate": 7.322708036737784e-05, + "loss": 0.4664, + "step": 9785 + }, + { + "epoch": 0.6279666452854393, + "grad_norm": 0.8389502685505456, + "learning_rate": 7.311923221459108e-05, + "loss": 0.6836, + "step": 9790 + }, + { + "epoch": 0.6282873636946761, + "grad_norm": 0.7980523725918469, + "learning_rate": 7.301141775204614e-05, + "loss": 0.6824, + "step": 9795 + }, + { + "epoch": 0.6286080821039127, + "grad_norm": 1.1727596107618312, + "learning_rate": 7.290363711486923e-05, + "loss": 0.6435, + "step": 9800 + }, + { + "epoch": 0.6289288005131495, + "grad_norm": 0.4755883693546517, + "learning_rate": 7.279589043814413e-05, + "loss": 0.7567, + "step": 9805 + }, + { + "epoch": 0.6292495189223861, + "grad_norm": 0.59249663501007, + "learning_rate": 7.268817785691204e-05, + "loss": 0.6907, + "step": 9810 + }, + { + "epoch": 0.6295702373316229, + "grad_norm": 0.848542013217018, + "learning_rate": 7.258049950617146e-05, + "loss": 0.6471, + "step": 9815 + }, + { + "epoch": 0.6298909557408595, + "grad_norm": 1.047981392744028, + "learning_rate": 7.247285552087797e-05, + "loss": 0.5712, + "step": 9820 + }, + { + "epoch": 0.6302116741500963, + "grad_norm": 0.8916612499406957, + "learning_rate": 7.236524603594406e-05, + "loss": 0.6496, + "step": 9825 + }, + { + "epoch": 0.6305323925593329, + "grad_norm": 0.810154490032121, + "learning_rate": 7.225767118623906e-05, + "loss": 0.5871, + "step": 9830 + }, + { + "epoch": 0.6308531109685696, + "grad_norm": 0.8722001341085496, + "learning_rate": 7.215013110658875e-05, + "loss": 0.643, + "step": 9835 + }, + { + "epoch": 0.6311738293778063, + "grad_norm": 0.6036268039451337, + "learning_rate": 7.204262593177551e-05, + "loss": 0.6787, + "step": 9840 + }, + { + "epoch": 0.631494547787043, + "grad_norm": 1.1616717351436967, + "learning_rate": 7.193515579653777e-05, + "loss": 0.5542, + "step": 9845 + }, + { + "epoch": 0.6318152661962797, + "grad_norm": 0.8131100593226482, + "learning_rate": 7.182772083557022e-05, + "loss": 0.7859, + "step": 9850 + }, + { + "epoch": 0.6321359846055163, + "grad_norm": 0.876808117538372, + "learning_rate": 7.172032118352338e-05, + "loss": 0.6484, + "step": 9855 + }, + { + "epoch": 0.6324567030147531, + "grad_norm": 0.8713054808471165, + "learning_rate": 7.161295697500353e-05, + "loss": 0.6265, + "step": 9860 + }, + { + "epoch": 0.6327774214239897, + "grad_norm": 1.023366348564304, + "learning_rate": 7.150562834457257e-05, + "loss": 0.5939, + "step": 9865 + }, + { + "epoch": 0.6330981398332264, + "grad_norm": 0.7588376669281691, + "learning_rate": 7.13983354267477e-05, + "loss": 0.7873, + "step": 9870 + }, + { + "epoch": 0.6334188582424631, + "grad_norm": 1.028561424510279, + "learning_rate": 7.129107835600149e-05, + "loss": 0.6212, + "step": 9875 + }, + { + "epoch": 0.6337395766516998, + "grad_norm": 0.5002948721851668, + "learning_rate": 7.118385726676148e-05, + "loss": 0.6269, + "step": 9880 + }, + { + "epoch": 0.6340602950609365, + "grad_norm": 0.6840341058593294, + "learning_rate": 7.10766722934102e-05, + "loss": 0.6232, + "step": 9885 + }, + { + "epoch": 0.6343810134701732, + "grad_norm": 1.1628940715108431, + "learning_rate": 7.096952357028486e-05, + "loss": 0.7978, + "step": 9890 + }, + { + "epoch": 0.6347017318794099, + "grad_norm": 0.8853939814346806, + "learning_rate": 7.086241123167722e-05, + "loss": 0.6057, + "step": 9895 + }, + { + "epoch": 0.6350224502886466, + "grad_norm": 0.7451557600335174, + "learning_rate": 7.07553354118335e-05, + "loss": 0.7038, + "step": 9900 + }, + { + "epoch": 0.6353431686978832, + "grad_norm": 1.40409713973294, + "learning_rate": 7.064829624495415e-05, + "loss": 0.6721, + "step": 9905 + }, + { + "epoch": 0.63566388710712, + "grad_norm": 0.8791535681920543, + "learning_rate": 7.054129386519356e-05, + "loss": 0.7629, + "step": 9910 + }, + { + "epoch": 0.6359846055163566, + "grad_norm": 0.6562938490531729, + "learning_rate": 7.043432840666015e-05, + "loss": 0.6885, + "step": 9915 + }, + { + "epoch": 0.6363053239255934, + "grad_norm": 0.8475306109482822, + "learning_rate": 7.032740000341604e-05, + "loss": 0.6528, + "step": 9920 + }, + { + "epoch": 0.63662604233483, + "grad_norm": 1.0340930274606936, + "learning_rate": 7.022050878947683e-05, + "loss": 0.5579, + "step": 9925 + }, + { + "epoch": 0.6369467607440668, + "grad_norm": 0.892410748846026, + "learning_rate": 7.011365489881164e-05, + "loss": 0.622, + "step": 9930 + }, + { + "epoch": 0.6372674791533034, + "grad_norm": 1.026899828920046, + "learning_rate": 7.000683846534268e-05, + "loss": 0.7173, + "step": 9935 + }, + { + "epoch": 0.63758819756254, + "grad_norm": 0.7906424850106287, + "learning_rate": 6.99000596229453e-05, + "loss": 0.6518, + "step": 9940 + }, + { + "epoch": 0.6379089159717768, + "grad_norm": 0.885516437560555, + "learning_rate": 6.979331850544772e-05, + "loss": 0.7629, + "step": 9945 + }, + { + "epoch": 0.6382296343810134, + "grad_norm": 1.2585108576804727, + "learning_rate": 6.968661524663085e-05, + "loss": 0.5346, + "step": 9950 + }, + { + "epoch": 0.6385503527902502, + "grad_norm": 0.6378216033005294, + "learning_rate": 6.957994998022817e-05, + "loss": 0.5599, + "step": 9955 + }, + { + "epoch": 0.6388710711994868, + "grad_norm": 1.0857649237283717, + "learning_rate": 6.947332283992553e-05, + "loss": 0.5546, + "step": 9960 + }, + { + "epoch": 0.6391917896087236, + "grad_norm": 0.7485103608812504, + "learning_rate": 6.936673395936103e-05, + "loss": 0.7607, + "step": 9965 + }, + { + "epoch": 0.6395125080179602, + "grad_norm": 0.6831137045570516, + "learning_rate": 6.926018347212482e-05, + "loss": 0.7246, + "step": 9970 + }, + { + "epoch": 0.6398332264271969, + "grad_norm": 0.8371300993555119, + "learning_rate": 6.915367151175887e-05, + "loss": 0.7647, + "step": 9975 + }, + { + "epoch": 0.6401539448364336, + "grad_norm": 0.6790794293309601, + "learning_rate": 6.904719821175691e-05, + "loss": 0.709, + "step": 9980 + }, + { + "epoch": 0.6404746632456703, + "grad_norm": 1.2809292980337206, + "learning_rate": 6.894076370556419e-05, + "loss": 0.7072, + "step": 9985 + }, + { + "epoch": 0.640795381654907, + "grad_norm": 0.6309070049475263, + "learning_rate": 6.883436812657736e-05, + "loss": 0.7517, + "step": 9990 + }, + { + "epoch": 0.6411161000641437, + "grad_norm": 0.7057857328226916, + "learning_rate": 6.872801160814429e-05, + "loss": 0.5892, + "step": 9995 + }, + { + "epoch": 0.6414368184733804, + "grad_norm": 0.6684609047663461, + "learning_rate": 6.862169428356391e-05, + "loss": 0.7041, + "step": 10000 + }, + { + "epoch": 0.641757536882617, + "grad_norm": 0.9825781560923286, + "learning_rate": 6.851541628608593e-05, + "loss": 0.5732, + "step": 10005 + }, + { + "epoch": 0.6420782552918538, + "grad_norm": 0.6656401212815036, + "learning_rate": 6.840917774891089e-05, + "loss": 0.6996, + "step": 10010 + }, + { + "epoch": 0.6423989737010904, + "grad_norm": 1.0284673996842317, + "learning_rate": 6.830297880518982e-05, + "loss": 0.6385, + "step": 10015 + }, + { + "epoch": 0.6427196921103271, + "grad_norm": 1.3813453443085013, + "learning_rate": 6.819681958802411e-05, + "loss": 0.8024, + "step": 10020 + }, + { + "epoch": 0.6430404105195638, + "grad_norm": 1.0439998261378045, + "learning_rate": 6.809070023046542e-05, + "loss": 0.7246, + "step": 10025 + }, + { + "epoch": 0.6433611289288005, + "grad_norm": 1.3726132291968678, + "learning_rate": 6.798462086551536e-05, + "loss": 0.7607, + "step": 10030 + }, + { + "epoch": 0.6436818473380372, + "grad_norm": 0.696112632783953, + "learning_rate": 6.78785816261255e-05, + "loss": 0.6657, + "step": 10035 + }, + { + "epoch": 0.6440025657472739, + "grad_norm": 0.9271308758677715, + "learning_rate": 6.777258264519712e-05, + "loss": 0.7089, + "step": 10040 + }, + { + "epoch": 0.6443232841565106, + "grad_norm": 0.971107223858267, + "learning_rate": 6.766662405558095e-05, + "loss": 0.7127, + "step": 10045 + }, + { + "epoch": 0.6446440025657473, + "grad_norm": 1.1077553805147324, + "learning_rate": 6.756070599007717e-05, + "loss": 0.6674, + "step": 10050 + }, + { + "epoch": 0.6449647209749839, + "grad_norm": 1.1241145720577337, + "learning_rate": 6.745482858143519e-05, + "loss": 0.6908, + "step": 10055 + }, + { + "epoch": 0.6452854393842207, + "grad_norm": 1.0311402231942566, + "learning_rate": 6.734899196235342e-05, + "loss": 0.5903, + "step": 10060 + }, + { + "epoch": 0.6456061577934573, + "grad_norm": 1.1164020984789884, + "learning_rate": 6.724319626547916e-05, + "loss": 0.7299, + "step": 10065 + }, + { + "epoch": 0.6459268762026941, + "grad_norm": 0.862577581408513, + "learning_rate": 6.71374416234084e-05, + "loss": 0.6447, + "step": 10070 + }, + { + "epoch": 0.6462475946119307, + "grad_norm": 0.6813994701366789, + "learning_rate": 6.703172816868575e-05, + "loss": 0.6327, + "step": 10075 + }, + { + "epoch": 0.6465683130211675, + "grad_norm": 0.8916563918460675, + "learning_rate": 6.69260560338041e-05, + "loss": 0.5921, + "step": 10080 + }, + { + "epoch": 0.6468890314304041, + "grad_norm": 0.9332137514439207, + "learning_rate": 6.682042535120463e-05, + "loss": 0.6558, + "step": 10085 + }, + { + "epoch": 0.6472097498396407, + "grad_norm": 0.83477107809383, + "learning_rate": 6.67148362532765e-05, + "loss": 0.6404, + "step": 10090 + }, + { + "epoch": 0.6475304682488775, + "grad_norm": 1.2218962185380584, + "learning_rate": 6.66092888723568e-05, + "loss": 0.6856, + "step": 10095 + }, + { + "epoch": 0.6478511866581141, + "grad_norm": 0.5613953193652488, + "learning_rate": 6.650378334073036e-05, + "loss": 0.5747, + "step": 10100 + }, + { + "epoch": 0.6481719050673509, + "grad_norm": 1.161315529719475, + "learning_rate": 6.639831979062952e-05, + "loss": 0.7714, + "step": 10105 + }, + { + "epoch": 0.6484926234765875, + "grad_norm": 1.2013466455307917, + "learning_rate": 6.629289835423393e-05, + "loss": 0.7067, + "step": 10110 + }, + { + "epoch": 0.6488133418858243, + "grad_norm": 0.8985970817080027, + "learning_rate": 6.618751916367061e-05, + "loss": 0.8022, + "step": 10115 + }, + { + "epoch": 0.6491340602950609, + "grad_norm": 1.2136972519623022, + "learning_rate": 6.608218235101352e-05, + "loss": 0.6141, + "step": 10120 + }, + { + "epoch": 0.6494547787042976, + "grad_norm": 0.9718583450791072, + "learning_rate": 6.597688804828353e-05, + "loss": 0.5938, + "step": 10125 + }, + { + "epoch": 0.6497754971135343, + "grad_norm": 0.9547734637829278, + "learning_rate": 6.587163638744827e-05, + "loss": 0.6992, + "step": 10130 + }, + { + "epoch": 0.650096215522771, + "grad_norm": 0.9151909021410464, + "learning_rate": 6.57664275004219e-05, + "loss": 0.7343, + "step": 10135 + }, + { + "epoch": 0.6504169339320077, + "grad_norm": 1.5971760196514397, + "learning_rate": 6.566126151906498e-05, + "loss": 0.7017, + "step": 10140 + }, + { + "epoch": 0.6507376523412444, + "grad_norm": 0.8126791037548418, + "learning_rate": 6.555613857518425e-05, + "loss": 0.6567, + "step": 10145 + }, + { + "epoch": 0.6510583707504811, + "grad_norm": 0.7571219128173635, + "learning_rate": 6.545105880053258e-05, + "loss": 0.6871, + "step": 10150 + }, + { + "epoch": 0.6513790891597178, + "grad_norm": 0.688497347517119, + "learning_rate": 6.534602232680869e-05, + "loss": 0.7347, + "step": 10155 + }, + { + "epoch": 0.6516998075689544, + "grad_norm": 0.8955793200079804, + "learning_rate": 6.524102928565706e-05, + "loss": 0.5972, + "step": 10160 + }, + { + "epoch": 0.6520205259781912, + "grad_norm": 0.9443767111598063, + "learning_rate": 6.513607980866768e-05, + "loss": 0.723, + "step": 10165 + }, + { + "epoch": 0.6523412443874278, + "grad_norm": 0.8214020012837946, + "learning_rate": 6.5031174027376e-05, + "loss": 0.7531, + "step": 10170 + }, + { + "epoch": 0.6526619627966646, + "grad_norm": 0.9405554364877039, + "learning_rate": 6.492631207326271e-05, + "loss": 0.6579, + "step": 10175 + }, + { + "epoch": 0.6529826812059012, + "grad_norm": 0.8528480386187783, + "learning_rate": 6.482149407775348e-05, + "loss": 0.6639, + "step": 10180 + }, + { + "epoch": 0.653303399615138, + "grad_norm": 1.0215536554217552, + "learning_rate": 6.471672017221897e-05, + "loss": 0.6788, + "step": 10185 + }, + { + "epoch": 0.6536241180243746, + "grad_norm": 1.0458906526223661, + "learning_rate": 6.461199048797457e-05, + "loss": 0.7466, + "step": 10190 + }, + { + "epoch": 0.6539448364336113, + "grad_norm": 0.7250104664732925, + "learning_rate": 6.450730515628025e-05, + "loss": 0.4862, + "step": 10195 + }, + { + "epoch": 0.654265554842848, + "grad_norm": 1.1562228223771571, + "learning_rate": 6.440266430834035e-05, + "loss": 0.7554, + "step": 10200 + }, + { + "epoch": 0.6545862732520846, + "grad_norm": 0.7656674676905709, + "learning_rate": 6.429806807530348e-05, + "loss": 0.6668, + "step": 10205 + }, + { + "epoch": 0.6549069916613214, + "grad_norm": 1.1136322722942007, + "learning_rate": 6.419351658826236e-05, + "loss": 0.7241, + "step": 10210 + }, + { + "epoch": 0.655227710070558, + "grad_norm": 1.0761146316049985, + "learning_rate": 6.40890099782536e-05, + "loss": 0.6501, + "step": 10215 + }, + { + "epoch": 0.6555484284797948, + "grad_norm": 0.9079430022905365, + "learning_rate": 6.398454837625761e-05, + "loss": 0.8384, + "step": 10220 + }, + { + "epoch": 0.6558691468890314, + "grad_norm": 0.8488475441393789, + "learning_rate": 6.388013191319829e-05, + "loss": 0.697, + "step": 10225 + }, + { + "epoch": 0.6561898652982682, + "grad_norm": 1.8731573144161795, + "learning_rate": 6.377576071994306e-05, + "loss": 0.5274, + "step": 10230 + }, + { + "epoch": 0.6565105837075048, + "grad_norm": 0.9597668865369915, + "learning_rate": 6.367143492730257e-05, + "loss": 0.5793, + "step": 10235 + }, + { + "epoch": 0.6568313021167415, + "grad_norm": 0.9184805187055093, + "learning_rate": 6.356715466603058e-05, + "loss": 0.7204, + "step": 10240 + }, + { + "epoch": 0.6571520205259782, + "grad_norm": 1.010481078501907, + "learning_rate": 6.346292006682375e-05, + "loss": 0.6568, + "step": 10245 + }, + { + "epoch": 0.6574727389352149, + "grad_norm": 1.2893595780329616, + "learning_rate": 6.335873126032155e-05, + "loss": 0.7476, + "step": 10250 + }, + { + "epoch": 0.6577934573444516, + "grad_norm": 0.7919851978335327, + "learning_rate": 6.325458837710603e-05, + "loss": 0.6681, + "step": 10255 + }, + { + "epoch": 0.6581141757536882, + "grad_norm": 0.7133876917502856, + "learning_rate": 6.31504915477017e-05, + "loss": 0.7879, + "step": 10260 + }, + { + "epoch": 0.658434894162925, + "grad_norm": 0.8067826322951818, + "learning_rate": 6.304644090257536e-05, + "loss": 0.64, + "step": 10265 + }, + { + "epoch": 0.6587556125721616, + "grad_norm": 0.7174409241967863, + "learning_rate": 6.294243657213587e-05, + "loss": 0.5671, + "step": 10270 + }, + { + "epoch": 0.6590763309813983, + "grad_norm": 0.7812465401233117, + "learning_rate": 6.283847868673417e-05, + "loss": 0.628, + "step": 10275 + }, + { + "epoch": 0.659397049390635, + "grad_norm": 0.565828308616574, + "learning_rate": 6.273456737666281e-05, + "loss": 0.621, + "step": 10280 + }, + { + "epoch": 0.6597177677998717, + "grad_norm": 1.0913219783317336, + "learning_rate": 6.26307027721561e-05, + "loss": 0.6341, + "step": 10285 + }, + { + "epoch": 0.6600384862091084, + "grad_norm": 0.812647700581263, + "learning_rate": 6.252688500338979e-05, + "loss": 0.6266, + "step": 10290 + }, + { + "epoch": 0.6603592046183451, + "grad_norm": 1.3344320513324446, + "learning_rate": 6.242311420048087e-05, + "loss": 0.697, + "step": 10295 + }, + { + "epoch": 0.6606799230275818, + "grad_norm": 0.8037339071262586, + "learning_rate": 6.231939049348756e-05, + "loss": 0.662, + "step": 10300 + }, + { + "epoch": 0.6610006414368185, + "grad_norm": 0.8348124914063436, + "learning_rate": 6.221571401240898e-05, + "loss": 0.5953, + "step": 10305 + }, + { + "epoch": 0.6613213598460551, + "grad_norm": 0.8007698372402566, + "learning_rate": 6.211208488718508e-05, + "loss": 0.7067, + "step": 10310 + }, + { + "epoch": 0.6616420782552919, + "grad_norm": 1.0240691382811138, + "learning_rate": 6.200850324769645e-05, + "loss": 0.6563, + "step": 10315 + }, + { + "epoch": 0.6619627966645285, + "grad_norm": 0.6245391951301155, + "learning_rate": 6.190496922376419e-05, + "loss": 0.566, + "step": 10320 + }, + { + "epoch": 0.6622835150737653, + "grad_norm": 0.9667633410108524, + "learning_rate": 6.180148294514969e-05, + "loss": 0.6114, + "step": 10325 + }, + { + "epoch": 0.6626042334830019, + "grad_norm": 0.7507271356005688, + "learning_rate": 6.169804454155457e-05, + "loss": 0.5604, + "step": 10330 + }, + { + "epoch": 0.6629249518922387, + "grad_norm": 1.3185339543060972, + "learning_rate": 6.159465414262034e-05, + "loss": 0.6832, + "step": 10335 + }, + { + "epoch": 0.6632456703014753, + "grad_norm": 1.1847306027291458, + "learning_rate": 6.14913118779284e-05, + "loss": 0.8276, + "step": 10340 + }, + { + "epoch": 0.6635663887107119, + "grad_norm": 0.645482702109424, + "learning_rate": 6.138801787699988e-05, + "loss": 0.7251, + "step": 10345 + }, + { + "epoch": 0.6638871071199487, + "grad_norm": 0.9170687001642995, + "learning_rate": 6.128477226929532e-05, + "loss": 0.5489, + "step": 10350 + }, + { + "epoch": 0.6642078255291853, + "grad_norm": 1.000806725934412, + "learning_rate": 6.118157518421468e-05, + "loss": 0.7246, + "step": 10355 + }, + { + "epoch": 0.6645285439384221, + "grad_norm": 0.8379511672470946, + "learning_rate": 6.107842675109703e-05, + "loss": 0.7874, + "step": 10360 + }, + { + "epoch": 0.6648492623476587, + "grad_norm": 0.7371509556636497, + "learning_rate": 6.097532709922054e-05, + "loss": 0.6244, + "step": 10365 + }, + { + "epoch": 0.6651699807568955, + "grad_norm": 0.9539665664045133, + "learning_rate": 6.087227635780225e-05, + "loss": 0.6107, + "step": 10370 + }, + { + "epoch": 0.6654906991661321, + "grad_norm": 0.7979555148132079, + "learning_rate": 6.0769274655997775e-05, + "loss": 0.5344, + "step": 10375 + }, + { + "epoch": 0.6658114175753689, + "grad_norm": 0.909657054573839, + "learning_rate": 6.0666322122901396e-05, + "loss": 0.6275, + "step": 10380 + }, + { + "epoch": 0.6661321359846055, + "grad_norm": 1.0313940290067696, + "learning_rate": 6.056341888754573e-05, + "loss": 0.6082, + "step": 10385 + }, + { + "epoch": 0.6664528543938422, + "grad_norm": 0.7489838245596225, + "learning_rate": 6.0460565078901633e-05, + "loss": 0.5819, + "step": 10390 + }, + { + "epoch": 0.6667735728030789, + "grad_norm": 1.1118413959198947, + "learning_rate": 6.035776082587794e-05, + "loss": 0.5196, + "step": 10395 + }, + { + "epoch": 0.6670942912123156, + "grad_norm": 0.8125706280287548, + "learning_rate": 6.025500625732142e-05, + "loss": 0.5352, + "step": 10400 + }, + { + "epoch": 0.6674150096215523, + "grad_norm": 0.9492211031254315, + "learning_rate": 6.015230150201661e-05, + "loss": 0.5139, + "step": 10405 + }, + { + "epoch": 0.667735728030789, + "grad_norm": 0.7268694268672965, + "learning_rate": 6.0049646688685567e-05, + "loss": 0.6442, + "step": 10410 + }, + { + "epoch": 0.6680564464400257, + "grad_norm": 0.7538411268384596, + "learning_rate": 5.994704194598775e-05, + "loss": 0.7771, + "step": 10415 + }, + { + "epoch": 0.6683771648492624, + "grad_norm": 0.732055273874663, + "learning_rate": 5.9844487402519886e-05, + "loss": 0.4246, + "step": 10420 + }, + { + "epoch": 0.668697883258499, + "grad_norm": 0.9282996799361855, + "learning_rate": 5.97419831868158e-05, + "loss": 0.6212, + "step": 10425 + }, + { + "epoch": 0.6690186016677357, + "grad_norm": 0.8160584484135337, + "learning_rate": 5.96395294273462e-05, + "loss": 0.5947, + "step": 10430 + }, + { + "epoch": 0.6693393200769724, + "grad_norm": 0.563899508227464, + "learning_rate": 5.9537126252518595e-05, + "loss": 0.6085, + "step": 10435 + }, + { + "epoch": 0.6696600384862091, + "grad_norm": 0.7096696600311123, + "learning_rate": 5.9434773790677076e-05, + "loss": 0.6623, + "step": 10440 + }, + { + "epoch": 0.6699807568954458, + "grad_norm": 1.0083725702632502, + "learning_rate": 5.933247217010216e-05, + "loss": 0.7533, + "step": 10445 + }, + { + "epoch": 0.6703014753046825, + "grad_norm": 0.8583730314996155, + "learning_rate": 5.9230221519010634e-05, + "loss": 0.6899, + "step": 10450 + }, + { + "epoch": 0.6706221937139192, + "grad_norm": 0.9948242533172998, + "learning_rate": 5.912802196555547e-05, + "loss": 0.6441, + "step": 10455 + }, + { + "epoch": 0.6709429121231558, + "grad_norm": 0.8416659287585814, + "learning_rate": 5.902587363782553e-05, + "loss": 0.52, + "step": 10460 + }, + { + "epoch": 0.6712636305323926, + "grad_norm": 0.7875617753719326, + "learning_rate": 5.892377666384552e-05, + "loss": 0.8289, + "step": 10465 + }, + { + "epoch": 0.6715843489416292, + "grad_norm": 1.3665322708300398, + "learning_rate": 5.882173117157579e-05, + "loss": 0.6931, + "step": 10470 + }, + { + "epoch": 0.671905067350866, + "grad_norm": 1.484703583509698, + "learning_rate": 5.871973728891207e-05, + "loss": 0.6282, + "step": 10475 + }, + { + "epoch": 0.6722257857601026, + "grad_norm": 0.6277171001704246, + "learning_rate": 5.861779514368552e-05, + "loss": 0.5476, + "step": 10480 + }, + { + "epoch": 0.6725465041693394, + "grad_norm": 0.893359208561377, + "learning_rate": 5.851590486366241e-05, + "loss": 0.5851, + "step": 10485 + }, + { + "epoch": 0.672867222578576, + "grad_norm": 0.7320275300041723, + "learning_rate": 5.841406657654402e-05, + "loss": 0.7706, + "step": 10490 + }, + { + "epoch": 0.6731879409878126, + "grad_norm": 0.8287094016340315, + "learning_rate": 5.831228040996643e-05, + "loss": 0.6782, + "step": 10495 + }, + { + "epoch": 0.6735086593970494, + "grad_norm": 0.668748966976369, + "learning_rate": 5.8210546491500416e-05, + "loss": 0.4843, + "step": 10500 + }, + { + "epoch": 0.673829377806286, + "grad_norm": 0.7774193196749479, + "learning_rate": 5.8108864948651385e-05, + "loss": 0.6915, + "step": 10505 + }, + { + "epoch": 0.6741500962155228, + "grad_norm": 0.7361276836480435, + "learning_rate": 5.8007235908858815e-05, + "loss": 0.6037, + "step": 10510 + }, + { + "epoch": 0.6744708146247594, + "grad_norm": 0.9273797610571103, + "learning_rate": 5.790565949949669e-05, + "loss": 0.6447, + "step": 10515 + }, + { + "epoch": 0.6747915330339962, + "grad_norm": 0.7357377379625472, + "learning_rate": 5.780413584787285e-05, + "loss": 0.6123, + "step": 10520 + }, + { + "epoch": 0.6751122514432328, + "grad_norm": 0.7349196129011529, + "learning_rate": 5.770266508122903e-05, + "loss": 0.6148, + "step": 10525 + }, + { + "epoch": 0.6754329698524695, + "grad_norm": 0.7228184809432814, + "learning_rate": 5.760124732674079e-05, + "loss": 0.7375, + "step": 10530 + }, + { + "epoch": 0.6757536882617062, + "grad_norm": 0.7245846277368149, + "learning_rate": 5.749988271151714e-05, + "loss": 0.8622, + "step": 10535 + }, + { + "epoch": 0.6760744066709429, + "grad_norm": 0.7864676224072312, + "learning_rate": 5.739857136260046e-05, + "loss": 0.712, + "step": 10540 + }, + { + "epoch": 0.6763951250801796, + "grad_norm": 1.645141716455399, + "learning_rate": 5.7297313406966534e-05, + "loss": 0.6939, + "step": 10545 + }, + { + "epoch": 0.6767158434894163, + "grad_norm": 0.5062488079743617, + "learning_rate": 5.719610897152405e-05, + "loss": 0.5611, + "step": 10550 + }, + { + "epoch": 0.677036561898653, + "grad_norm": 0.7048718325836721, + "learning_rate": 5.709495818311477e-05, + "loss": 0.7464, + "step": 10555 + }, + { + "epoch": 0.6773572803078897, + "grad_norm": 1.1659307946452016, + "learning_rate": 5.699386116851309e-05, + "loss": 0.7177, + "step": 10560 + }, + { + "epoch": 0.6776779987171264, + "grad_norm": 0.9170897775066968, + "learning_rate": 5.6892818054426035e-05, + "loss": 0.669, + "step": 10565 + }, + { + "epoch": 0.6779987171263631, + "grad_norm": 1.0508889718757837, + "learning_rate": 5.679182896749322e-05, + "loss": 0.6744, + "step": 10570 + }, + { + "epoch": 0.6783194355355997, + "grad_norm": 0.8259858656059345, + "learning_rate": 5.669089403428627e-05, + "loss": 0.6801, + "step": 10575 + }, + { + "epoch": 0.6786401539448365, + "grad_norm": 0.6629893516596802, + "learning_rate": 5.659001338130923e-05, + "loss": 0.6013, + "step": 10580 + }, + { + "epoch": 0.6789608723540731, + "grad_norm": 0.968488221191984, + "learning_rate": 5.648918713499787e-05, + "loss": 0.7905, + "step": 10585 + }, + { + "epoch": 0.6792815907633099, + "grad_norm": 0.7585559410962367, + "learning_rate": 5.6388415421719996e-05, + "loss": 0.5525, + "step": 10590 + }, + { + "epoch": 0.6796023091725465, + "grad_norm": 1.2745141606185377, + "learning_rate": 5.6287698367774897e-05, + "loss": 0.7167, + "step": 10595 + }, + { + "epoch": 0.6799230275817832, + "grad_norm": 0.6728914302123802, + "learning_rate": 5.6187036099393375e-05, + "loss": 0.6937, + "step": 10600 + }, + { + "epoch": 0.6802437459910199, + "grad_norm": 0.600819149081247, + "learning_rate": 5.608642874273771e-05, + "loss": 0.6316, + "step": 10605 + }, + { + "epoch": 0.6805644644002565, + "grad_norm": 0.6959088365991615, + "learning_rate": 5.598587642390114e-05, + "loss": 0.7457, + "step": 10610 + }, + { + "epoch": 0.6808851828094933, + "grad_norm": 0.7266824723699652, + "learning_rate": 5.5885379268908134e-05, + "loss": 0.6045, + "step": 10615 + }, + { + "epoch": 0.6812059012187299, + "grad_norm": 0.6681555688621381, + "learning_rate": 5.578493740371389e-05, + "loss": 0.6286, + "step": 10620 + }, + { + "epoch": 0.6815266196279667, + "grad_norm": 0.7610528413953269, + "learning_rate": 5.568455095420431e-05, + "loss": 0.5733, + "step": 10625 + }, + { + "epoch": 0.6818473380372033, + "grad_norm": 1.3214312132482846, + "learning_rate": 5.558422004619597e-05, + "loss": 0.6319, + "step": 10630 + }, + { + "epoch": 0.6821680564464401, + "grad_norm": 0.6966982078568826, + "learning_rate": 5.548394480543564e-05, + "loss": 0.4698, + "step": 10635 + }, + { + "epoch": 0.6824887748556767, + "grad_norm": 0.6367878363111128, + "learning_rate": 5.538372535760057e-05, + "loss": 0.662, + "step": 10640 + }, + { + "epoch": 0.6828094932649134, + "grad_norm": 0.5466987109462808, + "learning_rate": 5.528356182829777e-05, + "loss": 0.5193, + "step": 10645 + }, + { + "epoch": 0.6831302116741501, + "grad_norm": 0.8091665259225381, + "learning_rate": 5.518345434306444e-05, + "loss": 0.5853, + "step": 10650 + }, + { + "epoch": 0.6834509300833868, + "grad_norm": 0.5989345577351957, + "learning_rate": 5.508340302736743e-05, + "loss": 0.5997, + "step": 10655 + }, + { + "epoch": 0.6837716484926235, + "grad_norm": 0.8246700551716405, + "learning_rate": 5.498340800660313e-05, + "loss": 0.715, + "step": 10660 + }, + { + "epoch": 0.6840923669018601, + "grad_norm": 0.7999016646795889, + "learning_rate": 5.488346940609753e-05, + "loss": 0.7212, + "step": 10665 + }, + { + "epoch": 0.6844130853110969, + "grad_norm": 0.5763703153217136, + "learning_rate": 5.4783587351105734e-05, + "loss": 0.6361, + "step": 10670 + }, + { + "epoch": 0.6847338037203335, + "grad_norm": 1.3911645606934129, + "learning_rate": 5.4683761966812154e-05, + "loss": 0.7494, + "step": 10675 + }, + { + "epoch": 0.6850545221295702, + "grad_norm": 1.1526450545139104, + "learning_rate": 5.458399337833002e-05, + "loss": 0.5274, + "step": 10680 + }, + { + "epoch": 0.6853752405388069, + "grad_norm": 1.0168267129176949, + "learning_rate": 5.448428171070141e-05, + "loss": 0.8071, + "step": 10685 + }, + { + "epoch": 0.6856959589480436, + "grad_norm": 0.7598086971815275, + "learning_rate": 5.438462708889718e-05, + "loss": 0.676, + "step": 10690 + }, + { + "epoch": 0.6860166773572803, + "grad_norm": 1.056491176869749, + "learning_rate": 5.428502963781654e-05, + "loss": 0.591, + "step": 10695 + }, + { + "epoch": 0.686337395766517, + "grad_norm": 0.8433612740283131, + "learning_rate": 5.418548948228709e-05, + "loss": 0.6323, + "step": 10700 + }, + { + "epoch": 0.6866581141757537, + "grad_norm": 1.1399615640431888, + "learning_rate": 5.408600674706474e-05, + "loss": 0.6943, + "step": 10705 + }, + { + "epoch": 0.6869788325849904, + "grad_norm": 1.1427576567421822, + "learning_rate": 5.39865815568332e-05, + "loss": 0.6542, + "step": 10710 + }, + { + "epoch": 0.687299550994227, + "grad_norm": 0.8398449025370285, + "learning_rate": 5.3887214036204295e-05, + "loss": 0.6775, + "step": 10715 + }, + { + "epoch": 0.6876202694034638, + "grad_norm": 0.6183753226440165, + "learning_rate": 5.3787904309717365e-05, + "loss": 0.5856, + "step": 10720 + }, + { + "epoch": 0.6879409878127004, + "grad_norm": 0.7303097761926962, + "learning_rate": 5.368865250183952e-05, + "loss": 0.5393, + "step": 10725 + }, + { + "epoch": 0.6882617062219372, + "grad_norm": 1.042159531292707, + "learning_rate": 5.358945873696514e-05, + "loss": 0.598, + "step": 10730 + }, + { + "epoch": 0.6885824246311738, + "grad_norm": 0.8726534481321939, + "learning_rate": 5.3490323139415844e-05, + "loss": 0.6874, + "step": 10735 + }, + { + "epoch": 0.6889031430404106, + "grad_norm": 0.8279765934645724, + "learning_rate": 5.339124583344046e-05, + "loss": 0.7282, + "step": 10740 + }, + { + "epoch": 0.6892238614496472, + "grad_norm": 1.1033370234326692, + "learning_rate": 5.3292226943214666e-05, + "loss": 0.6647, + "step": 10745 + }, + { + "epoch": 0.689544579858884, + "grad_norm": 0.6731635406372563, + "learning_rate": 5.3193266592840994e-05, + "loss": 0.642, + "step": 10750 + }, + { + "epoch": 0.6898652982681206, + "grad_norm": 0.682406135632238, + "learning_rate": 5.309436490634855e-05, + "loss": 0.6876, + "step": 10755 + }, + { + "epoch": 0.6901860166773572, + "grad_norm": 0.6884304464201593, + "learning_rate": 5.299552200769289e-05, + "loss": 0.6405, + "step": 10760 + }, + { + "epoch": 0.690506735086594, + "grad_norm": 0.9303606786373573, + "learning_rate": 5.289673802075601e-05, + "loss": 0.5867, + "step": 10765 + }, + { + "epoch": 0.6908274534958306, + "grad_norm": 0.8966481917540933, + "learning_rate": 5.279801306934598e-05, + "loss": 0.7328, + "step": 10770 + }, + { + "epoch": 0.6911481719050674, + "grad_norm": 0.8301326693368314, + "learning_rate": 5.269934727719685e-05, + "loss": 0.673, + "step": 10775 + }, + { + "epoch": 0.691468890314304, + "grad_norm": 0.9231136482226949, + "learning_rate": 5.260074076796859e-05, + "loss": 0.8013, + "step": 10780 + }, + { + "epoch": 0.6917896087235408, + "grad_norm": 0.6344332487623263, + "learning_rate": 5.250219366524687e-05, + "loss": 0.6477, + "step": 10785 + }, + { + "epoch": 0.6921103271327774, + "grad_norm": 0.6184925377516596, + "learning_rate": 5.240370609254288e-05, + "loss": 0.5484, + "step": 10790 + }, + { + "epoch": 0.6924310455420141, + "grad_norm": 0.7946249563385892, + "learning_rate": 5.230527817329316e-05, + "loss": 0.7455, + "step": 10795 + }, + { + "epoch": 0.6927517639512508, + "grad_norm": 0.5532448902772473, + "learning_rate": 5.22069100308596e-05, + "loss": 0.5486, + "step": 10800 + }, + { + "epoch": 0.6930724823604875, + "grad_norm": 0.6171304782365078, + "learning_rate": 5.210860178852903e-05, + "loss": 0.681, + "step": 10805 + }, + { + "epoch": 0.6933932007697242, + "grad_norm": 1.2635876971136728, + "learning_rate": 5.201035356951334e-05, + "loss": 0.6736, + "step": 10810 + }, + { + "epoch": 0.6937139191789609, + "grad_norm": 0.5205480150437042, + "learning_rate": 5.191216549694909e-05, + "loss": 0.5153, + "step": 10815 + }, + { + "epoch": 0.6940346375881976, + "grad_norm": 0.9442523324184217, + "learning_rate": 5.1814037693897464e-05, + "loss": 0.6185, + "step": 10820 + }, + { + "epoch": 0.6943553559974343, + "grad_norm": 1.1934267268940544, + "learning_rate": 5.1715970283344205e-05, + "loss": 0.6677, + "step": 10825 + }, + { + "epoch": 0.6946760744066709, + "grad_norm": 0.7652562771619698, + "learning_rate": 5.161796338819924e-05, + "loss": 0.7638, + "step": 10830 + }, + { + "epoch": 0.6949967928159076, + "grad_norm": 0.8994137424891815, + "learning_rate": 5.152001713129677e-05, + "loss": 0.5898, + "step": 10835 + }, + { + "epoch": 0.6953175112251443, + "grad_norm": 1.1569578317709166, + "learning_rate": 5.142213163539491e-05, + "loss": 0.5728, + "step": 10840 + }, + { + "epoch": 0.695638229634381, + "grad_norm": 0.9567492023568471, + "learning_rate": 5.132430702317562e-05, + "loss": 0.6646, + "step": 10845 + }, + { + "epoch": 0.6959589480436177, + "grad_norm": 0.9942541719053858, + "learning_rate": 5.122654341724462e-05, + "loss": 0.7398, + "step": 10850 + }, + { + "epoch": 0.6962796664528544, + "grad_norm": 0.69345380130255, + "learning_rate": 5.1128840940131064e-05, + "loss": 0.5888, + "step": 10855 + }, + { + "epoch": 0.6966003848620911, + "grad_norm": 0.8276215026435204, + "learning_rate": 5.103119971428765e-05, + "loss": 0.6781, + "step": 10860 + }, + { + "epoch": 0.6969211032713277, + "grad_norm": 0.7245991079345528, + "learning_rate": 5.093361986209015e-05, + "loss": 0.7442, + "step": 10865 + }, + { + "epoch": 0.6972418216805645, + "grad_norm": 0.7885551527874833, + "learning_rate": 5.0836101505837494e-05, + "loss": 0.6788, + "step": 10870 + }, + { + "epoch": 0.6975625400898011, + "grad_norm": 0.857297702149309, + "learning_rate": 5.073864476775157e-05, + "loss": 0.6013, + "step": 10875 + }, + { + "epoch": 0.6978832584990379, + "grad_norm": 0.6348649341355659, + "learning_rate": 5.064124976997693e-05, + "loss": 0.6045, + "step": 10880 + }, + { + "epoch": 0.6982039769082745, + "grad_norm": 0.6585605551969316, + "learning_rate": 5.054391663458087e-05, + "loss": 0.6171, + "step": 10885 + }, + { + "epoch": 0.6985246953175113, + "grad_norm": 0.986468962885202, + "learning_rate": 5.044664548355307e-05, + "loss": 0.7186, + "step": 10890 + }, + { + "epoch": 0.6988454137267479, + "grad_norm": 0.9785918246000489, + "learning_rate": 5.0349436438805494e-05, + "loss": 0.7877, + "step": 10895 + }, + { + "epoch": 0.6991661321359846, + "grad_norm": 1.5065392603292607, + "learning_rate": 5.025228962217241e-05, + "loss": 0.6156, + "step": 10900 + }, + { + "epoch": 0.6994868505452213, + "grad_norm": 0.9224408618353005, + "learning_rate": 5.015520515540996e-05, + "loss": 0.5855, + "step": 10905 + }, + { + "epoch": 0.699807568954458, + "grad_norm": 0.8828715863784493, + "learning_rate": 5.005818316019618e-05, + "loss": 0.6038, + "step": 10910 + }, + { + "epoch": 0.7001282873636947, + "grad_norm": 0.9568291721616811, + "learning_rate": 4.996122375813079e-05, + "loss": 0.6317, + "step": 10915 + }, + { + "epoch": 0.7004490057729313, + "grad_norm": 1.4247569725340374, + "learning_rate": 4.986432707073515e-05, + "loss": 0.7097, + "step": 10920 + }, + { + "epoch": 0.7007697241821681, + "grad_norm": 0.5257863778727976, + "learning_rate": 4.976749321945191e-05, + "loss": 0.5316, + "step": 10925 + }, + { + "epoch": 0.7010904425914047, + "grad_norm": 0.7116948483921095, + "learning_rate": 4.9670722325644993e-05, + "loss": 0.6438, + "step": 10930 + }, + { + "epoch": 0.7014111610006415, + "grad_norm": 0.8934801180351521, + "learning_rate": 4.957401451059948e-05, + "loss": 0.6628, + "step": 10935 + }, + { + "epoch": 0.7017318794098781, + "grad_norm": 0.5554525116078812, + "learning_rate": 4.9477369895521284e-05, + "loss": 0.6803, + "step": 10940 + }, + { + "epoch": 0.7020525978191148, + "grad_norm": 1.115600134036066, + "learning_rate": 4.938078860153725e-05, + "loss": 0.582, + "step": 10945 + }, + { + "epoch": 0.7023733162283515, + "grad_norm": 1.04204980372642, + "learning_rate": 4.928427074969475e-05, + "loss": 0.6396, + "step": 10950 + }, + { + "epoch": 0.7026940346375882, + "grad_norm": 0.6952203258967746, + "learning_rate": 4.918781646096161e-05, + "loss": 0.609, + "step": 10955 + }, + { + "epoch": 0.7030147530468249, + "grad_norm": 0.8455941974814938, + "learning_rate": 4.909142585622616e-05, + "loss": 0.7442, + "step": 10960 + }, + { + "epoch": 0.7033354714560616, + "grad_norm": 0.9358056805840572, + "learning_rate": 4.899509905629671e-05, + "loss": 0.6163, + "step": 10965 + }, + { + "epoch": 0.7036561898652983, + "grad_norm": 0.8368567909279319, + "learning_rate": 4.889883618190184e-05, + "loss": 0.6729, + "step": 10970 + }, + { + "epoch": 0.703976908274535, + "grad_norm": 0.9626200217934863, + "learning_rate": 4.8802637353689694e-05, + "loss": 0.6208, + "step": 10975 + }, + { + "epoch": 0.7042976266837716, + "grad_norm": 1.423525816978348, + "learning_rate": 4.870650269222845e-05, + "loss": 0.6301, + "step": 10980 + }, + { + "epoch": 0.7046183450930084, + "grad_norm": 0.8943539539791406, + "learning_rate": 4.8610432318005705e-05, + "loss": 0.8259, + "step": 10985 + }, + { + "epoch": 0.704939063502245, + "grad_norm": 1.0047328070171035, + "learning_rate": 4.851442635142846e-05, + "loss": 0.6759, + "step": 10990 + }, + { + "epoch": 0.7052597819114818, + "grad_norm": 0.864965532206175, + "learning_rate": 4.841848491282315e-05, + "loss": 0.6722, + "step": 10995 + }, + { + "epoch": 0.7055805003207184, + "grad_norm": 0.7890255740216144, + "learning_rate": 4.832260812243513e-05, + "loss": 0.6922, + "step": 11000 + }, + { + "epoch": 0.7059012187299551, + "grad_norm": 1.2389180866062235, + "learning_rate": 4.822679610042894e-05, + "loss": 0.6051, + "step": 11005 + }, + { + "epoch": 0.7062219371391918, + "grad_norm": 0.6998283128694094, + "learning_rate": 4.813104896688777e-05, + "loss": 0.6615, + "step": 11010 + }, + { + "epoch": 0.7065426555484284, + "grad_norm": 0.8090143409111475, + "learning_rate": 4.803536684181354e-05, + "loss": 0.7387, + "step": 11015 + }, + { + "epoch": 0.7068633739576652, + "grad_norm": 1.0370968663682347, + "learning_rate": 4.793974984512677e-05, + "loss": 0.7072, + "step": 11020 + }, + { + "epoch": 0.7071840923669018, + "grad_norm": 0.7853945975713512, + "learning_rate": 4.7844198096666246e-05, + "loss": 0.686, + "step": 11025 + }, + { + "epoch": 0.7075048107761386, + "grad_norm": 0.702386626377002, + "learning_rate": 4.774871171618901e-05, + "loss": 0.7127, + "step": 11030 + }, + { + "epoch": 0.7078255291853752, + "grad_norm": 1.0108215460660506, + "learning_rate": 4.765329082337027e-05, + "loss": 0.6434, + "step": 11035 + }, + { + "epoch": 0.708146247594612, + "grad_norm": 0.9899048924342988, + "learning_rate": 4.755793553780292e-05, + "loss": 0.7323, + "step": 11040 + }, + { + "epoch": 0.7084669660038486, + "grad_norm": 0.9147032893585562, + "learning_rate": 4.746264597899792e-05, + "loss": 0.6739, + "step": 11045 + }, + { + "epoch": 0.7087876844130853, + "grad_norm": 1.0330004401132, + "learning_rate": 4.736742226638363e-05, + "loss": 0.8609, + "step": 11050 + }, + { + "epoch": 0.709108402822322, + "grad_norm": 0.6548738796277453, + "learning_rate": 4.727226451930604e-05, + "loss": 0.6734, + "step": 11055 + }, + { + "epoch": 0.7094291212315587, + "grad_norm": 0.81714120996019, + "learning_rate": 4.717717285702835e-05, + "loss": 0.7523, + "step": 11060 + }, + { + "epoch": 0.7097498396407954, + "grad_norm": 0.885017113426685, + "learning_rate": 4.708214739873096e-05, + "loss": 0.5943, + "step": 11065 + }, + { + "epoch": 0.710070558050032, + "grad_norm": 0.8620179894720568, + "learning_rate": 4.698718826351135e-05, + "loss": 0.593, + "step": 11070 + }, + { + "epoch": 0.7103912764592688, + "grad_norm": 0.7663377237340008, + "learning_rate": 4.689229557038379e-05, + "loss": 0.7649, + "step": 11075 + }, + { + "epoch": 0.7107119948685054, + "grad_norm": 0.779291905786263, + "learning_rate": 4.679746943827939e-05, + "loss": 0.6231, + "step": 11080 + }, + { + "epoch": 0.7110327132777421, + "grad_norm": 0.8488045821194506, + "learning_rate": 4.6702709986045745e-05, + "loss": 0.5658, + "step": 11085 + }, + { + "epoch": 0.7113534316869788, + "grad_norm": 0.7591544492497508, + "learning_rate": 4.660801733244685e-05, + "loss": 0.5434, + "step": 11090 + }, + { + "epoch": 0.7116741500962155, + "grad_norm": 0.9324567178402989, + "learning_rate": 4.651339159616312e-05, + "loss": 0.7694, + "step": 11095 + }, + { + "epoch": 0.7119948685054522, + "grad_norm": 0.614241285241644, + "learning_rate": 4.641883289579095e-05, + "loss": 0.573, + "step": 11100 + }, + { + "epoch": 0.7123155869146889, + "grad_norm": 0.7297521213628075, + "learning_rate": 4.632434134984288e-05, + "loss": 0.7862, + "step": 11105 + }, + { + "epoch": 0.7126363053239256, + "grad_norm": 0.8547500506968054, + "learning_rate": 4.6229917076747056e-05, + "loss": 0.6224, + "step": 11110 + }, + { + "epoch": 0.7129570237331623, + "grad_norm": 1.1207952262364815, + "learning_rate": 4.613556019484754e-05, + "loss": 0.7452, + "step": 11115 + }, + { + "epoch": 0.7132777421423989, + "grad_norm": 0.5122245150734959, + "learning_rate": 4.604127082240379e-05, + "loss": 0.6216, + "step": 11120 + }, + { + "epoch": 0.7135984605516357, + "grad_norm": 0.6841888313664231, + "learning_rate": 4.5947049077590664e-05, + "loss": 0.6031, + "step": 11125 + }, + { + "epoch": 0.7139191789608723, + "grad_norm": 0.8085851937507493, + "learning_rate": 4.585289507849838e-05, + "loss": 0.5983, + "step": 11130 + }, + { + "epoch": 0.7142398973701091, + "grad_norm": 0.8748340585570812, + "learning_rate": 4.575880894313207e-05, + "loss": 0.6462, + "step": 11135 + }, + { + "epoch": 0.7145606157793457, + "grad_norm": 0.5741182108460992, + "learning_rate": 4.566479078941198e-05, + "loss": 0.6313, + "step": 11140 + }, + { + "epoch": 0.7148813341885825, + "grad_norm": 1.3368271859986067, + "learning_rate": 4.557084073517305e-05, + "loss": 0.5434, + "step": 11145 + }, + { + "epoch": 0.7152020525978191, + "grad_norm": 0.7497857375686727, + "learning_rate": 4.547695889816485e-05, + "loss": 0.557, + "step": 11150 + }, + { + "epoch": 0.7155227710070559, + "grad_norm": 0.8178864612038674, + "learning_rate": 4.538314539605155e-05, + "loss": 0.6979, + "step": 11155 + }, + { + "epoch": 0.7158434894162925, + "grad_norm": 0.8969560105198988, + "learning_rate": 4.528940034641158e-05, + "loss": 0.765, + "step": 11160 + }, + { + "epoch": 0.7161642078255291, + "grad_norm": 1.2265503200288288, + "learning_rate": 4.519572386673768e-05, + "loss": 0.5296, + "step": 11165 + }, + { + "epoch": 0.7164849262347659, + "grad_norm": 0.611571817659739, + "learning_rate": 4.510211607443654e-05, + "loss": 0.6223, + "step": 11170 + }, + { + "epoch": 0.7168056446440025, + "grad_norm": 0.8641143822600184, + "learning_rate": 4.500857708682883e-05, + "loss": 0.7204, + "step": 11175 + }, + { + "epoch": 0.7171263630532393, + "grad_norm": 0.9563759174291445, + "learning_rate": 4.491510702114894e-05, + "loss": 0.6728, + "step": 11180 + }, + { + "epoch": 0.7174470814624759, + "grad_norm": 0.5814502110654781, + "learning_rate": 4.482170599454489e-05, + "loss": 0.6652, + "step": 11185 + }, + { + "epoch": 0.7177677998717127, + "grad_norm": 1.0858563785495055, + "learning_rate": 4.472837412407825e-05, + "loss": 0.5543, + "step": 11190 + }, + { + "epoch": 0.7180885182809493, + "grad_norm": 0.6644009179012256, + "learning_rate": 4.4635111526723826e-05, + "loss": 0.8072, + "step": 11195 + }, + { + "epoch": 0.718409236690186, + "grad_norm": 0.9031430293191645, + "learning_rate": 4.454191831936958e-05, + "loss": 0.7006, + "step": 11200 + }, + { + "epoch": 0.7187299550994227, + "grad_norm": 0.6707442290616978, + "learning_rate": 4.4448794618816634e-05, + "loss": 0.6081, + "step": 11205 + }, + { + "epoch": 0.7190506735086594, + "grad_norm": 0.4567339031728235, + "learning_rate": 4.4355740541778837e-05, + "loss": 0.5996, + "step": 11210 + }, + { + "epoch": 0.7193713919178961, + "grad_norm": 0.8456434286308311, + "learning_rate": 4.426275620488293e-05, + "loss": 0.5902, + "step": 11215 + }, + { + "epoch": 0.7196921103271328, + "grad_norm": 0.7375984313670896, + "learning_rate": 4.416984172466814e-05, + "loss": 0.5592, + "step": 11220 + }, + { + "epoch": 0.7200128287363695, + "grad_norm": 1.001285278455043, + "learning_rate": 4.407699721758614e-05, + "loss": 0.4883, + "step": 11225 + }, + { + "epoch": 0.7203335471456062, + "grad_norm": 1.2917508534051378, + "learning_rate": 4.398422280000101e-05, + "loss": 0.6768, + "step": 11230 + }, + { + "epoch": 0.7206542655548428, + "grad_norm": 0.9685204099266428, + "learning_rate": 4.3891518588188875e-05, + "loss": 0.5883, + "step": 11235 + }, + { + "epoch": 0.7209749839640796, + "grad_norm": 0.5295383592814902, + "learning_rate": 4.379888469833791e-05, + "loss": 0.6229, + "step": 11240 + }, + { + "epoch": 0.7212957023733162, + "grad_norm": 0.9573436890552846, + "learning_rate": 4.370632124654811e-05, + "loss": 0.7156, + "step": 11245 + }, + { + "epoch": 0.721616420782553, + "grad_norm": 0.741578858748363, + "learning_rate": 4.361382834883131e-05, + "loss": 0.6556, + "step": 11250 + }, + { + "epoch": 0.7219371391917896, + "grad_norm": 0.916633580201409, + "learning_rate": 4.3521406121110807e-05, + "loss": 0.676, + "step": 11255 + }, + { + "epoch": 0.7222578576010263, + "grad_norm": 0.3992983111166088, + "learning_rate": 4.342905467922133e-05, + "loss": 0.4788, + "step": 11260 + }, + { + "epoch": 0.722578576010263, + "grad_norm": 1.4519640203571154, + "learning_rate": 4.333677413890896e-05, + "loss": 0.7693, + "step": 11265 + }, + { + "epoch": 0.7228992944194996, + "grad_norm": 1.014341854127021, + "learning_rate": 4.324456461583084e-05, + "loss": 0.7161, + "step": 11270 + }, + { + "epoch": 0.7232200128287364, + "grad_norm": 0.5798440252008737, + "learning_rate": 4.315242622555518e-05, + "loss": 0.5319, + "step": 11275 + }, + { + "epoch": 0.723540731237973, + "grad_norm": 1.3961411697107977, + "learning_rate": 4.306035908356097e-05, + "loss": 0.7755, + "step": 11280 + }, + { + "epoch": 0.7238614496472098, + "grad_norm": 0.7989332199967835, + "learning_rate": 4.296836330523791e-05, + "loss": 0.6761, + "step": 11285 + }, + { + "epoch": 0.7241821680564464, + "grad_norm": 0.5432452037456782, + "learning_rate": 4.287643900588634e-05, + "loss": 0.5398, + "step": 11290 + }, + { + "epoch": 0.7245028864656832, + "grad_norm": 1.1422963762576541, + "learning_rate": 4.278458630071687e-05, + "loss": 0.5321, + "step": 11295 + }, + { + "epoch": 0.7248236048749198, + "grad_norm": 0.6668170639427147, + "learning_rate": 4.2692805304850545e-05, + "loss": 0.5796, + "step": 11300 + }, + { + "epoch": 0.7251443232841565, + "grad_norm": 0.8515640505208902, + "learning_rate": 4.260109613331842e-05, + "loss": 0.6569, + "step": 11305 + }, + { + "epoch": 0.7254650416933932, + "grad_norm": 0.7014693919060985, + "learning_rate": 4.250945890106156e-05, + "loss": 0.6856, + "step": 11310 + }, + { + "epoch": 0.7257857601026299, + "grad_norm": 1.067030988068662, + "learning_rate": 4.241789372293087e-05, + "loss": 0.7749, + "step": 11315 + }, + { + "epoch": 0.7261064785118666, + "grad_norm": 0.7479024679363765, + "learning_rate": 4.232640071368691e-05, + "loss": 0.5478, + "step": 11320 + }, + { + "epoch": 0.7264271969211032, + "grad_norm": 1.0084686752935972, + "learning_rate": 4.22349799879999e-05, + "loss": 0.7788, + "step": 11325 + }, + { + "epoch": 0.72674791533034, + "grad_norm": 0.6585878195188157, + "learning_rate": 4.214363166044932e-05, + "loss": 0.6133, + "step": 11330 + }, + { + "epoch": 0.7270686337395766, + "grad_norm": 0.6784141958893567, + "learning_rate": 4.205235584552407e-05, + "loss": 0.6019, + "step": 11335 + }, + { + "epoch": 0.7273893521488134, + "grad_norm": 0.993300088957976, + "learning_rate": 4.1961152657622024e-05, + "loss": 0.7166, + "step": 11340 + }, + { + "epoch": 0.72771007055805, + "grad_norm": 0.8874942343310022, + "learning_rate": 4.1870022211050074e-05, + "loss": 0.6981, + "step": 11345 + }, + { + "epoch": 0.7280307889672867, + "grad_norm": 1.4921657931640064, + "learning_rate": 4.177896462002402e-05, + "loss": 0.5832, + "step": 11350 + }, + { + "epoch": 0.7283515073765234, + "grad_norm": 0.7853192040977804, + "learning_rate": 4.168797999866827e-05, + "loss": 0.7185, + "step": 11355 + }, + { + "epoch": 0.7286722257857601, + "grad_norm": 0.7775032508697538, + "learning_rate": 4.159706846101574e-05, + "loss": 0.5868, + "step": 11360 + }, + { + "epoch": 0.7289929441949968, + "grad_norm": 0.8328166231193795, + "learning_rate": 4.1506230121007894e-05, + "loss": 0.6707, + "step": 11365 + }, + { + "epoch": 0.7293136626042335, + "grad_norm": 1.1556231103657886, + "learning_rate": 4.141546509249433e-05, + "loss": 0.602, + "step": 11370 + }, + { + "epoch": 0.7296343810134702, + "grad_norm": 0.6535692635433068, + "learning_rate": 4.1324773489232794e-05, + "loss": 0.7015, + "step": 11375 + }, + { + "epoch": 0.7299550994227069, + "grad_norm": 1.0308989718059964, + "learning_rate": 4.1234155424889e-05, + "loss": 0.6524, + "step": 11380 + }, + { + "epoch": 0.7302758178319435, + "grad_norm": 0.9042723107486375, + "learning_rate": 4.1143611013036556e-05, + "loss": 0.6932, + "step": 11385 + }, + { + "epoch": 0.7305965362411803, + "grad_norm": 1.045581159518661, + "learning_rate": 4.105314036715668e-05, + "loss": 0.598, + "step": 11390 + }, + { + "epoch": 0.7309172546504169, + "grad_norm": 0.720438985489428, + "learning_rate": 4.096274360063814e-05, + "loss": 0.6927, + "step": 11395 + }, + { + "epoch": 0.7312379730596537, + "grad_norm": 0.7837057060205996, + "learning_rate": 4.087242082677721e-05, + "loss": 0.6271, + "step": 11400 + }, + { + "epoch": 0.7315586914688903, + "grad_norm": 0.9277273501073059, + "learning_rate": 4.0782172158777296e-05, + "loss": 0.7232, + "step": 11405 + }, + { + "epoch": 0.731879409878127, + "grad_norm": 0.7663141809384151, + "learning_rate": 4.069199770974904e-05, + "loss": 0.5593, + "step": 11410 + }, + { + "epoch": 0.7322001282873637, + "grad_norm": 0.7732548069785231, + "learning_rate": 4.0601897592709984e-05, + "loss": 0.6973, + "step": 11415 + }, + { + "epoch": 0.7325208466966003, + "grad_norm": 1.0148083244026747, + "learning_rate": 4.0511871920584486e-05, + "loss": 0.8616, + "step": 11420 + }, + { + "epoch": 0.7328415651058371, + "grad_norm": 0.7789337008538708, + "learning_rate": 4.042192080620374e-05, + "loss": 0.7399, + "step": 11425 + }, + { + "epoch": 0.7331622835150737, + "grad_norm": 0.7411707815027391, + "learning_rate": 4.033204436230532e-05, + "loss": 0.7219, + "step": 11430 + }, + { + "epoch": 0.7334830019243105, + "grad_norm": 0.9973447184162525, + "learning_rate": 4.0242242701533396e-05, + "loss": 0.6579, + "step": 11435 + }, + { + "epoch": 0.7338037203335471, + "grad_norm": 0.5830094144343125, + "learning_rate": 4.015251593643818e-05, + "loss": 0.7666, + "step": 11440 + }, + { + "epoch": 0.7341244387427839, + "grad_norm": 0.9049494653802453, + "learning_rate": 4.006286417947627e-05, + "loss": 0.7362, + "step": 11445 + }, + { + "epoch": 0.7344451571520205, + "grad_norm": 1.1555455068409544, + "learning_rate": 3.9973287543010064e-05, + "loss": 0.7706, + "step": 11450 + }, + { + "epoch": 0.7347658755612572, + "grad_norm": 0.8236939327253207, + "learning_rate": 3.9883786139307864e-05, + "loss": 0.4883, + "step": 11455 + }, + { + "epoch": 0.7350865939704939, + "grad_norm": 0.7242616375495603, + "learning_rate": 3.979436008054377e-05, + "loss": 0.6765, + "step": 11460 + }, + { + "epoch": 0.7354073123797306, + "grad_norm": 0.8282782204794581, + "learning_rate": 3.97050094787973e-05, + "loss": 0.6393, + "step": 11465 + }, + { + "epoch": 0.7357280307889673, + "grad_norm": 0.5484580528486228, + "learning_rate": 3.9615734446053534e-05, + "loss": 0.6273, + "step": 11470 + }, + { + "epoch": 0.736048749198204, + "grad_norm": 0.8342001080027434, + "learning_rate": 3.952653509420277e-05, + "loss": 0.6517, + "step": 11475 + }, + { + "epoch": 0.7363694676074407, + "grad_norm": 0.8544406097793438, + "learning_rate": 3.9437411535040416e-05, + "loss": 0.5679, + "step": 11480 + }, + { + "epoch": 0.7366901860166774, + "grad_norm": 0.8001118287868482, + "learning_rate": 3.9348363880267006e-05, + "loss": 0.7448, + "step": 11485 + }, + { + "epoch": 0.737010904425914, + "grad_norm": 1.0049068620138881, + "learning_rate": 3.92593922414878e-05, + "loss": 0.5381, + "step": 11490 + }, + { + "epoch": 0.7373316228351507, + "grad_norm": 1.0836198813580136, + "learning_rate": 3.9170496730212944e-05, + "loss": 0.6346, + "step": 11495 + }, + { + "epoch": 0.7376523412443874, + "grad_norm": 0.4690219622238173, + "learning_rate": 3.9081677457857045e-05, + "loss": 0.5469, + "step": 11500 + }, + { + "epoch": 0.7379730596536241, + "grad_norm": 0.7653256546259366, + "learning_rate": 3.899293453573919e-05, + "loss": 0.6005, + "step": 11505 + }, + { + "epoch": 0.7382937780628608, + "grad_norm": 0.8939110106983141, + "learning_rate": 3.890426807508278e-05, + "loss": 0.6783, + "step": 11510 + }, + { + "epoch": 0.7386144964720975, + "grad_norm": 0.775603525768831, + "learning_rate": 3.881567818701538e-05, + "loss": 0.6916, + "step": 11515 + }, + { + "epoch": 0.7389352148813342, + "grad_norm": 1.3430493149234304, + "learning_rate": 3.872716498256863e-05, + "loss": 0.5578, + "step": 11520 + }, + { + "epoch": 0.7392559332905709, + "grad_norm": 0.715829315420304, + "learning_rate": 3.863872857267802e-05, + "loss": 0.7686, + "step": 11525 + }, + { + "epoch": 0.7395766516998076, + "grad_norm": 0.6732314863048653, + "learning_rate": 3.8550369068182735e-05, + "loss": 0.4974, + "step": 11530 + }, + { + "epoch": 0.7398973701090442, + "grad_norm": 0.5624440967305854, + "learning_rate": 3.846208657982572e-05, + "loss": 0.5765, + "step": 11535 + }, + { + "epoch": 0.740218088518281, + "grad_norm": 0.9351668361698933, + "learning_rate": 3.837388121825323e-05, + "loss": 0.6699, + "step": 11540 + }, + { + "epoch": 0.7405388069275176, + "grad_norm": 1.0442410475484458, + "learning_rate": 3.828575309401501e-05, + "loss": 0.5723, + "step": 11545 + }, + { + "epoch": 0.7408595253367544, + "grad_norm": 0.897573742077218, + "learning_rate": 3.819770231756389e-05, + "loss": 0.7723, + "step": 11550 + }, + { + "epoch": 0.741180243745991, + "grad_norm": 0.6333361868228848, + "learning_rate": 3.810972899925575e-05, + "loss": 0.5929, + "step": 11555 + }, + { + "epoch": 0.7415009621552278, + "grad_norm": 1.2414234428777005, + "learning_rate": 3.802183324934952e-05, + "loss": 0.6754, + "step": 11560 + }, + { + "epoch": 0.7418216805644644, + "grad_norm": 0.8678280206604037, + "learning_rate": 3.793401517800672e-05, + "loss": 0.434, + "step": 11565 + }, + { + "epoch": 0.742142398973701, + "grad_norm": 0.8589814705072975, + "learning_rate": 3.784627489529177e-05, + "loss": 0.7005, + "step": 11570 + }, + { + "epoch": 0.7424631173829378, + "grad_norm": 1.096069158153898, + "learning_rate": 3.775861251117128e-05, + "loss": 0.6066, + "step": 11575 + }, + { + "epoch": 0.7427838357921744, + "grad_norm": 0.8956575121848285, + "learning_rate": 3.76710281355145e-05, + "loss": 0.5453, + "step": 11580 + }, + { + "epoch": 0.7431045542014112, + "grad_norm": 0.9901238623869012, + "learning_rate": 3.7583521878092766e-05, + "loss": 0.6829, + "step": 11585 + }, + { + "epoch": 0.7434252726106478, + "grad_norm": 1.1556330315855146, + "learning_rate": 3.749609384857952e-05, + "loss": 0.6617, + "step": 11590 + }, + { + "epoch": 0.7437459910198846, + "grad_norm": 0.8946200380979793, + "learning_rate": 3.7408744156550235e-05, + "loss": 0.6454, + "step": 11595 + }, + { + "epoch": 0.7440667094291212, + "grad_norm": 0.6811470722359575, + "learning_rate": 3.73214729114821e-05, + "loss": 0.558, + "step": 11600 + }, + { + "epoch": 0.7443874278383579, + "grad_norm": 1.2129672803037883, + "learning_rate": 3.72342802227541e-05, + "loss": 0.6829, + "step": 11605 + }, + { + "epoch": 0.7447081462475946, + "grad_norm": 0.7287815359687029, + "learning_rate": 3.7147166199646665e-05, + "loss": 0.7291, + "step": 11610 + }, + { + "epoch": 0.7450288646568313, + "grad_norm": 0.7381906467511818, + "learning_rate": 3.706013095134162e-05, + "loss": 0.673, + "step": 11615 + }, + { + "epoch": 0.745349583066068, + "grad_norm": 1.2592430310132843, + "learning_rate": 3.697317458692219e-05, + "loss": 0.6236, + "step": 11620 + }, + { + "epoch": 0.7456703014753047, + "grad_norm": 0.6359130442368803, + "learning_rate": 3.688629721537256e-05, + "loss": 0.6774, + "step": 11625 + }, + { + "epoch": 0.7459910198845414, + "grad_norm": 0.9163313019367859, + "learning_rate": 3.679949894557808e-05, + "loss": 0.6353, + "step": 11630 + }, + { + "epoch": 0.7463117382937781, + "grad_norm": 0.66124758919148, + "learning_rate": 3.671277988632484e-05, + "loss": 0.6667, + "step": 11635 + }, + { + "epoch": 0.7466324567030147, + "grad_norm": 1.093053112833277, + "learning_rate": 3.6626140146299715e-05, + "loss": 0.6706, + "step": 11640 + }, + { + "epoch": 0.7469531751122515, + "grad_norm": 0.585918591610346, + "learning_rate": 3.653957983409012e-05, + "loss": 0.596, + "step": 11645 + }, + { + "epoch": 0.7472738935214881, + "grad_norm": 0.8785492282676739, + "learning_rate": 3.6453099058183936e-05, + "loss": 0.8345, + "step": 11650 + }, + { + "epoch": 0.7475946119307249, + "grad_norm": 1.0886821917358311, + "learning_rate": 3.6366697926969415e-05, + "loss": 0.7223, + "step": 11655 + }, + { + "epoch": 0.7479153303399615, + "grad_norm": 0.8352362172770396, + "learning_rate": 3.628037654873489e-05, + "loss": 0.7974, + "step": 11660 + }, + { + "epoch": 0.7482360487491982, + "grad_norm": 0.6846055972157917, + "learning_rate": 3.619413503166888e-05, + "loss": 0.7061, + "step": 11665 + }, + { + "epoch": 0.7485567671584349, + "grad_norm": 1.1651393765637517, + "learning_rate": 3.610797348385965e-05, + "loss": 0.6326, + "step": 11670 + }, + { + "epoch": 0.7488774855676715, + "grad_norm": 0.8887525600265255, + "learning_rate": 3.60218920132953e-05, + "loss": 0.6543, + "step": 11675 + }, + { + "epoch": 0.7491982039769083, + "grad_norm": 0.47701205334570973, + "learning_rate": 3.5935890727863653e-05, + "loss": 0.5758, + "step": 11680 + }, + { + "epoch": 0.7495189223861449, + "grad_norm": 1.0003500503360518, + "learning_rate": 3.5849969735351917e-05, + "loss": 0.7507, + "step": 11685 + }, + { + "epoch": 0.7498396407953817, + "grad_norm": 0.9203454434610632, + "learning_rate": 3.57641291434467e-05, + "loss": 0.7704, + "step": 11690 + }, + { + "epoch": 0.7501603592046183, + "grad_norm": 1.035485843783069, + "learning_rate": 3.5678369059733884e-05, + "loss": 0.7227, + "step": 11695 + }, + { + "epoch": 0.7504810776138551, + "grad_norm": 0.8574293258900955, + "learning_rate": 3.559268959169842e-05, + "loss": 0.5932, + "step": 11700 + }, + { + "epoch": 0.7508017960230917, + "grad_norm": 1.0713424994868566, + "learning_rate": 3.55070908467242e-05, + "loss": 0.7351, + "step": 11705 + }, + { + "epoch": 0.7511225144323285, + "grad_norm": 0.7637351663255856, + "learning_rate": 3.542157293209394e-05, + "loss": 0.5982, + "step": 11710 + }, + { + "epoch": 0.7514432328415651, + "grad_norm": 0.7283758639132564, + "learning_rate": 3.533613595498914e-05, + "loss": 0.6919, + "step": 11715 + }, + { + "epoch": 0.7517639512508018, + "grad_norm": 0.9199615101682994, + "learning_rate": 3.525078002248974e-05, + "loss": 0.834, + "step": 11720 + }, + { + "epoch": 0.7520846696600385, + "grad_norm": 0.685052311744196, + "learning_rate": 3.516550524157415e-05, + "loss": 0.7766, + "step": 11725 + }, + { + "epoch": 0.7524053880692751, + "grad_norm": 0.9557933778705214, + "learning_rate": 3.508031171911913e-05, + "loss": 0.7334, + "step": 11730 + }, + { + "epoch": 0.7527261064785119, + "grad_norm": 0.8217799938196116, + "learning_rate": 3.4995199561899496e-05, + "loss": 0.6719, + "step": 11735 + }, + { + "epoch": 0.7530468248877485, + "grad_norm": 0.8490165290571312, + "learning_rate": 3.491016887658819e-05, + "loss": 0.6352, + "step": 11740 + }, + { + "epoch": 0.7533675432969853, + "grad_norm": 1.0096737759482532, + "learning_rate": 3.4825219769755955e-05, + "loss": 0.6278, + "step": 11745 + }, + { + "epoch": 0.7536882617062219, + "grad_norm": 0.8116824311381272, + "learning_rate": 3.4740352347871294e-05, + "loss": 0.5794, + "step": 11750 + }, + { + "epoch": 0.7540089801154586, + "grad_norm": 1.0567664205528664, + "learning_rate": 3.4655566717300433e-05, + "loss": 0.5817, + "step": 11755 + }, + { + "epoch": 0.7543296985246953, + "grad_norm": 0.8458879335378663, + "learning_rate": 3.457086298430696e-05, + "loss": 0.5779, + "step": 11760 + }, + { + "epoch": 0.754650416933932, + "grad_norm": 0.8982863213171639, + "learning_rate": 3.448624125505194e-05, + "loss": 0.6697, + "step": 11765 + }, + { + "epoch": 0.7549711353431687, + "grad_norm": 0.8975989314029491, + "learning_rate": 3.440170163559355e-05, + "loss": 0.7032, + "step": 11770 + }, + { + "epoch": 0.7552918537524054, + "grad_norm": 0.8729443546989577, + "learning_rate": 3.4317244231887125e-05, + "loss": 0.8033, + "step": 11775 + }, + { + "epoch": 0.7556125721616421, + "grad_norm": 1.0239920545191055, + "learning_rate": 3.423286914978493e-05, + "loss": 0.672, + "step": 11780 + }, + { + "epoch": 0.7559332905708788, + "grad_norm": 0.7010189828092076, + "learning_rate": 3.414857649503602e-05, + "loss": 0.6409, + "step": 11785 + }, + { + "epoch": 0.7562540089801154, + "grad_norm": 0.8719062018189001, + "learning_rate": 3.4064366373286274e-05, + "loss": 0.7164, + "step": 11790 + }, + { + "epoch": 0.7565747273893522, + "grad_norm": 0.7198915627914316, + "learning_rate": 3.398023889007794e-05, + "loss": 0.6249, + "step": 11795 + }, + { + "epoch": 0.7568954457985888, + "grad_norm": 0.8718719431875859, + "learning_rate": 3.389619415084989e-05, + "loss": 0.6064, + "step": 11800 + }, + { + "epoch": 0.7572161642078256, + "grad_norm": 0.8120042747717762, + "learning_rate": 3.381223226093715e-05, + "loss": 0.5433, + "step": 11805 + }, + { + "epoch": 0.7575368826170622, + "grad_norm": 0.9647874073108456, + "learning_rate": 3.3728353325570915e-05, + "loss": 0.7064, + "step": 11810 + }, + { + "epoch": 0.757857601026299, + "grad_norm": 1.2538875949194586, + "learning_rate": 3.364455744987853e-05, + "loss": 0.5527, + "step": 11815 + }, + { + "epoch": 0.7581783194355356, + "grad_norm": 1.178257170426357, + "learning_rate": 3.35608447388831e-05, + "loss": 0.6565, + "step": 11820 + }, + { + "epoch": 0.7584990378447722, + "grad_norm": 0.8864713208910722, + "learning_rate": 3.3477215297503605e-05, + "loss": 0.5459, + "step": 11825 + }, + { + "epoch": 0.758819756254009, + "grad_norm": 0.81482691903865, + "learning_rate": 3.339366923055458e-05, + "loss": 0.6798, + "step": 11830 + }, + { + "epoch": 0.7591404746632456, + "grad_norm": 0.7808704507490104, + "learning_rate": 3.3310206642746125e-05, + "loss": 0.6767, + "step": 11835 + }, + { + "epoch": 0.7594611930724824, + "grad_norm": 0.6063874143510388, + "learning_rate": 3.3226827638683665e-05, + "loss": 0.7335, + "step": 11840 + }, + { + "epoch": 0.759781911481719, + "grad_norm": 0.9081154038511268, + "learning_rate": 3.3143532322867865e-05, + "loss": 0.7284, + "step": 11845 + }, + { + "epoch": 0.7601026298909558, + "grad_norm": 0.845045773951182, + "learning_rate": 3.306032079969459e-05, + "loss": 0.7782, + "step": 11850 + }, + { + "epoch": 0.7604233483001924, + "grad_norm": 0.8991436429034236, + "learning_rate": 3.29771931734546e-05, + "loss": 0.7148, + "step": 11855 + }, + { + "epoch": 0.7607440667094291, + "grad_norm": 0.9742693305593477, + "learning_rate": 3.2894149548333495e-05, + "loss": 0.6244, + "step": 11860 + }, + { + "epoch": 0.7610647851186658, + "grad_norm": 0.6773700996601912, + "learning_rate": 3.281119002841169e-05, + "loss": 0.5872, + "step": 11865 + }, + { + "epoch": 0.7613855035279025, + "grad_norm": 0.8384804126775537, + "learning_rate": 3.2728314717664055e-05, + "loss": 0.7845, + "step": 11870 + }, + { + "epoch": 0.7617062219371392, + "grad_norm": 1.1357544575552236, + "learning_rate": 3.264552371996008e-05, + "loss": 0.6953, + "step": 11875 + }, + { + "epoch": 0.7620269403463759, + "grad_norm": 0.8516566580601438, + "learning_rate": 3.256281713906343e-05, + "loss": 0.7256, + "step": 11880 + }, + { + "epoch": 0.7623476587556126, + "grad_norm": 1.2370541167396898, + "learning_rate": 3.248019507863203e-05, + "loss": 0.7604, + "step": 11885 + }, + { + "epoch": 0.7626683771648493, + "grad_norm": 0.9542563866917992, + "learning_rate": 3.2397657642217926e-05, + "loss": 0.5988, + "step": 11890 + }, + { + "epoch": 0.762989095574086, + "grad_norm": 1.0432964488893417, + "learning_rate": 3.2315204933266996e-05, + "loss": 0.6991, + "step": 11895 + }, + { + "epoch": 0.7633098139833226, + "grad_norm": 1.0011228778914865, + "learning_rate": 3.223283705511908e-05, + "loss": 0.7298, + "step": 11900 + }, + { + "epoch": 0.7636305323925593, + "grad_norm": 1.5274397488438434, + "learning_rate": 3.215055411100748e-05, + "loss": 0.6428, + "step": 11905 + }, + { + "epoch": 0.763951250801796, + "grad_norm": 0.876587920734237, + "learning_rate": 3.2068356204059255e-05, + "loss": 0.7244, + "step": 11910 + }, + { + "epoch": 0.7642719692110327, + "grad_norm": 0.6121339451327354, + "learning_rate": 3.198624343729479e-05, + "loss": 0.7324, + "step": 11915 + }, + { + "epoch": 0.7645926876202694, + "grad_norm": 0.8464048080490233, + "learning_rate": 3.190421591362772e-05, + "loss": 0.7464, + "step": 11920 + }, + { + "epoch": 0.7649134060295061, + "grad_norm": 0.9880557475834854, + "learning_rate": 3.1822273735864984e-05, + "loss": 0.71, + "step": 11925 + }, + { + "epoch": 0.7652341244387428, + "grad_norm": 1.0295342644337049, + "learning_rate": 3.174041700670638e-05, + "loss": 0.4895, + "step": 11930 + }, + { + "epoch": 0.7655548428479795, + "grad_norm": 0.7076312841936536, + "learning_rate": 3.165864582874477e-05, + "loss": 0.691, + "step": 11935 + }, + { + "epoch": 0.7658755612572161, + "grad_norm": 1.0135591193887252, + "learning_rate": 3.1576960304465705e-05, + "loss": 0.6266, + "step": 11940 + }, + { + "epoch": 0.7661962796664529, + "grad_norm": 1.0323761526191306, + "learning_rate": 3.149536053624735e-05, + "loss": 0.7654, + "step": 11945 + }, + { + "epoch": 0.7665169980756895, + "grad_norm": 1.55635605359068, + "learning_rate": 3.1413846626360536e-05, + "loss": 0.7714, + "step": 11950 + }, + { + "epoch": 0.7668377164849263, + "grad_norm": 0.9497662276751877, + "learning_rate": 3.133241867696829e-05, + "loss": 0.6683, + "step": 11955 + }, + { + "epoch": 0.7671584348941629, + "grad_norm": 0.8979757336357795, + "learning_rate": 3.1251076790126086e-05, + "loss": 0.7516, + "step": 11960 + }, + { + "epoch": 0.7674791533033997, + "grad_norm": 0.764820887022675, + "learning_rate": 3.1169821067781425e-05, + "loss": 0.5679, + "step": 11965 + }, + { + "epoch": 0.7677998717126363, + "grad_norm": 0.5942733392588654, + "learning_rate": 3.1088651611773834e-05, + "loss": 0.5194, + "step": 11970 + }, + { + "epoch": 0.768120590121873, + "grad_norm": 0.9490603016131256, + "learning_rate": 3.100756852383473e-05, + "loss": 0.5963, + "step": 11975 + }, + { + "epoch": 0.7684413085311097, + "grad_norm": 0.7616783689998372, + "learning_rate": 3.092657190558727e-05, + "loss": 0.6785, + "step": 11980 + }, + { + "epoch": 0.7687620269403463, + "grad_norm": 0.830417639785896, + "learning_rate": 3.084566185854628e-05, + "loss": 0.5892, + "step": 11985 + }, + { + "epoch": 0.7690827453495831, + "grad_norm": 1.0515557973724121, + "learning_rate": 3.076483848411803e-05, + "loss": 0.6846, + "step": 11990 + }, + { + "epoch": 0.7694034637588197, + "grad_norm": 0.9480637021643955, + "learning_rate": 3.068410188360022e-05, + "loss": 0.741, + "step": 11995 + }, + { + "epoch": 0.7697241821680565, + "grad_norm": 0.9435811108298884, + "learning_rate": 3.0603452158181744e-05, + "loss": 0.7019, + "step": 12000 + }, + { + "epoch": 0.7700449005772931, + "grad_norm": 0.7019989507064325, + "learning_rate": 3.052288940894259e-05, + "loss": 0.5835, + "step": 12005 + }, + { + "epoch": 0.7703656189865298, + "grad_norm": 0.6770008543875123, + "learning_rate": 3.0442413736853846e-05, + "loss": 0.6826, + "step": 12010 + }, + { + "epoch": 0.7706863373957665, + "grad_norm": 0.7178710129095005, + "learning_rate": 3.036202524277735e-05, + "loss": 0.7033, + "step": 12015 + }, + { + "epoch": 0.7710070558050032, + "grad_norm": 0.7298827842977621, + "learning_rate": 3.0281724027465708e-05, + "loss": 0.6847, + "step": 12020 + }, + { + "epoch": 0.7713277742142399, + "grad_norm": 1.2518124809303286, + "learning_rate": 3.020151019156221e-05, + "loss": 0.5659, + "step": 12025 + }, + { + "epoch": 0.7716484926234766, + "grad_norm": 0.7542697248961158, + "learning_rate": 3.0121383835600513e-05, + "loss": 0.7575, + "step": 12030 + }, + { + "epoch": 0.7719692110327133, + "grad_norm": 0.779461786694263, + "learning_rate": 3.0041345060004776e-05, + "loss": 0.7238, + "step": 12035 + }, + { + "epoch": 0.77228992944195, + "grad_norm": 1.0655675292269764, + "learning_rate": 2.9961393965089203e-05, + "loss": 0.7475, + "step": 12040 + }, + { + "epoch": 0.7726106478511866, + "grad_norm": 1.1044101389504177, + "learning_rate": 2.98815306510583e-05, + "loss": 0.6353, + "step": 12045 + }, + { + "epoch": 0.7729313662604234, + "grad_norm": 0.8533414942650657, + "learning_rate": 2.9801755218006433e-05, + "loss": 0.5867, + "step": 12050 + }, + { + "epoch": 0.77325208466966, + "grad_norm": 1.0958682723686255, + "learning_rate": 2.9722067765917838e-05, + "loss": 0.5739, + "step": 12055 + }, + { + "epoch": 0.7735728030788968, + "grad_norm": 0.7152332630816656, + "learning_rate": 2.9642468394666557e-05, + "loss": 0.6729, + "step": 12060 + }, + { + "epoch": 0.7738935214881334, + "grad_norm": 0.9986989562442445, + "learning_rate": 2.956295720401612e-05, + "loss": 0.6726, + "step": 12065 + }, + { + "epoch": 0.7742142398973701, + "grad_norm": 0.9811723796412208, + "learning_rate": 2.9483534293619685e-05, + "loss": 0.5619, + "step": 12070 + }, + { + "epoch": 0.7745349583066068, + "grad_norm": 0.9118000616924434, + "learning_rate": 2.9404199763019645e-05, + "loss": 0.6516, + "step": 12075 + }, + { + "epoch": 0.7748556767158435, + "grad_norm": 0.8942392291019036, + "learning_rate": 2.932495371164764e-05, + "loss": 0.7949, + "step": 12080 + }, + { + "epoch": 0.7751763951250802, + "grad_norm": 0.9745393445698103, + "learning_rate": 2.9245796238824496e-05, + "loss": 0.6836, + "step": 12085 + }, + { + "epoch": 0.7754971135343168, + "grad_norm": 0.624918898789372, + "learning_rate": 2.916672744375991e-05, + "loss": 0.5384, + "step": 12090 + }, + { + "epoch": 0.7758178319435536, + "grad_norm": 0.7577038101937041, + "learning_rate": 2.908774742555257e-05, + "loss": 0.7673, + "step": 12095 + }, + { + "epoch": 0.7761385503527902, + "grad_norm": 1.0261935822819983, + "learning_rate": 2.9008856283189778e-05, + "loss": 0.5503, + "step": 12100 + }, + { + "epoch": 0.776459268762027, + "grad_norm": 0.8962534874969645, + "learning_rate": 2.8930054115547488e-05, + "loss": 0.6463, + "step": 12105 + }, + { + "epoch": 0.7767799871712636, + "grad_norm": 0.70250181904508, + "learning_rate": 2.8851341021390155e-05, + "loss": 0.5889, + "step": 12110 + }, + { + "epoch": 0.7771007055805004, + "grad_norm": 0.6163717028953168, + "learning_rate": 2.877271709937056e-05, + "loss": 0.6057, + "step": 12115 + }, + { + "epoch": 0.777421423989737, + "grad_norm": 1.139236879333557, + "learning_rate": 2.8694182448029795e-05, + "loss": 0.6143, + "step": 12120 + }, + { + "epoch": 0.7777421423989737, + "grad_norm": 0.8597109154676085, + "learning_rate": 2.8615737165796974e-05, + "loss": 0.6156, + "step": 12125 + }, + { + "epoch": 0.7780628608082104, + "grad_norm": 1.0377068227971646, + "learning_rate": 2.8537381350989288e-05, + "loss": 0.7131, + "step": 12130 + }, + { + "epoch": 0.778383579217447, + "grad_norm": 0.9278713523838525, + "learning_rate": 2.8459115101811752e-05, + "loss": 0.5643, + "step": 12135 + }, + { + "epoch": 0.7787042976266838, + "grad_norm": 0.9111079193714665, + "learning_rate": 2.838093851635708e-05, + "loss": 0.7114, + "step": 12140 + }, + { + "epoch": 0.7790250160359204, + "grad_norm": 0.636013231630343, + "learning_rate": 2.8302851692605748e-05, + "loss": 0.5425, + "step": 12145 + }, + { + "epoch": 0.7793457344451572, + "grad_norm": 0.9437606048473691, + "learning_rate": 2.8224854728425555e-05, + "loss": 0.7358, + "step": 12150 + }, + { + "epoch": 0.7796664528543938, + "grad_norm": 0.9877250051200861, + "learning_rate": 2.814694772157184e-05, + "loss": 0.7881, + "step": 12155 + }, + { + "epoch": 0.7799871712636305, + "grad_norm": 0.6355892070558739, + "learning_rate": 2.806913076968709e-05, + "loss": 0.5765, + "step": 12160 + }, + { + "epoch": 0.7803078896728672, + "grad_norm": 0.8553618089212107, + "learning_rate": 2.7991403970300923e-05, + "loss": 0.6339, + "step": 12165 + }, + { + "epoch": 0.7806286080821039, + "grad_norm": 0.7956244875523378, + "learning_rate": 2.7913767420830105e-05, + "loss": 0.6316, + "step": 12170 + }, + { + "epoch": 0.7809493264913406, + "grad_norm": 0.74745099568378, + "learning_rate": 2.7836221218578052e-05, + "loss": 0.5178, + "step": 12175 + }, + { + "epoch": 0.7812700449005773, + "grad_norm": 2.797197105902477, + "learning_rate": 2.775876546073518e-05, + "loss": 0.7453, + "step": 12180 + }, + { + "epoch": 0.781590763309814, + "grad_norm": 0.8203117179056878, + "learning_rate": 2.768140024437842e-05, + "loss": 0.7123, + "step": 12185 + }, + { + "epoch": 0.7819114817190507, + "grad_norm": 0.8491800107534502, + "learning_rate": 2.7604125666471202e-05, + "loss": 0.6031, + "step": 12190 + }, + { + "epoch": 0.7822322001282873, + "grad_norm": 0.7920825834762689, + "learning_rate": 2.7526941823863494e-05, + "loss": 0.6918, + "step": 12195 + }, + { + "epoch": 0.7825529185375241, + "grad_norm": 0.8070095630772426, + "learning_rate": 2.744984881329139e-05, + "loss": 0.5921, + "step": 12200 + }, + { + "epoch": 0.7828736369467607, + "grad_norm": 0.6455255637368961, + "learning_rate": 2.7372846731377265e-05, + "loss": 0.6382, + "step": 12205 + }, + { + "epoch": 0.7831943553559975, + "grad_norm": 0.92556283214074, + "learning_rate": 2.7295935674629457e-05, + "loss": 0.5116, + "step": 12210 + }, + { + "epoch": 0.7835150737652341, + "grad_norm": 1.1170799846804207, + "learning_rate": 2.7219115739442215e-05, + "loss": 0.6566, + "step": 12215 + }, + { + "epoch": 0.7838357921744709, + "grad_norm": 0.5890009042735036, + "learning_rate": 2.7142387022095638e-05, + "loss": 0.6128, + "step": 12220 + }, + { + "epoch": 0.7841565105837075, + "grad_norm": 0.6327668177080631, + "learning_rate": 2.7065749618755455e-05, + "loss": 0.6366, + "step": 12225 + }, + { + "epoch": 0.7844772289929441, + "grad_norm": 0.8664538277798131, + "learning_rate": 2.698920362547299e-05, + "loss": 0.6013, + "step": 12230 + }, + { + "epoch": 0.7847979474021809, + "grad_norm": 0.7003044665428215, + "learning_rate": 2.6912749138184956e-05, + "loss": 0.7929, + "step": 12235 + }, + { + "epoch": 0.7851186658114175, + "grad_norm": 0.7853265661064053, + "learning_rate": 2.6836386252713396e-05, + "loss": 0.7137, + "step": 12240 + }, + { + "epoch": 0.7854393842206543, + "grad_norm": 0.909806347924112, + "learning_rate": 2.6760115064765568e-05, + "loss": 0.6994, + "step": 12245 + }, + { + "epoch": 0.7857601026298909, + "grad_norm": 0.8351806612159146, + "learning_rate": 2.6683935669933736e-05, + "loss": 0.6935, + "step": 12250 + }, + { + "epoch": 0.7860808210391277, + "grad_norm": 0.7611491943408887, + "learning_rate": 2.6607848163695227e-05, + "loss": 0.7319, + "step": 12255 + }, + { + "epoch": 0.7864015394483643, + "grad_norm": 1.122080599336026, + "learning_rate": 2.6531852641412082e-05, + "loss": 0.6022, + "step": 12260 + }, + { + "epoch": 0.7867222578576011, + "grad_norm": 1.1817121943287525, + "learning_rate": 2.645594919833119e-05, + "loss": 0.7494, + "step": 12265 + }, + { + "epoch": 0.7870429762668377, + "grad_norm": 0.7929071478719117, + "learning_rate": 2.6380137929583914e-05, + "loss": 0.7783, + "step": 12270 + }, + { + "epoch": 0.7873636946760744, + "grad_norm": 0.820309764452619, + "learning_rate": 2.6304418930186115e-05, + "loss": 0.6332, + "step": 12275 + }, + { + "epoch": 0.7876844130853111, + "grad_norm": 0.707291602928582, + "learning_rate": 2.6228792295038106e-05, + "loss": 0.537, + "step": 12280 + }, + { + "epoch": 0.7880051314945478, + "grad_norm": 0.8141400312776754, + "learning_rate": 2.6153258118924308e-05, + "loss": 0.6322, + "step": 12285 + }, + { + "epoch": 0.7883258499037845, + "grad_norm": 0.7187432563518902, + "learning_rate": 2.6077816496513363e-05, + "loss": 0.5032, + "step": 12290 + }, + { + "epoch": 0.7886465683130212, + "grad_norm": 0.921998673200194, + "learning_rate": 2.6002467522357867e-05, + "loss": 0.6134, + "step": 12295 + }, + { + "epoch": 0.7889672867222579, + "grad_norm": 1.4739251939697386, + "learning_rate": 2.592721129089427e-05, + "loss": 0.6579, + "step": 12300 + }, + { + "epoch": 0.7892880051314946, + "grad_norm": 0.7698494785751436, + "learning_rate": 2.5852047896442853e-05, + "loss": 0.6832, + "step": 12305 + }, + { + "epoch": 0.7896087235407312, + "grad_norm": 0.9676144058038108, + "learning_rate": 2.577697743320746e-05, + "loss": 0.6789, + "step": 12310 + }, + { + "epoch": 0.789929441949968, + "grad_norm": 0.7989952533967423, + "learning_rate": 2.570199999527557e-05, + "loss": 0.683, + "step": 12315 + }, + { + "epoch": 0.7902501603592046, + "grad_norm": 0.7540668642091226, + "learning_rate": 2.5627115676617953e-05, + "loss": 0.6137, + "step": 12320 + }, + { + "epoch": 0.7905708787684413, + "grad_norm": 1.2363573852579546, + "learning_rate": 2.555232457108879e-05, + "loss": 0.6497, + "step": 12325 + }, + { + "epoch": 0.790891597177678, + "grad_norm": 0.5683854501183521, + "learning_rate": 2.5477626772425356e-05, + "loss": 0.6996, + "step": 12330 + }, + { + "epoch": 0.7912123155869147, + "grad_norm": 0.5533412352742278, + "learning_rate": 2.5403022374247953e-05, + "loss": 0.7001, + "step": 12335 + }, + { + "epoch": 0.7915330339961514, + "grad_norm": 0.675236986686075, + "learning_rate": 2.5328511470059935e-05, + "loss": 0.5805, + "step": 12340 + }, + { + "epoch": 0.791853752405388, + "grad_norm": 0.7285390988297157, + "learning_rate": 2.5254094153247355e-05, + "loss": 0.6149, + "step": 12345 + }, + { + "epoch": 0.7921744708146248, + "grad_norm": 0.80400571870766, + "learning_rate": 2.5179770517079093e-05, + "loss": 0.6948, + "step": 12350 + }, + { + "epoch": 0.7924951892238614, + "grad_norm": 0.9377676574780994, + "learning_rate": 2.510554065470653e-05, + "loss": 0.7308, + "step": 12355 + }, + { + "epoch": 0.7928159076330982, + "grad_norm": 0.6446906934234106, + "learning_rate": 2.5031404659163492e-05, + "loss": 0.7255, + "step": 12360 + }, + { + "epoch": 0.7931366260423348, + "grad_norm": 0.8158537224973699, + "learning_rate": 2.495736262336632e-05, + "loss": 0.7016, + "step": 12365 + }, + { + "epoch": 0.7934573444515716, + "grad_norm": 0.9172314841106095, + "learning_rate": 2.4883414640113357e-05, + "loss": 0.6117, + "step": 12370 + }, + { + "epoch": 0.7937780628608082, + "grad_norm": 0.7437504326268314, + "learning_rate": 2.4809560802085274e-05, + "loss": 0.6409, + "step": 12375 + }, + { + "epoch": 0.7940987812700449, + "grad_norm": 0.6879611505056618, + "learning_rate": 2.4735801201844645e-05, + "loss": 0.6397, + "step": 12380 + }, + { + "epoch": 0.7944194996792816, + "grad_norm": 0.9926575009144855, + "learning_rate": 2.466213593183593e-05, + "loss": 0.6966, + "step": 12385 + }, + { + "epoch": 0.7947402180885182, + "grad_norm": 0.8127945292903275, + "learning_rate": 2.458856508438544e-05, + "loss": 0.7704, + "step": 12390 + }, + { + "epoch": 0.795060936497755, + "grad_norm": 0.8871371492144181, + "learning_rate": 2.451508875170104e-05, + "loss": 0.5606, + "step": 12395 + }, + { + "epoch": 0.7953816549069916, + "grad_norm": 0.8206919204372869, + "learning_rate": 2.444170702587226e-05, + "loss": 0.6932, + "step": 12400 + }, + { + "epoch": 0.7957023733162284, + "grad_norm": 0.6603633676196071, + "learning_rate": 2.436841999886994e-05, + "loss": 0.6109, + "step": 12405 + }, + { + "epoch": 0.796023091725465, + "grad_norm": 0.9151323413512733, + "learning_rate": 2.4295227762546267e-05, + "loss": 0.6631, + "step": 12410 + }, + { + "epoch": 0.7963438101347017, + "grad_norm": 0.9827343805814039, + "learning_rate": 2.422213040863468e-05, + "loss": 0.6563, + "step": 12415 + }, + { + "epoch": 0.7966645285439384, + "grad_norm": 0.9469619065977057, + "learning_rate": 2.414912802874961e-05, + "loss": 0.7412, + "step": 12420 + }, + { + "epoch": 0.7969852469531751, + "grad_norm": 1.3131843532103706, + "learning_rate": 2.4076220714386568e-05, + "loss": 0.6886, + "step": 12425 + }, + { + "epoch": 0.7973059653624118, + "grad_norm": 1.2148517258592102, + "learning_rate": 2.40034085569218e-05, + "loss": 0.6898, + "step": 12430 + }, + { + "epoch": 0.7976266837716485, + "grad_norm": 0.8095565024509138, + "learning_rate": 2.393069164761237e-05, + "loss": 0.6122, + "step": 12435 + }, + { + "epoch": 0.7979474021808852, + "grad_norm": 0.9467420200870824, + "learning_rate": 2.3858070077595908e-05, + "loss": 0.7174, + "step": 12440 + }, + { + "epoch": 0.7982681205901219, + "grad_norm": 0.6202794025655268, + "learning_rate": 2.3785543937890586e-05, + "loss": 0.66, + "step": 12445 + }, + { + "epoch": 0.7985888389993585, + "grad_norm": 1.0791006971385633, + "learning_rate": 2.3713113319394997e-05, + "loss": 0.5363, + "step": 12450 + }, + { + "epoch": 0.7989095574085953, + "grad_norm": 1.026500892588481, + "learning_rate": 2.3640778312887945e-05, + "loss": 0.7948, + "step": 12455 + }, + { + "epoch": 0.7992302758178319, + "grad_norm": 0.7967893717258743, + "learning_rate": 2.35685390090285e-05, + "loss": 0.6343, + "step": 12460 + }, + { + "epoch": 0.7995509942270687, + "grad_norm": 1.1948126480397625, + "learning_rate": 2.3496395498355694e-05, + "loss": 0.7174, + "step": 12465 + }, + { + "epoch": 0.7998717126363053, + "grad_norm": 0.8650772892603197, + "learning_rate": 2.34243478712885e-05, + "loss": 0.7018, + "step": 12470 + }, + { + "epoch": 0.800192431045542, + "grad_norm": 0.49196395624702055, + "learning_rate": 2.3352396218125827e-05, + "loss": 0.5881, + "step": 12475 + }, + { + "epoch": 0.8005131494547787, + "grad_norm": 0.7575733059076403, + "learning_rate": 2.3280540629046143e-05, + "loss": 0.7292, + "step": 12480 + }, + { + "epoch": 0.8008338678640154, + "grad_norm": 0.8513796572354395, + "learning_rate": 2.3208781194107664e-05, + "loss": 0.6286, + "step": 12485 + }, + { + "epoch": 0.8011545862732521, + "grad_norm": 0.734121779464679, + "learning_rate": 2.3137118003248004e-05, + "loss": 0.6818, + "step": 12490 + }, + { + "epoch": 0.8014753046824887, + "grad_norm": 0.5881243074608535, + "learning_rate": 2.306555114628415e-05, + "loss": 0.6553, + "step": 12495 + }, + { + "epoch": 0.8017960230917255, + "grad_norm": 0.6452008879569514, + "learning_rate": 2.2994080712912435e-05, + "loss": 0.705, + "step": 12500 + }, + { + "epoch": 0.8021167415009621, + "grad_norm": 1.409626103322556, + "learning_rate": 2.2922706792708194e-05, + "loss": 0.5859, + "step": 12505 + }, + { + "epoch": 0.8024374599101989, + "grad_norm": 0.7556485492806266, + "learning_rate": 2.2851429475125963e-05, + "loss": 0.6137, + "step": 12510 + }, + { + "epoch": 0.8027581783194355, + "grad_norm": 0.9809427245901448, + "learning_rate": 2.2780248849499088e-05, + "loss": 0.7344, + "step": 12515 + }, + { + "epoch": 0.8030788967286723, + "grad_norm": 0.38473648876347516, + "learning_rate": 2.2709165005039802e-05, + "loss": 0.4635, + "step": 12520 + }, + { + "epoch": 0.8033996151379089, + "grad_norm": 0.7409973296233345, + "learning_rate": 2.263817803083901e-05, + "loss": 0.6076, + "step": 12525 + }, + { + "epoch": 0.8037203335471456, + "grad_norm": 0.7165871670251992, + "learning_rate": 2.256728801586616e-05, + "loss": 0.6541, + "step": 12530 + }, + { + "epoch": 0.8040410519563823, + "grad_norm": 0.8518968659931285, + "learning_rate": 2.249649504896929e-05, + "loss": 0.7555, + "step": 12535 + }, + { + "epoch": 0.804361770365619, + "grad_norm": 0.9159683373230153, + "learning_rate": 2.242579921887471e-05, + "loss": 0.6843, + "step": 12540 + }, + { + "epoch": 0.8046824887748557, + "grad_norm": 0.6228826380501181, + "learning_rate": 2.2355200614186987e-05, + "loss": 0.5394, + "step": 12545 + }, + { + "epoch": 0.8050032071840924, + "grad_norm": 0.8002539057082869, + "learning_rate": 2.2284699323388923e-05, + "loss": 0.7345, + "step": 12550 + }, + { + "epoch": 0.8053239255933291, + "grad_norm": 0.9766455426961175, + "learning_rate": 2.2214295434841248e-05, + "loss": 0.7367, + "step": 12555 + }, + { + "epoch": 0.8056446440025657, + "grad_norm": 0.7046361659107024, + "learning_rate": 2.2143989036782707e-05, + "loss": 0.5187, + "step": 12560 + }, + { + "epoch": 0.8059653624118024, + "grad_norm": 0.8108273818757799, + "learning_rate": 2.2073780217329786e-05, + "loss": 0.6532, + "step": 12565 + }, + { + "epoch": 0.8062860808210391, + "grad_norm": 0.818379710541348, + "learning_rate": 2.2003669064476706e-05, + "loss": 0.6059, + "step": 12570 + }, + { + "epoch": 0.8066067992302758, + "grad_norm": 0.984654681269158, + "learning_rate": 2.1933655666095275e-05, + "loss": 0.6525, + "step": 12575 + }, + { + "epoch": 0.8069275176395125, + "grad_norm": 0.9567899833609597, + "learning_rate": 2.186374010993476e-05, + "loss": 0.7311, + "step": 12580 + }, + { + "epoch": 0.8072482360487492, + "grad_norm": 0.7463705769882709, + "learning_rate": 2.1793922483621876e-05, + "loss": 0.6196, + "step": 12585 + }, + { + "epoch": 0.8075689544579859, + "grad_norm": 0.9733520585461265, + "learning_rate": 2.1724202874660492e-05, + "loss": 0.7193, + "step": 12590 + }, + { + "epoch": 0.8078896728672226, + "grad_norm": 0.7681175464199929, + "learning_rate": 2.165458137043175e-05, + "loss": 0.6522, + "step": 12595 + }, + { + "epoch": 0.8082103912764592, + "grad_norm": 0.6886221085607587, + "learning_rate": 2.158505805819374e-05, + "loss": 0.6666, + "step": 12600 + }, + { + "epoch": 0.808531109685696, + "grad_norm": 0.603328263564938, + "learning_rate": 2.1515633025081484e-05, + "loss": 0.667, + "step": 12605 + }, + { + "epoch": 0.8088518280949326, + "grad_norm": 0.8470975793567042, + "learning_rate": 2.1446306358106927e-05, + "loss": 0.6453, + "step": 12610 + }, + { + "epoch": 0.8091725465041694, + "grad_norm": 1.0220077328521942, + "learning_rate": 2.1377078144158603e-05, + "loss": 0.6582, + "step": 12615 + }, + { + "epoch": 0.809493264913406, + "grad_norm": 0.7129620704949545, + "learning_rate": 2.1307948470001782e-05, + "loss": 0.5496, + "step": 12620 + }, + { + "epoch": 0.8098139833226428, + "grad_norm": 0.6343852911809139, + "learning_rate": 2.1238917422278116e-05, + "loss": 0.5455, + "step": 12625 + }, + { + "epoch": 0.8101347017318794, + "grad_norm": 0.36707540294038493, + "learning_rate": 2.1169985087505694e-05, + "loss": 0.6399, + "step": 12630 + }, + { + "epoch": 0.810455420141116, + "grad_norm": 0.813228299713834, + "learning_rate": 2.1101151552078944e-05, + "loss": 0.6842, + "step": 12635 + }, + { + "epoch": 0.8107761385503528, + "grad_norm": 0.6267132658473076, + "learning_rate": 2.1032416902268314e-05, + "loss": 0.5479, + "step": 12640 + }, + { + "epoch": 0.8110968569595894, + "grad_norm": 1.275645304461915, + "learning_rate": 2.0963781224220503e-05, + "loss": 0.6785, + "step": 12645 + }, + { + "epoch": 0.8114175753688262, + "grad_norm": 0.8576850457893269, + "learning_rate": 2.0895244603957998e-05, + "loss": 0.7868, + "step": 12650 + }, + { + "epoch": 0.8117382937780628, + "grad_norm": 0.5639578214670323, + "learning_rate": 2.082680712737929e-05, + "loss": 0.5559, + "step": 12655 + }, + { + "epoch": 0.8120590121872996, + "grad_norm": 1.1440696942831554, + "learning_rate": 2.0758468880258486e-05, + "loss": 0.7089, + "step": 12660 + }, + { + "epoch": 0.8123797305965362, + "grad_norm": 0.8070604839659317, + "learning_rate": 2.0690229948245365e-05, + "loss": 0.6695, + "step": 12665 + }, + { + "epoch": 0.812700449005773, + "grad_norm": 0.6244747169984161, + "learning_rate": 2.0622090416865293e-05, + "loss": 0.5854, + "step": 12670 + }, + { + "epoch": 0.8130211674150096, + "grad_norm": 0.506375535891638, + "learning_rate": 2.055405037151894e-05, + "loss": 0.6383, + "step": 12675 + }, + { + "epoch": 0.8133418858242463, + "grad_norm": 1.183001348716755, + "learning_rate": 2.0486109897482407e-05, + "loss": 0.6203, + "step": 12680 + }, + { + "epoch": 0.813662604233483, + "grad_norm": 0.6143509135493088, + "learning_rate": 2.0418269079906936e-05, + "loss": 0.5593, + "step": 12685 + }, + { + "epoch": 0.8139833226427197, + "grad_norm": 0.6234718472183463, + "learning_rate": 2.0350528003818825e-05, + "loss": 0.6459, + "step": 12690 + }, + { + "epoch": 0.8143040410519564, + "grad_norm": 1.8693845624658407, + "learning_rate": 2.0282886754119478e-05, + "loss": 0.7211, + "step": 12695 + }, + { + "epoch": 0.8146247594611931, + "grad_norm": 0.8258541488205007, + "learning_rate": 2.0215345415585107e-05, + "loss": 0.5976, + "step": 12700 + }, + { + "epoch": 0.8149454778704298, + "grad_norm": 0.914739265249098, + "learning_rate": 2.0147904072866695e-05, + "loss": 0.6308, + "step": 12705 + }, + { + "epoch": 0.8152661962796665, + "grad_norm": 0.7090505847389847, + "learning_rate": 2.0080562810489935e-05, + "loss": 0.727, + "step": 12710 + }, + { + "epoch": 0.8155869146889031, + "grad_norm": 0.9339182937300688, + "learning_rate": 2.001332171285505e-05, + "loss": 0.6809, + "step": 12715 + }, + { + "epoch": 0.8159076330981399, + "grad_norm": 0.925613865395883, + "learning_rate": 1.9946180864236797e-05, + "loss": 0.7004, + "step": 12720 + }, + { + "epoch": 0.8162283515073765, + "grad_norm": 0.874166373614285, + "learning_rate": 1.9879140348784177e-05, + "loss": 0.6623, + "step": 12725 + }, + { + "epoch": 0.8165490699166132, + "grad_norm": 0.8313132986404351, + "learning_rate": 1.981220025052056e-05, + "loss": 0.6177, + "step": 12730 + }, + { + "epoch": 0.8168697883258499, + "grad_norm": 0.6383078710564455, + "learning_rate": 1.9745360653343393e-05, + "loss": 0.6089, + "step": 12735 + }, + { + "epoch": 0.8171905067350866, + "grad_norm": 0.5929159065490891, + "learning_rate": 1.9678621641024132e-05, + "loss": 0.5833, + "step": 12740 + }, + { + "epoch": 0.8175112251443233, + "grad_norm": 0.6839908339425101, + "learning_rate": 1.961198329720827e-05, + "loss": 0.6513, + "step": 12745 + }, + { + "epoch": 0.8178319435535599, + "grad_norm": 0.43381578975254104, + "learning_rate": 1.9545445705415012e-05, + "loss": 0.655, + "step": 12750 + }, + { + "epoch": 0.8181526619627967, + "grad_norm": 0.666728316560307, + "learning_rate": 1.947900894903739e-05, + "loss": 0.5284, + "step": 12755 + }, + { + "epoch": 0.8184733803720333, + "grad_norm": 1.0911535549941562, + "learning_rate": 1.9412673111342018e-05, + "loss": 0.6534, + "step": 12760 + }, + { + "epoch": 0.8187940987812701, + "grad_norm": 0.8721963911370444, + "learning_rate": 1.934643827546899e-05, + "loss": 0.7718, + "step": 12765 + }, + { + "epoch": 0.8191148171905067, + "grad_norm": 0.9043104390757369, + "learning_rate": 1.928030452443187e-05, + "loss": 0.7249, + "step": 12770 + }, + { + "epoch": 0.8194355355997435, + "grad_norm": 0.6520308339900129, + "learning_rate": 1.9214271941117458e-05, + "loss": 0.569, + "step": 12775 + }, + { + "epoch": 0.8197562540089801, + "grad_norm": 1.0081351400932888, + "learning_rate": 1.9148340608285863e-05, + "loss": 0.6623, + "step": 12780 + }, + { + "epoch": 0.8200769724182168, + "grad_norm": 0.6541686083293314, + "learning_rate": 1.908251060857019e-05, + "loss": 0.6006, + "step": 12785 + }, + { + "epoch": 0.8203976908274535, + "grad_norm": 0.6996268349045872, + "learning_rate": 1.901678202447663e-05, + "loss": 0.6209, + "step": 12790 + }, + { + "epoch": 0.8207184092366901, + "grad_norm": 0.6137399071233165, + "learning_rate": 1.8951154938384207e-05, + "loss": 0.7341, + "step": 12795 + }, + { + "epoch": 0.8210391276459269, + "grad_norm": 0.6979894249139232, + "learning_rate": 1.8885629432544717e-05, + "loss": 0.6331, + "step": 12800 + }, + { + "epoch": 0.8213598460551635, + "grad_norm": 1.4876520614972237, + "learning_rate": 1.882020558908274e-05, + "loss": 0.5262, + "step": 12805 + }, + { + "epoch": 0.8216805644644003, + "grad_norm": 1.1310428300822517, + "learning_rate": 1.8754883489995335e-05, + "loss": 0.6548, + "step": 12810 + }, + { + "epoch": 0.8220012828736369, + "grad_norm": 0.819858534428383, + "learning_rate": 1.868966321715212e-05, + "loss": 0.6514, + "step": 12815 + }, + { + "epoch": 0.8223220012828736, + "grad_norm": 0.9699270159513138, + "learning_rate": 1.8624544852295046e-05, + "loss": 0.6668, + "step": 12820 + }, + { + "epoch": 0.8226427196921103, + "grad_norm": 1.1171340784169779, + "learning_rate": 1.8559528477038325e-05, + "loss": 0.7466, + "step": 12825 + }, + { + "epoch": 0.822963438101347, + "grad_norm": 0.9010920277558152, + "learning_rate": 1.849461417286843e-05, + "loss": 0.5722, + "step": 12830 + }, + { + "epoch": 0.8232841565105837, + "grad_norm": 0.8446632185572971, + "learning_rate": 1.8429802021143816e-05, + "loss": 0.7673, + "step": 12835 + }, + { + "epoch": 0.8236048749198204, + "grad_norm": 0.8445623736137308, + "learning_rate": 1.8365092103094938e-05, + "loss": 0.6343, + "step": 12840 + }, + { + "epoch": 0.8239255933290571, + "grad_norm": 1.3224256501204117, + "learning_rate": 1.83004844998241e-05, + "loss": 0.6446, + "step": 12845 + }, + { + "epoch": 0.8242463117382938, + "grad_norm": 1.2509505443818558, + "learning_rate": 1.8235979292305448e-05, + "loss": 0.5908, + "step": 12850 + }, + { + "epoch": 0.8245670301475305, + "grad_norm": 1.045236864985607, + "learning_rate": 1.8171576561384718e-05, + "loss": 0.6833, + "step": 12855 + }, + { + "epoch": 0.8248877485567672, + "grad_norm": 0.8131230488754208, + "learning_rate": 1.8107276387779194e-05, + "loss": 0.6713, + "step": 12860 + }, + { + "epoch": 0.8252084669660038, + "grad_norm": 0.9987203815522278, + "learning_rate": 1.8043078852077723e-05, + "loss": 0.6382, + "step": 12865 + }, + { + "epoch": 0.8255291853752406, + "grad_norm": 0.8378880198765352, + "learning_rate": 1.797898403474041e-05, + "loss": 0.651, + "step": 12870 + }, + { + "epoch": 0.8258499037844772, + "grad_norm": 0.69860101125052, + "learning_rate": 1.7914992016098652e-05, + "loss": 0.6678, + "step": 12875 + }, + { + "epoch": 0.826170622193714, + "grad_norm": 0.7906981356515638, + "learning_rate": 1.7851102876355064e-05, + "loss": 0.7724, + "step": 12880 + }, + { + "epoch": 0.8264913406029506, + "grad_norm": 0.7220660188316776, + "learning_rate": 1.778731669558322e-05, + "loss": 0.7528, + "step": 12885 + }, + { + "epoch": 0.8268120590121874, + "grad_norm": 0.8602114436332251, + "learning_rate": 1.772363355372776e-05, + "loss": 0.7355, + "step": 12890 + }, + { + "epoch": 0.827132777421424, + "grad_norm": 0.7936909578079667, + "learning_rate": 1.7660053530604103e-05, + "loss": 0.5939, + "step": 12895 + }, + { + "epoch": 0.8274534958306606, + "grad_norm": 0.7386556230325233, + "learning_rate": 1.759657670589844e-05, + "loss": 0.7065, + "step": 12900 + }, + { + "epoch": 0.8277742142398974, + "grad_norm": 0.7508393958424202, + "learning_rate": 1.7533203159167653e-05, + "loss": 0.7995, + "step": 12905 + }, + { + "epoch": 0.828094932649134, + "grad_norm": 1.484996895062748, + "learning_rate": 1.7469932969839133e-05, + "loss": 0.5822, + "step": 12910 + }, + { + "epoch": 0.8284156510583708, + "grad_norm": 0.7889368806667416, + "learning_rate": 1.7406766217210813e-05, + "loss": 0.6915, + "step": 12915 + }, + { + "epoch": 0.8287363694676074, + "grad_norm": 1.043078354293378, + "learning_rate": 1.7343702980450882e-05, + "loss": 0.6678, + "step": 12920 + }, + { + "epoch": 0.8290570878768442, + "grad_norm": 0.5235441869984315, + "learning_rate": 1.7280743338597903e-05, + "loss": 0.6732, + "step": 12925 + }, + { + "epoch": 0.8293778062860808, + "grad_norm": 0.9827303368182867, + "learning_rate": 1.7217887370560527e-05, + "loss": 0.5817, + "step": 12930 + }, + { + "epoch": 0.8296985246953175, + "grad_norm": 0.8919025135393817, + "learning_rate": 1.715513515511743e-05, + "loss": 0.5394, + "step": 12935 + }, + { + "epoch": 0.8300192431045542, + "grad_norm": 0.8422357074138689, + "learning_rate": 1.7092486770917382e-05, + "loss": 0.7755, + "step": 12940 + }, + { + "epoch": 0.8303399615137909, + "grad_norm": 0.9473245373995116, + "learning_rate": 1.7029942296478885e-05, + "loss": 0.6846, + "step": 12945 + }, + { + "epoch": 0.8306606799230276, + "grad_norm": 0.6373840068433619, + "learning_rate": 1.6967501810190323e-05, + "loss": 0.6543, + "step": 12950 + }, + { + "epoch": 0.8309813983322643, + "grad_norm": 0.7843610971634594, + "learning_rate": 1.6905165390309665e-05, + "loss": 0.6431, + "step": 12955 + }, + { + "epoch": 0.831302116741501, + "grad_norm": 1.1652096610055944, + "learning_rate": 1.6842933114964466e-05, + "loss": 0.8221, + "step": 12960 + }, + { + "epoch": 0.8316228351507376, + "grad_norm": 0.8194937278113069, + "learning_rate": 1.6780805062151816e-05, + "loss": 0.5232, + "step": 12965 + }, + { + "epoch": 0.8319435535599743, + "grad_norm": 1.188666287581691, + "learning_rate": 1.6718781309738073e-05, + "loss": 0.6604, + "step": 12970 + }, + { + "epoch": 0.832264271969211, + "grad_norm": 0.8641382912001553, + "learning_rate": 1.665686193545898e-05, + "loss": 0.5844, + "step": 12975 + }, + { + "epoch": 0.8325849903784477, + "grad_norm": 0.7062740744596516, + "learning_rate": 1.6595047016919373e-05, + "loss": 0.6843, + "step": 12980 + }, + { + "epoch": 0.8329057087876844, + "grad_norm": 1.7666107387397485, + "learning_rate": 1.6533336631593276e-05, + "loss": 0.5533, + "step": 12985 + }, + { + "epoch": 0.8332264271969211, + "grad_norm": 0.6713809127329562, + "learning_rate": 1.6471730856823587e-05, + "loss": 0.5803, + "step": 12990 + }, + { + "epoch": 0.8335471456061578, + "grad_norm": 0.789870715650865, + "learning_rate": 1.6410229769822137e-05, + "loss": 0.5722, + "step": 12995 + }, + { + "epoch": 0.8338678640153945, + "grad_norm": 0.694543681011162, + "learning_rate": 1.6348833447669596e-05, + "loss": 0.7518, + "step": 13000 + }, + { + "epoch": 0.8341885824246311, + "grad_norm": 0.9060155570486944, + "learning_rate": 1.6287541967315246e-05, + "loss": 0.6968, + "step": 13005 + }, + { + "epoch": 0.8345093008338679, + "grad_norm": 0.7521276185282114, + "learning_rate": 1.6226355405577052e-05, + "loss": 0.7398, + "step": 13010 + }, + { + "epoch": 0.8348300192431045, + "grad_norm": 0.6239824879078599, + "learning_rate": 1.6165273839141425e-05, + "loss": 0.5993, + "step": 13015 + }, + { + "epoch": 0.8351507376523413, + "grad_norm": 0.8788280197433859, + "learning_rate": 1.610429734456317e-05, + "loss": 0.5281, + "step": 13020 + }, + { + "epoch": 0.8354714560615779, + "grad_norm": 0.5708218830810341, + "learning_rate": 1.604342599826548e-05, + "loss": 0.6636, + "step": 13025 + }, + { + "epoch": 0.8357921744708147, + "grad_norm": 0.9995506015609548, + "learning_rate": 1.5982659876539706e-05, + "loss": 0.6224, + "step": 13030 + }, + { + "epoch": 0.8361128928800513, + "grad_norm": 0.6985670528256153, + "learning_rate": 1.5921999055545322e-05, + "loss": 0.7875, + "step": 13035 + }, + { + "epoch": 0.8364336112892881, + "grad_norm": 1.1017729058211603, + "learning_rate": 1.5861443611309836e-05, + "loss": 0.5689, + "step": 13040 + }, + { + "epoch": 0.8367543296985247, + "grad_norm": 0.6102105059220153, + "learning_rate": 1.5800993619728645e-05, + "loss": 0.6071, + "step": 13045 + }, + { + "epoch": 0.8370750481077613, + "grad_norm": 1.0918121069567406, + "learning_rate": 1.574064915656508e-05, + "loss": 0.6389, + "step": 13050 + }, + { + "epoch": 0.8373957665169981, + "grad_norm": 0.8119509757109902, + "learning_rate": 1.5680410297450097e-05, + "loss": 0.6904, + "step": 13055 + }, + { + "epoch": 0.8377164849262347, + "grad_norm": 1.0654010067070523, + "learning_rate": 1.56202771178824e-05, + "loss": 0.6806, + "step": 13060 + }, + { + "epoch": 0.8380372033354715, + "grad_norm": 1.003140917229182, + "learning_rate": 1.5560249693228167e-05, + "loss": 0.7506, + "step": 13065 + }, + { + "epoch": 0.8383579217447081, + "grad_norm": 0.8104009198927022, + "learning_rate": 1.5500328098721017e-05, + "loss": 0.6771, + "step": 13070 + }, + { + "epoch": 0.8386786401539449, + "grad_norm": 0.6505916854083006, + "learning_rate": 1.5440512409462027e-05, + "loss": 0.4606, + "step": 13075 + }, + { + "epoch": 0.8389993585631815, + "grad_norm": 0.8172274238106711, + "learning_rate": 1.5380802700419437e-05, + "loss": 0.6273, + "step": 13080 + }, + { + "epoch": 0.8393200769724182, + "grad_norm": 0.8412486560565198, + "learning_rate": 1.5321199046428748e-05, + "loss": 0.6232, + "step": 13085 + }, + { + "epoch": 0.8396407953816549, + "grad_norm": 1.2677355193498017, + "learning_rate": 1.526170152219246e-05, + "loss": 0.6965, + "step": 13090 + }, + { + "epoch": 0.8399615137908916, + "grad_norm": 1.1729148810404941, + "learning_rate": 1.520231020228008e-05, + "loss": 0.6742, + "step": 13095 + }, + { + "epoch": 0.8402822322001283, + "grad_norm": 0.9492910072998716, + "learning_rate": 1.51430251611281e-05, + "loss": 0.6427, + "step": 13100 + }, + { + "epoch": 0.840602950609365, + "grad_norm": 0.9485664054113067, + "learning_rate": 1.508384647303962e-05, + "loss": 0.7599, + "step": 13105 + }, + { + "epoch": 0.8409236690186017, + "grad_norm": 0.7710450909617227, + "learning_rate": 1.5024774212184644e-05, + "loss": 0.7211, + "step": 13110 + }, + { + "epoch": 0.8412443874278384, + "grad_norm": 1.4732302257890362, + "learning_rate": 1.496580845259965e-05, + "loss": 0.5757, + "step": 13115 + }, + { + "epoch": 0.841565105837075, + "grad_norm": 0.815748738677427, + "learning_rate": 1.4906949268187731e-05, + "loss": 0.7202, + "step": 13120 + }, + { + "epoch": 0.8418858242463118, + "grad_norm": 0.7569265134733956, + "learning_rate": 1.4848196732718333e-05, + "loss": 0.5067, + "step": 13125 + }, + { + "epoch": 0.8422065426555484, + "grad_norm": 0.7019350874014, + "learning_rate": 1.4789550919827255e-05, + "loss": 0.6555, + "step": 13130 + }, + { + "epoch": 0.8425272610647851, + "grad_norm": 1.070502908495116, + "learning_rate": 1.4731011903016589e-05, + "loss": 0.5612, + "step": 13135 + }, + { + "epoch": 0.8428479794740218, + "grad_norm": 0.8746378057344433, + "learning_rate": 1.4672579755654492e-05, + "loss": 0.6644, + "step": 13140 + }, + { + "epoch": 0.8431686978832585, + "grad_norm": 0.9069204901759049, + "learning_rate": 1.4614254550975282e-05, + "loss": 0.6041, + "step": 13145 + }, + { + "epoch": 0.8434894162924952, + "grad_norm": 0.6286045045253976, + "learning_rate": 1.455603636207915e-05, + "loss": 0.573, + "step": 13150 + }, + { + "epoch": 0.8438101347017318, + "grad_norm": 0.8046184953958996, + "learning_rate": 1.4497925261932188e-05, + "loss": 0.7031, + "step": 13155 + }, + { + "epoch": 0.8441308531109686, + "grad_norm": 0.9289022471342262, + "learning_rate": 1.4439921323366323e-05, + "loss": 0.6532, + "step": 13160 + }, + { + "epoch": 0.8444515715202052, + "grad_norm": 0.8155940315800527, + "learning_rate": 1.4382024619079105e-05, + "loss": 0.6537, + "step": 13165 + }, + { + "epoch": 0.844772289929442, + "grad_norm": 1.064740365786613, + "learning_rate": 1.432423522163372e-05, + "loss": 0.598, + "step": 13170 + }, + { + "epoch": 0.8450930083386786, + "grad_norm": 0.7962110283298796, + "learning_rate": 1.4266553203458831e-05, + "loss": 0.7714, + "step": 13175 + }, + { + "epoch": 0.8454137267479154, + "grad_norm": 0.9891349725088471, + "learning_rate": 1.4208978636848591e-05, + "loss": 0.666, + "step": 13180 + }, + { + "epoch": 0.845734445157152, + "grad_norm": 0.4894444101288945, + "learning_rate": 1.4151511593962418e-05, + "loss": 0.5697, + "step": 13185 + }, + { + "epoch": 0.8460551635663887, + "grad_norm": 0.9733970578229911, + "learning_rate": 1.4094152146824969e-05, + "loss": 0.639, + "step": 13190 + }, + { + "epoch": 0.8463758819756254, + "grad_norm": 0.7185939555951706, + "learning_rate": 1.40369003673261e-05, + "loss": 0.6608, + "step": 13195 + }, + { + "epoch": 0.846696600384862, + "grad_norm": 0.9770382341303654, + "learning_rate": 1.3979756327220683e-05, + "loss": 0.5714, + "step": 13200 + }, + { + "epoch": 0.8470173187940988, + "grad_norm": 0.9521286519290345, + "learning_rate": 1.3922720098128527e-05, + "loss": 0.7672, + "step": 13205 + }, + { + "epoch": 0.8473380372033354, + "grad_norm": 1.0026426650083589, + "learning_rate": 1.3865791751534418e-05, + "loss": 0.589, + "step": 13210 + }, + { + "epoch": 0.8476587556125722, + "grad_norm": 0.7945648077908503, + "learning_rate": 1.3808971358787837e-05, + "loss": 0.5791, + "step": 13215 + }, + { + "epoch": 0.8479794740218088, + "grad_norm": 0.6890626619071494, + "learning_rate": 1.3752258991103018e-05, + "loss": 0.7313, + "step": 13220 + }, + { + "epoch": 0.8483001924310456, + "grad_norm": 0.8523591274592248, + "learning_rate": 1.369565471955878e-05, + "loss": 0.7, + "step": 13225 + }, + { + "epoch": 0.8486209108402822, + "grad_norm": 0.6661468510777631, + "learning_rate": 1.3639158615098457e-05, + "loss": 0.681, + "step": 13230 + }, + { + "epoch": 0.8489416292495189, + "grad_norm": 0.498183926121059, + "learning_rate": 1.3582770748529839e-05, + "loss": 0.6238, + "step": 13235 + }, + { + "epoch": 0.8492623476587556, + "grad_norm": 0.9855613055277577, + "learning_rate": 1.3526491190525025e-05, + "loss": 0.7218, + "step": 13240 + }, + { + "epoch": 0.8495830660679923, + "grad_norm": 0.42718056670086024, + "learning_rate": 1.3470320011620418e-05, + "loss": 0.5768, + "step": 13245 + }, + { + "epoch": 0.849903784477229, + "grad_norm": 1.0362257024186183, + "learning_rate": 1.3414257282216535e-05, + "loss": 0.6332, + "step": 13250 + }, + { + "epoch": 0.8502245028864657, + "grad_norm": 0.8990446366365678, + "learning_rate": 1.3358303072578027e-05, + "loss": 0.6709, + "step": 13255 + }, + { + "epoch": 0.8505452212957024, + "grad_norm": 0.7211479323078617, + "learning_rate": 1.3302457452833484e-05, + "loss": 0.5878, + "step": 13260 + }, + { + "epoch": 0.8508659397049391, + "grad_norm": 1.1924552884788637, + "learning_rate": 1.3246720492975396e-05, + "loss": 0.7302, + "step": 13265 + }, + { + "epoch": 0.8511866581141757, + "grad_norm": 0.6589451039855936, + "learning_rate": 1.3191092262860127e-05, + "loss": 0.6891, + "step": 13270 + }, + { + "epoch": 0.8515073765234125, + "grad_norm": 0.6379938202383435, + "learning_rate": 1.3135572832207699e-05, + "loss": 0.5751, + "step": 13275 + }, + { + "epoch": 0.8518280949326491, + "grad_norm": 0.6107227064835382, + "learning_rate": 1.3080162270601826e-05, + "loss": 0.6705, + "step": 13280 + }, + { + "epoch": 0.8521488133418859, + "grad_norm": 0.7796857101023206, + "learning_rate": 1.3024860647489756e-05, + "loss": 0.595, + "step": 13285 + }, + { + "epoch": 0.8524695317511225, + "grad_norm": 1.067556813441523, + "learning_rate": 1.2969668032182147e-05, + "loss": 0.6906, + "step": 13290 + }, + { + "epoch": 0.8527902501603593, + "grad_norm": 0.7705240841097785, + "learning_rate": 1.2914584493853144e-05, + "loss": 0.6176, + "step": 13295 + }, + { + "epoch": 0.8531109685695959, + "grad_norm": 0.9227266859657003, + "learning_rate": 1.285961010154011e-05, + "loss": 0.6479, + "step": 13300 + }, + { + "epoch": 0.8534316869788325, + "grad_norm": 1.0189541311376396, + "learning_rate": 1.2804744924143608e-05, + "loss": 0.728, + "step": 13305 + }, + { + "epoch": 0.8537524053880693, + "grad_norm": 0.8401997316168908, + "learning_rate": 1.2749989030427344e-05, + "loss": 0.7617, + "step": 13310 + }, + { + "epoch": 0.8540731237973059, + "grad_norm": 0.9093797714776795, + "learning_rate": 1.269534248901807e-05, + "loss": 0.5851, + "step": 13315 + }, + { + "epoch": 0.8543938422065427, + "grad_norm": 1.1037034088263697, + "learning_rate": 1.2640805368405462e-05, + "loss": 0.6118, + "step": 13320 + }, + { + "epoch": 0.8547145606157793, + "grad_norm": 0.7177427685245759, + "learning_rate": 1.2586377736942034e-05, + "loss": 0.7042, + "step": 13325 + }, + { + "epoch": 0.8550352790250161, + "grad_norm": 0.9633359403241921, + "learning_rate": 1.2532059662843144e-05, + "loss": 0.7182, + "step": 13330 + }, + { + "epoch": 0.8553559974342527, + "grad_norm": 0.8564133887667676, + "learning_rate": 1.2477851214186754e-05, + "loss": 0.5807, + "step": 13335 + }, + { + "epoch": 0.8556767158434894, + "grad_norm": 1.0067512789243385, + "learning_rate": 1.2423752458913518e-05, + "loss": 0.6689, + "step": 13340 + }, + { + "epoch": 0.8559974342527261, + "grad_norm": 0.6740456644820353, + "learning_rate": 1.2369763464826533e-05, + "loss": 0.5505, + "step": 13345 + }, + { + "epoch": 0.8563181526619628, + "grad_norm": 0.7485205146558563, + "learning_rate": 1.2315884299591362e-05, + "loss": 0.7485, + "step": 13350 + }, + { + "epoch": 0.8566388710711995, + "grad_norm": 0.9943455478406926, + "learning_rate": 1.2262115030735944e-05, + "loss": 0.7464, + "step": 13355 + }, + { + "epoch": 0.8569595894804362, + "grad_norm": 0.7832997459113116, + "learning_rate": 1.2208455725650436e-05, + "loss": 0.6956, + "step": 13360 + }, + { + "epoch": 0.8572803078896729, + "grad_norm": 0.9569726126068407, + "learning_rate": 1.2154906451587189e-05, + "loss": 0.7132, + "step": 13365 + }, + { + "epoch": 0.8576010262989096, + "grad_norm": 0.7447552856015294, + "learning_rate": 1.2101467275660661e-05, + "loss": 0.4959, + "step": 13370 + }, + { + "epoch": 0.8579217447081462, + "grad_norm": 0.8033856598382162, + "learning_rate": 1.2048138264847297e-05, + "loss": 0.8208, + "step": 13375 + }, + { + "epoch": 0.858242463117383, + "grad_norm": 1.2548309542667209, + "learning_rate": 1.1994919485985522e-05, + "loss": 0.5933, + "step": 13380 + }, + { + "epoch": 0.8585631815266196, + "grad_norm": 0.8849084463562876, + "learning_rate": 1.1941811005775538e-05, + "loss": 0.7345, + "step": 13385 + }, + { + "epoch": 0.8588838999358563, + "grad_norm": 1.0662756941569218, + "learning_rate": 1.1888812890779377e-05, + "loss": 0.672, + "step": 13390 + }, + { + "epoch": 0.859204618345093, + "grad_norm": 1.4484403343446357, + "learning_rate": 1.1835925207420694e-05, + "loss": 0.606, + "step": 13395 + }, + { + "epoch": 0.8595253367543297, + "grad_norm": 0.8332555994611591, + "learning_rate": 1.1783148021984725e-05, + "loss": 0.692, + "step": 13400 + }, + { + "epoch": 0.8598460551635664, + "grad_norm": 0.7857634142558743, + "learning_rate": 1.1730481400618299e-05, + "loss": 0.8791, + "step": 13405 + }, + { + "epoch": 0.8601667735728031, + "grad_norm": 1.0726454797623632, + "learning_rate": 1.167792540932957e-05, + "loss": 0.6978, + "step": 13410 + }, + { + "epoch": 0.8604874919820398, + "grad_norm": 0.662627507867472, + "learning_rate": 1.162548011398814e-05, + "loss": 0.6655, + "step": 13415 + }, + { + "epoch": 0.8608082103912764, + "grad_norm": 0.7427411411925819, + "learning_rate": 1.1573145580324785e-05, + "loss": 0.7019, + "step": 13420 + }, + { + "epoch": 0.8611289288005132, + "grad_norm": 0.8465518983483786, + "learning_rate": 1.1520921873931489e-05, + "loss": 0.7452, + "step": 13425 + }, + { + "epoch": 0.8614496472097498, + "grad_norm": 0.5455286801662246, + "learning_rate": 1.1468809060261399e-05, + "loss": 0.652, + "step": 13430 + }, + { + "epoch": 0.8617703656189866, + "grad_norm": 0.8972113556345591, + "learning_rate": 1.1416807204628533e-05, + "loss": 0.5988, + "step": 13435 + }, + { + "epoch": 0.8620910840282232, + "grad_norm": 0.6854697322056322, + "learning_rate": 1.1364916372208e-05, + "loss": 0.696, + "step": 13440 + }, + { + "epoch": 0.86241180243746, + "grad_norm": 0.678150343614853, + "learning_rate": 1.1313136628035647e-05, + "loss": 0.5252, + "step": 13445 + }, + { + "epoch": 0.8627325208466966, + "grad_norm": 0.6285060401132421, + "learning_rate": 1.1261468037008172e-05, + "loss": 0.4725, + "step": 13450 + }, + { + "epoch": 0.8630532392559332, + "grad_norm": 0.6510845504498061, + "learning_rate": 1.1209910663882916e-05, + "loss": 0.5565, + "step": 13455 + }, + { + "epoch": 0.86337395766517, + "grad_norm": 1.2698183413935256, + "learning_rate": 1.1158464573277816e-05, + "loss": 0.7544, + "step": 13460 + }, + { + "epoch": 0.8636946760744066, + "grad_norm": 1.048484623181104, + "learning_rate": 1.1107129829671393e-05, + "loss": 0.6762, + "step": 13465 + }, + { + "epoch": 0.8640153944836434, + "grad_norm": 0.8197138470113798, + "learning_rate": 1.1055906497402534e-05, + "loss": 0.7671, + "step": 13470 + }, + { + "epoch": 0.86433611289288, + "grad_norm": 0.8060735013585868, + "learning_rate": 1.1004794640670602e-05, + "loss": 0.7412, + "step": 13475 + }, + { + "epoch": 0.8646568313021168, + "grad_norm": 0.5202202198681646, + "learning_rate": 1.0953794323535138e-05, + "loss": 0.617, + "step": 13480 + }, + { + "epoch": 0.8649775497113534, + "grad_norm": 0.9060221838859691, + "learning_rate": 1.0902905609915925e-05, + "loss": 0.6724, + "step": 13485 + }, + { + "epoch": 0.8652982681205901, + "grad_norm": 0.9948896143875089, + "learning_rate": 1.0852128563592911e-05, + "loss": 0.6916, + "step": 13490 + }, + { + "epoch": 0.8656189865298268, + "grad_norm": 0.6185205159442889, + "learning_rate": 1.0801463248206012e-05, + "loss": 0.6155, + "step": 13495 + }, + { + "epoch": 0.8659397049390635, + "grad_norm": 0.8621415617622489, + "learning_rate": 1.0750909727255231e-05, + "loss": 0.5641, + "step": 13500 + }, + { + "epoch": 0.8662604233483002, + "grad_norm": 1.0099987644568347, + "learning_rate": 1.0700468064100278e-05, + "loss": 0.5874, + "step": 13505 + }, + { + "epoch": 0.8665811417575369, + "grad_norm": 0.7860625683994522, + "learning_rate": 1.0650138321960834e-05, + "loss": 0.6447, + "step": 13510 + }, + { + "epoch": 0.8669018601667736, + "grad_norm": 1.0075130412273372, + "learning_rate": 1.0599920563916233e-05, + "loss": 0.6428, + "step": 13515 + }, + { + "epoch": 0.8672225785760103, + "grad_norm": 0.8433746537048423, + "learning_rate": 1.0549814852905427e-05, + "loss": 0.6156, + "step": 13520 + }, + { + "epoch": 0.8675432969852469, + "grad_norm": 0.6911458595910109, + "learning_rate": 1.0499821251727038e-05, + "loss": 0.7697, + "step": 13525 + }, + { + "epoch": 0.8678640153944837, + "grad_norm": 0.7261479775249019, + "learning_rate": 1.044993982303909e-05, + "loss": 0.7353, + "step": 13530 + }, + { + "epoch": 0.8681847338037203, + "grad_norm": 0.5256687873474478, + "learning_rate": 1.040017062935902e-05, + "loss": 0.5737, + "step": 13535 + }, + { + "epoch": 0.868505452212957, + "grad_norm": 1.0493206252194889, + "learning_rate": 1.035051373306366e-05, + "loss": 0.6215, + "step": 13540 + }, + { + "epoch": 0.8688261706221937, + "grad_norm": 0.665208544741004, + "learning_rate": 1.0300969196389033e-05, + "loss": 0.6073, + "step": 13545 + }, + { + "epoch": 0.8691468890314304, + "grad_norm": 0.6978534685649864, + "learning_rate": 1.0251537081430406e-05, + "loss": 0.5837, + "step": 13550 + }, + { + "epoch": 0.8694676074406671, + "grad_norm": 0.7579892159049441, + "learning_rate": 1.0202217450142082e-05, + "loss": 0.5604, + "step": 13555 + }, + { + "epoch": 0.8697883258499037, + "grad_norm": 0.6514517952782195, + "learning_rate": 1.015301036433739e-05, + "loss": 0.6971, + "step": 13560 + }, + { + "epoch": 0.8701090442591405, + "grad_norm": 0.4398371785948417, + "learning_rate": 1.0103915885688686e-05, + "loss": 0.5459, + "step": 13565 + }, + { + "epoch": 0.8704297626683771, + "grad_norm": 0.6924160948174624, + "learning_rate": 1.0054934075727062e-05, + "loss": 0.5386, + "step": 13570 + }, + { + "epoch": 0.8707504810776139, + "grad_norm": 0.9269090072648052, + "learning_rate": 1.0006064995842513e-05, + "loss": 0.7547, + "step": 13575 + }, + { + "epoch": 0.8710711994868505, + "grad_norm": 0.788185049843599, + "learning_rate": 9.957308707283675e-06, + "loss": 0.6128, + "step": 13580 + }, + { + "epoch": 0.8713919178960873, + "grad_norm": 0.7308595928706564, + "learning_rate": 9.90866527115788e-06, + "loss": 0.6036, + "step": 13585 + }, + { + "epoch": 0.8717126363053239, + "grad_norm": 0.7092354057653707, + "learning_rate": 9.860134748430972e-06, + "loss": 0.7038, + "step": 13590 + }, + { + "epoch": 0.8720333547145607, + "grad_norm": 1.0470346737728682, + "learning_rate": 9.811717199927273e-06, + "loss": 0.73, + "step": 13595 + }, + { + "epoch": 0.8723540731237973, + "grad_norm": 1.2863495939351028, + "learning_rate": 9.763412686329575e-06, + "loss": 0.7084, + "step": 13600 + }, + { + "epoch": 0.872674791533034, + "grad_norm": 0.768903275631644, + "learning_rate": 9.71522126817892e-06, + "loss": 0.7444, + "step": 13605 + }, + { + "epoch": 0.8729955099422707, + "grad_norm": 0.8561382895899066, + "learning_rate": 9.667143005874679e-06, + "loss": 0.6743, + "step": 13610 + }, + { + "epoch": 0.8733162283515074, + "grad_norm": 0.6255033102428371, + "learning_rate": 9.619177959674353e-06, + "loss": 0.6357, + "step": 13615 + }, + { + "epoch": 0.8736369467607441, + "grad_norm": 0.8367879131361138, + "learning_rate": 9.57132618969354e-06, + "loss": 0.7229, + "step": 13620 + }, + { + "epoch": 0.8739576651699807, + "grad_norm": 0.6292130728042913, + "learning_rate": 9.523587755905938e-06, + "loss": 0.6561, + "step": 13625 + }, + { + "epoch": 0.8742783835792175, + "grad_norm": 0.9860204738083063, + "learning_rate": 9.475962718143106e-06, + "loss": 0.6323, + "step": 13630 + }, + { + "epoch": 0.8745991019884541, + "grad_norm": 0.841887275726057, + "learning_rate": 9.428451136094541e-06, + "loss": 0.6762, + "step": 13635 + }, + { + "epoch": 0.8749198203976908, + "grad_norm": 0.7100122528682058, + "learning_rate": 9.381053069307499e-06, + "loss": 0.5494, + "step": 13640 + }, + { + "epoch": 0.8752405388069275, + "grad_norm": 1.1202599763010757, + "learning_rate": 9.33376857718703e-06, + "loss": 0.5936, + "step": 13645 + }, + { + "epoch": 0.8755612572161642, + "grad_norm": 1.0773135254923245, + "learning_rate": 9.286597718995783e-06, + "loss": 0.5523, + "step": 13650 + }, + { + "epoch": 0.8758819756254009, + "grad_norm": 0.7262011668633317, + "learning_rate": 9.239540553853987e-06, + "loss": 0.7559, + "step": 13655 + }, + { + "epoch": 0.8762026940346376, + "grad_norm": 1.1845562776611291, + "learning_rate": 9.192597140739445e-06, + "loss": 0.6214, + "step": 13660 + }, + { + "epoch": 0.8765234124438743, + "grad_norm": 1.0832215867500623, + "learning_rate": 9.145767538487282e-06, + "loss": 0.6363, + "step": 13665 + }, + { + "epoch": 0.876844130853111, + "grad_norm": 0.8384508766840872, + "learning_rate": 9.099051805790081e-06, + "loss": 0.7162, + "step": 13670 + }, + { + "epoch": 0.8771648492623476, + "grad_norm": 0.7886740113805487, + "learning_rate": 9.052450001197666e-06, + "loss": 0.5292, + "step": 13675 + }, + { + "epoch": 0.8774855676715844, + "grad_norm": 0.724073412445175, + "learning_rate": 9.005962183117055e-06, + "loss": 0.7159, + "step": 13680 + }, + { + "epoch": 0.877806286080821, + "grad_norm": 0.5059344342927663, + "learning_rate": 8.959588409812458e-06, + "loss": 0.6316, + "step": 13685 + }, + { + "epoch": 0.8781270044900578, + "grad_norm": 1.2097294273874917, + "learning_rate": 8.913328739405092e-06, + "loss": 0.7006, + "step": 13690 + }, + { + "epoch": 0.8784477228992944, + "grad_norm": 0.615032496760421, + "learning_rate": 8.867183229873211e-06, + "loss": 0.738, + "step": 13695 + }, + { + "epoch": 0.8787684413085312, + "grad_norm": 0.6135358966273193, + "learning_rate": 8.821151939051953e-06, + "loss": 0.6287, + "step": 13700 + }, + { + "epoch": 0.8790891597177678, + "grad_norm": 0.8910522096004475, + "learning_rate": 8.775234924633301e-06, + "loss": 0.7301, + "step": 13705 + }, + { + "epoch": 0.8794098781270044, + "grad_norm": 0.7815093977889225, + "learning_rate": 8.72943224416609e-06, + "loss": 0.6499, + "step": 13710 + }, + { + "epoch": 0.8797305965362412, + "grad_norm": 0.5607257491266542, + "learning_rate": 8.683743955055746e-06, + "loss": 0.6083, + "step": 13715 + }, + { + "epoch": 0.8800513149454778, + "grad_norm": 0.884214002379739, + "learning_rate": 8.638170114564414e-06, + "loss": 0.611, + "step": 13720 + }, + { + "epoch": 0.8803720333547146, + "grad_norm": 0.7528314170250561, + "learning_rate": 8.592710779810765e-06, + "loss": 0.6921, + "step": 13725 + }, + { + "epoch": 0.8806927517639512, + "grad_norm": 0.9161588988308113, + "learning_rate": 8.547366007769919e-06, + "loss": 0.652, + "step": 13730 + }, + { + "epoch": 0.881013470173188, + "grad_norm": 1.4044011636843894, + "learning_rate": 8.502135855273497e-06, + "loss": 0.6532, + "step": 13735 + }, + { + "epoch": 0.8813341885824246, + "grad_norm": 0.5311315649019397, + "learning_rate": 8.457020379009373e-06, + "loss": 0.5949, + "step": 13740 + }, + { + "epoch": 0.8816549069916613, + "grad_norm": 0.6747473256173435, + "learning_rate": 8.412019635521784e-06, + "loss": 0.5982, + "step": 13745 + }, + { + "epoch": 0.881975625400898, + "grad_norm": 0.6539295071967237, + "learning_rate": 8.367133681211103e-06, + "loss": 0.4702, + "step": 13750 + }, + { + "epoch": 0.8822963438101347, + "grad_norm": 0.403206890252452, + "learning_rate": 8.322362572333841e-06, + "loss": 0.5464, + "step": 13755 + }, + { + "epoch": 0.8826170622193714, + "grad_norm": 0.7780767642995721, + "learning_rate": 8.277706365002625e-06, + "loss": 0.6976, + "step": 13760 + }, + { + "epoch": 0.8829377806286081, + "grad_norm": 0.6272304201483566, + "learning_rate": 8.233165115186003e-06, + "loss": 0.6613, + "step": 13765 + }, + { + "epoch": 0.8832584990378448, + "grad_norm": 0.8343537172020628, + "learning_rate": 8.188738878708502e-06, + "loss": 0.7469, + "step": 13770 + }, + { + "epoch": 0.8835792174470815, + "grad_norm": 0.9345794017556924, + "learning_rate": 8.144427711250447e-06, + "loss": 0.7586, + "step": 13775 + }, + { + "epoch": 0.8838999358563181, + "grad_norm": 1.162828611729811, + "learning_rate": 8.100231668348002e-06, + "loss": 0.5382, + "step": 13780 + }, + { + "epoch": 0.8842206542655549, + "grad_norm": 1.1205395105885234, + "learning_rate": 8.056150805392993e-06, + "loss": 0.6138, + "step": 13785 + }, + { + "epoch": 0.8845413726747915, + "grad_norm": 0.5630057786543724, + "learning_rate": 8.012185177632914e-06, + "loss": 0.4977, + "step": 13790 + }, + { + "epoch": 0.8848620910840282, + "grad_norm": 0.8477848139037634, + "learning_rate": 7.968334840170843e-06, + "loss": 0.7394, + "step": 13795 + }, + { + "epoch": 0.8851828094932649, + "grad_norm": 0.7207695540829029, + "learning_rate": 7.92459984796532e-06, + "loss": 0.7108, + "step": 13800 + }, + { + "epoch": 0.8855035279025016, + "grad_norm": 0.9355747131091594, + "learning_rate": 7.880980255830372e-06, + "loss": 0.6971, + "step": 13805 + }, + { + "epoch": 0.8858242463117383, + "grad_norm": 0.8391670611046308, + "learning_rate": 7.83747611843536e-06, + "loss": 0.6618, + "step": 13810 + }, + { + "epoch": 0.886144964720975, + "grad_norm": 0.6940296556964382, + "learning_rate": 7.794087490304935e-06, + "loss": 0.7303, + "step": 13815 + }, + { + "epoch": 0.8864656831302117, + "grad_norm": 0.5463085826484815, + "learning_rate": 7.75081442581902e-06, + "loss": 0.7128, + "step": 13820 + }, + { + "epoch": 0.8867864015394483, + "grad_norm": 0.9553016730601827, + "learning_rate": 7.707656979212653e-06, + "loss": 0.5325, + "step": 13825 + }, + { + "epoch": 0.8871071199486851, + "grad_norm": 0.9151217967040441, + "learning_rate": 7.66461520457602e-06, + "loss": 0.7276, + "step": 13830 + }, + { + "epoch": 0.8874278383579217, + "grad_norm": 0.7869236135130984, + "learning_rate": 7.6216891558542395e-06, + "loss": 0.744, + "step": 13835 + }, + { + "epoch": 0.8877485567671585, + "grad_norm": 0.748585172606016, + "learning_rate": 7.578878886847507e-06, + "loss": 0.5891, + "step": 13840 + }, + { + "epoch": 0.8880692751763951, + "grad_norm": 0.7205402378107477, + "learning_rate": 7.536184451210815e-06, + "loss": 0.6715, + "step": 13845 + }, + { + "epoch": 0.8883899935856319, + "grad_norm": 0.6198613140638497, + "learning_rate": 7.493605902454004e-06, + "loss": 0.7581, + "step": 13850 + }, + { + "epoch": 0.8887107119948685, + "grad_norm": 0.84149727085621, + "learning_rate": 7.451143293941709e-06, + "loss": 0.746, + "step": 13855 + }, + { + "epoch": 0.8890314304041051, + "grad_norm": 0.8368846152026573, + "learning_rate": 7.408796678893226e-06, + "loss": 0.6687, + "step": 13860 + }, + { + "epoch": 0.8893521488133419, + "grad_norm": 0.7827898269521945, + "learning_rate": 7.366566110382445e-06, + "loss": 0.5832, + "step": 13865 + }, + { + "epoch": 0.8896728672225785, + "grad_norm": 0.7300699318830831, + "learning_rate": 7.324451641337882e-06, + "loss": 0.6294, + "step": 13870 + }, + { + "epoch": 0.8899935856318153, + "grad_norm": 0.9238241719407477, + "learning_rate": 7.28245332454246e-06, + "loss": 0.7083, + "step": 13875 + }, + { + "epoch": 0.8903143040410519, + "grad_norm": 0.7709293686153301, + "learning_rate": 7.240571212633618e-06, + "loss": 0.5686, + "step": 13880 + }, + { + "epoch": 0.8906350224502887, + "grad_norm": 1.1869349060713659, + "learning_rate": 7.198805358103067e-06, + "loss": 0.728, + "step": 13885 + }, + { + "epoch": 0.8909557408595253, + "grad_norm": 1.0851258551108929, + "learning_rate": 7.157155813296834e-06, + "loss": 0.7379, + "step": 13890 + }, + { + "epoch": 0.891276459268762, + "grad_norm": 0.7394708926504447, + "learning_rate": 7.115622630415253e-06, + "loss": 0.7321, + "step": 13895 + }, + { + "epoch": 0.8915971776779987, + "grad_norm": 0.816039779235774, + "learning_rate": 7.0742058615126726e-06, + "loss": 0.601, + "step": 13900 + }, + { + "epoch": 0.8919178960872354, + "grad_norm": 0.41244323070119415, + "learning_rate": 7.03290555849766e-06, + "loss": 0.5809, + "step": 13905 + }, + { + "epoch": 0.8922386144964721, + "grad_norm": 0.8918418533925353, + "learning_rate": 6.991721773132742e-06, + "loss": 0.7142, + "step": 13910 + }, + { + "epoch": 0.8925593329057088, + "grad_norm": 0.8732825568065812, + "learning_rate": 6.950654557034475e-06, + "loss": 0.6635, + "step": 13915 + }, + { + "epoch": 0.8928800513149455, + "grad_norm": 0.7358325355065991, + "learning_rate": 6.909703961673253e-06, + "loss": 0.6412, + "step": 13920 + }, + { + "epoch": 0.8932007697241822, + "grad_norm": 0.7111667197818642, + "learning_rate": 6.868870038373332e-06, + "loss": 0.6767, + "step": 13925 + }, + { + "epoch": 0.8935214881334188, + "grad_norm": 1.0721200112803682, + "learning_rate": 6.828152838312773e-06, + "loss": 0.5066, + "step": 13930 + }, + { + "epoch": 0.8938422065426556, + "grad_norm": 0.8310238983860934, + "learning_rate": 6.787552412523279e-06, + "loss": 0.6764, + "step": 13935 + }, + { + "epoch": 0.8941629249518922, + "grad_norm": 0.6872676077028719, + "learning_rate": 6.747068811890256e-06, + "loss": 0.6671, + "step": 13940 + }, + { + "epoch": 0.894483643361129, + "grad_norm": 0.9702633803545438, + "learning_rate": 6.706702087152661e-06, + "loss": 0.4624, + "step": 13945 + }, + { + "epoch": 0.8948043617703656, + "grad_norm": 1.375052365512822, + "learning_rate": 6.666452288902958e-06, + "loss": 0.7522, + "step": 13950 + }, + { + "epoch": 0.8951250801796024, + "grad_norm": 0.908667367564301, + "learning_rate": 6.626319467587106e-06, + "loss": 0.6602, + "step": 13955 + }, + { + "epoch": 0.895445798588839, + "grad_norm": 1.0327055092345554, + "learning_rate": 6.586303673504412e-06, + "loss": 0.6192, + "step": 13960 + }, + { + "epoch": 0.8957665169980756, + "grad_norm": 0.7913439515419154, + "learning_rate": 6.5464049568075615e-06, + "loss": 0.6883, + "step": 13965 + }, + { + "epoch": 0.8960872354073124, + "grad_norm": 0.9249759944838365, + "learning_rate": 6.506623367502418e-06, + "loss": 0.7207, + "step": 13970 + }, + { + "epoch": 0.896407953816549, + "grad_norm": 0.6185623923439777, + "learning_rate": 6.4669589554481325e-06, + "loss": 0.7935, + "step": 13975 + }, + { + "epoch": 0.8967286722257858, + "grad_norm": 0.9047502038967159, + "learning_rate": 6.4274117703569615e-06, + "loss": 0.523, + "step": 13980 + }, + { + "epoch": 0.8970493906350224, + "grad_norm": 0.5862791588591175, + "learning_rate": 6.387981861794212e-06, + "loss": 0.5767, + "step": 13985 + }, + { + "epoch": 0.8973701090442592, + "grad_norm": 0.9286416832372187, + "learning_rate": 6.348669279178277e-06, + "loss": 0.5952, + "step": 13990 + }, + { + "epoch": 0.8976908274534958, + "grad_norm": 0.9632286005822661, + "learning_rate": 6.309474071780408e-06, + "loss": 0.7512, + "step": 13995 + }, + { + "epoch": 0.8980115458627326, + "grad_norm": 0.6713818773459586, + "learning_rate": 6.2703962887248444e-06, + "loss": 0.8033, + "step": 14000 + }, + { + "epoch": 0.8983322642719692, + "grad_norm": 0.43098921146350616, + "learning_rate": 6.2314359789885756e-06, + "loss": 0.5506, + "step": 14005 + }, + { + "epoch": 0.8986529826812059, + "grad_norm": 1.029998963102262, + "learning_rate": 6.192593191401396e-06, + "loss": 0.6528, + "step": 14010 + }, + { + "epoch": 0.8989737010904426, + "grad_norm": 0.9428065435910548, + "learning_rate": 6.153867974645833e-06, + "loss": 0.6822, + "step": 14015 + }, + { + "epoch": 0.8992944194996793, + "grad_norm": 0.6275896637114994, + "learning_rate": 6.115260377257004e-06, + "loss": 0.556, + "step": 14020 + }, + { + "epoch": 0.899615137908916, + "grad_norm": 0.6094084941175278, + "learning_rate": 6.076770447622615e-06, + "loss": 0.5094, + "step": 14025 + }, + { + "epoch": 0.8999358563181526, + "grad_norm": 0.7526848860794296, + "learning_rate": 6.038398233982989e-06, + "loss": 0.678, + "step": 14030 + }, + { + "epoch": 0.9002565747273894, + "grad_norm": 0.5704944797751071, + "learning_rate": 6.000143784430756e-06, + "loss": 0.6822, + "step": 14035 + }, + { + "epoch": 0.900577293136626, + "grad_norm": 0.7525424440388754, + "learning_rate": 5.962007146911109e-06, + "loss": 0.7008, + "step": 14040 + }, + { + "epoch": 0.9008980115458627, + "grad_norm": 0.961888964093016, + "learning_rate": 5.923988369221456e-06, + "loss": 0.6805, + "step": 14045 + }, + { + "epoch": 0.9012187299550994, + "grad_norm": 0.8861288123930613, + "learning_rate": 5.886087499011594e-06, + "loss": 0.758, + "step": 14050 + }, + { + "epoch": 0.9015394483643361, + "grad_norm": 0.8032927310909407, + "learning_rate": 5.8483045837834705e-06, + "loss": 0.6607, + "step": 14055 + }, + { + "epoch": 0.9018601667735728, + "grad_norm": 0.8087075039644414, + "learning_rate": 5.810639670891216e-06, + "loss": 0.7027, + "step": 14060 + }, + { + "epoch": 0.9021808851828095, + "grad_norm": 0.8539578913251452, + "learning_rate": 5.773092807541092e-06, + "loss": 0.5801, + "step": 14065 + }, + { + "epoch": 0.9025016035920462, + "grad_norm": 0.7756452243315396, + "learning_rate": 5.735664040791367e-06, + "loss": 0.7103, + "step": 14070 + }, + { + "epoch": 0.9028223220012829, + "grad_norm": 1.222999060061691, + "learning_rate": 5.698353417552327e-06, + "loss": 0.6017, + "step": 14075 + }, + { + "epoch": 0.9031430404105195, + "grad_norm": 0.7983696291416744, + "learning_rate": 5.661160984586178e-06, + "loss": 0.6049, + "step": 14080 + }, + { + "epoch": 0.9034637588197563, + "grad_norm": 0.6490907871037943, + "learning_rate": 5.624086788506977e-06, + "loss": 0.5526, + "step": 14085 + }, + { + "epoch": 0.9037844772289929, + "grad_norm": 0.8508537947980717, + "learning_rate": 5.587130875780633e-06, + "loss": 0.7109, + "step": 14090 + }, + { + "epoch": 0.9041051956382297, + "grad_norm": 1.2192033565455072, + "learning_rate": 5.550293292724762e-06, + "loss": 0.7051, + "step": 14095 + }, + { + "epoch": 0.9044259140474663, + "grad_norm": 0.6587693265105345, + "learning_rate": 5.51357408550871e-06, + "loss": 0.6174, + "step": 14100 + }, + { + "epoch": 0.9047466324567031, + "grad_norm": 0.49748231304384327, + "learning_rate": 5.47697330015341e-06, + "loss": 0.5172, + "step": 14105 + }, + { + "epoch": 0.9050673508659397, + "grad_norm": 0.7884068967557873, + "learning_rate": 5.440490982531465e-06, + "loss": 0.6816, + "step": 14110 + }, + { + "epoch": 0.9053880692751763, + "grad_norm": 1.1069363022735697, + "learning_rate": 5.404127178366902e-06, + "loss": 0.6431, + "step": 14115 + }, + { + "epoch": 0.9057087876844131, + "grad_norm": 0.6826869882965035, + "learning_rate": 5.367881933235275e-06, + "loss": 0.5101, + "step": 14120 + }, + { + "epoch": 0.9060295060936497, + "grad_norm": 0.7273219199634979, + "learning_rate": 5.331755292563523e-06, + "loss": 0.637, + "step": 14125 + }, + { + "epoch": 0.9063502245028865, + "grad_norm": 0.8472794210673035, + "learning_rate": 5.295747301629917e-06, + "loss": 0.6022, + "step": 14130 + }, + { + "epoch": 0.9066709429121231, + "grad_norm": 0.8444956562341863, + "learning_rate": 5.259858005564089e-06, + "loss": 0.5334, + "step": 14135 + }, + { + "epoch": 0.9069916613213599, + "grad_norm": 0.6732860335353007, + "learning_rate": 5.224087449346826e-06, + "loss": 0.7202, + "step": 14140 + }, + { + "epoch": 0.9073123797305965, + "grad_norm": 0.9666322387828169, + "learning_rate": 5.188435677810133e-06, + "loss": 0.7559, + "step": 14145 + }, + { + "epoch": 0.9076330981398332, + "grad_norm": 0.9869737805273263, + "learning_rate": 5.152902735637166e-06, + "loss": 0.678, + "step": 14150 + }, + { + "epoch": 0.9079538165490699, + "grad_norm": 0.5732371579819191, + "learning_rate": 5.1174886673620805e-06, + "loss": 0.5993, + "step": 14155 + }, + { + "epoch": 0.9082745349583066, + "grad_norm": 0.8654988560178682, + "learning_rate": 5.082193517370127e-06, + "loss": 0.6813, + "step": 14160 + }, + { + "epoch": 0.9085952533675433, + "grad_norm": 1.0265921567687237, + "learning_rate": 5.047017329897463e-06, + "loss": 0.6737, + "step": 14165 + }, + { + "epoch": 0.90891597177678, + "grad_norm": 0.8248152748170539, + "learning_rate": 5.011960149031137e-06, + "loss": 0.5857, + "step": 14170 + }, + { + "epoch": 0.9092366901860167, + "grad_norm": 0.7956729093404309, + "learning_rate": 4.977022018709088e-06, + "loss": 0.6643, + "step": 14175 + }, + { + "epoch": 0.9095574085952534, + "grad_norm": 1.7578923486790687, + "learning_rate": 4.94220298271999e-06, + "loss": 0.7325, + "step": 14180 + }, + { + "epoch": 0.9098781270044901, + "grad_norm": 0.8908535862934428, + "learning_rate": 4.907503084703335e-06, + "loss": 0.7003, + "step": 14185 + }, + { + "epoch": 0.9101988454137268, + "grad_norm": 0.5989152273082363, + "learning_rate": 4.872922368149213e-06, + "loss": 0.6494, + "step": 14190 + }, + { + "epoch": 0.9105195638229634, + "grad_norm": 1.1947032610011639, + "learning_rate": 4.838460876398365e-06, + "loss": 0.712, + "step": 14195 + }, + { + "epoch": 0.9108402822322001, + "grad_norm": 0.8008113658697428, + "learning_rate": 4.804118652642164e-06, + "loss": 0.6607, + "step": 14200 + }, + { + "epoch": 0.9111610006414368, + "grad_norm": 0.9092451384048743, + "learning_rate": 4.769895739922403e-06, + "loss": 0.532, + "step": 14205 + }, + { + "epoch": 0.9114817190506735, + "grad_norm": 0.9642837868126427, + "learning_rate": 4.7357921811314374e-06, + "loss": 0.5875, + "step": 14210 + }, + { + "epoch": 0.9118024374599102, + "grad_norm": 1.2120942953279068, + "learning_rate": 4.701808019011966e-06, + "loss": 0.644, + "step": 14215 + }, + { + "epoch": 0.9121231558691469, + "grad_norm": 0.7731779356318255, + "learning_rate": 4.66794329615704e-06, + "loss": 0.7528, + "step": 14220 + }, + { + "epoch": 0.9124438742783836, + "grad_norm": 0.8452499221199778, + "learning_rate": 4.634198055010097e-06, + "loss": 0.7321, + "step": 14225 + }, + { + "epoch": 0.9127645926876202, + "grad_norm": 0.7660682093886364, + "learning_rate": 4.600572337864739e-06, + "loss": 0.58, + "step": 14230 + }, + { + "epoch": 0.913085311096857, + "grad_norm": 0.919577008788518, + "learning_rate": 4.567066186864799e-06, + "loss": 0.5792, + "step": 14235 + }, + { + "epoch": 0.9134060295060936, + "grad_norm": 0.7240560589023852, + "learning_rate": 4.53367964400423e-06, + "loss": 0.6382, + "step": 14240 + }, + { + "epoch": 0.9137267479153304, + "grad_norm": 0.9404018211860803, + "learning_rate": 4.500412751127148e-06, + "loss": 0.6983, + "step": 14245 + }, + { + "epoch": 0.914047466324567, + "grad_norm": 0.9226737613175637, + "learning_rate": 4.467265549927646e-06, + "loss": 0.7371, + "step": 14250 + }, + { + "epoch": 0.9143681847338038, + "grad_norm": 0.8674349211052579, + "learning_rate": 4.434238081949793e-06, + "loss": 0.715, + "step": 14255 + }, + { + "epoch": 0.9146889031430404, + "grad_norm": 1.0086095744064745, + "learning_rate": 4.401330388587655e-06, + "loss": 0.6359, + "step": 14260 + }, + { + "epoch": 0.915009621552277, + "grad_norm": 0.7399699212191572, + "learning_rate": 4.368542511085127e-06, + "loss": 0.6856, + "step": 14265 + }, + { + "epoch": 0.9153303399615138, + "grad_norm": 0.7837381511015072, + "learning_rate": 4.3358744905359845e-06, + "loss": 0.5355, + "step": 14270 + }, + { + "epoch": 0.9156510583707504, + "grad_norm": 0.7456554819958952, + "learning_rate": 4.303326367883742e-06, + "loss": 0.6506, + "step": 14275 + }, + { + "epoch": 0.9159717767799872, + "grad_norm": 0.7504015595604561, + "learning_rate": 4.2708981839216344e-06, + "loss": 0.7347, + "step": 14280 + }, + { + "epoch": 0.9162924951892238, + "grad_norm": 0.7872333950088334, + "learning_rate": 4.238589979292651e-06, + "loss": 0.7448, + "step": 14285 + }, + { + "epoch": 0.9166132135984606, + "grad_norm": 0.848658406503067, + "learning_rate": 4.206401794489301e-06, + "loss": 0.755, + "step": 14290 + }, + { + "epoch": 0.9169339320076972, + "grad_norm": 0.7157699993484576, + "learning_rate": 4.1743336698537805e-06, + "loss": 0.6877, + "step": 14295 + }, + { + "epoch": 0.9172546504169339, + "grad_norm": 0.920746793540226, + "learning_rate": 4.142385645577707e-06, + "loss": 0.6888, + "step": 14300 + }, + { + "epoch": 0.9175753688261706, + "grad_norm": 0.6845975702530432, + "learning_rate": 4.110557761702249e-06, + "loss": 0.754, + "step": 14305 + }, + { + "epoch": 0.9178960872354073, + "grad_norm": 1.1511196348448594, + "learning_rate": 4.078850058117978e-06, + "loss": 0.616, + "step": 14310 + }, + { + "epoch": 0.918216805644644, + "grad_norm": 0.6109287776036132, + "learning_rate": 4.0472625745648144e-06, + "loss": 0.5921, + "step": 14315 + }, + { + "epoch": 0.9185375240538807, + "grad_norm": 0.5799180489438701, + "learning_rate": 4.015795350632068e-06, + "loss": 0.6258, + "step": 14320 + }, + { + "epoch": 0.9188582424631174, + "grad_norm": 1.0588410053870487, + "learning_rate": 3.984448425758236e-06, + "loss": 0.6294, + "step": 14325 + }, + { + "epoch": 0.9191789608723541, + "grad_norm": 0.9656078510689677, + "learning_rate": 3.953221839231125e-06, + "loss": 0.7232, + "step": 14330 + }, + { + "epoch": 0.9194996792815907, + "grad_norm": 0.7627108781290338, + "learning_rate": 3.922115630187684e-06, + "loss": 0.7192, + "step": 14335 + }, + { + "epoch": 0.9198203976908275, + "grad_norm": 0.9118690797348065, + "learning_rate": 3.8911298376139604e-06, + "loss": 0.7131, + "step": 14340 + }, + { + "epoch": 0.9201411161000641, + "grad_norm": 0.6032629064325823, + "learning_rate": 3.860264500345145e-06, + "loss": 0.701, + "step": 14345 + }, + { + "epoch": 0.9204618345093009, + "grad_norm": 0.7887702725778526, + "learning_rate": 3.829519657065417e-06, + "loss": 0.4822, + "step": 14350 + }, + { + "epoch": 0.9207825529185375, + "grad_norm": 0.7138715411195988, + "learning_rate": 3.798895346307929e-06, + "loss": 0.6301, + "step": 14355 + }, + { + "epoch": 0.9211032713277743, + "grad_norm": 0.9024603895099268, + "learning_rate": 3.768391606454824e-06, + "loss": 0.7522, + "step": 14360 + }, + { + "epoch": 0.9214239897370109, + "grad_norm": 1.0280776294268867, + "learning_rate": 3.7380084757370427e-06, + "loss": 0.5146, + "step": 14365 + }, + { + "epoch": 0.9217447081462476, + "grad_norm": 1.2746584097883105, + "learning_rate": 3.707745992234446e-06, + "loss": 0.6437, + "step": 14370 + }, + { + "epoch": 0.9220654265554843, + "grad_norm": 0.7420480886663697, + "learning_rate": 3.677604193875639e-06, + "loss": 0.7434, + "step": 14375 + }, + { + "epoch": 0.9223861449647209, + "grad_norm": 0.7760260552269074, + "learning_rate": 3.647583118438003e-06, + "loss": 0.7314, + "step": 14380 + }, + { + "epoch": 0.9227068633739577, + "grad_norm": 0.5526340026602907, + "learning_rate": 3.617682803547573e-06, + "loss": 0.6684, + "step": 14385 + }, + { + "epoch": 0.9230275817831943, + "grad_norm": 0.8601770168248275, + "learning_rate": 3.587903286679051e-06, + "loss": 0.7048, + "step": 14390 + }, + { + "epoch": 0.9233483001924311, + "grad_norm": 0.768831329847095, + "learning_rate": 3.5582446051557694e-06, + "loss": 0.7109, + "step": 14395 + }, + { + "epoch": 0.9236690186016677, + "grad_norm": 0.7061972963645736, + "learning_rate": 3.5287067961495613e-06, + "loss": 0.7226, + "step": 14400 + }, + { + "epoch": 0.9239897370109045, + "grad_norm": 0.9718492483949128, + "learning_rate": 3.4992898966808128e-06, + "loss": 0.6096, + "step": 14405 + }, + { + "epoch": 0.9243104554201411, + "grad_norm": 0.6613307717148478, + "learning_rate": 3.4699939436183548e-06, + "loss": 0.6359, + "step": 14410 + }, + { + "epoch": 0.9246311738293778, + "grad_norm": 0.48853477777273874, + "learning_rate": 3.440818973679416e-06, + "loss": 0.5916, + "step": 14415 + }, + { + "epoch": 0.9249518922386145, + "grad_norm": 2.8872548788201846, + "learning_rate": 3.411765023429625e-06, + "loss": 0.6681, + "step": 14420 + }, + { + "epoch": 0.9252726106478512, + "grad_norm": 0.8605678505533776, + "learning_rate": 3.382832129282909e-06, + "loss": 0.7061, + "step": 14425 + }, + { + "epoch": 0.9255933290570879, + "grad_norm": 0.8152777611420922, + "learning_rate": 3.354020327501506e-06, + "loss": 0.7016, + "step": 14430 + }, + { + "epoch": 0.9259140474663246, + "grad_norm": 0.5720911855352934, + "learning_rate": 3.32532965419583e-06, + "loss": 0.6065, + "step": 14435 + }, + { + "epoch": 0.9262347658755613, + "grad_norm": 0.5729769215244488, + "learning_rate": 3.29676014532454e-06, + "loss": 0.6385, + "step": 14440 + }, + { + "epoch": 0.926555484284798, + "grad_norm": 0.7971168307254297, + "learning_rate": 3.2683118366944153e-06, + "loss": 0.7482, + "step": 14445 + }, + { + "epoch": 0.9268762026940346, + "grad_norm": 0.8082127626355636, + "learning_rate": 3.2399847639603132e-06, + "loss": 0.5749, + "step": 14450 + }, + { + "epoch": 0.9271969211032713, + "grad_norm": 0.986366425048449, + "learning_rate": 3.211778962625178e-06, + "loss": 0.814, + "step": 14455 + }, + { + "epoch": 0.927517639512508, + "grad_norm": 0.7974470102591675, + "learning_rate": 3.1836944680399215e-06, + "loss": 0.6845, + "step": 14460 + }, + { + "epoch": 0.9278383579217447, + "grad_norm": 0.9030012061093406, + "learning_rate": 3.155731315403465e-06, + "loss": 0.7462, + "step": 14465 + }, + { + "epoch": 0.9281590763309814, + "grad_norm": 0.8114451125831404, + "learning_rate": 3.1278895397626295e-06, + "loss": 0.7289, + "step": 14470 + }, + { + "epoch": 0.9284797947402181, + "grad_norm": 0.7580184369514217, + "learning_rate": 3.10016917601208e-06, + "loss": 0.8204, + "step": 14475 + }, + { + "epoch": 0.9288005131494548, + "grad_norm": 0.9028047332034969, + "learning_rate": 3.0725702588943693e-06, + "loss": 0.6502, + "step": 14480 + }, + { + "epoch": 0.9291212315586914, + "grad_norm": 0.5328705285389578, + "learning_rate": 3.0450928229997956e-06, + "loss": 0.6282, + "step": 14485 + }, + { + "epoch": 0.9294419499679282, + "grad_norm": 0.676301284723922, + "learning_rate": 3.0177369027664324e-06, + "loss": 0.6152, + "step": 14490 + }, + { + "epoch": 0.9297626683771648, + "grad_norm": 0.6911219963447808, + "learning_rate": 2.990502532480033e-06, + "loss": 0.7075, + "step": 14495 + }, + { + "epoch": 0.9300833867864016, + "grad_norm": 0.8158597361321028, + "learning_rate": 2.9633897462740035e-06, + "loss": 0.5278, + "step": 14500 + }, + { + "epoch": 0.9304041051956382, + "grad_norm": 0.8885816510360459, + "learning_rate": 2.936398578129407e-06, + "loss": 0.7842, + "step": 14505 + }, + { + "epoch": 0.930724823604875, + "grad_norm": 0.9090481734964072, + "learning_rate": 2.909529061874816e-06, + "loss": 0.6346, + "step": 14510 + }, + { + "epoch": 0.9310455420141116, + "grad_norm": 0.6271937382541385, + "learning_rate": 2.8827812311864044e-06, + "loss": 0.4965, + "step": 14515 + }, + { + "epoch": 0.9313662604233482, + "grad_norm": 0.8626519977341744, + "learning_rate": 2.856155119587789e-06, + "loss": 0.6916, + "step": 14520 + }, + { + "epoch": 0.931686978832585, + "grad_norm": 1.2378284751762905, + "learning_rate": 2.829650760450031e-06, + "loss": 0.6573, + "step": 14525 + }, + { + "epoch": 0.9320076972418216, + "grad_norm": 1.2677367998396853, + "learning_rate": 2.8032681869916366e-06, + "loss": 0.5755, + "step": 14530 + }, + { + "epoch": 0.9323284156510584, + "grad_norm": 0.5109336107393835, + "learning_rate": 2.7770074322784334e-06, + "loss": 0.5688, + "step": 14535 + }, + { + "epoch": 0.932649134060295, + "grad_norm": 0.7042004857736548, + "learning_rate": 2.7508685292235937e-06, + "loss": 0.7213, + "step": 14540 + }, + { + "epoch": 0.9329698524695318, + "grad_norm": 0.7309101698002372, + "learning_rate": 2.7248515105875673e-06, + "loss": 0.6667, + "step": 14545 + }, + { + "epoch": 0.9332905708787684, + "grad_norm": 0.6908743464424493, + "learning_rate": 2.6989564089780263e-06, + "loss": 0.6156, + "step": 14550 + }, + { + "epoch": 0.9336112892880052, + "grad_norm": 0.9549405672325, + "learning_rate": 2.673183256849876e-06, + "loss": 0.5705, + "step": 14555 + }, + { + "epoch": 0.9339320076972418, + "grad_norm": 0.8108069141144446, + "learning_rate": 2.6475320865051444e-06, + "loss": 0.6301, + "step": 14560 + }, + { + "epoch": 0.9342527261064785, + "grad_norm": 0.7542934406058188, + "learning_rate": 2.6220029300930037e-06, + "loss": 0.6081, + "step": 14565 + }, + { + "epoch": 0.9345734445157152, + "grad_norm": 0.8121008842739622, + "learning_rate": 2.5965958196096706e-06, + "loss": 0.7333, + "step": 14570 + }, + { + "epoch": 0.9348941629249519, + "grad_norm": 0.7044098978011041, + "learning_rate": 2.571310786898451e-06, + "loss": 0.6786, + "step": 14575 + }, + { + "epoch": 0.9352148813341886, + "grad_norm": 0.669296953567193, + "learning_rate": 2.5461478636496062e-06, + "loss": 0.6451, + "step": 14580 + }, + { + "epoch": 0.9355355997434253, + "grad_norm": 1.0134964970782947, + "learning_rate": 2.5211070814003536e-06, + "loss": 0.7071, + "step": 14585 + }, + { + "epoch": 0.935856318152662, + "grad_norm": 0.8079966960225432, + "learning_rate": 2.496188471534866e-06, + "loss": 0.6494, + "step": 14590 + }, + { + "epoch": 0.9361770365618987, + "grad_norm": 0.7980284916096867, + "learning_rate": 2.4713920652841394e-06, + "loss": 0.6966, + "step": 14595 + }, + { + "epoch": 0.9364977549711353, + "grad_norm": 1.4182606806536633, + "learning_rate": 2.4467178937260692e-06, + "loss": 0.5106, + "step": 14600 + }, + { + "epoch": 0.936818473380372, + "grad_norm": 1.1450293247030983, + "learning_rate": 2.4221659877853074e-06, + "loss": 0.6734, + "step": 14605 + }, + { + "epoch": 0.9371391917896087, + "grad_norm": 0.7955638461295016, + "learning_rate": 2.397736378233284e-06, + "loss": 0.725, + "step": 14610 + }, + { + "epoch": 0.9374599101988454, + "grad_norm": 0.7397520509486079, + "learning_rate": 2.3734290956881734e-06, + "loss": 0.6244, + "step": 14615 + }, + { + "epoch": 0.9377806286080821, + "grad_norm": 0.9732579754101209, + "learning_rate": 2.349244170614773e-06, + "loss": 0.6057, + "step": 14620 + }, + { + "epoch": 0.9381013470173188, + "grad_norm": 1.406456086581141, + "learning_rate": 2.3251816333246025e-06, + "loss": 0.7182, + "step": 14625 + }, + { + "epoch": 0.9384220654265555, + "grad_norm": 0.8952424347381697, + "learning_rate": 2.301241513975749e-06, + "loss": 0.7598, + "step": 14630 + }, + { + "epoch": 0.9387427838357921, + "grad_norm": 1.0218439096331748, + "learning_rate": 2.2774238425728677e-06, + "loss": 0.7246, + "step": 14635 + }, + { + "epoch": 0.9390635022450289, + "grad_norm": 0.7685781373474748, + "learning_rate": 2.2537286489671573e-06, + "loss": 0.5579, + "step": 14640 + }, + { + "epoch": 0.9393842206542655, + "grad_norm": 0.7182539188714678, + "learning_rate": 2.2301559628563062e-06, + "loss": 0.4816, + "step": 14645 + }, + { + "epoch": 0.9397049390635023, + "grad_norm": 0.7271338524133633, + "learning_rate": 2.206705813784471e-06, + "loss": 0.7117, + "step": 14650 + }, + { + "epoch": 0.9400256574727389, + "grad_norm": 0.9142892488291297, + "learning_rate": 2.18337823114223e-06, + "loss": 0.5035, + "step": 14655 + }, + { + "epoch": 0.9403463758819757, + "grad_norm": 1.1230106908678623, + "learning_rate": 2.160173244166541e-06, + "loss": 0.5692, + "step": 14660 + }, + { + "epoch": 0.9406670942912123, + "grad_norm": 0.40796226780607736, + "learning_rate": 2.1370908819407174e-06, + "loss": 0.5771, + "step": 14665 + }, + { + "epoch": 0.940987812700449, + "grad_norm": 0.9481608724103522, + "learning_rate": 2.1141311733943626e-06, + "loss": 0.5029, + "step": 14670 + }, + { + "epoch": 0.9413085311096857, + "grad_norm": 1.0000026556770782, + "learning_rate": 2.09129414730338e-06, + "loss": 0.6156, + "step": 14675 + }, + { + "epoch": 0.9416292495189224, + "grad_norm": 0.521971426032197, + "learning_rate": 2.0685798322899073e-06, + "loss": 0.6233, + "step": 14680 + }, + { + "epoch": 0.9419499679281591, + "grad_norm": 0.555113548672577, + "learning_rate": 2.045988256822273e-06, + "loss": 0.6226, + "step": 14685 + }, + { + "epoch": 0.9422706863373957, + "grad_norm": 1.0940970203612415, + "learning_rate": 2.0235194492149832e-06, + "loss": 0.6603, + "step": 14690 + }, + { + "epoch": 0.9425914047466325, + "grad_norm": 1.0787803604629624, + "learning_rate": 2.0011734376286896e-06, + "loss": 0.6915, + "step": 14695 + }, + { + "epoch": 0.9429121231558691, + "grad_norm": 0.603441598329727, + "learning_rate": 1.978950250070111e-06, + "loss": 0.7826, + "step": 14700 + }, + { + "epoch": 0.9432328415651058, + "grad_norm": 1.1933790532010597, + "learning_rate": 1.9568499143920336e-06, + "loss": 0.6277, + "step": 14705 + }, + { + "epoch": 0.9435535599743425, + "grad_norm": 0.5764914897220961, + "learning_rate": 1.9348724582933133e-06, + "loss": 0.6875, + "step": 14710 + }, + { + "epoch": 0.9438742783835792, + "grad_norm": 0.9696889870454197, + "learning_rate": 1.9130179093187484e-06, + "loss": 0.8159, + "step": 14715 + }, + { + "epoch": 0.9441949967928159, + "grad_norm": 1.174884517440042, + "learning_rate": 1.891286294859107e-06, + "loss": 0.7811, + "step": 14720 + }, + { + "epoch": 0.9445157152020526, + "grad_norm": 0.7432254800663841, + "learning_rate": 1.869677642151102e-06, + "loss": 0.8169, + "step": 14725 + }, + { + "epoch": 0.9448364336112893, + "grad_norm": 1.3451481683596176, + "learning_rate": 1.8481919782773138e-06, + "loss": 0.6386, + "step": 14730 + }, + { + "epoch": 0.945157152020526, + "grad_norm": 0.8999549303642768, + "learning_rate": 1.82682933016618e-06, + "loss": 0.6578, + "step": 14735 + }, + { + "epoch": 0.9454778704297627, + "grad_norm": 0.7535938047620351, + "learning_rate": 1.8055897245919718e-06, + "loss": 0.6345, + "step": 14740 + }, + { + "epoch": 0.9457985888389994, + "grad_norm": 0.9031933438522918, + "learning_rate": 1.78447318817474e-06, + "loss": 0.6979, + "step": 14745 + }, + { + "epoch": 0.946119307248236, + "grad_norm": 0.5909234139284275, + "learning_rate": 1.7634797473802922e-06, + "loss": 0.5283, + "step": 14750 + }, + { + "epoch": 0.9464400256574728, + "grad_norm": 0.7478929356403822, + "learning_rate": 1.7426094285201478e-06, + "loss": 0.7548, + "step": 14755 + }, + { + "epoch": 0.9467607440667094, + "grad_norm": 0.7939890902510196, + "learning_rate": 1.7218622577515496e-06, + "loss": 0.7005, + "step": 14760 + }, + { + "epoch": 0.9470814624759462, + "grad_norm": 0.6058878555015041, + "learning_rate": 1.7012382610773315e-06, + "loss": 0.6766, + "step": 14765 + }, + { + "epoch": 0.9474021808851828, + "grad_norm": 0.848486027790844, + "learning_rate": 1.6807374643460272e-06, + "loss": 0.7677, + "step": 14770 + }, + { + "epoch": 0.9477228992944196, + "grad_norm": 0.7595303087988711, + "learning_rate": 1.6603598932517061e-06, + "loss": 0.7407, + "step": 14775 + }, + { + "epoch": 0.9480436177036562, + "grad_norm": 0.7579789167134414, + "learning_rate": 1.6401055733340164e-06, + "loss": 0.669, + "step": 14780 + }, + { + "epoch": 0.9483643361128928, + "grad_norm": 1.2648466067379471, + "learning_rate": 1.61997452997813e-06, + "loss": 0.6469, + "step": 14785 + }, + { + "epoch": 0.9486850545221296, + "grad_norm": 0.797026657881511, + "learning_rate": 1.5999667884147196e-06, + "loss": 0.588, + "step": 14790 + }, + { + "epoch": 0.9490057729313662, + "grad_norm": 0.915174796254417, + "learning_rate": 1.5800823737199156e-06, + "loss": 0.7036, + "step": 14795 + }, + { + "epoch": 0.949326491340603, + "grad_norm": 0.7014564001359544, + "learning_rate": 1.5603213108152715e-06, + "loss": 0.604, + "step": 14800 + }, + { + "epoch": 0.9496472097498396, + "grad_norm": 1.0673933698941918, + "learning_rate": 1.5406836244677646e-06, + "loss": 0.6767, + "step": 14805 + }, + { + "epoch": 0.9499679281590764, + "grad_norm": 0.5974581758846627, + "learning_rate": 1.5211693392897185e-06, + "loss": 0.6277, + "step": 14810 + }, + { + "epoch": 0.950288646568313, + "grad_norm": 0.76752354413579, + "learning_rate": 1.5017784797388024e-06, + "loss": 0.6575, + "step": 14815 + }, + { + "epoch": 0.9506093649775497, + "grad_norm": 0.6302709486833972, + "learning_rate": 1.482511070118009e-06, + "loss": 0.5797, + "step": 14820 + }, + { + "epoch": 0.9509300833867864, + "grad_norm": 0.6408626471147529, + "learning_rate": 1.4633671345755884e-06, + "loss": 0.6938, + "step": 14825 + }, + { + "epoch": 0.9512508017960231, + "grad_norm": 1.147885938640683, + "learning_rate": 1.4443466971050367e-06, + "loss": 0.6631, + "step": 14830 + }, + { + "epoch": 0.9515715202052598, + "grad_norm": 1.2090975514637632, + "learning_rate": 1.4254497815450852e-06, + "loss": 0.5987, + "step": 14835 + }, + { + "epoch": 0.9518922386144965, + "grad_norm": 1.4462854589201612, + "learning_rate": 1.4066764115796328e-06, + "loss": 0.5496, + "step": 14840 + }, + { + "epoch": 0.9522129570237332, + "grad_norm": 2.2267736323891603, + "learning_rate": 1.3880266107377581e-06, + "loss": 0.6236, + "step": 14845 + }, + { + "epoch": 0.9525336754329699, + "grad_norm": 0.9767897268690148, + "learning_rate": 1.369500402393653e-06, + "loss": 0.6737, + "step": 14850 + }, + { + "epoch": 0.9528543938422065, + "grad_norm": 0.6597022287518994, + "learning_rate": 1.3510978097665994e-06, + "loss": 0.6009, + "step": 14855 + }, + { + "epoch": 0.9531751122514432, + "grad_norm": 0.8352297747099178, + "learning_rate": 1.332818855920981e-06, + "loss": 0.6206, + "step": 14860 + }, + { + "epoch": 0.9534958306606799, + "grad_norm": 0.3398468741414835, + "learning_rate": 1.314663563766172e-06, + "loss": 0.745, + "step": 14865 + }, + { + "epoch": 0.9538165490699166, + "grad_norm": 0.6650997138673455, + "learning_rate": 1.2966319560566264e-06, + "loss": 0.5189, + "step": 14870 + }, + { + "epoch": 0.9541372674791533, + "grad_norm": 0.8495035997423334, + "learning_rate": 1.2787240553917223e-06, + "loss": 0.5352, + "step": 14875 + }, + { + "epoch": 0.95445798588839, + "grad_norm": 0.6804679950864659, + "learning_rate": 1.2609398842158171e-06, + "loss": 0.5298, + "step": 14880 + }, + { + "epoch": 0.9547787042976267, + "grad_norm": 0.9011394842975389, + "learning_rate": 1.2432794648181922e-06, + "loss": 0.6416, + "step": 14885 + }, + { + "epoch": 0.9550994227068633, + "grad_norm": 0.8017624405517991, + "learning_rate": 1.225742819333031e-06, + "loss": 0.7683, + "step": 14890 + }, + { + "epoch": 0.9554201411161001, + "grad_norm": 1.0189493989237226, + "learning_rate": 1.2083299697393968e-06, + "loss": 0.6712, + "step": 14895 + }, + { + "epoch": 0.9557408595253367, + "grad_norm": 0.8632861800860692, + "learning_rate": 1.1910409378611653e-06, + "loss": 0.6677, + "step": 14900 + }, + { + "epoch": 0.9560615779345735, + "grad_norm": 0.8271377018484679, + "learning_rate": 1.17387574536707e-06, + "loss": 0.8435, + "step": 14905 + }, + { + "epoch": 0.9563822963438101, + "grad_norm": 1.090763241775662, + "learning_rate": 1.1568344137706133e-06, + "loss": 0.751, + "step": 14910 + }, + { + "epoch": 0.9567030147530469, + "grad_norm": 0.8533558406500173, + "learning_rate": 1.1399169644300323e-06, + "loss": 0.7627, + "step": 14915 + }, + { + "epoch": 0.9570237331622835, + "grad_norm": 0.7969691903367916, + "learning_rate": 1.1231234185483663e-06, + "loss": 0.6599, + "step": 14920 + }, + { + "epoch": 0.9573444515715203, + "grad_norm": 0.6892919393359965, + "learning_rate": 1.1064537971733124e-06, + "loss": 0.6862, + "step": 14925 + }, + { + "epoch": 0.9576651699807569, + "grad_norm": 0.8464857234158932, + "learning_rate": 1.0899081211972584e-06, + "loss": 0.8058, + "step": 14930 + }, + { + "epoch": 0.9579858883899935, + "grad_norm": 0.5019234017303561, + "learning_rate": 1.0734864113572606e-06, + "loss": 0.684, + "step": 14935 + }, + { + "epoch": 0.9583066067992303, + "grad_norm": 0.7995354303661617, + "learning_rate": 1.057188688234989e-06, + "loss": 0.577, + "step": 14940 + }, + { + "epoch": 0.9586273252084669, + "grad_norm": 1.053084388323032, + "learning_rate": 1.0410149722567376e-06, + "loss": 0.6179, + "step": 14945 + }, + { + "epoch": 0.9589480436177037, + "grad_norm": 0.9473025528524849, + "learning_rate": 1.0249652836933688e-06, + "loss": 0.6448, + "step": 14950 + }, + { + "epoch": 0.9592687620269403, + "grad_norm": 0.8867828551638389, + "learning_rate": 1.0090396426603143e-06, + "loss": 0.7081, + "step": 14955 + }, + { + "epoch": 0.9595894804361771, + "grad_norm": 0.579392165704179, + "learning_rate": 9.93238069117508e-07, + "loss": 0.6266, + "step": 14960 + }, + { + "epoch": 0.9599101988454137, + "grad_norm": 1.3419589121931794, + "learning_rate": 9.775605828693969e-07, + "loss": 0.6619, + "step": 14965 + }, + { + "epoch": 0.9602309172546504, + "grad_norm": 0.9125359836127329, + "learning_rate": 9.620072035649075e-07, + "loss": 0.6073, + "step": 14970 + }, + { + "epoch": 0.9605516356638871, + "grad_norm": 1.0860000796878035, + "learning_rate": 9.465779506974359e-07, + "loss": 0.5401, + "step": 14975 + }, + { + "epoch": 0.9608723540731238, + "grad_norm": 1.171824681775004, + "learning_rate": 9.312728436047913e-07, + "loss": 0.5753, + "step": 14980 + }, + { + "epoch": 0.9611930724823605, + "grad_norm": 0.5643018528812354, + "learning_rate": 9.160919014691848e-07, + "loss": 0.5638, + "step": 14985 + }, + { + "epoch": 0.9615137908915972, + "grad_norm": 0.9034235555165777, + "learning_rate": 9.010351433172304e-07, + "loss": 0.6334, + "step": 14990 + }, + { + "epoch": 0.9618345093008339, + "grad_norm": 1.1839905897068703, + "learning_rate": 8.86102588019877e-07, + "loss": 0.7153, + "step": 14995 + }, + { + "epoch": 0.9621552277100706, + "grad_norm": 0.8180578726272846, + "learning_rate": 8.712942542923986e-07, + "loss": 0.5817, + "step": 15000 + }, + { + "epoch": 0.9624759461193072, + "grad_norm": 1.0696335688074747, + "learning_rate": 8.566101606944266e-07, + "loss": 0.6736, + "step": 15005 + }, + { + "epoch": 0.962796664528544, + "grad_norm": 0.7303824338994761, + "learning_rate": 8.420503256298396e-07, + "loss": 0.6429, + "step": 15010 + }, + { + "epoch": 0.9631173829377806, + "grad_norm": 1.0294755318998579, + "learning_rate": 8.276147673467849e-07, + "loss": 0.7188, + "step": 15015 + }, + { + "epoch": 0.9634381013470174, + "grad_norm": 0.9556262852737702, + "learning_rate": 8.133035039376679e-07, + "loss": 0.5951, + "step": 15020 + }, + { + "epoch": 0.963758819756254, + "grad_norm": 0.9324693251087647, + "learning_rate": 7.991165533390854e-07, + "loss": 0.7127, + "step": 15025 + }, + { + "epoch": 0.9640795381654907, + "grad_norm": 0.9591152159542692, + "learning_rate": 7.850539333318585e-07, + "loss": 0.6322, + "step": 15030 + }, + { + "epoch": 0.9644002565747274, + "grad_norm": 0.6946002197246557, + "learning_rate": 7.711156615409665e-07, + "loss": 0.5755, + "step": 15035 + }, + { + "epoch": 0.964720974983964, + "grad_norm": 1.3334758098994104, + "learning_rate": 7.573017554355355e-07, + "loss": 0.6318, + "step": 15040 + }, + { + "epoch": 0.9650416933932008, + "grad_norm": 0.8978971885207064, + "learning_rate": 7.436122323288497e-07, + "loss": 0.6035, + "step": 15045 + }, + { + "epoch": 0.9653624118024374, + "grad_norm": 0.8103686748723528, + "learning_rate": 7.300471093782624e-07, + "loss": 0.6194, + "step": 15050 + }, + { + "epoch": 0.9656831302116742, + "grad_norm": 0.753034703476334, + "learning_rate": 7.166064035852405e-07, + "loss": 0.6241, + "step": 15055 + }, + { + "epoch": 0.9660038486209108, + "grad_norm": 0.8194295630630289, + "learning_rate": 7.032901317953089e-07, + "loss": 0.804, + "step": 15060 + }, + { + "epoch": 0.9663245670301476, + "grad_norm": 0.6380479125093319, + "learning_rate": 6.900983106980396e-07, + "loss": 0.4591, + "step": 15065 + }, + { + "epoch": 0.9666452854393842, + "grad_norm": 0.6010950679928249, + "learning_rate": 6.770309568270183e-07, + "loss": 0.5964, + "step": 15070 + }, + { + "epoch": 0.9669660038486209, + "grad_norm": 0.6142851169104145, + "learning_rate": 6.640880865598331e-07, + "loss": 0.515, + "step": 15075 + }, + { + "epoch": 0.9672867222578576, + "grad_norm": 0.5969279751540932, + "learning_rate": 6.512697161180859e-07, + "loss": 0.5795, + "step": 15080 + }, + { + "epoch": 0.9676074406670943, + "grad_norm": 1.1554904145083251, + "learning_rate": 6.38575861567281e-07, + "loss": 0.7483, + "step": 15085 + }, + { + "epoch": 0.967928159076331, + "grad_norm": 0.7865746542213344, + "learning_rate": 6.260065388169256e-07, + "loss": 0.5557, + "step": 15090 + }, + { + "epoch": 0.9682488774855676, + "grad_norm": 1.1050848806521416, + "learning_rate": 6.135617636204072e-07, + "loss": 0.5939, + "step": 15095 + }, + { + "epoch": 0.9685695958948044, + "grad_norm": 0.7070536160439901, + "learning_rate": 6.01241551575027e-07, + "loss": 0.6985, + "step": 15100 + }, + { + "epoch": 0.968890314304041, + "grad_norm": 1.105194184766872, + "learning_rate": 5.890459181219776e-07, + "loss": 0.7083, + "step": 15105 + }, + { + "epoch": 0.9692110327132777, + "grad_norm": 1.2744464352233527, + "learning_rate": 5.769748785463103e-07, + "loss": 0.6397, + "step": 15110 + }, + { + "epoch": 0.9695317511225144, + "grad_norm": 0.9272062316818276, + "learning_rate": 5.650284479769008e-07, + "loss": 0.7676, + "step": 15115 + }, + { + "epoch": 0.9698524695317511, + "grad_norm": 0.7995773908927787, + "learning_rate": 5.532066413864834e-07, + "loss": 0.6971, + "step": 15120 + }, + { + "epoch": 0.9701731879409878, + "grad_norm": 0.38586358236871543, + "learning_rate": 5.415094735915838e-07, + "loss": 0.6707, + "step": 15125 + }, + { + "epoch": 0.9704939063502245, + "grad_norm": 0.9134739108193013, + "learning_rate": 5.299369592524972e-07, + "loss": 0.7099, + "step": 15130 + }, + { + "epoch": 0.9708146247594612, + "grad_norm": 1.1214413150852183, + "learning_rate": 5.184891128733216e-07, + "loss": 0.5773, + "step": 15135 + }, + { + "epoch": 0.9711353431686979, + "grad_norm": 0.9080341063196368, + "learning_rate": 5.071659488018688e-07, + "loss": 0.5541, + "step": 15140 + }, + { + "epoch": 0.9714560615779346, + "grad_norm": 0.6396326113379124, + "learning_rate": 4.959674812297089e-07, + "loss": 0.7547, + "step": 15145 + }, + { + "epoch": 0.9717767799871713, + "grad_norm": 0.6247330527268826, + "learning_rate": 4.848937241921369e-07, + "loss": 0.7347, + "step": 15150 + }, + { + "epoch": 0.9720974983964079, + "grad_norm": 0.7413180396760661, + "learning_rate": 4.7394469156810674e-07, + "loss": 0.6324, + "step": 15155 + }, + { + "epoch": 0.9724182168056447, + "grad_norm": 0.8191285127812412, + "learning_rate": 4.6312039708028553e-07, + "loss": 0.6501, + "step": 15160 + }, + { + "epoch": 0.9727389352148813, + "grad_norm": 1.5646180696875727, + "learning_rate": 4.5242085429499923e-07, + "loss": 0.7018, + "step": 15165 + }, + { + "epoch": 0.9730596536241181, + "grad_norm": 1.05700452006374, + "learning_rate": 4.4184607662220987e-07, + "loss": 0.702, + "step": 15170 + }, + { + "epoch": 0.9733803720333547, + "grad_norm": 0.6341783140741876, + "learning_rate": 4.313960773155046e-07, + "loss": 0.636, + "step": 15175 + }, + { + "epoch": 0.9737010904425915, + "grad_norm": 0.7888859139283535, + "learning_rate": 4.2107086947209553e-07, + "loss": 0.6313, + "step": 15180 + }, + { + "epoch": 0.9740218088518281, + "grad_norm": 0.9191085670941561, + "learning_rate": 4.1087046603279777e-07, + "loss": 0.6221, + "step": 15185 + }, + { + "epoch": 0.9743425272610647, + "grad_norm": 0.747755641512419, + "learning_rate": 4.007948797819738e-07, + "loss": 0.7214, + "step": 15190 + }, + { + "epoch": 0.9746632456703015, + "grad_norm": 0.977703835187041, + "learning_rate": 3.90844123347589e-07, + "loss": 0.6226, + "step": 15195 + }, + { + "epoch": 0.9749839640795381, + "grad_norm": 1.0760333069724886, + "learning_rate": 3.8101820920114494e-07, + "loss": 0.5479, + "step": 15200 + }, + { + "epoch": 0.9753046824887749, + "grad_norm": 0.6944511489853861, + "learning_rate": 3.713171496576573e-07, + "loss": 0.5499, + "step": 15205 + }, + { + "epoch": 0.9756254008980115, + "grad_norm": 0.8427188819091052, + "learning_rate": 3.617409568756669e-07, + "loss": 0.7567, + "step": 15210 + }, + { + "epoch": 0.9759461193072483, + "grad_norm": 0.8552901758457413, + "learning_rate": 3.5228964285722864e-07, + "loss": 0.5683, + "step": 15215 + }, + { + "epoch": 0.9762668377164849, + "grad_norm": 1.3132456382472737, + "learning_rate": 3.429632194478782e-07, + "loss": 0.6284, + "step": 15220 + }, + { + "epoch": 0.9765875561257216, + "grad_norm": 0.7318279273617357, + "learning_rate": 3.337616983366321e-07, + "loss": 0.5582, + "step": 15225 + }, + { + "epoch": 0.9769082745349583, + "grad_norm": 0.6573550653291185, + "learning_rate": 3.246850910559318e-07, + "loss": 0.5491, + "step": 15230 + }, + { + "epoch": 0.977228992944195, + "grad_norm": 0.8242113768294678, + "learning_rate": 3.157334089816888e-07, + "loss": 0.7255, + "step": 15235 + }, + { + "epoch": 0.9775497113534317, + "grad_norm": 0.9030228435778539, + "learning_rate": 3.0690666333325067e-07, + "loss": 0.5873, + "step": 15240 + }, + { + "epoch": 0.9778704297626684, + "grad_norm": 0.565513303166446, + "learning_rate": 2.9820486517335713e-07, + "loss": 0.598, + "step": 15245 + }, + { + "epoch": 0.9781911481719051, + "grad_norm": 0.6147817142778307, + "learning_rate": 2.896280254081618e-07, + "loss": 0.7145, + "step": 15250 + }, + { + "epoch": 0.9785118665811418, + "grad_norm": 0.8743323298527471, + "learning_rate": 2.811761547871994e-07, + "loss": 0.6756, + "step": 15255 + }, + { + "epoch": 0.9788325849903784, + "grad_norm": 1.1307500659483494, + "learning_rate": 2.728492639033742e-07, + "loss": 0.6188, + "step": 15260 + }, + { + "epoch": 0.9791533033996151, + "grad_norm": 0.7125463266714677, + "learning_rate": 2.6464736319297136e-07, + "loss": 0.6278, + "step": 15265 + }, + { + "epoch": 0.9794740218088518, + "grad_norm": 0.5910469031411075, + "learning_rate": 2.5657046293560137e-07, + "loss": 0.6905, + "step": 15270 + }, + { + "epoch": 0.9797947402180885, + "grad_norm": 0.7878661937473239, + "learning_rate": 2.4861857325421123e-07, + "loss": 0.7325, + "step": 15275 + }, + { + "epoch": 0.9801154586273252, + "grad_norm": 0.8286733473521487, + "learning_rate": 2.4079170411507315e-07, + "loss": 0.7773, + "step": 15280 + }, + { + "epoch": 0.9804361770365619, + "grad_norm": 0.9685767265903029, + "learning_rate": 2.3308986532778464e-07, + "loss": 0.646, + "step": 15285 + }, + { + "epoch": 0.9807568954457986, + "grad_norm": 0.9361901486165769, + "learning_rate": 2.255130665452243e-07, + "loss": 0.6598, + "step": 15290 + }, + { + "epoch": 0.9810776138550352, + "grad_norm": 0.9598712869867538, + "learning_rate": 2.180613172635404e-07, + "loss": 0.5625, + "step": 15295 + }, + { + "epoch": 0.981398332264272, + "grad_norm": 0.53570267588639, + "learning_rate": 2.1073462682217325e-07, + "loss": 0.5784, + "step": 15300 + }, + { + "epoch": 0.9817190506735086, + "grad_norm": 0.7566088957917948, + "learning_rate": 2.0353300440382194e-07, + "loss": 0.6119, + "step": 15305 + }, + { + "epoch": 0.9820397690827454, + "grad_norm": 1.146329754716512, + "learning_rate": 1.9645645903444422e-07, + "loss": 0.7188, + "step": 15310 + }, + { + "epoch": 0.982360487491982, + "grad_norm": 0.8370973588336825, + "learning_rate": 1.895049995832232e-07, + "loss": 0.7563, + "step": 15315 + }, + { + "epoch": 0.9826812059012188, + "grad_norm": 0.9434580889772379, + "learning_rate": 1.8267863476255643e-07, + "loss": 0.7839, + "step": 15320 + }, + { + "epoch": 0.9830019243104554, + "grad_norm": 0.8804750628505544, + "learning_rate": 1.7597737312810004e-07, + "loss": 0.4332, + "step": 15325 + }, + { + "epoch": 0.9833226427196922, + "grad_norm": 0.7320489722005881, + "learning_rate": 1.694012230786579e-07, + "loss": 0.7652, + "step": 15330 + }, + { + "epoch": 0.9836433611289288, + "grad_norm": 0.7366362970085942, + "learning_rate": 1.6295019285628154e-07, + "loss": 0.7341, + "step": 15335 + }, + { + "epoch": 0.9839640795381654, + "grad_norm": 1.0140709106862729, + "learning_rate": 1.5662429054618122e-07, + "loss": 0.4945, + "step": 15340 + }, + { + "epoch": 0.9842847979474022, + "grad_norm": 1.9484887809729772, + "learning_rate": 1.504235240767371e-07, + "loss": 0.6308, + "step": 15345 + }, + { + "epoch": 0.9846055163566388, + "grad_norm": 0.9619117197899885, + "learning_rate": 1.4434790121951036e-07, + "loss": 0.6099, + "step": 15350 + }, + { + "epoch": 0.9849262347658756, + "grad_norm": 0.9949706333975902, + "learning_rate": 1.3839742958920987e-07, + "loss": 0.5725, + "step": 15355 + }, + { + "epoch": 0.9852469531751122, + "grad_norm": 0.9242186083511401, + "learning_rate": 1.3257211664368106e-07, + "loss": 0.6308, + "step": 15360 + }, + { + "epoch": 0.985567671584349, + "grad_norm": 1.0782239190960032, + "learning_rate": 1.2687196968392822e-07, + "loss": 0.6935, + "step": 15365 + }, + { + "epoch": 0.9858883899935856, + "grad_norm": 0.8111644243864005, + "learning_rate": 1.2129699585404774e-07, + "loss": 0.7241, + "step": 15370 + }, + { + "epoch": 0.9862091084028223, + "grad_norm": 0.7276347564310323, + "learning_rate": 1.1584720214129485e-07, + "loss": 0.6842, + "step": 15375 + }, + { + "epoch": 0.986529826812059, + "grad_norm": 1.036558622735431, + "learning_rate": 1.1052259537599474e-07, + "loss": 0.7109, + "step": 15380 + }, + { + "epoch": 0.9868505452212957, + "grad_norm": 0.8442723448288622, + "learning_rate": 1.053231822315981e-07, + "loss": 0.5197, + "step": 15385 + }, + { + "epoch": 0.9871712636305324, + "grad_norm": 0.7755592771907561, + "learning_rate": 1.0024896922464777e-07, + "loss": 0.5958, + "step": 15390 + }, + { + "epoch": 0.9874919820397691, + "grad_norm": 1.0235862204819772, + "learning_rate": 9.529996271475661e-08, + "loss": 0.7323, + "step": 15395 + }, + { + "epoch": 0.9878127004490058, + "grad_norm": 0.6802556392432448, + "learning_rate": 9.047616890461852e-08, + "loss": 0.6661, + "step": 15400 + }, + { + "epoch": 0.9881334188582425, + "grad_norm": 0.7642842609623561, + "learning_rate": 8.57775938399974e-08, + "loss": 0.6418, + "step": 15405 + }, + { + "epoch": 0.9884541372674791, + "grad_norm": 0.7629833080692018, + "learning_rate": 8.1204243409716e-08, + "loss": 0.71, + "step": 15410 + }, + { + "epoch": 0.9887748556767159, + "grad_norm": 0.8028551912844719, + "learning_rate": 7.675612334566706e-08, + "loss": 0.6261, + "step": 15415 + }, + { + "epoch": 0.9890955740859525, + "grad_norm": 0.8568280018874693, + "learning_rate": 7.24332392227578e-08, + "loss": 0.7818, + "step": 15420 + }, + { + "epoch": 0.9894162924951893, + "grad_norm": 0.9435010043749265, + "learning_rate": 6.823559645896538e-08, + "loss": 0.7135, + "step": 15425 + }, + { + "epoch": 0.9897370109044259, + "grad_norm": 0.8536947904193946, + "learning_rate": 6.416320031527035e-08, + "loss": 0.6909, + "step": 15430 + }, + { + "epoch": 0.9900577293136626, + "grad_norm": 0.6375751055715156, + "learning_rate": 6.02160558957121e-08, + "loss": 0.7567, + "step": 15435 + }, + { + "epoch": 0.9903784477228993, + "grad_norm": 0.722851635446421, + "learning_rate": 5.639416814731124e-08, + "loss": 0.595, + "step": 15440 + }, + { + "epoch": 0.9906991661321359, + "grad_norm": 0.6530835942019998, + "learning_rate": 5.269754186013609e-08, + "loss": 0.6185, + "step": 15445 + }, + { + "epoch": 0.9910198845413727, + "grad_norm": 1.0508657841447764, + "learning_rate": 4.912618166723615e-08, + "loss": 0.5615, + "step": 15450 + }, + { + "epoch": 0.9913406029506093, + "grad_norm": 0.89657789016663, + "learning_rate": 4.5680092044686486e-08, + "loss": 0.686, + "step": 15455 + }, + { + "epoch": 0.9916613213598461, + "grad_norm": 1.0049970608249212, + "learning_rate": 4.235927731153222e-08, + "loss": 0.5976, + "step": 15460 + }, + { + "epoch": 0.9919820397690827, + "grad_norm": 0.5955235189985802, + "learning_rate": 3.916374162983294e-08, + "loss": 0.4921, + "step": 15465 + }, + { + "epoch": 0.9923027581783195, + "grad_norm": 1.0006472782878193, + "learning_rate": 3.6093489004618286e-08, + "loss": 0.6268, + "step": 15470 + }, + { + "epoch": 0.9926234765875561, + "grad_norm": 0.7931648933621266, + "learning_rate": 3.314852328389906e-08, + "loss": 0.6005, + "step": 15475 + }, + { + "epoch": 0.9929441949967928, + "grad_norm": 0.9041277771423232, + "learning_rate": 3.032884815866721e-08, + "loss": 0.5324, + "step": 15480 + }, + { + "epoch": 0.9932649134060295, + "grad_norm": 0.9494072939119311, + "learning_rate": 2.7634467162873657e-08, + "loss": 0.7065, + "step": 15485 + }, + { + "epoch": 0.9935856318152662, + "grad_norm": 0.6280167222373476, + "learning_rate": 2.506538367345046e-08, + "loss": 0.6061, + "step": 15490 + }, + { + "epoch": 0.9939063502245029, + "grad_norm": 0.9951079606352037, + "learning_rate": 2.2621600910288644e-08, + "loss": 0.6444, + "step": 15495 + }, + { + "epoch": 0.9942270686337396, + "grad_norm": 0.5695988637172767, + "learning_rate": 2.0303121936227077e-08, + "loss": 0.5318, + "step": 15500 + }, + { + "epoch": 0.9945477870429763, + "grad_norm": 0.7104107796380682, + "learning_rate": 1.8109949657074687e-08, + "loss": 0.584, + "step": 15505 + }, + { + "epoch": 0.994868505452213, + "grad_norm": 0.9519239668806431, + "learning_rate": 1.6042086821566048e-08, + "loss": 0.6069, + "step": 15510 + }, + { + "epoch": 0.9951892238614497, + "grad_norm": 1.0531482909821168, + "learning_rate": 1.409953602140579e-08, + "loss": 0.6419, + "step": 15515 + }, + { + "epoch": 0.9955099422706863, + "grad_norm": 0.8960669638227693, + "learning_rate": 1.2282299691235289e-08, + "loss": 0.6139, + "step": 15520 + }, + { + "epoch": 0.995830660679923, + "grad_norm": 1.4364607207448494, + "learning_rate": 1.059038010863267e-08, + "loss": 0.557, + "step": 15525 + }, + { + "epoch": 0.9961513790891597, + "grad_norm": 0.7870715712258225, + "learning_rate": 9.02377939412391e-09, + "loss": 0.6829, + "step": 15530 + }, + { + "epoch": 0.9964720974983964, + "grad_norm": 0.6681758560958523, + "learning_rate": 7.582499511160635e-09, + "loss": 0.6894, + "step": 15535 + }, + { + "epoch": 0.9967928159076331, + "grad_norm": 0.7692932903463889, + "learning_rate": 6.266542266120112e-09, + "loss": 0.6775, + "step": 15540 + }, + { + "epoch": 0.9971135343168698, + "grad_norm": 1.2971219190629335, + "learning_rate": 5.0759093083385665e-09, + "loss": 0.6272, + "step": 15545 + }, + { + "epoch": 0.9974342527261065, + "grad_norm": 0.6500496471556959, + "learning_rate": 4.010602130033458e-09, + "loss": 0.6068, + "step": 15550 + }, + { + "epoch": 0.9977549711353432, + "grad_norm": 0.686167526298323, + "learning_rate": 3.0706220664034057e-09, + "loss": 0.6119, + "step": 15555 + }, + { + "epoch": 0.9980756895445798, + "grad_norm": 0.7818918449822959, + "learning_rate": 2.255970295539367e-09, + "loss": 0.6275, + "step": 15560 + }, + { + "epoch": 0.9983964079538166, + "grad_norm": 0.7349503006612832, + "learning_rate": 1.5666478384579464e-09, + "loss": 0.7661, + "step": 15565 + }, + { + "epoch": 0.9987171263630532, + "grad_norm": 0.7447321528022689, + "learning_rate": 1.0026555591013952e-09, + "loss": 0.7204, + "step": 15570 + }, + { + "epoch": 0.99903784477229, + "grad_norm": 0.5813111087659052, + "learning_rate": 5.639941643376112e-10, + "loss": 0.6803, + "step": 15575 + }, + { + "epoch": 0.9993585631815266, + "grad_norm": 0.7896016109360797, + "learning_rate": 2.5066420393793365e-10, + "loss": 0.7841, + "step": 15580 + }, + { + "epoch": 0.9996792815907634, + "grad_norm": 0.8103436457382939, + "learning_rate": 6.266607062155316e-11, + "loss": 0.5852, + "step": 15585 + }, + { + "epoch": 1.0, + "grad_norm": 0.9796619082932287, + "learning_rate": 0.0, + "loss": 0.7238, + "step": 15590 + }, + { + "epoch": 1.0, + "step": 15590, + "total_flos": 1.6764562374852608e+16, + "train_loss": 0.0, + "train_runtime": 0.0156, + "train_samples_per_second": 6662118.554, + "train_steps_per_second": 104099.609 + } + ], + "logging_steps": 5, + "max_steps": 1624, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6764562374852608e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}