{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 15590, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.414368184733804e-05, "grad_norm": 2.122934421744886, "learning_rate": 1.2828736369467608e-07, "loss": 1.1536, "step": 1 }, { "epoch": 0.00032071840923669016, "grad_norm": 2.6120313710646594, "learning_rate": 6.414368184733804e-07, "loss": 0.8767, "step": 5 }, { "epoch": 0.0006414368184733803, "grad_norm": 1.9036320491341159, "learning_rate": 1.2828736369467608e-06, "loss": 1.1617, "step": 10 }, { "epoch": 0.0009621552277100705, "grad_norm": 2.2675442011112223, "learning_rate": 1.9243104554201413e-06, "loss": 0.9348, "step": 15 }, { "epoch": 0.0012828736369467607, "grad_norm": 1.8668180685612314, "learning_rate": 2.5657472738935216e-06, "loss": 0.9887, "step": 20 }, { "epoch": 0.001603592046183451, "grad_norm": 3.7598222149067344, "learning_rate": 3.2071840923669024e-06, "loss": 0.9933, "step": 25 }, { "epoch": 0.001924310455420141, "grad_norm": 0.7497789448952673, "learning_rate": 3.848620910840283e-06, "loss": 0.8951, "step": 30 }, { "epoch": 0.0022450288646568314, "grad_norm": 0.7497466999070768, "learning_rate": 4.490057729313663e-06, "loss": 1.1075, "step": 35 }, { "epoch": 0.0025657472738935213, "grad_norm": 0.6269808680953072, "learning_rate": 5.131494547787043e-06, "loss": 0.9902, "step": 40 }, { "epoch": 0.0028864656831302116, "grad_norm": 0.7670239135600633, "learning_rate": 5.7729313662604236e-06, "loss": 0.8813, "step": 45 }, { "epoch": 0.003207184092366902, "grad_norm": 2.7386818812948612, "learning_rate": 6.414368184733805e-06, "loss": 1.0156, "step": 50 }, { "epoch": 0.003527902501603592, "grad_norm": 0.6074882325288049, "learning_rate": 7.055805003207184e-06, "loss": 0.8396, "step": 55 }, { "epoch": 0.003848620910840282, "grad_norm": 0.5127156771589034, "learning_rate": 7.697241821680565e-06, "loss": 0.8716, "step": 60 }, { "epoch": 0.004169339320076972, "grad_norm": 1.9056440284966447, "learning_rate": 8.338678640153946e-06, "loss": 0.854, "step": 65 }, { "epoch": 0.004490057729313663, "grad_norm": 0.5220731546600494, "learning_rate": 8.980115458627326e-06, "loss": 0.7033, "step": 70 }, { "epoch": 0.004810776138550353, "grad_norm": 0.8528246103677898, "learning_rate": 9.621552277100706e-06, "loss": 0.7803, "step": 75 }, { "epoch": 0.005131494547787043, "grad_norm": 0.9553069537717978, "learning_rate": 1.0262989095574087e-05, "loss": 0.8577, "step": 80 }, { "epoch": 0.005452212957023733, "grad_norm": 1.089966886886208, "learning_rate": 1.0904425914047467e-05, "loss": 0.5766, "step": 85 }, { "epoch": 0.005772931366260423, "grad_norm": 1.3259601452666736, "learning_rate": 1.1545862732520847e-05, "loss": 0.6605, "step": 90 }, { "epoch": 0.006093649775497113, "grad_norm": 0.5501705053457395, "learning_rate": 1.2187299550994227e-05, "loss": 0.622, "step": 95 }, { "epoch": 0.006414368184733804, "grad_norm": 0.565393913728341, "learning_rate": 1.282873636946761e-05, "loss": 0.6598, "step": 100 }, { "epoch": 0.006735086593970494, "grad_norm": 0.8355349528605185, "learning_rate": 1.3470173187940988e-05, "loss": 0.57, "step": 105 }, { "epoch": 0.007055805003207184, "grad_norm": 0.7311107470147664, "learning_rate": 1.4111610006414368e-05, "loss": 0.7012, "step": 110 }, { "epoch": 0.0073765234124438745, "grad_norm": 0.7172503923642882, "learning_rate": 1.4753046824887749e-05, "loss": 0.6794, "step": 115 }, { "epoch": 0.007697241821680564, "grad_norm": 0.6517343625027339, "learning_rate": 1.539448364336113e-05, "loss": 0.7512, "step": 120 }, { "epoch": 0.008017960230917255, "grad_norm": 0.7506138412539792, "learning_rate": 1.603592046183451e-05, "loss": 0.6939, "step": 125 }, { "epoch": 0.008338678640153944, "grad_norm": 0.6731898912128177, "learning_rate": 1.667735728030789e-05, "loss": 0.6853, "step": 130 }, { "epoch": 0.008659397049390635, "grad_norm": 0.5526935784877048, "learning_rate": 1.731879409878127e-05, "loss": 0.7614, "step": 135 }, { "epoch": 0.008980115458627326, "grad_norm": 0.7407967682593112, "learning_rate": 1.7960230917254652e-05, "loss": 0.6847, "step": 140 }, { "epoch": 0.009300833867864015, "grad_norm": 1.0558900513241394, "learning_rate": 1.8601667735728032e-05, "loss": 0.6291, "step": 145 }, { "epoch": 0.009621552277100705, "grad_norm": 0.534720549756236, "learning_rate": 1.9243104554201412e-05, "loss": 0.6933, "step": 150 }, { "epoch": 0.009942270686337396, "grad_norm": 0.8533880407106053, "learning_rate": 1.9884541372674793e-05, "loss": 0.7405, "step": 155 }, { "epoch": 0.010262989095574085, "grad_norm": 2.2157811263492633, "learning_rate": 2.0525978191148173e-05, "loss": 0.6605, "step": 160 }, { "epoch": 0.010583707504810776, "grad_norm": 0.741553600188979, "learning_rate": 2.1167415009621553e-05, "loss": 0.6929, "step": 165 }, { "epoch": 0.010904425914047467, "grad_norm": 0.592672329081525, "learning_rate": 2.1808851828094934e-05, "loss": 0.7712, "step": 170 }, { "epoch": 0.011225144323284156, "grad_norm": 0.7143661642401767, "learning_rate": 2.2450288646568314e-05, "loss": 0.7264, "step": 175 }, { "epoch": 0.011545862732520847, "grad_norm": 0.7168820160805862, "learning_rate": 2.3091725465041694e-05, "loss": 0.7147, "step": 180 }, { "epoch": 0.011866581141757537, "grad_norm": 0.8106566714421187, "learning_rate": 2.3733162283515075e-05, "loss": 0.7091, "step": 185 }, { "epoch": 0.012187299550994226, "grad_norm": 1.131984585130431, "learning_rate": 2.4374599101988455e-05, "loss": 0.6725, "step": 190 }, { "epoch": 0.012508017960230917, "grad_norm": 0.5991057607118903, "learning_rate": 2.5016035920461832e-05, "loss": 0.5288, "step": 195 }, { "epoch": 0.012828736369467608, "grad_norm": 0.7441333776346593, "learning_rate": 2.565747273893522e-05, "loss": 0.6001, "step": 200 }, { "epoch": 0.013149454778704297, "grad_norm": 0.7177668887803592, "learning_rate": 2.6298909557408596e-05, "loss": 0.729, "step": 205 }, { "epoch": 0.013470173187940988, "grad_norm": 1.152356658408425, "learning_rate": 2.6940346375881976e-05, "loss": 0.649, "step": 210 }, { "epoch": 0.013790891597177678, "grad_norm": 0.8692844040434968, "learning_rate": 2.758178319435536e-05, "loss": 0.7514, "step": 215 }, { "epoch": 0.014111610006414367, "grad_norm": 0.7731506164196528, "learning_rate": 2.8223220012828737e-05, "loss": 0.7303, "step": 220 }, { "epoch": 0.014432328415651058, "grad_norm": 0.6675669855403799, "learning_rate": 2.8864656831302117e-05, "loss": 0.5974, "step": 225 }, { "epoch": 0.014753046824887749, "grad_norm": 0.6511258667141646, "learning_rate": 2.9506093649775497e-05, "loss": 0.6502, "step": 230 }, { "epoch": 0.015073765234124438, "grad_norm": 0.8153736796805081, "learning_rate": 3.014753046824888e-05, "loss": 0.7187, "step": 235 }, { "epoch": 0.015394483643361129, "grad_norm": 0.682020511101791, "learning_rate": 3.078896728672226e-05, "loss": 0.7687, "step": 240 }, { "epoch": 0.01571520205259782, "grad_norm": 0.9723518475601368, "learning_rate": 3.143040410519564e-05, "loss": 0.6333, "step": 245 }, { "epoch": 0.01603592046183451, "grad_norm": 0.6642430373016617, "learning_rate": 3.207184092366902e-05, "loss": 0.7503, "step": 250 }, { "epoch": 0.0163566388710712, "grad_norm": 1.0604072659225818, "learning_rate": 3.27132777421424e-05, "loss": 0.7296, "step": 255 }, { "epoch": 0.01667735728030789, "grad_norm": 0.5389238146909613, "learning_rate": 3.335471456061578e-05, "loss": 0.6449, "step": 260 }, { "epoch": 0.01699807568954458, "grad_norm": 1.0886777633244675, "learning_rate": 3.3996151379089166e-05, "loss": 0.6087, "step": 265 }, { "epoch": 0.01731879409878127, "grad_norm": 0.7740455363235514, "learning_rate": 3.463758819756254e-05, "loss": 0.7716, "step": 270 }, { "epoch": 0.01763951250801796, "grad_norm": 0.7842668340726671, "learning_rate": 3.527902501603592e-05, "loss": 0.6184, "step": 275 }, { "epoch": 0.01796023091725465, "grad_norm": 0.8724306321758412, "learning_rate": 3.5920461834509304e-05, "loss": 0.5647, "step": 280 }, { "epoch": 0.018280949326491342, "grad_norm": 0.6108159651722537, "learning_rate": 3.656189865298269e-05, "loss": 0.7748, "step": 285 }, { "epoch": 0.01860166773572803, "grad_norm": 0.554729905784846, "learning_rate": 3.7203335471456064e-05, "loss": 0.6969, "step": 290 }, { "epoch": 0.01892238614496472, "grad_norm": 0.5263100377774543, "learning_rate": 3.784477228992944e-05, "loss": 0.6331, "step": 295 }, { "epoch": 0.01924310455420141, "grad_norm": 0.7458575860438468, "learning_rate": 3.8486209108402825e-05, "loss": 0.7178, "step": 300 }, { "epoch": 0.0195638229634381, "grad_norm": 0.7209749688824592, "learning_rate": 3.912764592687621e-05, "loss": 0.7774, "step": 305 }, { "epoch": 0.019884541372674792, "grad_norm": 0.8894616503150261, "learning_rate": 3.9769082745349585e-05, "loss": 0.8354, "step": 310 }, { "epoch": 0.020205259781911483, "grad_norm": 0.6322923436990817, "learning_rate": 4.041051956382296e-05, "loss": 0.6009, "step": 315 }, { "epoch": 0.02052597819114817, "grad_norm": 0.9519419320088668, "learning_rate": 4.1051956382296346e-05, "loss": 0.61, "step": 320 }, { "epoch": 0.02084669660038486, "grad_norm": 0.654969001631436, "learning_rate": 4.169339320076972e-05, "loss": 0.602, "step": 325 }, { "epoch": 0.021167415009621552, "grad_norm": 0.6250956091655624, "learning_rate": 4.233483001924311e-05, "loss": 0.6451, "step": 330 }, { "epoch": 0.021488133418858243, "grad_norm": 0.7392153639819625, "learning_rate": 4.297626683771649e-05, "loss": 0.7724, "step": 335 }, { "epoch": 0.021808851828094934, "grad_norm": 0.7914340872699686, "learning_rate": 4.361770365618987e-05, "loss": 0.7245, "step": 340 }, { "epoch": 0.022129570237331624, "grad_norm": 0.5688389882467555, "learning_rate": 4.4259140474663244e-05, "loss": 0.5756, "step": 345 }, { "epoch": 0.02245028864656831, "grad_norm": 0.6860675746425041, "learning_rate": 4.490057729313663e-05, "loss": 0.6515, "step": 350 }, { "epoch": 0.022771007055805002, "grad_norm": 0.8497624484329163, "learning_rate": 4.554201411161001e-05, "loss": 0.742, "step": 355 }, { "epoch": 0.023091725465041693, "grad_norm": 0.9589070592978919, "learning_rate": 4.618345093008339e-05, "loss": 0.7261, "step": 360 }, { "epoch": 0.023412443874278384, "grad_norm": 0.5397605849852198, "learning_rate": 4.6824887748556765e-05, "loss": 0.721, "step": 365 }, { "epoch": 0.023733162283515075, "grad_norm": 0.4218758453965537, "learning_rate": 4.746632456703015e-05, "loss": 0.7008, "step": 370 }, { "epoch": 0.024053880692751765, "grad_norm": 0.4660237223228576, "learning_rate": 4.810776138550353e-05, "loss": 0.5954, "step": 375 }, { "epoch": 0.024374599101988453, "grad_norm": 1.1414044523272346, "learning_rate": 4.874919820397691e-05, "loss": 0.7092, "step": 380 }, { "epoch": 0.024695317511225143, "grad_norm": 0.7794538849217394, "learning_rate": 4.939063502245029e-05, "loss": 0.6556, "step": 385 }, { "epoch": 0.025016035920461834, "grad_norm": 0.6784254428885176, "learning_rate": 5.0032071840923663e-05, "loss": 0.6523, "step": 390 }, { "epoch": 0.025336754329698525, "grad_norm": 0.5550050199692612, "learning_rate": 5.0673508659397054e-05, "loss": 0.7065, "step": 395 }, { "epoch": 0.025657472738935216, "grad_norm": 1.3489642897531091, "learning_rate": 5.131494547787044e-05, "loss": 0.657, "step": 400 }, { "epoch": 0.025978191148171906, "grad_norm": 0.8799442657849393, "learning_rate": 5.195638229634381e-05, "loss": 0.7712, "step": 405 }, { "epoch": 0.026298909557408594, "grad_norm": 0.6211518086394292, "learning_rate": 5.259781911481719e-05, "loss": 0.6556, "step": 410 }, { "epoch": 0.026619627966645285, "grad_norm": 0.527786179579098, "learning_rate": 5.3239255933290575e-05, "loss": 0.6304, "step": 415 }, { "epoch": 0.026940346375881975, "grad_norm": 0.6225940856068456, "learning_rate": 5.388069275176395e-05, "loss": 0.7504, "step": 420 }, { "epoch": 0.027261064785118666, "grad_norm": 0.7472577597094603, "learning_rate": 5.4522129570237336e-05, "loss": 0.5737, "step": 425 }, { "epoch": 0.027581783194355357, "grad_norm": 0.9003123884674169, "learning_rate": 5.516356638871072e-05, "loss": 0.6751, "step": 430 }, { "epoch": 0.027902501603592048, "grad_norm": 1.193348964937134, "learning_rate": 5.580500320718409e-05, "loss": 0.6685, "step": 435 }, { "epoch": 0.028223220012828735, "grad_norm": 0.8207452374854483, "learning_rate": 5.644644002565747e-05, "loss": 0.5606, "step": 440 }, { "epoch": 0.028543938422065426, "grad_norm": 0.6253317338492933, "learning_rate": 5.7087876844130864e-05, "loss": 0.6848, "step": 445 }, { "epoch": 0.028864656831302116, "grad_norm": 0.5089340890778841, "learning_rate": 5.7729313662604234e-05, "loss": 0.5969, "step": 450 }, { "epoch": 0.029185375240538807, "grad_norm": 0.6403611822232731, "learning_rate": 5.837075048107762e-05, "loss": 0.6663, "step": 455 }, { "epoch": 0.029506093649775498, "grad_norm": 0.9017481128452324, "learning_rate": 5.9012187299550994e-05, "loss": 0.6253, "step": 460 }, { "epoch": 0.02982681205901219, "grad_norm": 0.7102935907261797, "learning_rate": 5.965362411802438e-05, "loss": 0.6032, "step": 465 }, { "epoch": 0.030147530468248876, "grad_norm": 0.572528044090495, "learning_rate": 6.029506093649776e-05, "loss": 0.7059, "step": 470 }, { "epoch": 0.030468248877485567, "grad_norm": 0.6507630672872388, "learning_rate": 6.093649775497113e-05, "loss": 0.551, "step": 475 }, { "epoch": 0.030788967286722257, "grad_norm": 0.4787872258590136, "learning_rate": 6.157793457344452e-05, "loss": 0.4953, "step": 480 }, { "epoch": 0.031109685695958948, "grad_norm": 0.6446626662145857, "learning_rate": 6.22193713919179e-05, "loss": 0.7073, "step": 485 }, { "epoch": 0.03143040410519564, "grad_norm": 0.46176975999305003, "learning_rate": 6.286080821039128e-05, "loss": 0.7031, "step": 490 }, { "epoch": 0.03175112251443233, "grad_norm": 0.6364571216466376, "learning_rate": 6.350224502886466e-05, "loss": 0.7208, "step": 495 }, { "epoch": 0.03207184092366902, "grad_norm": 0.6441271299481783, "learning_rate": 6.414368184733804e-05, "loss": 0.651, "step": 500 }, { "epoch": 0.03239255933290571, "grad_norm": 0.5277240516380076, "learning_rate": 6.478511866581141e-05, "loss": 0.7596, "step": 505 }, { "epoch": 0.0327132777421424, "grad_norm": 0.6102741778617242, "learning_rate": 6.54265554842848e-05, "loss": 0.8127, "step": 510 }, { "epoch": 0.033033996151379086, "grad_norm": 1.2909493866489476, "learning_rate": 6.606799230275818e-05, "loss": 0.6172, "step": 515 }, { "epoch": 0.03335471456061578, "grad_norm": 0.8290001510292774, "learning_rate": 6.670942912123156e-05, "loss": 0.7024, "step": 520 }, { "epoch": 0.03367543296985247, "grad_norm": 0.5082074367378367, "learning_rate": 6.735086593970495e-05, "loss": 0.5993, "step": 525 }, { "epoch": 0.03399615137908916, "grad_norm": 0.8948141239538124, "learning_rate": 6.799230275817833e-05, "loss": 0.6288, "step": 530 }, { "epoch": 0.03431686978832585, "grad_norm": 0.704188041016483, "learning_rate": 6.86337395766517e-05, "loss": 0.6173, "step": 535 }, { "epoch": 0.03463758819756254, "grad_norm": 0.8493617205406083, "learning_rate": 6.927517639512509e-05, "loss": 0.6472, "step": 540 }, { "epoch": 0.03495830660679923, "grad_norm": 0.6071336551640186, "learning_rate": 6.991661321359846e-05, "loss": 0.7066, "step": 545 }, { "epoch": 0.03527902501603592, "grad_norm": 0.6299761061285323, "learning_rate": 7.055805003207184e-05, "loss": 0.5004, "step": 550 }, { "epoch": 0.03559974342527261, "grad_norm": 0.36030076856010784, "learning_rate": 7.119948685054522e-05, "loss": 0.5939, "step": 555 }, { "epoch": 0.0359204618345093, "grad_norm": 0.5657747344505833, "learning_rate": 7.184092366901861e-05, "loss": 0.6394, "step": 560 }, { "epoch": 0.03624118024374599, "grad_norm": 0.5512464769253931, "learning_rate": 7.248236048749199e-05, "loss": 0.7496, "step": 565 }, { "epoch": 0.036561898652982684, "grad_norm": 1.6012481016769327, "learning_rate": 7.312379730596537e-05, "loss": 0.74, "step": 570 }, { "epoch": 0.03688261706221937, "grad_norm": 0.539931431422469, "learning_rate": 7.376523412443874e-05, "loss": 0.655, "step": 575 }, { "epoch": 0.03720333547145606, "grad_norm": 0.5792692922947517, "learning_rate": 7.440667094291213e-05, "loss": 0.6268, "step": 580 }, { "epoch": 0.03752405388069275, "grad_norm": 0.44904646394711184, "learning_rate": 7.504810776138551e-05, "loss": 0.621, "step": 585 }, { "epoch": 0.03784477228992944, "grad_norm": 0.38995414700568637, "learning_rate": 7.568954457985888e-05, "loss": 0.479, "step": 590 }, { "epoch": 0.03816549069916613, "grad_norm": 0.6100292909911376, "learning_rate": 7.633098139833227e-05, "loss": 0.5087, "step": 595 }, { "epoch": 0.03848620910840282, "grad_norm": 0.767232067956154, "learning_rate": 7.697241821680565e-05, "loss": 0.7094, "step": 600 }, { "epoch": 0.03880692751763951, "grad_norm": 0.5093223662182627, "learning_rate": 7.761385503527902e-05, "loss": 0.6216, "step": 605 }, { "epoch": 0.0391276459268762, "grad_norm": 0.604996949026468, "learning_rate": 7.825529185375242e-05, "loss": 0.6343, "step": 610 }, { "epoch": 0.039448364336112894, "grad_norm": 0.6313031887029451, "learning_rate": 7.88967286722258e-05, "loss": 0.6814, "step": 615 }, { "epoch": 0.039769082745349585, "grad_norm": 0.5515684818028812, "learning_rate": 7.953816549069917e-05, "loss": 0.6319, "step": 620 }, { "epoch": 0.040089801154586276, "grad_norm": 0.9067875561472081, "learning_rate": 8.017960230917255e-05, "loss": 0.626, "step": 625 }, { "epoch": 0.040410519563822966, "grad_norm": 0.4402348046376401, "learning_rate": 8.082103912764592e-05, "loss": 0.6581, "step": 630 }, { "epoch": 0.04073123797305965, "grad_norm": 0.6653624732467279, "learning_rate": 8.146247594611931e-05, "loss": 0.6266, "step": 635 }, { "epoch": 0.04105195638229634, "grad_norm": 0.7506028416479603, "learning_rate": 8.210391276459269e-05, "loss": 0.7304, "step": 640 }, { "epoch": 0.04137267479153303, "grad_norm": 0.43305772472870374, "learning_rate": 8.274534958306606e-05, "loss": 0.6272, "step": 645 }, { "epoch": 0.04169339320076972, "grad_norm": 0.7883927079167802, "learning_rate": 8.338678640153945e-05, "loss": 0.564, "step": 650 }, { "epoch": 0.04201411161000641, "grad_norm": 0.6406069976891953, "learning_rate": 8.402822322001283e-05, "loss": 0.6594, "step": 655 }, { "epoch": 0.042334830019243104, "grad_norm": 0.6650787540082842, "learning_rate": 8.466966003848621e-05, "loss": 0.6086, "step": 660 }, { "epoch": 0.042655548428479795, "grad_norm": 0.6280025445964529, "learning_rate": 8.53110968569596e-05, "loss": 0.6188, "step": 665 }, { "epoch": 0.042976266837716486, "grad_norm": 0.6181001304138187, "learning_rate": 8.595253367543298e-05, "loss": 0.6454, "step": 670 }, { "epoch": 0.043296985246953176, "grad_norm": 0.9164302121431295, "learning_rate": 8.659397049390635e-05, "loss": 0.7409, "step": 675 }, { "epoch": 0.04361770365618987, "grad_norm": 0.5146934352157929, "learning_rate": 8.723540731237973e-05, "loss": 0.7961, "step": 680 }, { "epoch": 0.04393842206542656, "grad_norm": 0.8884783771604745, "learning_rate": 8.787684413085312e-05, "loss": 0.7023, "step": 685 }, { "epoch": 0.04425914047466325, "grad_norm": 0.5972459928844025, "learning_rate": 8.851828094932649e-05, "loss": 0.6437, "step": 690 }, { "epoch": 0.04457985888389993, "grad_norm": 1.027137591537084, "learning_rate": 8.915971776779987e-05, "loss": 0.6461, "step": 695 }, { "epoch": 0.04490057729313662, "grad_norm": 0.684561126713197, "learning_rate": 8.980115458627326e-05, "loss": 0.6417, "step": 700 }, { "epoch": 0.045221295702373314, "grad_norm": 0.5791897637489775, "learning_rate": 9.044259140474664e-05, "loss": 0.6545, "step": 705 }, { "epoch": 0.045542014111610005, "grad_norm": 0.6093322265483176, "learning_rate": 9.108402822322002e-05, "loss": 0.5431, "step": 710 }, { "epoch": 0.045862732520846695, "grad_norm": 1.20412780035678, "learning_rate": 9.172546504169339e-05, "loss": 0.6122, "step": 715 }, { "epoch": 0.046183450930083386, "grad_norm": 0.4344736289735069, "learning_rate": 9.236690186016678e-05, "loss": 0.6896, "step": 720 }, { "epoch": 0.04650416933932008, "grad_norm": 0.479553471093618, "learning_rate": 9.300833867864016e-05, "loss": 0.7446, "step": 725 }, { "epoch": 0.04682488774855677, "grad_norm": 0.4175717995477323, "learning_rate": 9.364977549711353e-05, "loss": 0.5635, "step": 730 }, { "epoch": 0.04714560615779346, "grad_norm": 0.43527442203162864, "learning_rate": 9.429121231558691e-05, "loss": 0.5984, "step": 735 }, { "epoch": 0.04746632456703015, "grad_norm": 0.6764034597420034, "learning_rate": 9.49326491340603e-05, "loss": 0.6575, "step": 740 }, { "epoch": 0.04778704297626684, "grad_norm": 0.6994297524226791, "learning_rate": 9.557408595253368e-05, "loss": 0.6381, "step": 745 }, { "epoch": 0.04810776138550353, "grad_norm": 0.5924112864276749, "learning_rate": 9.621552277100707e-05, "loss": 0.6273, "step": 750 }, { "epoch": 0.04842847979474022, "grad_norm": 0.529839489096258, "learning_rate": 9.685695958948045e-05, "loss": 0.5524, "step": 755 }, { "epoch": 0.048749198203976905, "grad_norm": 0.5412474092793377, "learning_rate": 9.749839640795382e-05, "loss": 0.6584, "step": 760 }, { "epoch": 0.049069916613213596, "grad_norm": 0.62325178443721, "learning_rate": 9.81398332264272e-05, "loss": 0.7556, "step": 765 }, { "epoch": 0.04939063502245029, "grad_norm": 0.6185109985068113, "learning_rate": 9.878127004490059e-05, "loss": 0.6396, "step": 770 }, { "epoch": 0.04971135343168698, "grad_norm": 0.5650081284141024, "learning_rate": 9.942270686337396e-05, "loss": 0.6761, "step": 775 }, { "epoch": 0.05003207184092367, "grad_norm": 0.6838574740900004, "learning_rate": 0.00010006414368184733, "loss": 0.6228, "step": 780 }, { "epoch": 0.05035279025016036, "grad_norm": 0.6196830613093786, "learning_rate": 0.00010070558050032072, "loss": 0.6648, "step": 785 }, { "epoch": 0.05067350865939705, "grad_norm": 0.5504649558203162, "learning_rate": 0.00010134701731879411, "loss": 0.697, "step": 790 }, { "epoch": 0.05099422706863374, "grad_norm": 0.654837344932131, "learning_rate": 0.00010198845413726748, "loss": 0.6986, "step": 795 }, { "epoch": 0.05131494547787043, "grad_norm": 0.7011329232246133, "learning_rate": 0.00010262989095574088, "loss": 0.7206, "step": 800 }, { "epoch": 0.05163566388710712, "grad_norm": 0.6807528459174979, "learning_rate": 0.00010327132777421425, "loss": 0.6834, "step": 805 }, { "epoch": 0.05195638229634381, "grad_norm": 0.8856217259425705, "learning_rate": 0.00010391276459268762, "loss": 0.7028, "step": 810 }, { "epoch": 0.052277100705580504, "grad_norm": 0.5962908888781525, "learning_rate": 0.00010455420141116101, "loss": 0.5113, "step": 815 }, { "epoch": 0.05259781911481719, "grad_norm": 0.9014177998142, "learning_rate": 0.00010519563822963438, "loss": 0.6129, "step": 820 }, { "epoch": 0.05291853752405388, "grad_norm": 0.6753791164158136, "learning_rate": 0.00010583707504810775, "loss": 0.756, "step": 825 }, { "epoch": 0.05323925593329057, "grad_norm": 0.48791891735015575, "learning_rate": 0.00010647851186658115, "loss": 0.5352, "step": 830 }, { "epoch": 0.05355997434252726, "grad_norm": 0.7373582383544524, "learning_rate": 0.00010711994868505453, "loss": 0.7345, "step": 835 }, { "epoch": 0.05388069275176395, "grad_norm": 0.49964472362766127, "learning_rate": 0.0001077613855035279, "loss": 0.7314, "step": 840 }, { "epoch": 0.05420141116100064, "grad_norm": 0.48415921267506284, "learning_rate": 0.0001084028223220013, "loss": 0.5548, "step": 845 }, { "epoch": 0.05452212957023733, "grad_norm": 0.6197607704084165, "learning_rate": 0.00010904425914047467, "loss": 0.6271, "step": 850 }, { "epoch": 0.05484284797947402, "grad_norm": 0.677683386452661, "learning_rate": 0.00010968569595894804, "loss": 0.7739, "step": 855 }, { "epoch": 0.055163566388710714, "grad_norm": 0.7298215600744931, "learning_rate": 0.00011032713277742144, "loss": 0.6813, "step": 860 }, { "epoch": 0.055484284797947404, "grad_norm": 0.49556474863687744, "learning_rate": 0.00011096856959589481, "loss": 0.7165, "step": 865 }, { "epoch": 0.055805003207184095, "grad_norm": 0.4755941527376833, "learning_rate": 0.00011161000641436818, "loss": 0.7439, "step": 870 }, { "epoch": 0.056125721616420786, "grad_norm": 0.8183131489420952, "learning_rate": 0.00011225144323284158, "loss": 0.7741, "step": 875 }, { "epoch": 0.05644644002565747, "grad_norm": 0.577588746397813, "learning_rate": 0.00011289288005131495, "loss": 0.6951, "step": 880 }, { "epoch": 0.05676715843489416, "grad_norm": 0.3104626766912227, "learning_rate": 0.00011353431686978833, "loss": 0.6068, "step": 885 }, { "epoch": 0.05708787684413085, "grad_norm": 0.6364607751424182, "learning_rate": 0.00011417575368826173, "loss": 0.6601, "step": 890 }, { "epoch": 0.05740859525336754, "grad_norm": 0.5489548053878326, "learning_rate": 0.0001148171905067351, "loss": 0.6498, "step": 895 }, { "epoch": 0.05772931366260423, "grad_norm": 0.8290809901584166, "learning_rate": 0.00011545862732520847, "loss": 0.7598, "step": 900 }, { "epoch": 0.058050032071840924, "grad_norm": 0.9889805070312973, "learning_rate": 0.00011610006414368186, "loss": 0.6528, "step": 905 }, { "epoch": 0.058370750481077614, "grad_norm": 0.5034027315098741, "learning_rate": 0.00011674150096215524, "loss": 0.6916, "step": 910 }, { "epoch": 0.058691468890314305, "grad_norm": 0.5211514737547632, "learning_rate": 0.0001173829377806286, "loss": 0.6455, "step": 915 }, { "epoch": 0.059012187299550996, "grad_norm": 0.5915915443611912, "learning_rate": 0.00011802437459910199, "loss": 0.619, "step": 920 }, { "epoch": 0.05933290570878769, "grad_norm": 0.6356669965403786, "learning_rate": 0.00011866581141757537, "loss": 0.6339, "step": 925 }, { "epoch": 0.05965362411802438, "grad_norm": 0.5203747383599147, "learning_rate": 0.00011930724823604876, "loss": 0.7174, "step": 930 }, { "epoch": 0.05997434252726107, "grad_norm": 0.4400681567105204, "learning_rate": 0.00011994868505452213, "loss": 0.757, "step": 935 }, { "epoch": 0.06029506093649775, "grad_norm": 0.5134463977576896, "learning_rate": 0.00012059012187299552, "loss": 0.5941, "step": 940 }, { "epoch": 0.06061577934573444, "grad_norm": 2.1514572255404563, "learning_rate": 0.0001212315586914689, "loss": 0.5872, "step": 945 }, { "epoch": 0.06093649775497113, "grad_norm": 0.5533804918183362, "learning_rate": 0.00012187299550994226, "loss": 0.681, "step": 950 }, { "epoch": 0.061257216164207824, "grad_norm": 0.43736512301454394, "learning_rate": 0.00012251443232841566, "loss": 0.6241, "step": 955 }, { "epoch": 0.061577934573444515, "grad_norm": 0.7036625039029036, "learning_rate": 0.00012315586914688904, "loss": 0.7528, "step": 960 }, { "epoch": 0.061898652982681206, "grad_norm": 0.5883952786255479, "learning_rate": 0.0001237973059653624, "loss": 0.6132, "step": 965 }, { "epoch": 0.062219371391917896, "grad_norm": 0.593687347482467, "learning_rate": 0.0001244387427838358, "loss": 0.6453, "step": 970 }, { "epoch": 0.06254008980115458, "grad_norm": 0.8797836564455341, "learning_rate": 0.00012508017960230917, "loss": 0.6658, "step": 975 }, { "epoch": 0.06286080821039128, "grad_norm": 0.8231331839998992, "learning_rate": 0.00012572161642078255, "loss": 0.6615, "step": 980 }, { "epoch": 0.06318152661962796, "grad_norm": 0.5202568995405973, "learning_rate": 0.00012636305323925594, "loss": 0.8156, "step": 985 }, { "epoch": 0.06350224502886466, "grad_norm": 0.623580493806845, "learning_rate": 0.00012700449005772932, "loss": 0.6959, "step": 990 }, { "epoch": 0.06382296343810134, "grad_norm": 0.5798575607273242, "learning_rate": 0.0001276459268762027, "loss": 0.5538, "step": 995 }, { "epoch": 0.06414368184733804, "grad_norm": 0.6970653558425355, "learning_rate": 0.0001282873636946761, "loss": 0.7063, "step": 1000 }, { "epoch": 0.06446440025657472, "grad_norm": 0.8241115273976609, "learning_rate": 0.00012892880051314947, "loss": 0.6371, "step": 1005 }, { "epoch": 0.06478511866581142, "grad_norm": 0.7769868872755683, "learning_rate": 0.00012957023733162283, "loss": 0.6202, "step": 1010 }, { "epoch": 0.0651058370750481, "grad_norm": 0.4974832858382039, "learning_rate": 0.00013021167415009624, "loss": 0.652, "step": 1015 }, { "epoch": 0.0654265554842848, "grad_norm": 0.7988613498086312, "learning_rate": 0.0001308531109685696, "loss": 0.6179, "step": 1020 }, { "epoch": 0.06574727389352149, "grad_norm": 0.5975032929676001, "learning_rate": 0.00013149454778704298, "loss": 0.7551, "step": 1025 }, { "epoch": 0.06606799230275817, "grad_norm": 0.46478481189365806, "learning_rate": 0.00013213598460551636, "loss": 0.6643, "step": 1030 }, { "epoch": 0.06638871071199487, "grad_norm": 0.5467473022741837, "learning_rate": 0.00013277742142398975, "loss": 0.6786, "step": 1035 }, { "epoch": 0.06670942912123155, "grad_norm": 0.788511157965346, "learning_rate": 0.00013341885824246313, "loss": 0.699, "step": 1040 }, { "epoch": 0.06703014753046825, "grad_norm": 0.7378591658959022, "learning_rate": 0.0001340602950609365, "loss": 0.5498, "step": 1045 }, { "epoch": 0.06735086593970493, "grad_norm": 0.524580967213953, "learning_rate": 0.0001347017318794099, "loss": 0.7092, "step": 1050 }, { "epoch": 0.06767158434894163, "grad_norm": 10.11033461685559, "learning_rate": 0.00013534316869788325, "loss": 0.6694, "step": 1055 }, { "epoch": 0.06799230275817832, "grad_norm": 0.6039061177211199, "learning_rate": 0.00013598460551635666, "loss": 0.6105, "step": 1060 }, { "epoch": 0.06831302116741501, "grad_norm": 0.7863303522868051, "learning_rate": 0.00013662604233483002, "loss": 0.6867, "step": 1065 }, { "epoch": 0.0686337395766517, "grad_norm": 0.6197712573428509, "learning_rate": 0.0001372674791533034, "loss": 0.6893, "step": 1070 }, { "epoch": 0.0689544579858884, "grad_norm": 0.43888192750291055, "learning_rate": 0.0001379089159717768, "loss": 0.7157, "step": 1075 }, { "epoch": 0.06927517639512508, "grad_norm": 0.7306535592576365, "learning_rate": 0.00013855035279025017, "loss": 0.7648, "step": 1080 }, { "epoch": 0.06959589480436178, "grad_norm": 0.5833095869044383, "learning_rate": 0.00013919178960872356, "loss": 0.6655, "step": 1085 }, { "epoch": 0.06991661321359846, "grad_norm": 0.3330431009666685, "learning_rate": 0.0001398332264271969, "loss": 0.5681, "step": 1090 }, { "epoch": 0.07023733162283514, "grad_norm": 0.8485768964159431, "learning_rate": 0.00014047466324567032, "loss": 0.6139, "step": 1095 }, { "epoch": 0.07055805003207184, "grad_norm": 0.48935398848123357, "learning_rate": 0.00014111610006414368, "loss": 0.6591, "step": 1100 }, { "epoch": 0.07087876844130853, "grad_norm": 0.6694840056428312, "learning_rate": 0.00014175753688261706, "loss": 0.5986, "step": 1105 }, { "epoch": 0.07119948685054522, "grad_norm": 0.7907065672480846, "learning_rate": 0.00014239897370109045, "loss": 0.8477, "step": 1110 }, { "epoch": 0.07152020525978191, "grad_norm": 0.45721463553494507, "learning_rate": 0.00014304041051956383, "loss": 0.6511, "step": 1115 }, { "epoch": 0.0718409236690186, "grad_norm": 0.5932773719713492, "learning_rate": 0.00014368184733803721, "loss": 0.6205, "step": 1120 }, { "epoch": 0.07216164207825529, "grad_norm": 0.7933284443225256, "learning_rate": 0.0001443232841565106, "loss": 0.524, "step": 1125 }, { "epoch": 0.07248236048749199, "grad_norm": 0.4677884329123659, "learning_rate": 0.00014496472097498398, "loss": 0.555, "step": 1130 }, { "epoch": 0.07280307889672867, "grad_norm": 0.850254756515873, "learning_rate": 0.00014560615779345734, "loss": 0.7627, "step": 1135 }, { "epoch": 0.07312379730596537, "grad_norm": 0.522103651356661, "learning_rate": 0.00014624759461193075, "loss": 0.7255, "step": 1140 }, { "epoch": 0.07344451571520205, "grad_norm": 0.6063292373713933, "learning_rate": 0.0001468890314304041, "loss": 0.6222, "step": 1145 }, { "epoch": 0.07376523412443874, "grad_norm": 0.9713303841273095, "learning_rate": 0.0001475304682488775, "loss": 0.7341, "step": 1150 }, { "epoch": 0.07408595253367543, "grad_norm": 0.837884018201796, "learning_rate": 0.00014817190506735087, "loss": 0.6822, "step": 1155 }, { "epoch": 0.07440667094291212, "grad_norm": 0.39437246960153793, "learning_rate": 0.00014881334188582426, "loss": 0.7116, "step": 1160 }, { "epoch": 0.07472738935214882, "grad_norm": 0.6202094758512229, "learning_rate": 0.0001494547787042976, "loss": 0.6015, "step": 1165 }, { "epoch": 0.0750481077613855, "grad_norm": 0.8135054592447762, "learning_rate": 0.00015009621552277102, "loss": 0.6487, "step": 1170 }, { "epoch": 0.0753688261706222, "grad_norm": 0.5507524560111344, "learning_rate": 0.0001507376523412444, "loss": 0.5846, "step": 1175 }, { "epoch": 0.07568954457985888, "grad_norm": 0.5961939171868111, "learning_rate": 0.00015137908915971776, "loss": 0.6111, "step": 1180 }, { "epoch": 0.07601026298909558, "grad_norm": 0.5352884760699661, "learning_rate": 0.00015202052597819118, "loss": 0.6401, "step": 1185 }, { "epoch": 0.07633098139833226, "grad_norm": 0.6620834657515849, "learning_rate": 0.00015266196279666453, "loss": 0.7108, "step": 1190 }, { "epoch": 0.07665169980756896, "grad_norm": 0.24886726646481336, "learning_rate": 0.00015330339961513792, "loss": 0.465, "step": 1195 }, { "epoch": 0.07697241821680564, "grad_norm": 0.5949618384904851, "learning_rate": 0.0001539448364336113, "loss": 0.6872, "step": 1200 }, { "epoch": 0.07729313662604234, "grad_norm": 0.7888477619326826, "learning_rate": 0.00015458627325208468, "loss": 0.5609, "step": 1205 }, { "epoch": 0.07761385503527903, "grad_norm": 1.0669700966748508, "learning_rate": 0.00015522771007055804, "loss": 0.7743, "step": 1210 }, { "epoch": 0.07793457344451571, "grad_norm": 0.7068283314311553, "learning_rate": 0.00015586914688903145, "loss": 0.6263, "step": 1215 }, { "epoch": 0.0782552918537524, "grad_norm": 0.5841407337661559, "learning_rate": 0.00015651058370750483, "loss": 0.6187, "step": 1220 }, { "epoch": 0.07857601026298909, "grad_norm": 0.6229227132294815, "learning_rate": 0.0001571520205259782, "loss": 0.7183, "step": 1225 }, { "epoch": 0.07889672867222579, "grad_norm": 0.6002586833079545, "learning_rate": 0.0001577934573444516, "loss": 0.7077, "step": 1230 }, { "epoch": 0.07921744708146247, "grad_norm": 0.5383734940611982, "learning_rate": 0.00015843489416292496, "loss": 0.6251, "step": 1235 }, { "epoch": 0.07953816549069917, "grad_norm": 0.6051581628244698, "learning_rate": 0.00015907633098139834, "loss": 0.6742, "step": 1240 }, { "epoch": 0.07985888389993585, "grad_norm": 0.6524111511727346, "learning_rate": 0.0001597177677998717, "loss": 0.6258, "step": 1245 }, { "epoch": 0.08017960230917255, "grad_norm": 0.8452071724294624, "learning_rate": 0.0001603592046183451, "loss": 0.6583, "step": 1250 }, { "epoch": 0.08050032071840924, "grad_norm": 0.5380526459581828, "learning_rate": 0.00016100064143681847, "loss": 0.7976, "step": 1255 }, { "epoch": 0.08082103912764593, "grad_norm": 0.846065125270878, "learning_rate": 0.00016164207825529185, "loss": 0.5684, "step": 1260 }, { "epoch": 0.08114175753688262, "grad_norm": 1.2668855662638892, "learning_rate": 0.00016228351507376523, "loss": 0.6079, "step": 1265 }, { "epoch": 0.0814624759461193, "grad_norm": 0.7795964267281216, "learning_rate": 0.00016292495189223862, "loss": 0.646, "step": 1270 }, { "epoch": 0.081783194355356, "grad_norm": 0.7027735707273621, "learning_rate": 0.000163566388710712, "loss": 0.7358, "step": 1275 }, { "epoch": 0.08210391276459268, "grad_norm": 0.6792816013615487, "learning_rate": 0.00016420782552918538, "loss": 0.6695, "step": 1280 }, { "epoch": 0.08242463117382938, "grad_norm": 0.6182179483058359, "learning_rate": 0.00016484926234765877, "loss": 0.6096, "step": 1285 }, { "epoch": 0.08274534958306606, "grad_norm": 0.7701573171054498, "learning_rate": 0.00016549069916613212, "loss": 0.6467, "step": 1290 }, { "epoch": 0.08306606799230276, "grad_norm": 0.8699247842006342, "learning_rate": 0.00016613213598460554, "loss": 0.5635, "step": 1295 }, { "epoch": 0.08338678640153944, "grad_norm": 1.5815526952211336, "learning_rate": 0.0001667735728030789, "loss": 0.7091, "step": 1300 }, { "epoch": 0.08370750481077614, "grad_norm": 1.1184328365345817, "learning_rate": 0.00016741500962155228, "loss": 0.6598, "step": 1305 }, { "epoch": 0.08402822322001283, "grad_norm": 0.5795213958251844, "learning_rate": 0.00016805644644002566, "loss": 0.6638, "step": 1310 }, { "epoch": 0.08434894162924952, "grad_norm": 0.9373149156332843, "learning_rate": 0.00016869788325849904, "loss": 0.6091, "step": 1315 }, { "epoch": 0.08466966003848621, "grad_norm": 1.581754110063961, "learning_rate": 0.00016933932007697243, "loss": 0.6641, "step": 1320 }, { "epoch": 0.0849903784477229, "grad_norm": 0.8932544552326179, "learning_rate": 0.0001699807568954458, "loss": 0.7052, "step": 1325 }, { "epoch": 0.08531109685695959, "grad_norm": 0.7663989496912428, "learning_rate": 0.0001706221937139192, "loss": 0.6517, "step": 1330 }, { "epoch": 0.08563181526619627, "grad_norm": 0.46405474836741084, "learning_rate": 0.00017126363053239255, "loss": 0.6357, "step": 1335 }, { "epoch": 0.08595253367543297, "grad_norm": 0.6382018266002509, "learning_rate": 0.00017190506735086596, "loss": 0.5614, "step": 1340 }, { "epoch": 0.08627325208466965, "grad_norm": 0.43085923514031815, "learning_rate": 0.00017254650416933932, "loss": 0.6499, "step": 1345 }, { "epoch": 0.08659397049390635, "grad_norm": 0.8362450675258178, "learning_rate": 0.0001731879409878127, "loss": 0.7166, "step": 1350 }, { "epoch": 0.08691468890314304, "grad_norm": 0.6383324045212788, "learning_rate": 0.00017382937780628609, "loss": 0.6072, "step": 1355 }, { "epoch": 0.08723540731237973, "grad_norm": 0.8935601954358443, "learning_rate": 0.00017447081462475947, "loss": 0.6483, "step": 1360 }, { "epoch": 0.08755612572161642, "grad_norm": 0.7202566228037989, "learning_rate": 0.00017511225144323285, "loss": 0.5967, "step": 1365 }, { "epoch": 0.08787684413085312, "grad_norm": 0.5230697995372986, "learning_rate": 0.00017575368826170624, "loss": 0.7513, "step": 1370 }, { "epoch": 0.0881975625400898, "grad_norm": 0.816705171178794, "learning_rate": 0.00017639512508017962, "loss": 0.6651, "step": 1375 }, { "epoch": 0.0885182809493265, "grad_norm": 0.5342326962298032, "learning_rate": 0.00017703656189865298, "loss": 0.5963, "step": 1380 }, { "epoch": 0.08883899935856318, "grad_norm": 0.5984545509333057, "learning_rate": 0.0001776779987171264, "loss": 0.6455, "step": 1385 }, { "epoch": 0.08915971776779986, "grad_norm": 0.6477898184624558, "learning_rate": 0.00017831943553559974, "loss": 0.7328, "step": 1390 }, { "epoch": 0.08948043617703656, "grad_norm": 0.5092110599627088, "learning_rate": 0.00017896087235407313, "loss": 0.6279, "step": 1395 }, { "epoch": 0.08980115458627325, "grad_norm": 0.7029802255673286, "learning_rate": 0.0001796023091725465, "loss": 0.7776, "step": 1400 }, { "epoch": 0.09012187299550994, "grad_norm": 0.8832343335799728, "learning_rate": 0.0001802437459910199, "loss": 0.6111, "step": 1405 }, { "epoch": 0.09044259140474663, "grad_norm": 0.8016746694750925, "learning_rate": 0.00018088518280949328, "loss": 0.6695, "step": 1410 }, { "epoch": 0.09076330981398333, "grad_norm": 1.1634306884211862, "learning_rate": 0.00018152661962796664, "loss": 0.7114, "step": 1415 }, { "epoch": 0.09108402822322001, "grad_norm": 0.6624473023568856, "learning_rate": 0.00018216805644644005, "loss": 0.7559, "step": 1420 }, { "epoch": 0.09140474663245671, "grad_norm": 0.8042759336949421, "learning_rate": 0.0001828094932649134, "loss": 0.7583, "step": 1425 }, { "epoch": 0.09172546504169339, "grad_norm": 0.9772784468918035, "learning_rate": 0.00018345093008338679, "loss": 0.6853, "step": 1430 }, { "epoch": 0.09204618345093009, "grad_norm": 0.5803428867246113, "learning_rate": 0.00018409236690186017, "loss": 0.6201, "step": 1435 }, { "epoch": 0.09236690186016677, "grad_norm": 1.0135605254585267, "learning_rate": 0.00018473380372033355, "loss": 0.5897, "step": 1440 }, { "epoch": 0.09268762026940347, "grad_norm": 1.0146665351265378, "learning_rate": 0.00018537524053880694, "loss": 0.7739, "step": 1445 }, { "epoch": 0.09300833867864015, "grad_norm": 0.6409550994368336, "learning_rate": 0.00018601667735728032, "loss": 0.652, "step": 1450 }, { "epoch": 0.09332905708787684, "grad_norm": 0.9063409381829404, "learning_rate": 0.0001866581141757537, "loss": 0.5091, "step": 1455 }, { "epoch": 0.09364977549711354, "grad_norm": 0.6035311851346433, "learning_rate": 0.00018729955099422706, "loss": 0.5951, "step": 1460 }, { "epoch": 0.09397049390635022, "grad_norm": 0.4305914555852047, "learning_rate": 0.00018794098781270047, "loss": 0.6979, "step": 1465 }, { "epoch": 0.09429121231558692, "grad_norm": 0.592322337116948, "learning_rate": 0.00018858242463117383, "loss": 0.7894, "step": 1470 }, { "epoch": 0.0946119307248236, "grad_norm": 0.7019994823024447, "learning_rate": 0.0001892238614496472, "loss": 0.6685, "step": 1475 }, { "epoch": 0.0949326491340603, "grad_norm": 0.6511984672543305, "learning_rate": 0.0001898652982681206, "loss": 0.7993, "step": 1480 }, { "epoch": 0.09525336754329698, "grad_norm": 0.7220123377652353, "learning_rate": 0.00019050673508659398, "loss": 0.6424, "step": 1485 }, { "epoch": 0.09557408595253368, "grad_norm": 0.569165004645741, "learning_rate": 0.00019114817190506736, "loss": 0.5879, "step": 1490 }, { "epoch": 0.09589480436177036, "grad_norm": 0.6841283140830406, "learning_rate": 0.00019178960872354075, "loss": 0.6944, "step": 1495 }, { "epoch": 0.09621552277100706, "grad_norm": 0.5806780565962407, "learning_rate": 0.00019243104554201413, "loss": 0.7039, "step": 1500 }, { "epoch": 0.09653624118024375, "grad_norm": 1.0231588558162683, "learning_rate": 0.0001930724823604875, "loss": 0.7447, "step": 1505 }, { "epoch": 0.09685695958948044, "grad_norm": 0.6513202839027658, "learning_rate": 0.0001937139191789609, "loss": 0.6013, "step": 1510 }, { "epoch": 0.09717767799871713, "grad_norm": 0.7845659853361092, "learning_rate": 0.00019435535599743425, "loss": 0.6069, "step": 1515 }, { "epoch": 0.09749839640795381, "grad_norm": 0.7194048768316849, "learning_rate": 0.00019499679281590764, "loss": 0.7641, "step": 1520 }, { "epoch": 0.09781911481719051, "grad_norm": 0.6191788469641755, "learning_rate": 0.00019563822963438102, "loss": 0.7448, "step": 1525 }, { "epoch": 0.09813983322642719, "grad_norm": 0.7426546495672112, "learning_rate": 0.0001962796664528544, "loss": 0.7761, "step": 1530 }, { "epoch": 0.09846055163566389, "grad_norm": 0.7572762314827131, "learning_rate": 0.0001969211032713278, "loss": 0.8618, "step": 1535 }, { "epoch": 0.09878127004490057, "grad_norm": 0.6372317781767599, "learning_rate": 0.00019756254008980117, "loss": 0.6666, "step": 1540 }, { "epoch": 0.09910198845413727, "grad_norm": 0.6326871836739665, "learning_rate": 0.00019820397690827456, "loss": 0.6547, "step": 1545 }, { "epoch": 0.09942270686337396, "grad_norm": 1.1563371750862326, "learning_rate": 0.0001988454137267479, "loss": 0.6453, "step": 1550 }, { "epoch": 0.09974342527261065, "grad_norm": 0.9479492008239019, "learning_rate": 0.00019948685054522132, "loss": 0.651, "step": 1555 }, { "epoch": 0.10006414368184734, "grad_norm": 0.7535422723224012, "learning_rate": 0.00019999999749335695, "loss": 0.7093, "step": 1560 }, { "epoch": 0.10038486209108403, "grad_norm": 0.5932112107729582, "learning_rate": 0.00019999990976086248, "loss": 0.7182, "step": 1565 }, { "epoch": 0.10070558050032072, "grad_norm": 0.6993778331415806, "learning_rate": 0.00019999969669633985, "loss": 0.6146, "step": 1570 }, { "epoch": 0.1010262989095574, "grad_norm": 0.7283971397341802, "learning_rate": 0.00019999935830005615, "loss": 0.6496, "step": 1575 }, { "epoch": 0.1013470173187941, "grad_norm": 1.0242803435192598, "learning_rate": 0.00019999889457243545, "loss": 0.8042, "step": 1580 }, { "epoch": 0.10166773572803078, "grad_norm": 0.6322290861624766, "learning_rate": 0.000199998305514059, "loss": 0.7667, "step": 1585 }, { "epoch": 0.10198845413726748, "grad_norm": 0.6155965395909687, "learning_rate": 0.00019999759112566498, "loss": 0.6363, "step": 1590 }, { "epoch": 0.10230917254650417, "grad_norm": 0.4557230080410517, "learning_rate": 0.00019999675140814887, "loss": 0.5606, "step": 1595 }, { "epoch": 0.10262989095574086, "grad_norm": 0.6477761450960091, "learning_rate": 0.00019999578636256302, "loss": 0.6693, "step": 1600 }, { "epoch": 0.10295060936497755, "grad_norm": 0.8654904236010101, "learning_rate": 0.000199994695990117, "loss": 0.6314, "step": 1605 }, { "epoch": 0.10327132777421424, "grad_norm": 0.6903326653951578, "learning_rate": 0.00019999348029217732, "loss": 0.6179, "step": 1610 }, { "epoch": 0.10359204618345093, "grad_norm": 0.8840202106741641, "learning_rate": 0.00019999213927026775, "loss": 0.6985, "step": 1615 }, { "epoch": 0.10391276459268763, "grad_norm": 0.668673191642802, "learning_rate": 0.00019999067292606894, "loss": 0.6218, "step": 1620 }, { "epoch": 0.10423348300192431, "grad_norm": 0.8731375253636731, "learning_rate": 0.00019998908126141868, "loss": 0.6898, "step": 1625 }, { "epoch": 0.10455420141116101, "grad_norm": 1.0983344411460778, "learning_rate": 0.00019998736427831194, "loss": 0.7532, "step": 1630 }, { "epoch": 0.10487491982039769, "grad_norm": 0.5721731282366914, "learning_rate": 0.00019998552197890052, "loss": 0.6003, "step": 1635 }, { "epoch": 0.10519563822963438, "grad_norm": 0.679795611170959, "learning_rate": 0.0001999835543654935, "loss": 0.7003, "step": 1640 }, { "epoch": 0.10551635663887107, "grad_norm": 0.7659746917304108, "learning_rate": 0.0001999814614405569, "loss": 0.6359, "step": 1645 }, { "epoch": 0.10583707504810776, "grad_norm": 1.1962756283471876, "learning_rate": 0.00019997924320671383, "loss": 0.6308, "step": 1650 }, { "epoch": 0.10615779345734445, "grad_norm": 0.7621683185763631, "learning_rate": 0.00019997689966674446, "loss": 0.7957, "step": 1655 }, { "epoch": 0.10647851186658114, "grad_norm": 0.7338531701197929, "learning_rate": 0.00019997443082358601, "loss": 0.6757, "step": 1660 }, { "epoch": 0.10679923027581784, "grad_norm": 0.7150664806057576, "learning_rate": 0.00019997183668033267, "loss": 0.694, "step": 1665 }, { "epoch": 0.10711994868505452, "grad_norm": 0.7869356473972234, "learning_rate": 0.0001999691172402358, "loss": 0.719, "step": 1670 }, { "epoch": 0.10744066709429122, "grad_norm": 0.611503667039071, "learning_rate": 0.00019996627250670374, "loss": 0.6343, "step": 1675 }, { "epoch": 0.1077613855035279, "grad_norm": 0.7766135920581687, "learning_rate": 0.00019996330248330183, "loss": 0.693, "step": 1680 }, { "epoch": 0.1080821039127646, "grad_norm": 0.4786388847248821, "learning_rate": 0.00019996020717375247, "loss": 0.6194, "step": 1685 }, { "epoch": 0.10840282232200128, "grad_norm": 0.6991936018277035, "learning_rate": 0.000199956986581935, "loss": 0.7263, "step": 1690 }, { "epoch": 0.10872354073123797, "grad_norm": 0.7205841321201338, "learning_rate": 0.000199953640711886, "loss": 0.4831, "step": 1695 }, { "epoch": 0.10904425914047466, "grad_norm": 0.9131191032401795, "learning_rate": 0.00019995016956779886, "loss": 0.5177, "step": 1700 }, { "epoch": 0.10936497754971135, "grad_norm": 0.5536147800325968, "learning_rate": 0.000199946573154024, "loss": 0.6789, "step": 1705 }, { "epoch": 0.10968569595894805, "grad_norm": 0.6451976876558219, "learning_rate": 0.00019994285147506888, "loss": 0.7275, "step": 1710 }, { "epoch": 0.11000641436818473, "grad_norm": 0.9579506214333907, "learning_rate": 0.00019993900453559805, "loss": 0.6589, "step": 1715 }, { "epoch": 0.11032713277742143, "grad_norm": 0.9260040237199151, "learning_rate": 0.00019993503234043284, "loss": 0.6823, "step": 1720 }, { "epoch": 0.11064785118665811, "grad_norm": 0.9505358223036796, "learning_rate": 0.00019993093489455182, "loss": 0.7616, "step": 1725 }, { "epoch": 0.11096856959589481, "grad_norm": 0.7825553328319829, "learning_rate": 0.0001999267122030903, "loss": 0.6443, "step": 1730 }, { "epoch": 0.11128928800513149, "grad_norm": 1.277608679789176, "learning_rate": 0.00019992236427134069, "loss": 0.6155, "step": 1735 }, { "epoch": 0.11161000641436819, "grad_norm": 0.5889261013180431, "learning_rate": 0.00019991789110475238, "loss": 0.6994, "step": 1740 }, { "epoch": 0.11193072482360487, "grad_norm": 0.8029959511201281, "learning_rate": 0.00019991329270893163, "loss": 0.5902, "step": 1745 }, { "epoch": 0.11225144323284157, "grad_norm": 0.8303612970994603, "learning_rate": 0.00019990856908964178, "loss": 0.783, "step": 1750 }, { "epoch": 0.11257216164207826, "grad_norm": 0.7054559375502497, "learning_rate": 0.00019990372025280304, "loss": 0.6792, "step": 1755 }, { "epoch": 0.11289288005131494, "grad_norm": 0.7420987703476908, "learning_rate": 0.0001998987462044925, "loss": 0.6013, "step": 1760 }, { "epoch": 0.11321359846055164, "grad_norm": 0.7094425366646243, "learning_rate": 0.00019989364695094426, "loss": 0.5688, "step": 1765 }, { "epoch": 0.11353431686978832, "grad_norm": 0.569373653159604, "learning_rate": 0.00019988842249854934, "loss": 0.58, "step": 1770 }, { "epoch": 0.11385503527902502, "grad_norm": 0.46978550262066865, "learning_rate": 0.00019988307285385566, "loss": 0.7256, "step": 1775 }, { "epoch": 0.1141757536882617, "grad_norm": 0.6612438373633108, "learning_rate": 0.00019987759802356803, "loss": 0.7488, "step": 1780 }, { "epoch": 0.1144964720974984, "grad_norm": 0.7309333682103005, "learning_rate": 0.00019987199801454816, "loss": 0.7284, "step": 1785 }, { "epoch": 0.11481719050673508, "grad_norm": 0.9460563497076551, "learning_rate": 0.00019986627283381472, "loss": 0.6057, "step": 1790 }, { "epoch": 0.11513790891597178, "grad_norm": 0.6266870049609108, "learning_rate": 0.00019986042248854312, "loss": 0.6476, "step": 1795 }, { "epoch": 0.11545862732520847, "grad_norm": 0.7739200309128734, "learning_rate": 0.0001998544469860658, "loss": 0.6622, "step": 1800 }, { "epoch": 0.11577934573444516, "grad_norm": 0.7469556806210228, "learning_rate": 0.00019984834633387193, "loss": 0.5747, "step": 1805 }, { "epoch": 0.11610006414368185, "grad_norm": 0.5367955199234249, "learning_rate": 0.00019984212053960763, "loss": 0.671, "step": 1810 }, { "epoch": 0.11642078255291853, "grad_norm": 1.0704497861003814, "learning_rate": 0.00019983576961107576, "loss": 0.6748, "step": 1815 }, { "epoch": 0.11674150096215523, "grad_norm": 0.6669764759339204, "learning_rate": 0.00019982929355623615, "loss": 0.7167, "step": 1820 }, { "epoch": 0.11706221937139191, "grad_norm": 0.6039796198063991, "learning_rate": 0.00019982269238320532, "loss": 0.6067, "step": 1825 }, { "epoch": 0.11738293778062861, "grad_norm": 1.7098001118613075, "learning_rate": 0.00019981596610025668, "loss": 0.7805, "step": 1830 }, { "epoch": 0.1177036561898653, "grad_norm": 0.7398855694010563, "learning_rate": 0.00019980911471582043, "loss": 0.6427, "step": 1835 }, { "epoch": 0.11802437459910199, "grad_norm": 0.8354800121875872, "learning_rate": 0.0001998021382384836, "loss": 0.7408, "step": 1840 }, { "epoch": 0.11834509300833868, "grad_norm": 0.6722235019789473, "learning_rate": 0.00019979503667698985, "loss": 0.6435, "step": 1845 }, { "epoch": 0.11866581141757537, "grad_norm": 0.717593721397057, "learning_rate": 0.00019978781004023982, "loss": 0.6241, "step": 1850 }, { "epoch": 0.11898652982681206, "grad_norm": 0.7195515776738803, "learning_rate": 0.00019978045833729074, "loss": 0.5947, "step": 1855 }, { "epoch": 0.11930724823604875, "grad_norm": 0.8882886022840869, "learning_rate": 0.00019977298157735672, "loss": 0.7388, "step": 1860 }, { "epoch": 0.11962796664528544, "grad_norm": 0.989988319302347, "learning_rate": 0.0001997653797698085, "loss": 0.7599, "step": 1865 }, { "epoch": 0.11994868505452214, "grad_norm": 0.8403633651058144, "learning_rate": 0.00019975765292417358, "loss": 0.6432, "step": 1870 }, { "epoch": 0.12026940346375882, "grad_norm": 1.2049771636877937, "learning_rate": 0.00019974980105013623, "loss": 0.7333, "step": 1875 }, { "epoch": 0.1205901218729955, "grad_norm": 0.8525983520687547, "learning_rate": 0.00019974182415753732, "loss": 0.6699, "step": 1880 }, { "epoch": 0.1209108402822322, "grad_norm": 0.5716659731530915, "learning_rate": 0.00019973372225637453, "loss": 0.5793, "step": 1885 }, { "epoch": 0.12123155869146889, "grad_norm": 0.6060632420377923, "learning_rate": 0.00019972549535680206, "loss": 0.671, "step": 1890 }, { "epoch": 0.12155227710070558, "grad_norm": 0.7561918292328402, "learning_rate": 0.00019971714346913086, "loss": 0.5316, "step": 1895 }, { "epoch": 0.12187299550994227, "grad_norm": 0.9824211285333242, "learning_rate": 0.00019970866660382863, "loss": 0.5868, "step": 1900 }, { "epoch": 0.12219371391917896, "grad_norm": 0.7951038927386893, "learning_rate": 0.00019970006477151953, "loss": 0.7, "step": 1905 }, { "epoch": 0.12251443232841565, "grad_norm": 0.747912075117886, "learning_rate": 0.0001996913379829844, "loss": 0.5798, "step": 1910 }, { "epoch": 0.12283515073765235, "grad_norm": 1.2254454430699995, "learning_rate": 0.00019968248624916077, "loss": 0.6667, "step": 1915 }, { "epoch": 0.12315586914688903, "grad_norm": 1.1768102485963885, "learning_rate": 0.00019967350958114267, "loss": 0.5774, "step": 1920 }, { "epoch": 0.12347658755612573, "grad_norm": 0.6310183951664794, "learning_rate": 0.0001996644079901808, "loss": 0.4486, "step": 1925 }, { "epoch": 0.12379730596536241, "grad_norm": 0.8260925792950813, "learning_rate": 0.00019965518148768233, "loss": 0.5623, "step": 1930 }, { "epoch": 0.1241180243745991, "grad_norm": 0.9150306074218141, "learning_rate": 0.000199645830085211, "loss": 0.83, "step": 1935 }, { "epoch": 0.12443874278383579, "grad_norm": 0.9369210275043979, "learning_rate": 0.00019963635379448722, "loss": 0.7223, "step": 1940 }, { "epoch": 0.12475946119307248, "grad_norm": 0.748894355642791, "learning_rate": 0.00019962675262738774, "loss": 0.6919, "step": 1945 }, { "epoch": 0.12508017960230916, "grad_norm": 1.1961745083017192, "learning_rate": 0.00019961702659594598, "loss": 0.5536, "step": 1950 }, { "epoch": 0.12540089801154586, "grad_norm": 0.568991855421978, "learning_rate": 0.00019960717571235173, "loss": 0.639, "step": 1955 }, { "epoch": 0.12572161642078256, "grad_norm": 1.0900526061976745, "learning_rate": 0.00019959719998895135, "loss": 0.6571, "step": 1960 }, { "epoch": 0.12604233483001925, "grad_norm": 0.7953938211319622, "learning_rate": 0.00019958709943824758, "loss": 0.7077, "step": 1965 }, { "epoch": 0.12636305323925592, "grad_norm": 1.0090362549424627, "learning_rate": 0.0001995768740728997, "loss": 0.629, "step": 1970 }, { "epoch": 0.12668377164849262, "grad_norm": 0.7822194115921188, "learning_rate": 0.0001995665239057234, "loss": 0.7948, "step": 1975 }, { "epoch": 0.12700449005772932, "grad_norm": 0.82569207599097, "learning_rate": 0.00019955604894969067, "loss": 0.6823, "step": 1980 }, { "epoch": 0.12732520846696602, "grad_norm": 0.5455388809406508, "learning_rate": 0.0001995454492179301, "loss": 0.5594, "step": 1985 }, { "epoch": 0.1276459268762027, "grad_norm": 0.7695218529222057, "learning_rate": 0.00019953472472372647, "loss": 0.7198, "step": 1990 }, { "epoch": 0.12796664528543938, "grad_norm": 0.8673513110262479, "learning_rate": 0.00019952387548052112, "loss": 0.7148, "step": 1995 }, { "epoch": 0.12828736369467608, "grad_norm": 0.919881076337375, "learning_rate": 0.00019951290150191158, "loss": 0.6439, "step": 2000 }, { "epoch": 0.12860808210391275, "grad_norm": 0.9262998866124367, "learning_rate": 0.00019950180280165175, "loss": 0.5764, "step": 2005 }, { "epoch": 0.12892880051314945, "grad_norm": 0.6765034342263078, "learning_rate": 0.00019949057939365193, "loss": 0.4096, "step": 2010 }, { "epoch": 0.12924951892238615, "grad_norm": 0.7219277816800387, "learning_rate": 0.00019947923129197862, "loss": 0.7127, "step": 2015 }, { "epoch": 0.12957023733162285, "grad_norm": 0.8406570776216719, "learning_rate": 0.0001994677585108546, "loss": 0.6191, "step": 2020 }, { "epoch": 0.12989095574085952, "grad_norm": 0.7458490203268737, "learning_rate": 0.00019945616106465904, "loss": 0.5701, "step": 2025 }, { "epoch": 0.1302116741500962, "grad_norm": 1.293735176011679, "learning_rate": 0.0001994444389679272, "loss": 0.6852, "step": 2030 }, { "epoch": 0.1305323925593329, "grad_norm": 0.9148850105541353, "learning_rate": 0.00019943259223535067, "loss": 0.7057, "step": 2035 }, { "epoch": 0.1308531109685696, "grad_norm": 0.6641079479178653, "learning_rate": 0.0001994206208817772, "loss": 0.7629, "step": 2040 }, { "epoch": 0.13117382937780628, "grad_norm": 0.791984066260629, "learning_rate": 0.00019940852492221075, "loss": 0.6992, "step": 2045 }, { "epoch": 0.13149454778704298, "grad_norm": 0.849479398893481, "learning_rate": 0.00019939630437181143, "loss": 0.6966, "step": 2050 }, { "epoch": 0.13181526619627967, "grad_norm": 0.8367106501858504, "learning_rate": 0.00019938395924589552, "loss": 0.5852, "step": 2055 }, { "epoch": 0.13213598460551634, "grad_norm": 0.6790358847768235, "learning_rate": 0.00019937148955993545, "loss": 0.6393, "step": 2060 }, { "epoch": 0.13245670301475304, "grad_norm": 0.9502499514885022, "learning_rate": 0.00019935889532955977, "loss": 0.6777, "step": 2065 }, { "epoch": 0.13277742142398974, "grad_norm": 0.8134631960781032, "learning_rate": 0.000199346176570553, "loss": 0.6862, "step": 2070 }, { "epoch": 0.13309813983322644, "grad_norm": 0.6366664689319048, "learning_rate": 0.00019933333329885593, "loss": 0.604, "step": 2075 }, { "epoch": 0.1334188582424631, "grad_norm": 0.8465634973529981, "learning_rate": 0.00019932036553056524, "loss": 0.7162, "step": 2080 }, { "epoch": 0.1337395766516998, "grad_norm": 0.8425039370601171, "learning_rate": 0.00019930727328193378, "loss": 0.6855, "step": 2085 }, { "epoch": 0.1340602950609365, "grad_norm": 1.14970228361299, "learning_rate": 0.00019929405656937032, "loss": 0.7191, "step": 2090 }, { "epoch": 0.1343810134701732, "grad_norm": 1.0969227215850126, "learning_rate": 0.0001992807154094396, "loss": 0.728, "step": 2095 }, { "epoch": 0.13470173187940987, "grad_norm": 0.5634883710558874, "learning_rate": 0.00019926724981886244, "loss": 0.6794, "step": 2100 }, { "epoch": 0.13502245028864657, "grad_norm": 0.9532151941436401, "learning_rate": 0.0001992536598145155, "loss": 0.6422, "step": 2105 }, { "epoch": 0.13534316869788326, "grad_norm": 0.8529397357920244, "learning_rate": 0.0001992399454134315, "loss": 0.8323, "step": 2110 }, { "epoch": 0.13566388710711993, "grad_norm": 0.5995161683553816, "learning_rate": 0.00019922610663279894, "loss": 0.6443, "step": 2115 }, { "epoch": 0.13598460551635663, "grad_norm": 1.1645114047730085, "learning_rate": 0.00019921214348996228, "loss": 0.638, "step": 2120 }, { "epoch": 0.13630532392559333, "grad_norm": 0.7254426735765782, "learning_rate": 0.00019919805600242176, "loss": 0.6233, "step": 2125 }, { "epoch": 0.13662604233483003, "grad_norm": 1.2630556570142795, "learning_rate": 0.00019918384418783362, "loss": 0.7941, "step": 2130 }, { "epoch": 0.1369467607440667, "grad_norm": 0.5842349667453849, "learning_rate": 0.00019916950806400983, "loss": 0.7714, "step": 2135 }, { "epoch": 0.1372674791533034, "grad_norm": 0.5918400976970277, "learning_rate": 0.00019915504764891808, "loss": 0.7118, "step": 2140 }, { "epoch": 0.1375881975625401, "grad_norm": 0.8666504796220831, "learning_rate": 0.000199140462960682, "loss": 0.7462, "step": 2145 }, { "epoch": 0.1379089159717768, "grad_norm": 0.7764199666330917, "learning_rate": 0.00019912575401758082, "loss": 0.6395, "step": 2150 }, { "epoch": 0.13822963438101346, "grad_norm": 0.9186504138753783, "learning_rate": 0.00019911092083804962, "loss": 0.6289, "step": 2155 }, { "epoch": 0.13855035279025016, "grad_norm": 0.8035713423211853, "learning_rate": 0.00019909596344067914, "loss": 0.7541, "step": 2160 }, { "epoch": 0.13887107119948686, "grad_norm": 0.7189520752077799, "learning_rate": 0.00019908088184421578, "loss": 0.6826, "step": 2165 }, { "epoch": 0.13919178960872355, "grad_norm": 0.6655350088157191, "learning_rate": 0.00019906567606756167, "loss": 0.7409, "step": 2170 }, { "epoch": 0.13951250801796022, "grad_norm": 0.3224597929224718, "learning_rate": 0.0001990503461297745, "loss": 0.5904, "step": 2175 }, { "epoch": 0.13983322642719692, "grad_norm": 0.8267424045917116, "learning_rate": 0.00019903489205006764, "loss": 0.6894, "step": 2180 }, { "epoch": 0.14015394483643362, "grad_norm": 0.6123341217762982, "learning_rate": 0.00019901931384780995, "loss": 0.703, "step": 2185 }, { "epoch": 0.1404746632456703, "grad_norm": 0.45163827780119753, "learning_rate": 0.00019900361154252602, "loss": 0.59, "step": 2190 }, { "epoch": 0.140795381654907, "grad_norm": 0.9556170145817368, "learning_rate": 0.00019898778515389584, "loss": 0.5857, "step": 2195 }, { "epoch": 0.14111610006414368, "grad_norm": 0.7479105122087544, "learning_rate": 0.00019897183470175494, "loss": 0.6585, "step": 2200 }, { "epoch": 0.14143681847338038, "grad_norm": 1.0326719597420064, "learning_rate": 0.0001989557602060944, "loss": 0.7534, "step": 2205 }, { "epoch": 0.14175753688261705, "grad_norm": 0.8658293920784573, "learning_rate": 0.00019893956168706066, "loss": 0.7002, "step": 2210 }, { "epoch": 0.14207825529185375, "grad_norm": 0.8622344203075765, "learning_rate": 0.00019892323916495582, "loss": 0.7086, "step": 2215 }, { "epoch": 0.14239897370109045, "grad_norm": 0.7259813554322444, "learning_rate": 0.00019890679266023709, "loss": 0.4999, "step": 2220 }, { "epoch": 0.14271969211032715, "grad_norm": 0.6647794000879613, "learning_rate": 0.0001988902221935173, "loss": 0.7005, "step": 2225 }, { "epoch": 0.14304041051956382, "grad_norm": 0.8451399712054074, "learning_rate": 0.00019887352778556454, "loss": 0.6435, "step": 2230 }, { "epoch": 0.1433611289288005, "grad_norm": 0.7567525634116421, "learning_rate": 0.0001988567094573023, "loss": 0.7609, "step": 2235 }, { "epoch": 0.1436818473380372, "grad_norm": 0.8106441964345322, "learning_rate": 0.00019883976722980936, "loss": 0.6969, "step": 2240 }, { "epoch": 0.14400256574727388, "grad_norm": 0.6312440700944748, "learning_rate": 0.00019882270112431974, "loss": 0.6787, "step": 2245 }, { "epoch": 0.14432328415651058, "grad_norm": 0.8698670635315567, "learning_rate": 0.00019880551116222277, "loss": 0.79, "step": 2250 }, { "epoch": 0.14464400256574728, "grad_norm": 0.5675337075202405, "learning_rate": 0.00019878819736506297, "loss": 0.6922, "step": 2255 }, { "epoch": 0.14496472097498397, "grad_norm": 0.8080748220001619, "learning_rate": 0.00019877075975454015, "loss": 0.6723, "step": 2260 }, { "epoch": 0.14528543938422064, "grad_norm": 1.18598966284805, "learning_rate": 0.00019875319835250922, "loss": 0.6078, "step": 2265 }, { "epoch": 0.14560615779345734, "grad_norm": 0.7396735588781944, "learning_rate": 0.00019873551318098026, "loss": 0.6555, "step": 2270 }, { "epoch": 0.14592687620269404, "grad_norm": 0.9421384978371221, "learning_rate": 0.00019871770426211843, "loss": 0.6763, "step": 2275 }, { "epoch": 0.14624759461193074, "grad_norm": 1.3557865695262534, "learning_rate": 0.0001986997716182441, "loss": 0.6517, "step": 2280 }, { "epoch": 0.1465683130211674, "grad_norm": 0.7620628179190014, "learning_rate": 0.0001986817152718326, "loss": 0.8213, "step": 2285 }, { "epoch": 0.1468890314304041, "grad_norm": 1.1665229535256436, "learning_rate": 0.0001986635352455143, "loss": 0.6593, "step": 2290 }, { "epoch": 0.1472097498396408, "grad_norm": 0.549262325529975, "learning_rate": 0.0001986452315620747, "loss": 0.5682, "step": 2295 }, { "epoch": 0.14753046824887747, "grad_norm": 0.6290840720109729, "learning_rate": 0.00019862680424445413, "loss": 0.5891, "step": 2300 }, { "epoch": 0.14785118665811417, "grad_norm": 0.6806013181414412, "learning_rate": 0.00019860825331574798, "loss": 0.7814, "step": 2305 }, { "epoch": 0.14817190506735087, "grad_norm": 0.9105112621167852, "learning_rate": 0.00019858957879920647, "loss": 0.6707, "step": 2310 }, { "epoch": 0.14849262347658757, "grad_norm": 0.8528821816779594, "learning_rate": 0.00019857078071823484, "loss": 0.664, "step": 2315 }, { "epoch": 0.14881334188582424, "grad_norm": 0.7181914153458827, "learning_rate": 0.0001985518590963931, "loss": 0.6854, "step": 2320 }, { "epoch": 0.14913406029506093, "grad_norm": 0.7397278453494517, "learning_rate": 0.00019853281395739613, "loss": 0.6665, "step": 2325 }, { "epoch": 0.14945477870429763, "grad_norm": 0.8745968398949746, "learning_rate": 0.00019851364532511362, "loss": 0.5766, "step": 2330 }, { "epoch": 0.14977549711353433, "grad_norm": 1.2088886679730004, "learning_rate": 0.00019849435322356995, "loss": 0.7018, "step": 2335 }, { "epoch": 0.150096215522771, "grad_norm": 1.0443479254100274, "learning_rate": 0.00019847493767694444, "loss": 0.5986, "step": 2340 }, { "epoch": 0.1504169339320077, "grad_norm": 1.0921241128817574, "learning_rate": 0.00019845539870957092, "loss": 0.5923, "step": 2345 }, { "epoch": 0.1507376523412444, "grad_norm": 0.9646802917631114, "learning_rate": 0.00019843573634593806, "loss": 0.7926, "step": 2350 }, { "epoch": 0.1510583707504811, "grad_norm": 0.7656847484095911, "learning_rate": 0.00019841595061068906, "loss": 0.7207, "step": 2355 }, { "epoch": 0.15137908915971776, "grad_norm": 0.5049528849051477, "learning_rate": 0.0001983960415286219, "loss": 0.6228, "step": 2360 }, { "epoch": 0.15169980756895446, "grad_norm": 0.9068993192806996, "learning_rate": 0.00019837600912468893, "loss": 0.5693, "step": 2365 }, { "epoch": 0.15202052597819116, "grad_norm": 0.8676250105736654, "learning_rate": 0.00019835585342399732, "loss": 0.5743, "step": 2370 }, { "epoch": 0.15234124438742783, "grad_norm": 0.5246385631697503, "learning_rate": 0.00019833557445180855, "loss": 0.7401, "step": 2375 }, { "epoch": 0.15266196279666452, "grad_norm": 0.7016792226152242, "learning_rate": 0.0001983151722335387, "loss": 0.6403, "step": 2380 }, { "epoch": 0.15298268120590122, "grad_norm": 0.7722496289657441, "learning_rate": 0.00019829464679475836, "loss": 0.5484, "step": 2385 }, { "epoch": 0.15330339961513792, "grad_norm": 1.2298123662291214, "learning_rate": 0.00019827399816119243, "loss": 0.7674, "step": 2390 }, { "epoch": 0.1536241180243746, "grad_norm": 0.7861238282945989, "learning_rate": 0.00019825322635872036, "loss": 0.619, "step": 2395 }, { "epoch": 0.1539448364336113, "grad_norm": 0.9211911752664865, "learning_rate": 0.00019823233141337584, "loss": 0.6211, "step": 2400 }, { "epoch": 0.15426555484284799, "grad_norm": 0.7151255909037463, "learning_rate": 0.00019821131335134696, "loss": 0.548, "step": 2405 }, { "epoch": 0.15458627325208468, "grad_norm": 0.9458426635711992, "learning_rate": 0.00019819017219897613, "loss": 0.6482, "step": 2410 }, { "epoch": 0.15490699166132135, "grad_norm": 1.0258204800171964, "learning_rate": 0.00019816890798276, "loss": 0.6717, "step": 2415 }, { "epoch": 0.15522771007055805, "grad_norm": 1.2116376507078799, "learning_rate": 0.00019814752072934945, "loss": 0.6242, "step": 2420 }, { "epoch": 0.15554842847979475, "grad_norm": 0.7799968415850017, "learning_rate": 0.00019812601046554962, "loss": 0.6257, "step": 2425 }, { "epoch": 0.15586914688903142, "grad_norm": 0.4916761578519649, "learning_rate": 0.00019810437721831976, "loss": 0.7221, "step": 2430 }, { "epoch": 0.15618986529826812, "grad_norm": 0.9089669003206741, "learning_rate": 0.00019808262101477328, "loss": 0.6457, "step": 2435 }, { "epoch": 0.1565105837075048, "grad_norm": 0.5752941624716728, "learning_rate": 0.00019806074188217766, "loss": 0.5367, "step": 2440 }, { "epoch": 0.1568313021167415, "grad_norm": 0.7889396413468497, "learning_rate": 0.0001980387398479546, "loss": 0.5704, "step": 2445 }, { "epoch": 0.15715202052597818, "grad_norm": 0.7974301152247996, "learning_rate": 0.00019801661493967955, "loss": 0.7438, "step": 2450 }, { "epoch": 0.15747273893521488, "grad_norm": 0.9099718674001662, "learning_rate": 0.00019799436718508228, "loss": 0.7057, "step": 2455 }, { "epoch": 0.15779345734445158, "grad_norm": 0.7460789907183486, "learning_rate": 0.0001979719966120463, "loss": 0.6769, "step": 2460 }, { "epoch": 0.15811417575368827, "grad_norm": 0.9026682063218279, "learning_rate": 0.00019794950324860918, "loss": 0.6677, "step": 2465 }, { "epoch": 0.15843489416292494, "grad_norm": 0.706813388972018, "learning_rate": 0.0001979268871229623, "loss": 0.652, "step": 2470 }, { "epoch": 0.15875561257216164, "grad_norm": 0.7951893501420781, "learning_rate": 0.00019790414826345094, "loss": 0.7231, "step": 2475 }, { "epoch": 0.15907633098139834, "grad_norm": 0.9695064104615378, "learning_rate": 0.0001978812866985742, "loss": 0.6308, "step": 2480 }, { "epoch": 0.159397049390635, "grad_norm": 0.5344509876021667, "learning_rate": 0.00019785830245698497, "loss": 0.6997, "step": 2485 }, { "epoch": 0.1597177677998717, "grad_norm": 0.834051661967047, "learning_rate": 0.00019783519556748987, "loss": 0.6783, "step": 2490 }, { "epoch": 0.1600384862091084, "grad_norm": 0.9723305146917021, "learning_rate": 0.0001978119660590493, "loss": 0.7798, "step": 2495 }, { "epoch": 0.1603592046183451, "grad_norm": 0.8859242414039744, "learning_rate": 0.00019778861396077725, "loss": 0.793, "step": 2500 }, { "epoch": 0.16067992302758177, "grad_norm": 0.7241777810857887, "learning_rate": 0.00019776513930194148, "loss": 0.504, "step": 2505 }, { "epoch": 0.16100064143681847, "grad_norm": 1.054121315907452, "learning_rate": 0.00019774154211196318, "loss": 0.7509, "step": 2510 }, { "epoch": 0.16132135984605517, "grad_norm": 0.8701449793412597, "learning_rate": 0.0001977178224204173, "loss": 0.7875, "step": 2515 }, { "epoch": 0.16164207825529187, "grad_norm": 0.7757819809049686, "learning_rate": 0.00019769398025703224, "loss": 0.6047, "step": 2520 }, { "epoch": 0.16196279666452854, "grad_norm": 1.0713357367053484, "learning_rate": 0.00019767001565168982, "loss": 0.7384, "step": 2525 }, { "epoch": 0.16228351507376523, "grad_norm": 0.43793306094407475, "learning_rate": 0.00019764592863442544, "loss": 0.6156, "step": 2530 }, { "epoch": 0.16260423348300193, "grad_norm": 0.9034469617213254, "learning_rate": 0.0001976217192354279, "loss": 0.6383, "step": 2535 }, { "epoch": 0.1629249518922386, "grad_norm": 0.7090465404578327, "learning_rate": 0.0001975973874850393, "loss": 0.59, "step": 2540 }, { "epoch": 0.1632456703014753, "grad_norm": 0.7781025944113742, "learning_rate": 0.00019757293341375517, "loss": 0.6829, "step": 2545 }, { "epoch": 0.163566388710712, "grad_norm": 0.701765797555506, "learning_rate": 0.00019754835705222435, "loss": 0.6682, "step": 2550 }, { "epoch": 0.1638871071199487, "grad_norm": 0.8486110822681391, "learning_rate": 0.00019752365843124885, "loss": 0.7107, "step": 2555 }, { "epoch": 0.16420782552918536, "grad_norm": 1.2183183484648679, "learning_rate": 0.00019749883758178404, "loss": 0.7092, "step": 2560 }, { "epoch": 0.16452854393842206, "grad_norm": 0.5747438190450085, "learning_rate": 0.0001974738945349384, "loss": 0.5635, "step": 2565 }, { "epoch": 0.16484926234765876, "grad_norm": 0.754766366798954, "learning_rate": 0.0001974488293219736, "loss": 0.7561, "step": 2570 }, { "epoch": 0.16516998075689546, "grad_norm": 0.9579439740753294, "learning_rate": 0.00019742364197430443, "loss": 0.6015, "step": 2575 }, { "epoch": 0.16549069916613213, "grad_norm": 0.6786544154968012, "learning_rate": 0.00019739833252349867, "loss": 0.5112, "step": 2580 }, { "epoch": 0.16581141757536882, "grad_norm": 0.7934214823629537, "learning_rate": 0.00019737290100127722, "loss": 0.7203, "step": 2585 }, { "epoch": 0.16613213598460552, "grad_norm": 1.33220621050734, "learning_rate": 0.00019734734743951396, "loss": 0.6863, "step": 2590 }, { "epoch": 0.16645285439384222, "grad_norm": 0.8267900862256077, "learning_rate": 0.00019732167187023572, "loss": 0.6449, "step": 2595 }, { "epoch": 0.1667735728030789, "grad_norm": 0.7287938245757929, "learning_rate": 0.0001972958743256222, "loss": 0.7308, "step": 2600 }, { "epoch": 0.1670942912123156, "grad_norm": 0.5363094807734924, "learning_rate": 0.00019726995483800613, "loss": 0.6403, "step": 2605 }, { "epoch": 0.16741500962155229, "grad_norm": 0.7277617239159246, "learning_rate": 0.00019724391343987284, "loss": 0.6777, "step": 2610 }, { "epoch": 0.16773572803078896, "grad_norm": 0.9462519719607535, "learning_rate": 0.00019721775016386057, "loss": 0.6895, "step": 2615 }, { "epoch": 0.16805644644002565, "grad_norm": 0.8528897030121969, "learning_rate": 0.0001971914650427604, "loss": 0.5536, "step": 2620 }, { "epoch": 0.16837716484926235, "grad_norm": 0.9319172497451002, "learning_rate": 0.000197165058109516, "loss": 0.5724, "step": 2625 }, { "epoch": 0.16869788325849905, "grad_norm": 0.7410196474628663, "learning_rate": 0.0001971385293972237, "loss": 0.6785, "step": 2630 }, { "epoch": 0.16901860166773572, "grad_norm": 0.9192207798068145, "learning_rate": 0.00019711187893913255, "loss": 0.7219, "step": 2635 }, { "epoch": 0.16933932007697242, "grad_norm": 0.5750937169325536, "learning_rate": 0.00019708510676864414, "loss": 0.482, "step": 2640 }, { "epoch": 0.1696600384862091, "grad_norm": 0.7158603995106417, "learning_rate": 0.0001970582129193126, "loss": 0.577, "step": 2645 }, { "epoch": 0.1699807568954458, "grad_norm": 0.9152254783119084, "learning_rate": 0.00019703119742484453, "loss": 0.6657, "step": 2650 }, { "epoch": 0.17030147530468248, "grad_norm": 0.7435319188039847, "learning_rate": 0.00019700406031909905, "loss": 0.6779, "step": 2655 }, { "epoch": 0.17062219371391918, "grad_norm": 1.504228508241335, "learning_rate": 0.0001969768016360877, "loss": 0.7278, "step": 2660 }, { "epoch": 0.17094291212315588, "grad_norm": 1.2092049917834673, "learning_rate": 0.00019694942140997435, "loss": 0.7341, "step": 2665 }, { "epoch": 0.17126363053239255, "grad_norm": 0.6080302726719192, "learning_rate": 0.00019692191967507524, "loss": 0.6543, "step": 2670 }, { "epoch": 0.17158434894162924, "grad_norm": 0.7373008700852878, "learning_rate": 0.0001968942964658589, "loss": 0.6152, "step": 2675 }, { "epoch": 0.17190506735086594, "grad_norm": 0.9214476765346659, "learning_rate": 0.000196866551816946, "loss": 0.6878, "step": 2680 }, { "epoch": 0.17222578576010264, "grad_norm": 0.7450194855735123, "learning_rate": 0.0001968386857631096, "loss": 0.6173, "step": 2685 }, { "epoch": 0.1725465041693393, "grad_norm": 0.6242054305521421, "learning_rate": 0.00019681069833927476, "loss": 0.6746, "step": 2690 }, { "epoch": 0.172867222578576, "grad_norm": 0.711220248168634, "learning_rate": 0.00019678258958051877, "loss": 0.6821, "step": 2695 }, { "epoch": 0.1731879409878127, "grad_norm": 0.7496584977206721, "learning_rate": 0.00019675435952207088, "loss": 0.5238, "step": 2700 }, { "epoch": 0.1735086593970494, "grad_norm": 0.7084413643635924, "learning_rate": 0.00019672600819931247, "loss": 0.7056, "step": 2705 }, { "epoch": 0.17382937780628607, "grad_norm": 1.0439027628488613, "learning_rate": 0.00019669753564777688, "loss": 0.6513, "step": 2710 }, { "epoch": 0.17415009621552277, "grad_norm": 0.71498067288977, "learning_rate": 0.0001966689419031493, "loss": 0.7406, "step": 2715 }, { "epoch": 0.17447081462475947, "grad_norm": 0.7033452927937216, "learning_rate": 0.00019664022700126695, "loss": 0.6923, "step": 2720 }, { "epoch": 0.17479153303399614, "grad_norm": 0.8919976779446186, "learning_rate": 0.00019661139097811877, "loss": 0.6326, "step": 2725 }, { "epoch": 0.17511225144323284, "grad_norm": 0.9493437873661492, "learning_rate": 0.00019658243386984562, "loss": 0.5783, "step": 2730 }, { "epoch": 0.17543296985246953, "grad_norm": 0.9860728443591087, "learning_rate": 0.00019655335571274003, "loss": 0.7279, "step": 2735 }, { "epoch": 0.17575368826170623, "grad_norm": 0.6352021684421743, "learning_rate": 0.0001965241565432463, "loss": 0.6397, "step": 2740 }, { "epoch": 0.1760744066709429, "grad_norm": 1.099016920497353, "learning_rate": 0.00019649483639796032, "loss": 0.6756, "step": 2745 }, { "epoch": 0.1763951250801796, "grad_norm": 0.7058834343210731, "learning_rate": 0.00019646539531362973, "loss": 0.7218, "step": 2750 }, { "epoch": 0.1767158434894163, "grad_norm": 0.8020832284905198, "learning_rate": 0.00019643583332715366, "loss": 0.5708, "step": 2755 }, { "epoch": 0.177036561898653, "grad_norm": 0.8014855578510585, "learning_rate": 0.0001964061504755827, "loss": 0.7843, "step": 2760 }, { "epoch": 0.17735728030788966, "grad_norm": 1.0134184586337234, "learning_rate": 0.0001963763467961191, "loss": 0.6599, "step": 2765 }, { "epoch": 0.17767799871712636, "grad_norm": 0.6050193347531744, "learning_rate": 0.0001963464223261164, "loss": 0.7984, "step": 2770 }, { "epoch": 0.17799871712636306, "grad_norm": 0.7479913165773774, "learning_rate": 0.0001963163771030796, "loss": 0.7469, "step": 2775 }, { "epoch": 0.17831943553559973, "grad_norm": 1.091278392341476, "learning_rate": 0.00019628621116466502, "loss": 0.6991, "step": 2780 }, { "epoch": 0.17864015394483643, "grad_norm": 1.0105012542968526, "learning_rate": 0.00019625592454868026, "loss": 0.6867, "step": 2785 }, { "epoch": 0.17896087235407312, "grad_norm": 0.8032083651463552, "learning_rate": 0.0001962255172930842, "loss": 0.7184, "step": 2790 }, { "epoch": 0.17928159076330982, "grad_norm": 0.8193497605449357, "learning_rate": 0.00019619498943598688, "loss": 0.5785, "step": 2795 }, { "epoch": 0.1796023091725465, "grad_norm": 0.7772046040254091, "learning_rate": 0.00019616434101564956, "loss": 0.7471, "step": 2800 }, { "epoch": 0.1799230275817832, "grad_norm": 1.224565960941351, "learning_rate": 0.00019613357207048452, "loss": 0.856, "step": 2805 }, { "epoch": 0.1802437459910199, "grad_norm": 0.6591412427417273, "learning_rate": 0.00019610268263905515, "loss": 0.5893, "step": 2810 }, { "epoch": 0.18056446440025659, "grad_norm": 0.8875976837711199, "learning_rate": 0.00019607167276007587, "loss": 0.7161, "step": 2815 }, { "epoch": 0.18088518280949326, "grad_norm": 0.8225479052301773, "learning_rate": 0.00019604054247241193, "loss": 0.5873, "step": 2820 }, { "epoch": 0.18120590121872995, "grad_norm": 1.2087539785527361, "learning_rate": 0.00019600929181507972, "loss": 0.6542, "step": 2825 }, { "epoch": 0.18152661962796665, "grad_norm": 0.8050140113302814, "learning_rate": 0.00019597792082724625, "loss": 0.5778, "step": 2830 }, { "epoch": 0.18184733803720335, "grad_norm": 1.321288241534433, "learning_rate": 0.00019594642954822952, "loss": 0.5994, "step": 2835 }, { "epoch": 0.18216805644644002, "grad_norm": 0.9376939681240336, "learning_rate": 0.00019591481801749816, "loss": 0.5046, "step": 2840 }, { "epoch": 0.18248877485567672, "grad_norm": 0.6185458970009285, "learning_rate": 0.00019588308627467162, "loss": 0.6859, "step": 2845 }, { "epoch": 0.18280949326491341, "grad_norm": 0.7801762201714135, "learning_rate": 0.00019585123435952, "loss": 0.7015, "step": 2850 }, { "epoch": 0.18313021167415008, "grad_norm": 0.7265831165052501, "learning_rate": 0.00019581926231196391, "loss": 0.823, "step": 2855 }, { "epoch": 0.18345093008338678, "grad_norm": 0.8151220320154888, "learning_rate": 0.00019578717017207467, "loss": 0.689, "step": 2860 }, { "epoch": 0.18377164849262348, "grad_norm": 0.9213195972340709, "learning_rate": 0.000195754957980074, "loss": 0.7382, "step": 2865 }, { "epoch": 0.18409236690186018, "grad_norm": 0.782822592817081, "learning_rate": 0.0001957226257763342, "loss": 0.6929, "step": 2870 }, { "epoch": 0.18441308531109685, "grad_norm": 0.980335474676683, "learning_rate": 0.0001956901736013778, "loss": 0.6156, "step": 2875 }, { "epoch": 0.18473380372033354, "grad_norm": 0.9039810035947186, "learning_rate": 0.00019565760149587794, "loss": 0.7664, "step": 2880 }, { "epoch": 0.18505452212957024, "grad_norm": 0.000701834979829147, "learning_rate": 0.0001956249095006578, "loss": 0.5249, "step": 2885 }, { "epoch": 0.18537524053880694, "grad_norm": 1.0237955976436885, "learning_rate": 0.00019559209765669105, "loss": 0.6839, "step": 2890 }, { "epoch": 0.1856959589480436, "grad_norm": 0.6769833810242086, "learning_rate": 0.00019555916600510145, "loss": 0.6537, "step": 2895 }, { "epoch": 0.1860166773572803, "grad_norm": 0.6462485885713231, "learning_rate": 0.00019552611458716296, "loss": 0.723, "step": 2900 }, { "epoch": 0.186337395766517, "grad_norm": 0.8722147531755802, "learning_rate": 0.0001954929434442996, "loss": 0.6837, "step": 2905 }, { "epoch": 0.18665811417575368, "grad_norm": 0.6906487731551919, "learning_rate": 0.0001954596526180855, "loss": 0.6678, "step": 2910 }, { "epoch": 0.18697883258499037, "grad_norm": 0.8754536117451718, "learning_rate": 0.00019542624215024474, "loss": 0.7607, "step": 2915 }, { "epoch": 0.18729955099422707, "grad_norm": 0.7481215119155424, "learning_rate": 0.0001953927120826514, "loss": 0.7354, "step": 2920 }, { "epoch": 0.18762026940346377, "grad_norm": 0.7173045174318763, "learning_rate": 0.0001953590624573294, "loss": 0.6889, "step": 2925 }, { "epoch": 0.18794098781270044, "grad_norm": 0.688657494500447, "learning_rate": 0.00019532529331645258, "loss": 0.7716, "step": 2930 }, { "epoch": 0.18826170622193714, "grad_norm": 0.8542179699315836, "learning_rate": 0.0001952914047023445, "loss": 0.6846, "step": 2935 }, { "epoch": 0.18858242463117383, "grad_norm": 0.6693936334963977, "learning_rate": 0.0001952573966574785, "loss": 0.6893, "step": 2940 }, { "epoch": 0.18890314304041053, "grad_norm": 1.1047249058364512, "learning_rate": 0.00019522326922447755, "loss": 0.7203, "step": 2945 }, { "epoch": 0.1892238614496472, "grad_norm": 0.6082855408476369, "learning_rate": 0.00019518902244611435, "loss": 0.7069, "step": 2950 }, { "epoch": 0.1895445798588839, "grad_norm": 0.5867678432004605, "learning_rate": 0.00019515465636531107, "loss": 0.7485, "step": 2955 }, { "epoch": 0.1898652982681206, "grad_norm": 0.6389524482986783, "learning_rate": 0.0001951201710251395, "loss": 0.6291, "step": 2960 }, { "epoch": 0.19018601667735727, "grad_norm": 0.40852828777296263, "learning_rate": 0.00019508556646882083, "loss": 0.6572, "step": 2965 }, { "epoch": 0.19050673508659396, "grad_norm": 0.6625359401782684, "learning_rate": 0.00019505084273972568, "loss": 0.6905, "step": 2970 }, { "epoch": 0.19082745349583066, "grad_norm": 0.6733266631590418, "learning_rate": 0.00019501599988137406, "loss": 0.6065, "step": 2975 }, { "epoch": 0.19114817190506736, "grad_norm": 0.8217762217578838, "learning_rate": 0.00019498103793743528, "loss": 0.6843, "step": 2980 }, { "epoch": 0.19146889031430403, "grad_norm": 1.220514466724885, "learning_rate": 0.00019494595695172787, "loss": 0.604, "step": 2985 }, { "epoch": 0.19178960872354073, "grad_norm": 0.792446196427873, "learning_rate": 0.00019491075696821962, "loss": 0.6326, "step": 2990 }, { "epoch": 0.19211032713277743, "grad_norm": 0.8158356531364367, "learning_rate": 0.00019487543803102736, "loss": 0.7795, "step": 2995 }, { "epoch": 0.19243104554201412, "grad_norm": 1.3297681323714916, "learning_rate": 0.00019484000018441715, "loss": 0.6776, "step": 3000 }, { "epoch": 0.1927517639512508, "grad_norm": 1.1206878255004398, "learning_rate": 0.00019480444347280392, "loss": 0.7425, "step": 3005 }, { "epoch": 0.1930724823604875, "grad_norm": 0.5668482553685025, "learning_rate": 0.00019476876794075168, "loss": 0.6463, "step": 3010 }, { "epoch": 0.1933932007697242, "grad_norm": 0.9274228876056752, "learning_rate": 0.0001947329736329734, "loss": 0.7253, "step": 3015 }, { "epoch": 0.19371391917896089, "grad_norm": 0.8934110376365801, "learning_rate": 0.0001946970605943308, "loss": 0.8008, "step": 3020 }, { "epoch": 0.19403463758819756, "grad_norm": 0.7054346176332205, "learning_rate": 0.00019466102886983445, "loss": 0.6421, "step": 3025 }, { "epoch": 0.19435535599743425, "grad_norm": 1.112312708275422, "learning_rate": 0.0001946248785046437, "loss": 0.5448, "step": 3030 }, { "epoch": 0.19467607440667095, "grad_norm": 0.9514480454813623, "learning_rate": 0.00019458860954406655, "loss": 0.8921, "step": 3035 }, { "epoch": 0.19499679281590762, "grad_norm": 0.8289559763958162, "learning_rate": 0.00019455222203355974, "loss": 0.6384, "step": 3040 }, { "epoch": 0.19531751122514432, "grad_norm": 1.6772904982725059, "learning_rate": 0.00019451571601872842, "loss": 0.593, "step": 3045 }, { "epoch": 0.19563822963438102, "grad_norm": 0.933959150583705, "learning_rate": 0.00019447909154532642, "loss": 0.7033, "step": 3050 }, { "epoch": 0.19595894804361771, "grad_norm": 0.9836848697506737, "learning_rate": 0.00019444234865925597, "loss": 0.694, "step": 3055 }, { "epoch": 0.19627966645285438, "grad_norm": 0.752058149609346, "learning_rate": 0.00019440548740656772, "loss": 0.8419, "step": 3060 }, { "epoch": 0.19660038486209108, "grad_norm": 0.5564595991041628, "learning_rate": 0.00019436850783346063, "loss": 0.5868, "step": 3065 }, { "epoch": 0.19692110327132778, "grad_norm": 1.1233031900082198, "learning_rate": 0.00019433140998628202, "loss": 0.7213, "step": 3070 }, { "epoch": 0.19724182168056448, "grad_norm": 0.9846847511141703, "learning_rate": 0.00019429419391152743, "loss": 0.6083, "step": 3075 }, { "epoch": 0.19756254008980115, "grad_norm": 0.9133697850179805, "learning_rate": 0.00019425685965584056, "loss": 0.7509, "step": 3080 }, { "epoch": 0.19788325849903785, "grad_norm": 1.1268873349974773, "learning_rate": 0.0001942194072660132, "loss": 0.6734, "step": 3085 }, { "epoch": 0.19820397690827454, "grad_norm": 0.663450697814864, "learning_rate": 0.00019418183678898525, "loss": 0.7093, "step": 3090 }, { "epoch": 0.1985246953175112, "grad_norm": 0.6245075928754343, "learning_rate": 0.0001941441482718446, "loss": 0.7194, "step": 3095 }, { "epoch": 0.1988454137267479, "grad_norm": 0.9587885835266485, "learning_rate": 0.00019410634176182705, "loss": 0.6995, "step": 3100 }, { "epoch": 0.1991661321359846, "grad_norm": 0.8163502504890695, "learning_rate": 0.00019406841730631636, "loss": 0.7503, "step": 3105 }, { "epoch": 0.1994868505452213, "grad_norm": 0.9426439782405206, "learning_rate": 0.00019403037495284398, "loss": 0.7404, "step": 3110 }, { "epoch": 0.19980756895445798, "grad_norm": 0.8220300785309613, "learning_rate": 0.00019399221474908932, "loss": 0.6744, "step": 3115 }, { "epoch": 0.20012828736369467, "grad_norm": 0.9955681688037235, "learning_rate": 0.00019395393674287927, "loss": 0.6852, "step": 3120 }, { "epoch": 0.20044900577293137, "grad_norm": 1.1278721654085937, "learning_rate": 0.00019391554098218853, "loss": 0.8426, "step": 3125 }, { "epoch": 0.20076972418216807, "grad_norm": 1.289322139002122, "learning_rate": 0.00019387702751513932, "loss": 0.7352, "step": 3130 }, { "epoch": 0.20109044259140474, "grad_norm": 1.4969951218148942, "learning_rate": 0.0001938383963900014, "loss": 0.7202, "step": 3135 }, { "epoch": 0.20141116100064144, "grad_norm": 0.8939306827167222, "learning_rate": 0.000193799647655192, "loss": 0.6326, "step": 3140 }, { "epoch": 0.20173187940987813, "grad_norm": 1.038193039895127, "learning_rate": 0.00019376078135927566, "loss": 0.5945, "step": 3145 }, { "epoch": 0.2020525978191148, "grad_norm": 0.8466700431352269, "learning_rate": 0.00019372179755096448, "loss": 0.4709, "step": 3150 }, { "epoch": 0.2023733162283515, "grad_norm": 0.8353167491615692, "learning_rate": 0.00019368269627911757, "loss": 0.6145, "step": 3155 }, { "epoch": 0.2026940346375882, "grad_norm": 0.5826569638112876, "learning_rate": 0.00019364347759274144, "loss": 0.6798, "step": 3160 }, { "epoch": 0.2030147530468249, "grad_norm": 0.6596971126256945, "learning_rate": 0.0001936041415409897, "loss": 0.7164, "step": 3165 }, { "epoch": 0.20333547145606157, "grad_norm": 1.1459761657771013, "learning_rate": 0.00019356468817316311, "loss": 0.6503, "step": 3170 }, { "epoch": 0.20365618986529826, "grad_norm": 0.6795054057142108, "learning_rate": 0.0001935251175387094, "loss": 0.624, "step": 3175 }, { "epoch": 0.20397690827453496, "grad_norm": 0.740763733162126, "learning_rate": 0.00019348542968722324, "loss": 0.6297, "step": 3180 }, { "epoch": 0.20429762668377166, "grad_norm": 0.7064796503029271, "learning_rate": 0.00019344562466844635, "loss": 0.6003, "step": 3185 }, { "epoch": 0.20461834509300833, "grad_norm": 1.6506358182547065, "learning_rate": 0.00019340570253226712, "loss": 0.4787, "step": 3190 }, { "epoch": 0.20493906350224503, "grad_norm": 1.1332295207671033, "learning_rate": 0.0001933656633287209, "loss": 0.7126, "step": 3195 }, { "epoch": 0.20525978191148173, "grad_norm": 0.617200353783866, "learning_rate": 0.00019332550710798966, "loss": 0.598, "step": 3200 }, { "epoch": 0.2055805003207184, "grad_norm": 0.868513802069887, "learning_rate": 0.000193285233920402, "loss": 0.7152, "step": 3205 }, { "epoch": 0.2059012187299551, "grad_norm": 1.1852925025104672, "learning_rate": 0.00019324484381643325, "loss": 0.7774, "step": 3210 }, { "epoch": 0.2062219371391918, "grad_norm": 1.0280680170586727, "learning_rate": 0.00019320433684670514, "loss": 0.7043, "step": 3215 }, { "epoch": 0.2065426555484285, "grad_norm": 0.6987881012001924, "learning_rate": 0.00019316371306198592, "loss": 0.7619, "step": 3220 }, { "epoch": 0.20686337395766516, "grad_norm": 0.8392027535004901, "learning_rate": 0.00019312297251319026, "loss": 0.6781, "step": 3225 }, { "epoch": 0.20718409236690186, "grad_norm": 1.2842078269698645, "learning_rate": 0.00019308211525137915, "loss": 0.7145, "step": 3230 }, { "epoch": 0.20750481077613855, "grad_norm": 0.6603411917591546, "learning_rate": 0.0001930411413277599, "loss": 0.6411, "step": 3235 }, { "epoch": 0.20782552918537525, "grad_norm": 1.3159150838945801, "learning_rate": 0.000193000050793686, "loss": 0.7067, "step": 3240 }, { "epoch": 0.20814624759461192, "grad_norm": 1.2826837962016335, "learning_rate": 0.0001929588437006571, "loss": 0.657, "step": 3245 }, { "epoch": 0.20846696600384862, "grad_norm": 0.7429467281992763, "learning_rate": 0.00019291752010031887, "loss": 0.6783, "step": 3250 }, { "epoch": 0.20878768441308532, "grad_norm": 0.9388767995389723, "learning_rate": 0.00019287608004446314, "loss": 0.6873, "step": 3255 }, { "epoch": 0.20910840282232201, "grad_norm": 0.8840070141339184, "learning_rate": 0.0001928345235850276, "loss": 0.6159, "step": 3260 }, { "epoch": 0.20942912123155868, "grad_norm": 1.0732885802726535, "learning_rate": 0.00019279285077409582, "loss": 0.6713, "step": 3265 }, { "epoch": 0.20974983964079538, "grad_norm": 0.7289657532988314, "learning_rate": 0.00019275106166389725, "loss": 0.6831, "step": 3270 }, { "epoch": 0.21007055805003208, "grad_norm": 0.6492856906135663, "learning_rate": 0.00019270915630680707, "loss": 0.7126, "step": 3275 }, { "epoch": 0.21039127645926875, "grad_norm": 0.8073736143636202, "learning_rate": 0.0001926671347553462, "loss": 0.7527, "step": 3280 }, { "epoch": 0.21071199486850545, "grad_norm": 0.8682418292741673, "learning_rate": 0.0001926249970621811, "loss": 0.5924, "step": 3285 }, { "epoch": 0.21103271327774215, "grad_norm": 0.553914766273313, "learning_rate": 0.00019258274328012384, "loss": 0.5456, "step": 3290 }, { "epoch": 0.21135343168697884, "grad_norm": 0.9718939215705609, "learning_rate": 0.00019254037346213204, "loss": 0.5976, "step": 3295 }, { "epoch": 0.2116741500962155, "grad_norm": 0.9064065621099515, "learning_rate": 0.00019249788766130863, "loss": 0.7424, "step": 3300 }, { "epoch": 0.2119948685054522, "grad_norm": 0.6693670165919959, "learning_rate": 0.00019245528593090204, "loss": 0.7834, "step": 3305 }, { "epoch": 0.2123155869146889, "grad_norm": 0.68000110275399, "learning_rate": 0.0001924125683243059, "loss": 0.8261, "step": 3310 }, { "epoch": 0.2126363053239256, "grad_norm": 0.8936655552945705, "learning_rate": 0.0001923697348950591, "loss": 0.7315, "step": 3315 }, { "epoch": 0.21295702373316228, "grad_norm": 0.9370537429273521, "learning_rate": 0.0001923267856968457, "loss": 0.6054, "step": 3320 }, { "epoch": 0.21327774214239897, "grad_norm": 1.5321045176308976, "learning_rate": 0.00019228372078349486, "loss": 0.6995, "step": 3325 }, { "epoch": 0.21359846055163567, "grad_norm": 0.8164083897600656, "learning_rate": 0.00019224054020898073, "loss": 0.7217, "step": 3330 }, { "epoch": 0.21391917896087234, "grad_norm": 0.9360751302506096, "learning_rate": 0.00019219724402742247, "loss": 0.7071, "step": 3335 }, { "epoch": 0.21423989737010904, "grad_norm": 1.1474158049320227, "learning_rate": 0.00019215383229308412, "loss": 0.696, "step": 3340 }, { "epoch": 0.21456061577934574, "grad_norm": 0.6286443687036616, "learning_rate": 0.0001921103050603745, "loss": 0.6582, "step": 3345 }, { "epoch": 0.21488133418858243, "grad_norm": 0.930008180786893, "learning_rate": 0.00019206666238384728, "loss": 0.7267, "step": 3350 }, { "epoch": 0.2152020525978191, "grad_norm": 0.8966235538817937, "learning_rate": 0.0001920229043182007, "loss": 0.7461, "step": 3355 }, { "epoch": 0.2155227710070558, "grad_norm": 0.6075118442836386, "learning_rate": 0.0001919790309182777, "loss": 0.6218, "step": 3360 }, { "epoch": 0.2158434894162925, "grad_norm": 1.120521483944113, "learning_rate": 0.00019193504223906577, "loss": 0.7854, "step": 3365 }, { "epoch": 0.2161642078255292, "grad_norm": 0.7536443555714086, "learning_rate": 0.00019189093833569686, "loss": 0.6665, "step": 3370 }, { "epoch": 0.21648492623476587, "grad_norm": 0.7306155955546904, "learning_rate": 0.00019184671926344732, "loss": 0.5562, "step": 3375 }, { "epoch": 0.21680564464400257, "grad_norm": 1.4066089443224215, "learning_rate": 0.00019180238507773788, "loss": 0.7206, "step": 3380 }, { "epoch": 0.21712636305323926, "grad_norm": 1.0420087314885336, "learning_rate": 0.0001917579358341335, "loss": 0.8488, "step": 3385 }, { "epoch": 0.21744708146247593, "grad_norm": 1.24092779077047, "learning_rate": 0.0001917133715883434, "loss": 0.7737, "step": 3390 }, { "epoch": 0.21776779987171263, "grad_norm": 1.2683256948043233, "learning_rate": 0.00019166869239622085, "loss": 0.5991, "step": 3395 }, { "epoch": 0.21808851828094933, "grad_norm": 1.0154708506536307, "learning_rate": 0.0001916238983137633, "loss": 0.6921, "step": 3400 }, { "epoch": 0.21840923669018603, "grad_norm": 1.250860867590444, "learning_rate": 0.00019157898939711212, "loss": 0.772, "step": 3405 }, { "epoch": 0.2187299550994227, "grad_norm": 1.0205976247637063, "learning_rate": 0.0001915339657025526, "loss": 0.6262, "step": 3410 }, { "epoch": 0.2190506735086594, "grad_norm": 0.6808470166264919, "learning_rate": 0.0001914888272865139, "loss": 0.5628, "step": 3415 }, { "epoch": 0.2193713919178961, "grad_norm": 1.0460679318245396, "learning_rate": 0.00019144357420556893, "loss": 0.6497, "step": 3420 }, { "epoch": 0.2196921103271328, "grad_norm": 0.8912439646989759, "learning_rate": 0.00019139820651643442, "loss": 0.5868, "step": 3425 }, { "epoch": 0.22001282873636946, "grad_norm": 0.6690277429678054, "learning_rate": 0.00019135272427597063, "loss": 0.6833, "step": 3430 }, { "epoch": 0.22033354714560616, "grad_norm": 1.0200781753500376, "learning_rate": 0.00019130712754118138, "loss": 0.6225, "step": 3435 }, { "epoch": 0.22065426555484285, "grad_norm": 1.0186432727769665, "learning_rate": 0.00019126141636921414, "loss": 0.769, "step": 3440 }, { "epoch": 0.22097498396407952, "grad_norm": 0.671761473616358, "learning_rate": 0.0001912155908173596, "loss": 0.6917, "step": 3445 }, { "epoch": 0.22129570237331622, "grad_norm": 0.7493482108843831, "learning_rate": 0.00019116965094305197, "loss": 0.7762, "step": 3450 }, { "epoch": 0.22161642078255292, "grad_norm": 0.9676529237022933, "learning_rate": 0.00019112359680386863, "loss": 0.6426, "step": 3455 }, { "epoch": 0.22193713919178962, "grad_norm": 0.7117654744699794, "learning_rate": 0.00019107742845753025, "loss": 0.6968, "step": 3460 }, { "epoch": 0.2222578576010263, "grad_norm": 1.0489562483489054, "learning_rate": 0.0001910311459619006, "loss": 0.7852, "step": 3465 }, { "epoch": 0.22257857601026299, "grad_norm": 0.7103830582474117, "learning_rate": 0.00019098474937498652, "loss": 0.6496, "step": 3470 }, { "epoch": 0.22289929441949968, "grad_norm": 1.1088261693908699, "learning_rate": 0.00019093823875493784, "loss": 0.7313, "step": 3475 }, { "epoch": 0.22322001282873638, "grad_norm": 1.1659589438084368, "learning_rate": 0.00019089161416004733, "loss": 0.6526, "step": 3480 }, { "epoch": 0.22354073123797305, "grad_norm": 0.7493230462026259, "learning_rate": 0.0001908448756487506, "loss": 0.6629, "step": 3485 }, { "epoch": 0.22386144964720975, "grad_norm": 0.8650060759204274, "learning_rate": 0.000190798023279626, "loss": 0.7321, "step": 3490 }, { "epoch": 0.22418216805644645, "grad_norm": 0.8002336983221607, "learning_rate": 0.0001907510571113946, "loss": 0.7816, "step": 3495 }, { "epoch": 0.22450288646568314, "grad_norm": 0.6840069838552998, "learning_rate": 0.00019070397720292014, "loss": 0.6472, "step": 3500 }, { "epoch": 0.2248236048749198, "grad_norm": 0.9253534124109082, "learning_rate": 0.0001906567836132089, "loss": 0.7952, "step": 3505 }, { "epoch": 0.2251443232841565, "grad_norm": 0.8707427934510977, "learning_rate": 0.0001906094764014095, "loss": 0.6403, "step": 3510 }, { "epoch": 0.2254650416933932, "grad_norm": 0.8952137846177877, "learning_rate": 0.00019056205562681324, "loss": 0.7713, "step": 3515 }, { "epoch": 0.22578576010262988, "grad_norm": 1.2157321282590767, "learning_rate": 0.00019051452134885346, "loss": 0.7791, "step": 3520 }, { "epoch": 0.22610647851186658, "grad_norm": 1.1942747630269164, "learning_rate": 0.000190466873627106, "loss": 0.7107, "step": 3525 }, { "epoch": 0.22642719692110327, "grad_norm": 0.7534228887260359, "learning_rate": 0.00019041911252128864, "loss": 0.7748, "step": 3530 }, { "epoch": 0.22674791533033997, "grad_norm": 0.7020738108193582, "learning_rate": 0.0001903712380912615, "loss": 0.641, "step": 3535 }, { "epoch": 0.22706863373957664, "grad_norm": 0.8822584692031392, "learning_rate": 0.0001903232503970266, "loss": 0.7302, "step": 3540 }, { "epoch": 0.22738935214881334, "grad_norm": 0.7669563154301963, "learning_rate": 0.00019027514949872794, "loss": 0.6305, "step": 3545 }, { "epoch": 0.22771007055805004, "grad_norm": 0.75341665833547, "learning_rate": 0.0001902269354566514, "loss": 0.5966, "step": 3550 }, { "epoch": 0.22803078896728673, "grad_norm": 1.3621102982113154, "learning_rate": 0.00019017860833122466, "loss": 0.7256, "step": 3555 }, { "epoch": 0.2283515073765234, "grad_norm": 0.6413371506739955, "learning_rate": 0.00019013016818301718, "loss": 0.7576, "step": 3560 }, { "epoch": 0.2286722257857601, "grad_norm": 0.9240762303756279, "learning_rate": 0.00019008161507274004, "loss": 0.6412, "step": 3565 }, { "epoch": 0.2289929441949968, "grad_norm": 0.600216888507175, "learning_rate": 0.0001900329490612458, "loss": 0.6077, "step": 3570 }, { "epoch": 0.22931366260423347, "grad_norm": 0.7764633127488129, "learning_rate": 0.0001899841702095287, "loss": 0.7296, "step": 3575 }, { "epoch": 0.22963438101347017, "grad_norm": 0.8982484209272996, "learning_rate": 0.00018993527857872437, "loss": 0.7016, "step": 3580 }, { "epoch": 0.22995509942270687, "grad_norm": 1.0720659350142319, "learning_rate": 0.0001898862742301096, "loss": 0.7538, "step": 3585 }, { "epoch": 0.23027581783194356, "grad_norm": 1.1146855770453603, "learning_rate": 0.00018983715722510267, "loss": 0.7336, "step": 3590 }, { "epoch": 0.23059653624118023, "grad_norm": 1.0183157286000422, "learning_rate": 0.00018978792762526297, "loss": 0.7608, "step": 3595 }, { "epoch": 0.23091725465041693, "grad_norm": 0.5987067875621542, "learning_rate": 0.000189738585492291, "loss": 0.7482, "step": 3600 }, { "epoch": 0.23123797305965363, "grad_norm": 1.2051854914953493, "learning_rate": 0.0001896891308880283, "loss": 0.6866, "step": 3605 }, { "epoch": 0.23155869146889033, "grad_norm": 0.6469997389423526, "learning_rate": 0.00018963956387445743, "loss": 0.5533, "step": 3610 }, { "epoch": 0.231879409878127, "grad_norm": 0.751435050187464, "learning_rate": 0.00018958988451370172, "loss": 0.5345, "step": 3615 }, { "epoch": 0.2322001282873637, "grad_norm": 0.9296699512717883, "learning_rate": 0.00018954009286802545, "loss": 0.6395, "step": 3620 }, { "epoch": 0.2325208466966004, "grad_norm": 0.8523320100136826, "learning_rate": 0.0001894901889998335, "loss": 0.6699, "step": 3625 }, { "epoch": 0.23284156510583706, "grad_norm": 0.8927205659717501, "learning_rate": 0.0001894401729716715, "loss": 0.7016, "step": 3630 }, { "epoch": 0.23316228351507376, "grad_norm": 0.9773519130062428, "learning_rate": 0.00018939004484622556, "loss": 0.5938, "step": 3635 }, { "epoch": 0.23348300192431046, "grad_norm": 1.205672119851859, "learning_rate": 0.00018933980468632236, "loss": 0.6659, "step": 3640 }, { "epoch": 0.23380372033354715, "grad_norm": 0.7579640404532227, "learning_rate": 0.00018928945255492898, "loss": 0.6189, "step": 3645 }, { "epoch": 0.23412443874278382, "grad_norm": 0.7167559954703847, "learning_rate": 0.0001892389885151528, "loss": 0.7174, "step": 3650 }, { "epoch": 0.23444515715202052, "grad_norm": 0.9211676236510546, "learning_rate": 0.0001891884126302415, "loss": 0.7194, "step": 3655 }, { "epoch": 0.23476587556125722, "grad_norm": 1.0264289808335763, "learning_rate": 0.00018913772496358293, "loss": 0.7518, "step": 3660 }, { "epoch": 0.23508659397049392, "grad_norm": 0.7037785727516465, "learning_rate": 0.000189086925578705, "loss": 0.6463, "step": 3665 }, { "epoch": 0.2354073123797306, "grad_norm": 0.7939519982595736, "learning_rate": 0.0001890360145392757, "loss": 0.6679, "step": 3670 }, { "epoch": 0.23572803078896729, "grad_norm": 0.9346634485226615, "learning_rate": 0.00018898499190910285, "loss": 0.6707, "step": 3675 }, { "epoch": 0.23604874919820398, "grad_norm": 0.9205144038862676, "learning_rate": 0.00018893385775213428, "loss": 0.5932, "step": 3680 }, { "epoch": 0.23636946760744068, "grad_norm": 0.7662986014450179, "learning_rate": 0.00018888261213245751, "loss": 0.626, "step": 3685 }, { "epoch": 0.23669018601667735, "grad_norm": 0.9540864146877855, "learning_rate": 0.00018883125511429976, "loss": 0.6775, "step": 3690 }, { "epoch": 0.23701090442591405, "grad_norm": 0.8236472390358622, "learning_rate": 0.0001887797867620279, "loss": 0.5783, "step": 3695 }, { "epoch": 0.23733162283515075, "grad_norm": 1.1046319576589374, "learning_rate": 0.00018872820714014828, "loss": 0.7178, "step": 3700 }, { "epoch": 0.23765234124438742, "grad_norm": 0.8687058181792315, "learning_rate": 0.0001886765163133068, "loss": 0.7188, "step": 3705 }, { "epoch": 0.2379730596536241, "grad_norm": 0.8074055463421766, "learning_rate": 0.0001886247143462886, "loss": 0.6839, "step": 3710 }, { "epoch": 0.2382937780628608, "grad_norm": 0.9477091526553252, "learning_rate": 0.0001885728013040183, "loss": 0.694, "step": 3715 }, { "epoch": 0.2386144964720975, "grad_norm": 1.4070444194213776, "learning_rate": 0.00018852077725155955, "loss": 0.6443, "step": 3720 }, { "epoch": 0.23893521488133418, "grad_norm": 0.7885481772614231, "learning_rate": 0.00018846864225411522, "loss": 0.6975, "step": 3725 }, { "epoch": 0.23925593329057088, "grad_norm": 1.416662073982706, "learning_rate": 0.0001884163963770272, "loss": 0.5101, "step": 3730 }, { "epoch": 0.23957665169980757, "grad_norm": 1.1458969994696415, "learning_rate": 0.00018836403968577642, "loss": 0.6615, "step": 3735 }, { "epoch": 0.23989737010904427, "grad_norm": 0.8353107592687541, "learning_rate": 0.00018831157224598265, "loss": 0.6361, "step": 3740 }, { "epoch": 0.24021808851828094, "grad_norm": 0.9588837283118316, "learning_rate": 0.0001882589941234044, "loss": 0.6013, "step": 3745 }, { "epoch": 0.24053880692751764, "grad_norm": 0.9378372320194371, "learning_rate": 0.00018820630538393896, "loss": 0.6638, "step": 3750 }, { "epoch": 0.24085952533675434, "grad_norm": 0.657630819098, "learning_rate": 0.0001881535060936223, "loss": 0.6291, "step": 3755 }, { "epoch": 0.241180243745991, "grad_norm": 0.8483718480641205, "learning_rate": 0.00018810059631862885, "loss": 0.7489, "step": 3760 }, { "epoch": 0.2415009621552277, "grad_norm": 0.6502718844446955, "learning_rate": 0.0001880475761252716, "loss": 0.7414, "step": 3765 }, { "epoch": 0.2418216805644644, "grad_norm": 1.1168778404379636, "learning_rate": 0.00018799444558000188, "loss": 0.5148, "step": 3770 }, { "epoch": 0.2421423989737011, "grad_norm": 0.7913864245267141, "learning_rate": 0.00018794120474940936, "loss": 0.7854, "step": 3775 }, { "epoch": 0.24246311738293777, "grad_norm": 0.6448828952136001, "learning_rate": 0.00018788785370022187, "loss": 0.7078, "step": 3780 }, { "epoch": 0.24278383579217447, "grad_norm": 1.5060141096885609, "learning_rate": 0.00018783439249930544, "loss": 0.6149, "step": 3785 }, { "epoch": 0.24310455420141117, "grad_norm": 1.1449759900992198, "learning_rate": 0.00018778082121366415, "loss": 0.6848, "step": 3790 }, { "epoch": 0.24342527261064786, "grad_norm": 0.8978384550293506, "learning_rate": 0.00018772713991044006, "loss": 0.5786, "step": 3795 }, { "epoch": 0.24374599101988453, "grad_norm": 1.0307173194583823, "learning_rate": 0.0001876733486569131, "loss": 0.6089, "step": 3800 }, { "epoch": 0.24406670942912123, "grad_norm": 1.0460496173819018, "learning_rate": 0.00018761944752050092, "loss": 0.7205, "step": 3805 }, { "epoch": 0.24438742783835793, "grad_norm": 0.7905784500183457, "learning_rate": 0.00018756543656875903, "loss": 0.6866, "step": 3810 }, { "epoch": 0.2447081462475946, "grad_norm": 0.8146037687112702, "learning_rate": 0.0001875113158693805, "loss": 0.6722, "step": 3815 }, { "epoch": 0.2450288646568313, "grad_norm": 0.6700527883378358, "learning_rate": 0.00018745708549019598, "loss": 0.69, "step": 3820 }, { "epoch": 0.245349583066068, "grad_norm": 0.86059539710882, "learning_rate": 0.00018740274549917355, "loss": 0.6951, "step": 3825 }, { "epoch": 0.2456703014753047, "grad_norm": 0.754486021920581, "learning_rate": 0.00018734829596441869, "loss": 0.669, "step": 3830 }, { "epoch": 0.24599101988454136, "grad_norm": 1.2671234138000913, "learning_rate": 0.00018729373695417411, "loss": 0.53, "step": 3835 }, { "epoch": 0.24631173829377806, "grad_norm": 0.6932982987761634, "learning_rate": 0.0001872390685368199, "loss": 0.6588, "step": 3840 }, { "epoch": 0.24663245670301476, "grad_norm": 0.8973942648351731, "learning_rate": 0.00018718429078087306, "loss": 0.759, "step": 3845 }, { "epoch": 0.24695317511225146, "grad_norm": 0.8232879633687452, "learning_rate": 0.00018712940375498777, "loss": 0.7228, "step": 3850 }, { "epoch": 0.24727389352148813, "grad_norm": 0.6326649992249508, "learning_rate": 0.0001870744075279551, "loss": 0.7392, "step": 3855 }, { "epoch": 0.24759461193072482, "grad_norm": 1.097141467166474, "learning_rate": 0.000187019302168703, "loss": 0.6787, "step": 3860 }, { "epoch": 0.24791533033996152, "grad_norm": 0.3009107744843191, "learning_rate": 0.00018696408774629623, "loss": 0.5101, "step": 3865 }, { "epoch": 0.2482360487491982, "grad_norm": 0.8763665765416497, "learning_rate": 0.00018690876432993616, "loss": 0.6693, "step": 3870 }, { "epoch": 0.2485567671584349, "grad_norm": 0.8358957515633696, "learning_rate": 0.00018685333198896085, "loss": 0.4624, "step": 3875 }, { "epoch": 0.24887748556767159, "grad_norm": 0.7954157351888587, "learning_rate": 0.00018679779079284478, "loss": 0.6448, "step": 3880 }, { "epoch": 0.24919820397690828, "grad_norm": 0.8015671945298257, "learning_rate": 0.00018674214081119899, "loss": 0.7378, "step": 3885 }, { "epoch": 0.24951892238614495, "grad_norm": 0.4176253877935304, "learning_rate": 0.00018668638211377075, "loss": 0.6243, "step": 3890 }, { "epoch": 0.24983964079538165, "grad_norm": 0.9442754652275936, "learning_rate": 0.00018663051477044363, "loss": 0.7179, "step": 3895 }, { "epoch": 0.2501603592046183, "grad_norm": 0.4823245844586911, "learning_rate": 0.00018657453885123743, "loss": 0.6911, "step": 3900 }, { "epoch": 0.250481077613855, "grad_norm": 1.2379921804802545, "learning_rate": 0.00018651845442630788, "loss": 0.7287, "step": 3905 }, { "epoch": 0.2508017960230917, "grad_norm": 0.8025900155844875, "learning_rate": 0.00018646226156594683, "loss": 0.6996, "step": 3910 }, { "epoch": 0.2511225144323284, "grad_norm": 0.7107570481507937, "learning_rate": 0.00018640596034058202, "loss": 0.6547, "step": 3915 }, { "epoch": 0.2514432328415651, "grad_norm": 1.0641358272949475, "learning_rate": 0.00018634955082077694, "loss": 0.6644, "step": 3920 }, { "epoch": 0.2517639512508018, "grad_norm": 0.47480734009901776, "learning_rate": 0.00018629303307723087, "loss": 0.573, "step": 3925 }, { "epoch": 0.2520846696600385, "grad_norm": 0.793188561410365, "learning_rate": 0.0001862364071807787, "loss": 0.5214, "step": 3930 }, { "epoch": 0.25240538806927515, "grad_norm": 1.0592935580458442, "learning_rate": 0.00018617967320239088, "loss": 0.7271, "step": 3935 }, { "epoch": 0.25272610647851185, "grad_norm": 1.2256726599433683, "learning_rate": 0.00018612283121317334, "loss": 0.6422, "step": 3940 }, { "epoch": 0.25304682488774854, "grad_norm": 0.7519903384129473, "learning_rate": 0.00018606588128436733, "loss": 0.5867, "step": 3945 }, { "epoch": 0.25336754329698524, "grad_norm": 0.7245403184900441, "learning_rate": 0.00018600882348734942, "loss": 0.595, "step": 3950 }, { "epoch": 0.25368826170622194, "grad_norm": 0.8118238034713691, "learning_rate": 0.0001859516578936314, "loss": 0.6789, "step": 3955 }, { "epoch": 0.25400898011545864, "grad_norm": 0.94671989401086, "learning_rate": 0.0001858943845748601, "loss": 0.5563, "step": 3960 }, { "epoch": 0.25432969852469534, "grad_norm": 1.2366250568429358, "learning_rate": 0.00018583700360281743, "loss": 0.7508, "step": 3965 }, { "epoch": 0.25465041693393203, "grad_norm": 0.79253106009907, "learning_rate": 0.00018577951504942014, "loss": 0.8067, "step": 3970 }, { "epoch": 0.2549711353431687, "grad_norm": 0.8702530726486416, "learning_rate": 0.0001857219189867199, "loss": 0.617, "step": 3975 }, { "epoch": 0.2552918537524054, "grad_norm": 1.0941049074741396, "learning_rate": 0.0001856642154869031, "loss": 0.6722, "step": 3980 }, { "epoch": 0.25561257216164207, "grad_norm": 0.8439431895631772, "learning_rate": 0.00018560640462229072, "loss": 0.4939, "step": 3985 }, { "epoch": 0.25593329057087877, "grad_norm": 0.6351905484581176, "learning_rate": 0.00018554848646533842, "loss": 0.6447, "step": 3990 }, { "epoch": 0.25625400898011547, "grad_norm": 0.5405523691592523, "learning_rate": 0.00018549046108863623, "loss": 0.619, "step": 3995 }, { "epoch": 0.25657472738935216, "grad_norm": 0.9663208760661458, "learning_rate": 0.00018543232856490857, "loss": 0.7077, "step": 4000 }, { "epoch": 0.25689544579858886, "grad_norm": 1.1847646315539586, "learning_rate": 0.00018537408896701426, "loss": 0.645, "step": 4005 }, { "epoch": 0.2572161642078255, "grad_norm": 0.9615403982388305, "learning_rate": 0.00018531574236794614, "loss": 0.6811, "step": 4010 }, { "epoch": 0.2575368826170622, "grad_norm": 0.8358212875135942, "learning_rate": 0.0001852572888408313, "loss": 0.7614, "step": 4015 }, { "epoch": 0.2578576010262989, "grad_norm": 0.654849517944886, "learning_rate": 0.00018519872845893084, "loss": 0.7217, "step": 4020 }, { "epoch": 0.2581783194355356, "grad_norm": 1.2575079996892056, "learning_rate": 0.00018514006129563966, "loss": 0.6607, "step": 4025 }, { "epoch": 0.2584990378447723, "grad_norm": 0.9922068320402926, "learning_rate": 0.00018508128742448664, "loss": 0.837, "step": 4030 }, { "epoch": 0.258819756254009, "grad_norm": 0.6769732353504583, "learning_rate": 0.00018502240691913423, "loss": 0.5391, "step": 4035 }, { "epoch": 0.2591404746632457, "grad_norm": 1.0085400425349142, "learning_rate": 0.00018496341985337872, "loss": 0.6348, "step": 4040 }, { "epoch": 0.2594611930724824, "grad_norm": 1.0848700957447277, "learning_rate": 0.00018490432630114987, "loss": 0.6778, "step": 4045 }, { "epoch": 0.25978191148171903, "grad_norm": 2.0271957707532953, "learning_rate": 0.00018484512633651083, "loss": 0.654, "step": 4050 }, { "epoch": 0.2601026298909557, "grad_norm": 0.7805695373329654, "learning_rate": 0.00018478582003365822, "loss": 0.7096, "step": 4055 }, { "epoch": 0.2604233483001924, "grad_norm": 0.9870035129297559, "learning_rate": 0.0001847264074669219, "loss": 0.6384, "step": 4060 }, { "epoch": 0.2607440667094291, "grad_norm": 1.4231275295206969, "learning_rate": 0.00018466688871076492, "loss": 0.7516, "step": 4065 }, { "epoch": 0.2610647851186658, "grad_norm": 0.9526984436593213, "learning_rate": 0.00018460726383978337, "loss": 0.7593, "step": 4070 }, { "epoch": 0.2613855035279025, "grad_norm": 0.8092373561884175, "learning_rate": 0.00018454753292870645, "loss": 0.7056, "step": 4075 }, { "epoch": 0.2617062219371392, "grad_norm": 1.0372403017182314, "learning_rate": 0.0001844876960523961, "loss": 0.7301, "step": 4080 }, { "epoch": 0.26202694034637586, "grad_norm": 1.0864230414581424, "learning_rate": 0.0001844277532858472, "loss": 0.7108, "step": 4085 }, { "epoch": 0.26234765875561256, "grad_norm": 1.1180610427980169, "learning_rate": 0.00018436770470418734, "loss": 0.6945, "step": 4090 }, { "epoch": 0.26266837716484925, "grad_norm": 0.7213205274182185, "learning_rate": 0.00018430755038267664, "loss": 0.5532, "step": 4095 }, { "epoch": 0.26298909557408595, "grad_norm": 1.1163686122257008, "learning_rate": 0.00018424729039670786, "loss": 0.6516, "step": 4100 }, { "epoch": 0.26330981398332265, "grad_norm": 1.2583036183921432, "learning_rate": 0.00018418692482180605, "loss": 0.6414, "step": 4105 }, { "epoch": 0.26363053239255935, "grad_norm": 0.9930140372439703, "learning_rate": 0.0001841264537336287, "loss": 0.6207, "step": 4110 }, { "epoch": 0.26395125080179604, "grad_norm": 1.0089622154428168, "learning_rate": 0.00018406587720796555, "loss": 0.584, "step": 4115 }, { "epoch": 0.2642719692110327, "grad_norm": 0.7458841041229098, "learning_rate": 0.00018400519532073845, "loss": 0.5883, "step": 4120 }, { "epoch": 0.2645926876202694, "grad_norm": 0.8089823917563255, "learning_rate": 0.0001839444081480013, "loss": 0.7034, "step": 4125 }, { "epoch": 0.2649134060295061, "grad_norm": 0.6692062310802624, "learning_rate": 0.00018388351576594, "loss": 0.6344, "step": 4130 }, { "epoch": 0.2652341244387428, "grad_norm": 1.1933403776576017, "learning_rate": 0.0001838225182508722, "loss": 0.6661, "step": 4135 }, { "epoch": 0.2655548428479795, "grad_norm": 0.8440572180162611, "learning_rate": 0.00018376141567924746, "loss": 0.748, "step": 4140 }, { "epoch": 0.2658755612572162, "grad_norm": 0.8186841087339073, "learning_rate": 0.0001837002081276469, "loss": 0.7713, "step": 4145 }, { "epoch": 0.2661962796664529, "grad_norm": 1.0666433490645642, "learning_rate": 0.0001836388956727833, "loss": 0.8609, "step": 4150 }, { "epoch": 0.26651699807568957, "grad_norm": 1.1355241254608384, "learning_rate": 0.00018357747839150082, "loss": 0.6469, "step": 4155 }, { "epoch": 0.2668377164849262, "grad_norm": 0.7464964673319473, "learning_rate": 0.00018351595636077509, "loss": 0.5979, "step": 4160 }, { "epoch": 0.2671584348941629, "grad_norm": 0.8983502422541593, "learning_rate": 0.00018345432965771296, "loss": 0.6956, "step": 4165 }, { "epoch": 0.2674791533033996, "grad_norm": 1.0667530685360391, "learning_rate": 0.00018339259835955252, "loss": 0.613, "step": 4170 }, { "epoch": 0.2677998717126363, "grad_norm": 0.9132017699113576, "learning_rate": 0.00018333076254366292, "loss": 0.7377, "step": 4175 }, { "epoch": 0.268120590121873, "grad_norm": 0.820877622590415, "learning_rate": 0.0001832688222875443, "loss": 0.6287, "step": 4180 }, { "epoch": 0.2684413085311097, "grad_norm": 1.118619920969021, "learning_rate": 0.00018320677766882777, "loss": 0.6384, "step": 4185 }, { "epoch": 0.2687620269403464, "grad_norm": 1.4366554572404993, "learning_rate": 0.00018314462876527508, "loss": 0.6833, "step": 4190 }, { "epoch": 0.26908274534958304, "grad_norm": 1.0835964639148083, "learning_rate": 0.00018308237565477887, "loss": 0.5727, "step": 4195 }, { "epoch": 0.26940346375881974, "grad_norm": 0.9256686315486947, "learning_rate": 0.00018302001841536222, "loss": 0.6766, "step": 4200 }, { "epoch": 0.26972418216805644, "grad_norm": 0.9133924374197757, "learning_rate": 0.00018295755712517887, "loss": 0.6114, "step": 4205 }, { "epoch": 0.27004490057729313, "grad_norm": 0.9886601065235708, "learning_rate": 0.00018289499186251282, "loss": 0.6487, "step": 4210 }, { "epoch": 0.27036561898652983, "grad_norm": 0.7921503565458989, "learning_rate": 0.00018283232270577854, "loss": 0.5979, "step": 4215 }, { "epoch": 0.27068633739576653, "grad_norm": 0.6150099468882971, "learning_rate": 0.00018276954973352053, "loss": 0.6981, "step": 4220 }, { "epoch": 0.2710070558050032, "grad_norm": 1.0834800425960802, "learning_rate": 0.00018270667302441355, "loss": 0.5754, "step": 4225 }, { "epoch": 0.27132777421423987, "grad_norm": 1.6569395813805736, "learning_rate": 0.00018264369265726232, "loss": 0.6754, "step": 4230 }, { "epoch": 0.27164849262347657, "grad_norm": 1.1904706994873762, "learning_rate": 0.0001825806087110015, "loss": 0.6955, "step": 4235 }, { "epoch": 0.27196921103271327, "grad_norm": 0.9036845887010689, "learning_rate": 0.00018251742126469553, "loss": 0.6245, "step": 4240 }, { "epoch": 0.27228992944194996, "grad_norm": 1.2154289806047023, "learning_rate": 0.00018245413039753858, "loss": 0.6966, "step": 4245 }, { "epoch": 0.27261064785118666, "grad_norm": 0.7781670764658554, "learning_rate": 0.00018239073618885447, "loss": 0.5014, "step": 4250 }, { "epoch": 0.27293136626042336, "grad_norm": 0.9312674308580604, "learning_rate": 0.00018232723871809654, "loss": 0.7177, "step": 4255 }, { "epoch": 0.27325208466966006, "grad_norm": 0.7997579086131462, "learning_rate": 0.00018226363806484749, "loss": 0.6622, "step": 4260 }, { "epoch": 0.27357280307889675, "grad_norm": 1.1414064891921076, "learning_rate": 0.00018219993430881935, "loss": 0.7326, "step": 4265 }, { "epoch": 0.2738935214881334, "grad_norm": 0.8488220516302005, "learning_rate": 0.00018213612752985346, "loss": 0.6111, "step": 4270 }, { "epoch": 0.2742142398973701, "grad_norm": 0.6785943182404776, "learning_rate": 0.00018207221780792022, "loss": 0.568, "step": 4275 }, { "epoch": 0.2745349583066068, "grad_norm": 0.7407135493281501, "learning_rate": 0.00018200820522311907, "loss": 0.9428, "step": 4280 }, { "epoch": 0.2748556767158435, "grad_norm": 0.7785838981084623, "learning_rate": 0.00018194408985567826, "loss": 0.6602, "step": 4285 }, { "epoch": 0.2751763951250802, "grad_norm": 1.3274741440702664, "learning_rate": 0.00018187987178595506, "loss": 0.6326, "step": 4290 }, { "epoch": 0.2754971135343169, "grad_norm": 0.7698326162883183, "learning_rate": 0.00018181555109443527, "loss": 0.7828, "step": 4295 }, { "epoch": 0.2758178319435536, "grad_norm": 0.9874438661020553, "learning_rate": 0.00018175112786173345, "loss": 0.6177, "step": 4300 }, { "epoch": 0.2761385503527902, "grad_norm": 1.2983806783457539, "learning_rate": 0.0001816866021685926, "loss": 0.5931, "step": 4305 }, { "epoch": 0.2764592687620269, "grad_norm": 0.6650133276949847, "learning_rate": 0.00018162197409588414, "loss": 0.6065, "step": 4310 }, { "epoch": 0.2767799871712636, "grad_norm": 0.6615532414642794, "learning_rate": 0.0001815572437246078, "loss": 0.6777, "step": 4315 }, { "epoch": 0.2771007055805003, "grad_norm": 0.9856674878658384, "learning_rate": 0.00018149241113589158, "loss": 0.7992, "step": 4320 }, { "epoch": 0.277421423989737, "grad_norm": 0.9736624117716728, "learning_rate": 0.00018142747641099156, "loss": 0.6433, "step": 4325 }, { "epoch": 0.2777421423989737, "grad_norm": 0.6411826659070557, "learning_rate": 0.00018136243963129176, "loss": 0.6934, "step": 4330 }, { "epoch": 0.2780628608082104, "grad_norm": 1.1535749419623087, "learning_rate": 0.00018129730087830423, "loss": 0.6763, "step": 4335 }, { "epoch": 0.2783835792174471, "grad_norm": 0.9545043501616219, "learning_rate": 0.00018123206023366875, "loss": 0.6913, "step": 4340 }, { "epoch": 0.27870429762668375, "grad_norm": 0.8726709507710128, "learning_rate": 0.00018116671777915279, "loss": 0.6719, "step": 4345 }, { "epoch": 0.27902501603592045, "grad_norm": 0.8365717106126314, "learning_rate": 0.00018110127359665144, "loss": 0.8124, "step": 4350 }, { "epoch": 0.27934573444515715, "grad_norm": 1.2549482014888076, "learning_rate": 0.00018103572776818734, "loss": 0.6818, "step": 4355 }, { "epoch": 0.27966645285439384, "grad_norm": 1.0842835676700455, "learning_rate": 0.00018097008037591046, "loss": 0.6671, "step": 4360 }, { "epoch": 0.27998717126363054, "grad_norm": 0.9380406537541407, "learning_rate": 0.00018090433150209809, "loss": 0.6949, "step": 4365 }, { "epoch": 0.28030788967286724, "grad_norm": 1.150794578223368, "learning_rate": 0.00018083848122915468, "loss": 0.7515, "step": 4370 }, { "epoch": 0.28062860808210394, "grad_norm": 0.8083227750174746, "learning_rate": 0.0001807725296396118, "loss": 0.7616, "step": 4375 }, { "epoch": 0.2809493264913406, "grad_norm": 0.7534176713677331, "learning_rate": 0.000180706476816128, "loss": 0.7793, "step": 4380 }, { "epoch": 0.2812700449005773, "grad_norm": 0.8339195487244033, "learning_rate": 0.00018064032284148868, "loss": 0.6498, "step": 4385 }, { "epoch": 0.281590763309814, "grad_norm": 1.0737472499663367, "learning_rate": 0.00018057406779860603, "loss": 0.717, "step": 4390 }, { "epoch": 0.28191148171905067, "grad_norm": 0.9978477560799941, "learning_rate": 0.00018050771177051896, "loss": 0.5892, "step": 4395 }, { "epoch": 0.28223220012828737, "grad_norm": 1.3027101386742324, "learning_rate": 0.00018044125484039284, "loss": 0.7084, "step": 4400 }, { "epoch": 0.28255291853752407, "grad_norm": 0.930029771124351, "learning_rate": 0.0001803746970915196, "loss": 0.6916, "step": 4405 }, { "epoch": 0.28287363694676076, "grad_norm": 0.7778850969886842, "learning_rate": 0.00018030803860731744, "loss": 0.7685, "step": 4410 }, { "epoch": 0.2831943553559974, "grad_norm": 0.7650986542927773, "learning_rate": 0.00018024127947133096, "loss": 0.6537, "step": 4415 }, { "epoch": 0.2835150737652341, "grad_norm": 1.5408988991120984, "learning_rate": 0.00018017441976723073, "loss": 0.7775, "step": 4420 }, { "epoch": 0.2838357921744708, "grad_norm": 1.2912216339714508, "learning_rate": 0.0001801074595788135, "loss": 0.6968, "step": 4425 }, { "epoch": 0.2841565105837075, "grad_norm": 1.0528277674684878, "learning_rate": 0.00018004039899000186, "loss": 0.6352, "step": 4430 }, { "epoch": 0.2844772289929442, "grad_norm": 0.9968577641995723, "learning_rate": 0.00017997323808484434, "loss": 0.681, "step": 4435 }, { "epoch": 0.2847979474021809, "grad_norm": 0.7048566927661232, "learning_rate": 0.0001799059769475151, "loss": 0.589, "step": 4440 }, { "epoch": 0.2851186658114176, "grad_norm": 1.2752536855080614, "learning_rate": 0.00017983861566231397, "loss": 0.6021, "step": 4445 }, { "epoch": 0.2854393842206543, "grad_norm": 0.6838772733375945, "learning_rate": 0.0001797711543136663, "loss": 0.62, "step": 4450 }, { "epoch": 0.28576010262989093, "grad_norm": 1.0992940781905054, "learning_rate": 0.00017970359298612282, "loss": 0.7695, "step": 4455 }, { "epoch": 0.28608082103912763, "grad_norm": 0.9891320713998334, "learning_rate": 0.00017963593176435964, "loss": 0.7417, "step": 4460 }, { "epoch": 0.28640153944836433, "grad_norm": 1.0219509493165506, "learning_rate": 0.00017956817073317793, "loss": 0.8078, "step": 4465 }, { "epoch": 0.286722257857601, "grad_norm": 0.601838514745307, "learning_rate": 0.00017950030997750414, "loss": 0.6521, "step": 4470 }, { "epoch": 0.2870429762668377, "grad_norm": 0.6658616403524804, "learning_rate": 0.00017943234958238952, "loss": 0.4757, "step": 4475 }, { "epoch": 0.2873636946760744, "grad_norm": 1.007316511383742, "learning_rate": 0.00017936428963301036, "loss": 0.7311, "step": 4480 }, { "epoch": 0.2876844130853111, "grad_norm": 1.1189936485732135, "learning_rate": 0.00017929613021466765, "loss": 0.6303, "step": 4485 }, { "epoch": 0.28800513149454776, "grad_norm": 0.7720709103171642, "learning_rate": 0.000179227871412787, "loss": 0.5517, "step": 4490 }, { "epoch": 0.28832584990378446, "grad_norm": 0.840259961080622, "learning_rate": 0.00017915951331291864, "loss": 0.7003, "step": 4495 }, { "epoch": 0.28864656831302116, "grad_norm": 0.7950998217641071, "learning_rate": 0.00017909105600073726, "loss": 0.6693, "step": 4500 }, { "epoch": 0.28896728672225785, "grad_norm": 0.8828219239731676, "learning_rate": 0.00017902249956204183, "loss": 0.613, "step": 4505 }, { "epoch": 0.28928800513149455, "grad_norm": 0.8050366826668545, "learning_rate": 0.0001789538440827557, "loss": 0.5657, "step": 4510 }, { "epoch": 0.28960872354073125, "grad_norm": 1.0967164706749888, "learning_rate": 0.00017888508964892616, "loss": 0.8128, "step": 4515 }, { "epoch": 0.28992944194996795, "grad_norm": 0.9150715640614145, "learning_rate": 0.00017881623634672465, "loss": 0.7572, "step": 4520 }, { "epoch": 0.29025016035920465, "grad_norm": 1.2602671775870735, "learning_rate": 0.00017874728426244647, "loss": 0.6905, "step": 4525 }, { "epoch": 0.2905708787684413, "grad_norm": 0.9346668957570068, "learning_rate": 0.00017867823348251076, "loss": 0.7051, "step": 4530 }, { "epoch": 0.290891597177678, "grad_norm": 0.7910849436025686, "learning_rate": 0.00017860908409346034, "loss": 0.709, "step": 4535 }, { "epoch": 0.2912123155869147, "grad_norm": 0.8218374279342303, "learning_rate": 0.0001785398361819616, "loss": 0.5839, "step": 4540 }, { "epoch": 0.2915330339961514, "grad_norm": 0.8511332345341893, "learning_rate": 0.0001784704898348045, "loss": 0.7218, "step": 4545 }, { "epoch": 0.2918537524053881, "grad_norm": 1.2396495867604176, "learning_rate": 0.0001784010451389022, "loss": 0.5707, "step": 4550 }, { "epoch": 0.2921744708146248, "grad_norm": 0.5453795713818735, "learning_rate": 0.00017833150218129129, "loss": 0.7248, "step": 4555 }, { "epoch": 0.2924951892238615, "grad_norm": 0.8544441259057197, "learning_rate": 0.00017826186104913142, "loss": 0.6706, "step": 4560 }, { "epoch": 0.2928159076330981, "grad_norm": 0.7078874543955929, "learning_rate": 0.00017819212182970535, "loss": 0.6732, "step": 4565 }, { "epoch": 0.2931366260423348, "grad_norm": 1.1258864806353122, "learning_rate": 0.0001781222846104187, "loss": 0.696, "step": 4570 }, { "epoch": 0.2934573444515715, "grad_norm": 0.8952983146425741, "learning_rate": 0.00017805234947879993, "loss": 0.6778, "step": 4575 }, { "epoch": 0.2937780628608082, "grad_norm": 1.078013753440664, "learning_rate": 0.0001779823165225003, "loss": 0.6494, "step": 4580 }, { "epoch": 0.2940987812700449, "grad_norm": 1.2457998074637708, "learning_rate": 0.0001779121858292936, "loss": 0.6356, "step": 4585 }, { "epoch": 0.2944194996792816, "grad_norm": 0.9452414867290724, "learning_rate": 0.0001778419574870761, "loss": 0.7049, "step": 4590 }, { "epoch": 0.2947402180885183, "grad_norm": 1.0903318911783695, "learning_rate": 0.00017777163158386647, "loss": 0.653, "step": 4595 }, { "epoch": 0.29506093649775494, "grad_norm": 1.172298521370259, "learning_rate": 0.00017770120820780573, "loss": 0.7285, "step": 4600 }, { "epoch": 0.29538165490699164, "grad_norm": 0.6583420678299451, "learning_rate": 0.00017763068744715697, "loss": 0.6031, "step": 4605 }, { "epoch": 0.29570237331622834, "grad_norm": 0.8591774180151724, "learning_rate": 0.00017756006939030535, "loss": 0.7409, "step": 4610 }, { "epoch": 0.29602309172546504, "grad_norm": 0.6898541329818539, "learning_rate": 0.00017748935412575804, "loss": 0.589, "step": 4615 }, { "epoch": 0.29634381013470174, "grad_norm": 0.5395272492697519, "learning_rate": 0.000177418541742144, "loss": 0.708, "step": 4620 }, { "epoch": 0.29666452854393843, "grad_norm": 1.0169898045901036, "learning_rate": 0.0001773476323282138, "loss": 0.6948, "step": 4625 }, { "epoch": 0.29698524695317513, "grad_norm": 1.0000948614259928, "learning_rate": 0.00017727662597283986, "loss": 0.7215, "step": 4630 }, { "epoch": 0.29730596536241183, "grad_norm": 0.9689865733719959, "learning_rate": 0.00017720552276501592, "loss": 0.6701, "step": 4635 }, { "epoch": 0.29762668377164847, "grad_norm": 0.6557948134140331, "learning_rate": 0.00017713432279385712, "loss": 0.6235, "step": 4640 }, { "epoch": 0.29794740218088517, "grad_norm": 1.1877573091679572, "learning_rate": 0.00017706302614859992, "loss": 0.7863, "step": 4645 }, { "epoch": 0.29826812059012187, "grad_norm": 0.8462973100804213, "learning_rate": 0.00017699163291860198, "loss": 0.5724, "step": 4650 }, { "epoch": 0.29858883899935856, "grad_norm": 0.9236445624740109, "learning_rate": 0.0001769201431933419, "loss": 0.5787, "step": 4655 }, { "epoch": 0.29890955740859526, "grad_norm": 1.0716376234952218, "learning_rate": 0.00017684855706241934, "loss": 0.7401, "step": 4660 }, { "epoch": 0.29923027581783196, "grad_norm": 1.1600311786248418, "learning_rate": 0.00017677687461555467, "loss": 0.708, "step": 4665 }, { "epoch": 0.29955099422706866, "grad_norm": 0.7413385734559219, "learning_rate": 0.00017670509594258912, "loss": 0.5718, "step": 4670 }, { "epoch": 0.2998717126363053, "grad_norm": 0.9348593211146833, "learning_rate": 0.00017663322113348434, "loss": 0.7492, "step": 4675 }, { "epoch": 0.300192431045542, "grad_norm": 1.5696315279326167, "learning_rate": 0.0001765612502783226, "loss": 0.6552, "step": 4680 }, { "epoch": 0.3005131494547787, "grad_norm": 1.0990775256909542, "learning_rate": 0.00017648918346730653, "loss": 0.582, "step": 4685 }, { "epoch": 0.3008338678640154, "grad_norm": 0.7467674097224691, "learning_rate": 0.00017641702079075904, "loss": 0.6326, "step": 4690 }, { "epoch": 0.3011545862732521, "grad_norm": 0.7256436706311058, "learning_rate": 0.00017634476233912308, "loss": 0.7717, "step": 4695 }, { "epoch": 0.3014753046824888, "grad_norm": 0.754840650778496, "learning_rate": 0.00017627240820296177, "loss": 0.6896, "step": 4700 }, { "epoch": 0.3017960230917255, "grad_norm": 0.7072150395545665, "learning_rate": 0.0001761999584729581, "loss": 0.6332, "step": 4705 }, { "epoch": 0.3021167415009622, "grad_norm": 1.2009873604762311, "learning_rate": 0.00017612741323991488, "loss": 0.6393, "step": 4710 }, { "epoch": 0.3024374599101988, "grad_norm": 0.6086745243060716, "learning_rate": 0.0001760547725947545, "loss": 0.6681, "step": 4715 }, { "epoch": 0.3027581783194355, "grad_norm": 0.9853085984018423, "learning_rate": 0.0001759820366285192, "loss": 0.5961, "step": 4720 }, { "epoch": 0.3030788967286722, "grad_norm": 1.0109466174974706, "learning_rate": 0.00017590920543237036, "loss": 0.7225, "step": 4725 }, { "epoch": 0.3033996151379089, "grad_norm": 1.2139597067132748, "learning_rate": 0.00017583627909758902, "loss": 0.6542, "step": 4730 }, { "epoch": 0.3037203335471456, "grad_norm": 0.9478885183065455, "learning_rate": 0.00017576325771557518, "loss": 0.6881, "step": 4735 }, { "epoch": 0.3040410519563823, "grad_norm": 0.8539507613861936, "learning_rate": 0.00017569014137784822, "loss": 0.6331, "step": 4740 }, { "epoch": 0.304361770365619, "grad_norm": 0.9679885840401695, "learning_rate": 0.00017561693017604637, "loss": 0.7997, "step": 4745 }, { "epoch": 0.30468248877485565, "grad_norm": 0.9422216475894025, "learning_rate": 0.00017554362420192676, "loss": 0.6769, "step": 4750 }, { "epoch": 0.30500320718409235, "grad_norm": 1.0998446041770769, "learning_rate": 0.00017547022354736538, "loss": 0.6072, "step": 4755 }, { "epoch": 0.30532392559332905, "grad_norm": 1.0857238442878236, "learning_rate": 0.00017539672830435682, "loss": 0.7689, "step": 4760 }, { "epoch": 0.30564464400256575, "grad_norm": 0.7440444931879342, "learning_rate": 0.00017532313856501427, "loss": 0.5841, "step": 4765 }, { "epoch": 0.30596536241180244, "grad_norm": 0.7172978744287396, "learning_rate": 0.0001752494544215693, "loss": 0.6583, "step": 4770 }, { "epoch": 0.30628608082103914, "grad_norm": 1.2045039512423583, "learning_rate": 0.00017517567596637184, "loss": 0.6052, "step": 4775 }, { "epoch": 0.30660679923027584, "grad_norm": 0.6334336485782317, "learning_rate": 0.00017510180329189, "loss": 0.6194, "step": 4780 }, { "epoch": 0.3069275176395125, "grad_norm": 1.3899325242838065, "learning_rate": 0.00017502783649070994, "loss": 0.7102, "step": 4785 }, { "epoch": 0.3072482360487492, "grad_norm": 1.1877009077958471, "learning_rate": 0.00017495377565553594, "loss": 0.683, "step": 4790 }, { "epoch": 0.3075689544579859, "grad_norm": 1.1043105680832985, "learning_rate": 0.00017487962087918993, "loss": 0.6165, "step": 4795 }, { "epoch": 0.3078896728672226, "grad_norm": 0.9571802341999754, "learning_rate": 0.00017480537225461178, "loss": 0.499, "step": 4800 }, { "epoch": 0.3082103912764593, "grad_norm": 1.0846077393930171, "learning_rate": 0.00017473102987485876, "loss": 0.7685, "step": 4805 }, { "epoch": 0.30853110968569597, "grad_norm": 0.9095961738585777, "learning_rate": 0.00017465659383310587, "loss": 0.6373, "step": 4810 }, { "epoch": 0.30885182809493267, "grad_norm": 1.1872255037042634, "learning_rate": 0.00017458206422264533, "loss": 0.6564, "step": 4815 }, { "epoch": 0.30917254650416937, "grad_norm": 1.0600317447426089, "learning_rate": 0.00017450744113688672, "loss": 0.6103, "step": 4820 }, { "epoch": 0.309493264913406, "grad_norm": 0.89956531270657, "learning_rate": 0.00017443272466935675, "loss": 0.7056, "step": 4825 }, { "epoch": 0.3098139833226427, "grad_norm": 0.6138048573378617, "learning_rate": 0.00017435791491369917, "loss": 0.6437, "step": 4830 }, { "epoch": 0.3101347017318794, "grad_norm": 0.6479672204769544, "learning_rate": 0.00017428301196367464, "loss": 0.7149, "step": 4835 }, { "epoch": 0.3104554201411161, "grad_norm": 0.9059240016877552, "learning_rate": 0.00017420801591316062, "loss": 0.6641, "step": 4840 }, { "epoch": 0.3107761385503528, "grad_norm": 0.7000331742442105, "learning_rate": 0.00017413292685615134, "loss": 0.6227, "step": 4845 }, { "epoch": 0.3110968569595895, "grad_norm": 0.8706735159170973, "learning_rate": 0.00017405774488675742, "loss": 0.6191, "step": 4850 }, { "epoch": 0.3114175753688262, "grad_norm": 0.9657278531523165, "learning_rate": 0.0001739824700992061, "loss": 0.5956, "step": 4855 }, { "epoch": 0.31173829377806284, "grad_norm": 0.9553637466697323, "learning_rate": 0.0001739071025878409, "loss": 0.7627, "step": 4860 }, { "epoch": 0.31205901218729953, "grad_norm": 1.1595347795694808, "learning_rate": 0.00017383164244712146, "loss": 0.6432, "step": 4865 }, { "epoch": 0.31237973059653623, "grad_norm": 1.3557930665103466, "learning_rate": 0.0001737560897716236, "loss": 0.6965, "step": 4870 }, { "epoch": 0.31270044900577293, "grad_norm": 0.919377290874929, "learning_rate": 0.00017368044465603915, "loss": 0.6913, "step": 4875 }, { "epoch": 0.3130211674150096, "grad_norm": 0.9179711638304333, "learning_rate": 0.00017360470719517577, "loss": 0.5516, "step": 4880 }, { "epoch": 0.3133418858242463, "grad_norm": 0.8074363475177312, "learning_rate": 0.00017352887748395678, "loss": 0.6421, "step": 4885 }, { "epoch": 0.313662604233483, "grad_norm": 1.3217851235374773, "learning_rate": 0.00017345295561742123, "loss": 0.7387, "step": 4890 }, { "epoch": 0.31398332264271966, "grad_norm": 0.8100107368582629, "learning_rate": 0.0001733769416907236, "loss": 0.6104, "step": 4895 }, { "epoch": 0.31430404105195636, "grad_norm": 1.0974582938152775, "learning_rate": 0.0001733008357991338, "loss": 0.649, "step": 4900 }, { "epoch": 0.31462475946119306, "grad_norm": 1.233711986487123, "learning_rate": 0.00017322463803803688, "loss": 0.5448, "step": 4905 }, { "epoch": 0.31494547787042976, "grad_norm": 0.8777266459889339, "learning_rate": 0.00017314834850293325, "loss": 0.7512, "step": 4910 }, { "epoch": 0.31526619627966646, "grad_norm": 0.8794148401176598, "learning_rate": 0.00017307196728943812, "loss": 0.6314, "step": 4915 }, { "epoch": 0.31558691468890315, "grad_norm": 0.7021113325319495, "learning_rate": 0.00017299549449328175, "loss": 0.5404, "step": 4920 }, { "epoch": 0.31590763309813985, "grad_norm": 0.76819009517203, "learning_rate": 0.00017291893021030913, "loss": 0.7646, "step": 4925 }, { "epoch": 0.31622835150737655, "grad_norm": 1.3281150753972946, "learning_rate": 0.00017284227453647993, "loss": 0.6404, "step": 4930 }, { "epoch": 0.3165490699166132, "grad_norm": 0.8777792257027988, "learning_rate": 0.00017276552756786831, "loss": 0.7211, "step": 4935 }, { "epoch": 0.3168697883258499, "grad_norm": 0.9522765071117524, "learning_rate": 0.00017268868940066288, "loss": 0.7659, "step": 4940 }, { "epoch": 0.3171905067350866, "grad_norm": 0.7347381221386469, "learning_rate": 0.0001726117601311666, "loss": 0.7521, "step": 4945 }, { "epoch": 0.3175112251443233, "grad_norm": 0.947686463596072, "learning_rate": 0.00017253473985579657, "loss": 0.6981, "step": 4950 }, { "epoch": 0.31783194355356, "grad_norm": 0.9948270615790568, "learning_rate": 0.0001724576286710839, "loss": 0.5347, "step": 4955 }, { "epoch": 0.3181526619627967, "grad_norm": 0.7412951434019396, "learning_rate": 0.00017238042667367377, "loss": 0.6563, "step": 4960 }, { "epoch": 0.3184733803720334, "grad_norm": 0.9060455966464537, "learning_rate": 0.00017230313396032504, "loss": 0.8452, "step": 4965 }, { "epoch": 0.31879409878127, "grad_norm": 0.7926379737171755, "learning_rate": 0.00017222575062791033, "loss": 0.6834, "step": 4970 }, { "epoch": 0.3191148171905067, "grad_norm": 1.1978749811848812, "learning_rate": 0.00017214827677341582, "loss": 0.5959, "step": 4975 }, { "epoch": 0.3194355355997434, "grad_norm": 1.1382243993856835, "learning_rate": 0.00017207071249394118, "loss": 0.8144, "step": 4980 }, { "epoch": 0.3197562540089801, "grad_norm": 0.9207041310652729, "learning_rate": 0.00017199305788669937, "loss": 0.7515, "step": 4985 }, { "epoch": 0.3200769724182168, "grad_norm": 0.7762438521118743, "learning_rate": 0.00017191531304901653, "loss": 0.7128, "step": 4990 }, { "epoch": 0.3203976908274535, "grad_norm": 1.0657161158728048, "learning_rate": 0.000171837478078332, "loss": 0.7206, "step": 4995 }, { "epoch": 0.3207184092366902, "grad_norm": 0.8853471042976426, "learning_rate": 0.00017175955307219796, "loss": 0.6661, "step": 5000 }, { "epoch": 0.3210391276459269, "grad_norm": 0.730931049927295, "learning_rate": 0.00017168153812827957, "loss": 0.7177, "step": 5005 }, { "epoch": 0.32135984605516354, "grad_norm": 1.24238938271146, "learning_rate": 0.0001716034333443545, "loss": 0.7264, "step": 5010 }, { "epoch": 0.32168056446440024, "grad_norm": 1.0598509644567646, "learning_rate": 0.00017152523881831325, "loss": 0.5868, "step": 5015 }, { "epoch": 0.32200128287363694, "grad_norm": 1.142674205123222, "learning_rate": 0.00017144695464815866, "loss": 0.7652, "step": 5020 }, { "epoch": 0.32232200128287364, "grad_norm": 1.2248444413302872, "learning_rate": 0.00017136858093200593, "loss": 0.6078, "step": 5025 }, { "epoch": 0.32264271969211034, "grad_norm": 0.9090404485944782, "learning_rate": 0.00017129011776808258, "loss": 0.6921, "step": 5030 }, { "epoch": 0.32296343810134703, "grad_norm": 1.0978730524660503, "learning_rate": 0.00017121156525472814, "loss": 0.7593, "step": 5035 }, { "epoch": 0.32328415651058373, "grad_norm": 1.8023280272488704, "learning_rate": 0.00017113292349039413, "loss": 0.7583, "step": 5040 }, { "epoch": 0.3236048749198204, "grad_norm": 1.0487723489551213, "learning_rate": 0.000171054192573644, "loss": 0.7754, "step": 5045 }, { "epoch": 0.32392559332905707, "grad_norm": 0.7931120571928945, "learning_rate": 0.0001709753726031529, "loss": 0.7182, "step": 5050 }, { "epoch": 0.32424631173829377, "grad_norm": 1.3448284362405596, "learning_rate": 0.00017089646367770756, "loss": 0.6391, "step": 5055 }, { "epoch": 0.32456703014753047, "grad_norm": 0.9771883061194023, "learning_rate": 0.0001708174658962062, "loss": 0.632, "step": 5060 }, { "epoch": 0.32488774855676716, "grad_norm": 0.944625885099161, "learning_rate": 0.00017073837935765846, "loss": 0.6235, "step": 5065 }, { "epoch": 0.32520846696600386, "grad_norm": 0.9899695819556337, "learning_rate": 0.00017065920416118522, "loss": 0.7345, "step": 5070 }, { "epoch": 0.32552918537524056, "grad_norm": 0.5815153267452241, "learning_rate": 0.00017057994040601838, "loss": 0.5988, "step": 5075 }, { "epoch": 0.3258499037844772, "grad_norm": 0.7182304509869034, "learning_rate": 0.00017050058819150098, "loss": 0.5962, "step": 5080 }, { "epoch": 0.3261706221937139, "grad_norm": 0.7916342652857238, "learning_rate": 0.0001704211476170868, "loss": 0.5903, "step": 5085 }, { "epoch": 0.3264913406029506, "grad_norm": 1.186592480709318, "learning_rate": 0.00017034161878234043, "loss": 0.7071, "step": 5090 }, { "epoch": 0.3268120590121873, "grad_norm": 1.4501384859209354, "learning_rate": 0.00017026200178693704, "loss": 0.5699, "step": 5095 }, { "epoch": 0.327132777421424, "grad_norm": 0.4770414244602479, "learning_rate": 0.0001701822967306624, "loss": 0.6942, "step": 5100 }, { "epoch": 0.3274534958306607, "grad_norm": 1.2188679878291713, "learning_rate": 0.00017010250371341244, "loss": 0.6633, "step": 5105 }, { "epoch": 0.3277742142398974, "grad_norm": 1.0813857287425748, "learning_rate": 0.0001700226228351935, "loss": 0.6257, "step": 5110 }, { "epoch": 0.3280949326491341, "grad_norm": 0.8540165463861037, "learning_rate": 0.00016994265419612205, "loss": 0.5918, "step": 5115 }, { "epoch": 0.32841565105837073, "grad_norm": 1.1642007608342173, "learning_rate": 0.00016986259789642444, "loss": 0.6911, "step": 5120 }, { "epoch": 0.3287363694676074, "grad_norm": 0.8539433327300491, "learning_rate": 0.00016978245403643694, "loss": 0.7732, "step": 5125 }, { "epoch": 0.3290570878768441, "grad_norm": 1.0202618411725253, "learning_rate": 0.0001697022227166056, "loss": 0.7798, "step": 5130 }, { "epoch": 0.3293778062860808, "grad_norm": 0.8876324268732894, "learning_rate": 0.00016962190403748605, "loss": 0.714, "step": 5135 }, { "epoch": 0.3296985246953175, "grad_norm": 0.7783501191713772, "learning_rate": 0.0001695414980997434, "loss": 0.7987, "step": 5140 }, { "epoch": 0.3300192431045542, "grad_norm": 1.204240570280653, "learning_rate": 0.00016946100500415213, "loss": 0.6914, "step": 5145 }, { "epoch": 0.3303399615137909, "grad_norm": 0.7152048301163425, "learning_rate": 0.00016938042485159594, "loss": 0.6703, "step": 5150 }, { "epoch": 0.33066067992302756, "grad_norm": 1.191922058294469, "learning_rate": 0.0001692997577430677, "loss": 0.6539, "step": 5155 }, { "epoch": 0.33098139833226425, "grad_norm": 0.8187793173057333, "learning_rate": 0.00016921900377966923, "loss": 0.7468, "step": 5160 }, { "epoch": 0.33130211674150095, "grad_norm": 0.9381392106872509, "learning_rate": 0.00016913816306261112, "loss": 0.766, "step": 5165 }, { "epoch": 0.33162283515073765, "grad_norm": 0.7128118797176758, "learning_rate": 0.00016905723569321288, "loss": 0.6719, "step": 5170 }, { "epoch": 0.33194355355997435, "grad_norm": 1.500297575057347, "learning_rate": 0.00016897622177290244, "loss": 0.7072, "step": 5175 }, { "epoch": 0.33226427196921104, "grad_norm": 0.9800774031498481, "learning_rate": 0.0001688951214032163, "loss": 0.6549, "step": 5180 }, { "epoch": 0.33258499037844774, "grad_norm": 0.8808790723791357, "learning_rate": 0.00016881393468579932, "loss": 0.6955, "step": 5185 }, { "epoch": 0.33290570878768444, "grad_norm": 0.8920914860291771, "learning_rate": 0.00016873266172240452, "loss": 0.5649, "step": 5190 }, { "epoch": 0.3332264271969211, "grad_norm": 0.6851960157071083, "learning_rate": 0.00016865130261489305, "loss": 0.6897, "step": 5195 }, { "epoch": 0.3335471456061578, "grad_norm": 0.8407283098592762, "learning_rate": 0.00016856985746523405, "loss": 0.6559, "step": 5200 }, { "epoch": 0.3338678640153945, "grad_norm": 0.9215186470532375, "learning_rate": 0.00016848832637550437, "loss": 0.7664, "step": 5205 }, { "epoch": 0.3341885824246312, "grad_norm": 0.7299164606010856, "learning_rate": 0.00016840670944788882, "loss": 0.5981, "step": 5210 }, { "epoch": 0.3345093008338679, "grad_norm": 0.8732424966610127, "learning_rate": 0.00016832500678467952, "loss": 0.7035, "step": 5215 }, { "epoch": 0.33483001924310457, "grad_norm": 0.9750167638289885, "learning_rate": 0.00016824321848827624, "loss": 0.5995, "step": 5220 }, { "epoch": 0.33515073765234127, "grad_norm": 1.0976388995980935, "learning_rate": 0.00016816134466118596, "loss": 0.7107, "step": 5225 }, { "epoch": 0.3354714560615779, "grad_norm": 1.0135781126967063, "learning_rate": 0.00016807938540602292, "loss": 0.7174, "step": 5230 }, { "epoch": 0.3357921744708146, "grad_norm": 0.8189118457664761, "learning_rate": 0.00016799734082550844, "loss": 0.6645, "step": 5235 }, { "epoch": 0.3361128928800513, "grad_norm": 0.6996919391876488, "learning_rate": 0.0001679152110224707, "loss": 0.6629, "step": 5240 }, { "epoch": 0.336433611289288, "grad_norm": 0.7381428623848976, "learning_rate": 0.00016783299609984478, "loss": 0.6016, "step": 5245 }, { "epoch": 0.3367543296985247, "grad_norm": 0.9095764087290898, "learning_rate": 0.00016775069616067233, "loss": 0.8577, "step": 5250 }, { "epoch": 0.3370750481077614, "grad_norm": 0.7032412366347235, "learning_rate": 0.00016766831130810171, "loss": 0.7342, "step": 5255 }, { "epoch": 0.3373957665169981, "grad_norm": 0.9697869860649856, "learning_rate": 0.00016758584164538757, "loss": 0.6338, "step": 5260 }, { "epoch": 0.33771648492623474, "grad_norm": 0.7784503288752077, "learning_rate": 0.00016750328727589095, "loss": 0.6666, "step": 5265 }, { "epoch": 0.33803720333547144, "grad_norm": 0.5156266401874552, "learning_rate": 0.00016742064830307897, "loss": 0.7699, "step": 5270 }, { "epoch": 0.33835792174470813, "grad_norm": 1.0003590365934907, "learning_rate": 0.0001673379248305248, "loss": 0.6751, "step": 5275 }, { "epoch": 0.33867864015394483, "grad_norm": 0.8026066074245787, "learning_rate": 0.0001672551169619076, "loss": 0.7573, "step": 5280 }, { "epoch": 0.33899935856318153, "grad_norm": 1.0369937352211243, "learning_rate": 0.00016717222480101221, "loss": 0.667, "step": 5285 }, { "epoch": 0.3393200769724182, "grad_norm": 0.9644006720446381, "learning_rate": 0.0001670892484517292, "loss": 0.6383, "step": 5290 }, { "epoch": 0.3396407953816549, "grad_norm": 1.0076204289252497, "learning_rate": 0.00016700618801805453, "loss": 0.7178, "step": 5295 }, { "epoch": 0.3399615137908916, "grad_norm": 0.5579579624666732, "learning_rate": 0.00016692304360408966, "loss": 0.6665, "step": 5300 }, { "epoch": 0.34028223220012827, "grad_norm": 0.8064350566112853, "learning_rate": 0.00016683981531404125, "loss": 0.5122, "step": 5305 }, { "epoch": 0.34060295060936496, "grad_norm": 0.9816255727453933, "learning_rate": 0.0001667565032522211, "loss": 0.6926, "step": 5310 }, { "epoch": 0.34092366901860166, "grad_norm": 0.817929460216783, "learning_rate": 0.00016667310752304602, "loss": 0.5491, "step": 5315 }, { "epoch": 0.34124438742783836, "grad_norm": 0.9215347160545883, "learning_rate": 0.00016658962823103764, "loss": 0.6835, "step": 5320 }, { "epoch": 0.34156510583707506, "grad_norm": 1.1290419292904414, "learning_rate": 0.00016650606548082236, "loss": 0.735, "step": 5325 }, { "epoch": 0.34188582424631175, "grad_norm": 1.1930691902617288, "learning_rate": 0.0001664224193771312, "loss": 0.5138, "step": 5330 }, { "epoch": 0.34220654265554845, "grad_norm": 0.8088938421114102, "learning_rate": 0.0001663386900247995, "loss": 0.6654, "step": 5335 }, { "epoch": 0.3425272610647851, "grad_norm": 0.5514542526950761, "learning_rate": 0.0001662548775287672, "loss": 0.6456, "step": 5340 }, { "epoch": 0.3428479794740218, "grad_norm": 0.8205842308107273, "learning_rate": 0.00016617098199407814, "loss": 0.7144, "step": 5345 }, { "epoch": 0.3431686978832585, "grad_norm": 0.9295493105678805, "learning_rate": 0.00016608700352588053, "loss": 0.6876, "step": 5350 }, { "epoch": 0.3434894162924952, "grad_norm": 0.7296614219020304, "learning_rate": 0.00016600294222942626, "loss": 0.6785, "step": 5355 }, { "epoch": 0.3438101347017319, "grad_norm": 0.6002339895362847, "learning_rate": 0.00016591879821007126, "loss": 0.5796, "step": 5360 }, { "epoch": 0.3441308531109686, "grad_norm": 1.6160052086574104, "learning_rate": 0.00016583457157327497, "loss": 0.7118, "step": 5365 }, { "epoch": 0.3444515715202053, "grad_norm": 1.2282552121625845, "learning_rate": 0.00016575026242460046, "loss": 0.6564, "step": 5370 }, { "epoch": 0.344772289929442, "grad_norm": 0.9643175110463178, "learning_rate": 0.00016566587086971416, "loss": 0.669, "step": 5375 }, { "epoch": 0.3450930083386786, "grad_norm": 0.9607772443483632, "learning_rate": 0.00016558139701438584, "loss": 0.6276, "step": 5380 }, { "epoch": 0.3454137267479153, "grad_norm": 0.9147875672042459, "learning_rate": 0.0001654968409644884, "loss": 0.5905, "step": 5385 }, { "epoch": 0.345734445157152, "grad_norm": 0.7334238812099275, "learning_rate": 0.00016541220282599773, "loss": 0.6261, "step": 5390 }, { "epoch": 0.3460551635663887, "grad_norm": 1.1742953273617749, "learning_rate": 0.00016532748270499262, "loss": 0.7, "step": 5395 }, { "epoch": 0.3463758819756254, "grad_norm": 1.1387016781633938, "learning_rate": 0.00016524268070765465, "loss": 0.7061, "step": 5400 }, { "epoch": 0.3466966003848621, "grad_norm": 0.9794060869341327, "learning_rate": 0.0001651577969402679, "loss": 0.7031, "step": 5405 }, { "epoch": 0.3470173187940988, "grad_norm": 0.9732807122694793, "learning_rate": 0.0001650728315092191, "loss": 0.6588, "step": 5410 }, { "epoch": 0.34733803720333545, "grad_norm": 1.2045887990242425, "learning_rate": 0.0001649877845209972, "loss": 0.5635, "step": 5415 }, { "epoch": 0.34765875561257215, "grad_norm": 0.9098967972234847, "learning_rate": 0.0001649026560821934, "loss": 0.6877, "step": 5420 }, { "epoch": 0.34797947402180884, "grad_norm": 0.8919518792507914, "learning_rate": 0.000164817446299501, "loss": 0.852, "step": 5425 }, { "epoch": 0.34830019243104554, "grad_norm": 1.082286394388753, "learning_rate": 0.00016473215527971528, "loss": 0.6497, "step": 5430 }, { "epoch": 0.34862091084028224, "grad_norm": 0.7681820908697059, "learning_rate": 0.00016464678312973327, "loss": 0.7075, "step": 5435 }, { "epoch": 0.34894162924951894, "grad_norm": 0.8577629521944062, "learning_rate": 0.00016456132995655372, "loss": 0.6942, "step": 5440 }, { "epoch": 0.34926234765875563, "grad_norm": 0.7981749008936162, "learning_rate": 0.00016447579586727692, "loss": 0.6658, "step": 5445 }, { "epoch": 0.3495830660679923, "grad_norm": 0.6566080494812765, "learning_rate": 0.0001643901809691046, "loss": 0.6325, "step": 5450 }, { "epoch": 0.349903784477229, "grad_norm": 0.7729498372329889, "learning_rate": 0.00016430448536933965, "loss": 0.5609, "step": 5455 }, { "epoch": 0.35022450288646567, "grad_norm": 1.0464507162443157, "learning_rate": 0.00016421870917538635, "loss": 0.6353, "step": 5460 }, { "epoch": 0.35054522129570237, "grad_norm": 1.3013839685098925, "learning_rate": 0.00016413285249474975, "loss": 0.5724, "step": 5465 }, { "epoch": 0.35086593970493907, "grad_norm": 0.813558813259816, "learning_rate": 0.00016404691543503588, "loss": 0.7074, "step": 5470 }, { "epoch": 0.35118665811417576, "grad_norm": 1.001748370098994, "learning_rate": 0.0001639608981039515, "loss": 0.7945, "step": 5475 }, { "epoch": 0.35150737652341246, "grad_norm": 0.870149957049954, "learning_rate": 0.00016387480060930395, "loss": 0.689, "step": 5480 }, { "epoch": 0.35182809493264916, "grad_norm": 0.8680578535676656, "learning_rate": 0.00016378862305900112, "loss": 0.6239, "step": 5485 }, { "epoch": 0.3521488133418858, "grad_norm": 0.8274627515878666, "learning_rate": 0.0001637023655610511, "loss": 0.6437, "step": 5490 }, { "epoch": 0.3524695317511225, "grad_norm": 0.8836905220838523, "learning_rate": 0.00016361602822356232, "loss": 0.581, "step": 5495 }, { "epoch": 0.3527902501603592, "grad_norm": 0.645087928333498, "learning_rate": 0.0001635296111547432, "loss": 0.65, "step": 5500 }, { "epoch": 0.3531109685695959, "grad_norm": 0.9138176884852274, "learning_rate": 0.00016344311446290212, "loss": 0.6039, "step": 5505 }, { "epoch": 0.3534316869788326, "grad_norm": 0.8932196439321753, "learning_rate": 0.00016335653825644717, "loss": 0.6447, "step": 5510 }, { "epoch": 0.3537524053880693, "grad_norm": 0.700814257534255, "learning_rate": 0.00016326988264388624, "loss": 0.634, "step": 5515 }, { "epoch": 0.354073123797306, "grad_norm": 0.8079984489578869, "learning_rate": 0.0001631831477338266, "loss": 0.5378, "step": 5520 }, { "epoch": 0.35439384220654263, "grad_norm": 1.0368102707808613, "learning_rate": 0.00016309633363497503, "loss": 0.6121, "step": 5525 }, { "epoch": 0.35471456061577933, "grad_norm": 1.0720279870828384, "learning_rate": 0.00016300944045613745, "loss": 0.615, "step": 5530 }, { "epoch": 0.355035279025016, "grad_norm": 0.6936759908598535, "learning_rate": 0.00016292246830621897, "loss": 0.7186, "step": 5535 }, { "epoch": 0.3553559974342527, "grad_norm": 0.8578757956070833, "learning_rate": 0.00016283541729422368, "loss": 0.6859, "step": 5540 }, { "epoch": 0.3556767158434894, "grad_norm": 0.6299846194893505, "learning_rate": 0.0001627482875292544, "loss": 0.7011, "step": 5545 }, { "epoch": 0.3559974342527261, "grad_norm": 2.8465820906119697, "learning_rate": 0.00016266107912051275, "loss": 0.6824, "step": 5550 }, { "epoch": 0.3563181526619628, "grad_norm": 0.8212652492805361, "learning_rate": 0.00016257379217729897, "loss": 0.7353, "step": 5555 }, { "epoch": 0.35663887107119946, "grad_norm": 0.8592127708286107, "learning_rate": 0.00016248642680901157, "loss": 0.7493, "step": 5560 }, { "epoch": 0.35695958948043616, "grad_norm": 1.5401896960046906, "learning_rate": 0.00016239898312514747, "loss": 0.6233, "step": 5565 }, { "epoch": 0.35728030788967285, "grad_norm": 0.9880669672357292, "learning_rate": 0.00016231146123530169, "loss": 0.7483, "step": 5570 }, { "epoch": 0.35760102629890955, "grad_norm": 1.0054106975653296, "learning_rate": 0.00016222386124916733, "loss": 0.7477, "step": 5575 }, { "epoch": 0.35792174470814625, "grad_norm": 0.8851121102484797, "learning_rate": 0.0001621361832765353, "loss": 0.7338, "step": 5580 }, { "epoch": 0.35824246311738295, "grad_norm": 0.7868381457390292, "learning_rate": 0.0001620484274272943, "loss": 0.8315, "step": 5585 }, { "epoch": 0.35856318152661965, "grad_norm": 2.2302567668996907, "learning_rate": 0.00016196059381143056, "loss": 0.6057, "step": 5590 }, { "epoch": 0.35888389993585634, "grad_norm": 0.8632558537630518, "learning_rate": 0.0001618726825390279, "loss": 0.6017, "step": 5595 }, { "epoch": 0.359204618345093, "grad_norm": 0.9301897471057365, "learning_rate": 0.0001617846937202674, "loss": 0.7127, "step": 5600 }, { "epoch": 0.3595253367543297, "grad_norm": 1.0314386924705863, "learning_rate": 0.00016169662746542724, "loss": 0.6471, "step": 5605 }, { "epoch": 0.3598460551635664, "grad_norm": 0.7527220509268685, "learning_rate": 0.00016160848388488283, "loss": 0.5149, "step": 5610 }, { "epoch": 0.3601667735728031, "grad_norm": 0.9964259981347259, "learning_rate": 0.0001615202630891064, "loss": 0.7551, "step": 5615 }, { "epoch": 0.3604874919820398, "grad_norm": 0.9534877288363439, "learning_rate": 0.0001614319651886669, "loss": 0.7869, "step": 5620 }, { "epoch": 0.3608082103912765, "grad_norm": 0.6624325233415048, "learning_rate": 0.00016134359029423004, "loss": 0.6187, "step": 5625 }, { "epoch": 0.36112892880051317, "grad_norm": 1.1438885759745019, "learning_rate": 0.000161255138516558, "loss": 0.6818, "step": 5630 }, { "epoch": 0.3614496472097498, "grad_norm": 1.0060076302436596, "learning_rate": 0.00016116660996650918, "loss": 0.7134, "step": 5635 }, { "epoch": 0.3617703656189865, "grad_norm": 0.824054815580278, "learning_rate": 0.0001610780047550384, "loss": 0.6322, "step": 5640 }, { "epoch": 0.3620910840282232, "grad_norm": 1.1593592610393137, "learning_rate": 0.00016098932299319642, "loss": 0.6549, "step": 5645 }, { "epoch": 0.3624118024374599, "grad_norm": 1.3453462014445998, "learning_rate": 0.00016090056479213, "loss": 0.6626, "step": 5650 }, { "epoch": 0.3627325208466966, "grad_norm": 0.6303823430985745, "learning_rate": 0.00016081173026308168, "loss": 0.6129, "step": 5655 }, { "epoch": 0.3630532392559333, "grad_norm": 0.9682139214042652, "learning_rate": 0.00016072281951738974, "loss": 0.5327, "step": 5660 }, { "epoch": 0.36337395766517, "grad_norm": 0.6265113009833752, "learning_rate": 0.00016063383266648788, "loss": 0.7972, "step": 5665 }, { "epoch": 0.3636946760744067, "grad_norm": 1.0602611989591288, "learning_rate": 0.0001605447698219052, "loss": 0.7568, "step": 5670 }, { "epoch": 0.36401539448364334, "grad_norm": 0.8085898565934937, "learning_rate": 0.0001604556310952661, "loss": 0.7088, "step": 5675 }, { "epoch": 0.36433611289288004, "grad_norm": 0.9259612439090465, "learning_rate": 0.00016036641659829005, "loss": 0.6433, "step": 5680 }, { "epoch": 0.36465683130211674, "grad_norm": 1.0560925548902709, "learning_rate": 0.00016027712644279147, "loss": 0.6389, "step": 5685 }, { "epoch": 0.36497754971135343, "grad_norm": 0.9202003497456687, "learning_rate": 0.00016018776074067965, "loss": 0.6588, "step": 5690 }, { "epoch": 0.36529826812059013, "grad_norm": 0.7606894269431724, "learning_rate": 0.00016009831960395854, "loss": 0.6249, "step": 5695 }, { "epoch": 0.36561898652982683, "grad_norm": 1.0194051743569745, "learning_rate": 0.00016000880314472662, "loss": 0.7063, "step": 5700 }, { "epoch": 0.3659397049390635, "grad_norm": 0.8971345599358044, "learning_rate": 0.0001599192114751768, "loss": 0.7758, "step": 5705 }, { "epoch": 0.36626042334830017, "grad_norm": 0.8114509690004853, "learning_rate": 0.0001598295447075962, "loss": 0.687, "step": 5710 }, { "epoch": 0.36658114175753687, "grad_norm": 1.1086821486366683, "learning_rate": 0.00015973980295436613, "loss": 0.7663, "step": 5715 }, { "epoch": 0.36690186016677356, "grad_norm": 0.8305079494288046, "learning_rate": 0.00015964998632796187, "loss": 0.7841, "step": 5720 }, { "epoch": 0.36722257857601026, "grad_norm": 0.9332565471912556, "learning_rate": 0.00015956009494095245, "loss": 0.7629, "step": 5725 }, { "epoch": 0.36754329698524696, "grad_norm": 1.2026329331281138, "learning_rate": 0.00015947012890600072, "loss": 0.6034, "step": 5730 }, { "epoch": 0.36786401539448366, "grad_norm": 0.8890367793012931, "learning_rate": 0.00015938008833586307, "loss": 0.673, "step": 5735 }, { "epoch": 0.36818473380372035, "grad_norm": 1.1168519576569294, "learning_rate": 0.00015928997334338924, "loss": 0.7265, "step": 5740 }, { "epoch": 0.368505452212957, "grad_norm": 0.7323689106049717, "learning_rate": 0.00015919978404152225, "loss": 0.5286, "step": 5745 }, { "epoch": 0.3688261706221937, "grad_norm": 0.7491408637491445, "learning_rate": 0.00015910952054329832, "loss": 0.6603, "step": 5750 }, { "epoch": 0.3691468890314304, "grad_norm": 0.5720787370255552, "learning_rate": 0.00015901918296184664, "loss": 0.7637, "step": 5755 }, { "epoch": 0.3694676074406671, "grad_norm": 1.247050118094861, "learning_rate": 0.00015892877141038917, "loss": 0.6643, "step": 5760 }, { "epoch": 0.3697883258499038, "grad_norm": 0.8428619170851901, "learning_rate": 0.00015883828600224073, "loss": 0.603, "step": 5765 }, { "epoch": 0.3701090442591405, "grad_norm": 0.6414166600611392, "learning_rate": 0.00015874772685080853, "loss": 0.6775, "step": 5770 }, { "epoch": 0.3704297626683772, "grad_norm": 1.39629472630112, "learning_rate": 0.0001586570940695924, "loss": 0.7512, "step": 5775 }, { "epoch": 0.3707504810776139, "grad_norm": 1.0547557813661854, "learning_rate": 0.00015856638777218422, "loss": 0.7574, "step": 5780 }, { "epoch": 0.3710711994868505, "grad_norm": 0.8689805862522758, "learning_rate": 0.00015847560807226823, "loss": 0.6427, "step": 5785 }, { "epoch": 0.3713919178960872, "grad_norm": 1.068120678282078, "learning_rate": 0.00015838475508362051, "loss": 0.7343, "step": 5790 }, { "epoch": 0.3717126363053239, "grad_norm": 0.8164191154263224, "learning_rate": 0.00015829382892010912, "loss": 0.7685, "step": 5795 }, { "epoch": 0.3720333547145606, "grad_norm": 0.9769245060606544, "learning_rate": 0.00015820282969569374, "loss": 0.6804, "step": 5800 }, { "epoch": 0.3723540731237973, "grad_norm": 0.676619842133273, "learning_rate": 0.00015811175752442562, "loss": 0.7244, "step": 5805 }, { "epoch": 0.372674791533034, "grad_norm": 3.577185251797483, "learning_rate": 0.00015802061252044748, "loss": 0.7426, "step": 5810 }, { "epoch": 0.3729955099422707, "grad_norm": 0.5176738358349613, "learning_rate": 0.00015792939479799333, "loss": 0.6545, "step": 5815 }, { "epoch": 0.37331622835150735, "grad_norm": 0.9510093482774353, "learning_rate": 0.00015783810447138826, "loss": 0.6358, "step": 5820 }, { "epoch": 0.37363694676074405, "grad_norm": 0.8940071414235186, "learning_rate": 0.0001577467416550484, "loss": 0.7573, "step": 5825 }, { "epoch": 0.37395766516998075, "grad_norm": 0.8502887517010003, "learning_rate": 0.0001576553064634807, "loss": 0.6371, "step": 5830 }, { "epoch": 0.37427838357921744, "grad_norm": 0.7260357322627535, "learning_rate": 0.00015756379901128294, "loss": 0.6106, "step": 5835 }, { "epoch": 0.37459910198845414, "grad_norm": 0.5018237254264993, "learning_rate": 0.00015747221941314325, "loss": 0.6329, "step": 5840 }, { "epoch": 0.37491982039769084, "grad_norm": 0.9130075924966622, "learning_rate": 0.00015738056778384038, "loss": 0.6868, "step": 5845 }, { "epoch": 0.37524053880692754, "grad_norm": 0.803836499340597, "learning_rate": 0.00015728884423824323, "loss": 0.5845, "step": 5850 }, { "epoch": 0.37556125721616423, "grad_norm": 0.7604942646833414, "learning_rate": 0.0001571970488913109, "loss": 0.6911, "step": 5855 }, { "epoch": 0.3758819756254009, "grad_norm": 0.6458258737911328, "learning_rate": 0.00015710518185809246, "loss": 0.5681, "step": 5860 }, { "epoch": 0.3762026940346376, "grad_norm": 1.4247194077938075, "learning_rate": 0.00015701324325372688, "loss": 0.7889, "step": 5865 }, { "epoch": 0.3765234124438743, "grad_norm": 0.9972586435085499, "learning_rate": 0.00015692123319344272, "loss": 0.5962, "step": 5870 }, { "epoch": 0.37684413085311097, "grad_norm": 0.8022131053222762, "learning_rate": 0.0001568291517925582, "loss": 0.7065, "step": 5875 }, { "epoch": 0.37716484926234767, "grad_norm": 1.0767684802416355, "learning_rate": 0.00015673699916648085, "loss": 0.5781, "step": 5880 }, { "epoch": 0.37748556767158437, "grad_norm": 0.9496229847137114, "learning_rate": 0.00015664477543070757, "loss": 0.7056, "step": 5885 }, { "epoch": 0.37780628608082106, "grad_norm": 0.9633177655644503, "learning_rate": 0.00015655248070082438, "loss": 0.6939, "step": 5890 }, { "epoch": 0.3781270044900577, "grad_norm": 1.115479048562113, "learning_rate": 0.00015646011509250617, "loss": 0.7378, "step": 5895 }, { "epoch": 0.3784477228992944, "grad_norm": 0.7941062299537334, "learning_rate": 0.0001563676787215168, "loss": 0.5145, "step": 5900 }, { "epoch": 0.3787684413085311, "grad_norm": 0.7169731124858206, "learning_rate": 0.0001562751717037087, "loss": 0.5164, "step": 5905 }, { "epoch": 0.3790891597177678, "grad_norm": 0.844339179385339, "learning_rate": 0.00015618259415502291, "loss": 0.7001, "step": 5910 }, { "epoch": 0.3794098781270045, "grad_norm": 0.8954099088632127, "learning_rate": 0.00015608994619148886, "loss": 0.7601, "step": 5915 }, { "epoch": 0.3797305965362412, "grad_norm": 0.9177657222066289, "learning_rate": 0.00015599722792922425, "loss": 0.6568, "step": 5920 }, { "epoch": 0.3800513149454779, "grad_norm": 0.6243318997688123, "learning_rate": 0.00015590443948443482, "loss": 0.696, "step": 5925 }, { "epoch": 0.38037203335471453, "grad_norm": 1.4596235194468596, "learning_rate": 0.00015581158097341435, "loss": 0.5778, "step": 5930 }, { "epoch": 0.38069275176395123, "grad_norm": 1.2871148477384242, "learning_rate": 0.0001557186525125444, "loss": 0.6818, "step": 5935 }, { "epoch": 0.38101347017318793, "grad_norm": 0.999208910914117, "learning_rate": 0.00015562565421829415, "loss": 0.763, "step": 5940 }, { "epoch": 0.3813341885824246, "grad_norm": 0.7364320418930576, "learning_rate": 0.0001555325862072204, "loss": 0.5347, "step": 5945 }, { "epoch": 0.3816549069916613, "grad_norm": 1.2608428725949008, "learning_rate": 0.0001554394485959673, "loss": 0.7863, "step": 5950 }, { "epoch": 0.381975625400898, "grad_norm": 1.1072984033964586, "learning_rate": 0.00015534624150126617, "loss": 0.6498, "step": 5955 }, { "epoch": 0.3822963438101347, "grad_norm": 1.058590608018293, "learning_rate": 0.00015525296503993548, "loss": 0.5703, "step": 5960 }, { "epoch": 0.3826170622193714, "grad_norm": 1.0908171799744935, "learning_rate": 0.0001551596193288806, "loss": 0.7091, "step": 5965 }, { "epoch": 0.38293778062860806, "grad_norm": 0.947173201252904, "learning_rate": 0.0001550662044850937, "loss": 0.7283, "step": 5970 }, { "epoch": 0.38325849903784476, "grad_norm": 1.3607073347278296, "learning_rate": 0.00015497272062565362, "loss": 0.6388, "step": 5975 }, { "epoch": 0.38357921744708146, "grad_norm": 0.6419239829629664, "learning_rate": 0.0001548791678677257, "loss": 0.6622, "step": 5980 }, { "epoch": 0.38389993585631815, "grad_norm": 0.7871877023929343, "learning_rate": 0.0001547855463285616, "loss": 0.6371, "step": 5985 }, { "epoch": 0.38422065426555485, "grad_norm": 0.8230439216037291, "learning_rate": 0.00015469185612549917, "loss": 0.6582, "step": 5990 }, { "epoch": 0.38454137267479155, "grad_norm": 0.9173584864551694, "learning_rate": 0.00015459809737596237, "loss": 0.6135, "step": 5995 }, { "epoch": 0.38486209108402825, "grad_norm": 0.8582018157822806, "learning_rate": 0.0001545042701974611, "loss": 0.7084, "step": 6000 }, { "epoch": 0.3851828094932649, "grad_norm": 1.2042820074202123, "learning_rate": 0.0001544103747075909, "loss": 0.8395, "step": 6005 }, { "epoch": 0.3855035279025016, "grad_norm": 1.1522620963550567, "learning_rate": 0.00015431641102403302, "loss": 0.7, "step": 6010 }, { "epoch": 0.3858242463117383, "grad_norm": 1.2293665111038254, "learning_rate": 0.00015422237926455417, "loss": 0.8011, "step": 6015 }, { "epoch": 0.386144964720975, "grad_norm": 0.8550912373038257, "learning_rate": 0.00015412827954700632, "loss": 0.7712, "step": 6020 }, { "epoch": 0.3864656831302117, "grad_norm": 1.216484260092555, "learning_rate": 0.00015403411198932672, "loss": 0.5951, "step": 6025 }, { "epoch": 0.3867864015394484, "grad_norm": 1.0566678632706732, "learning_rate": 0.00015393987670953756, "loss": 0.6986, "step": 6030 }, { "epoch": 0.3871071199486851, "grad_norm": 0.9868957913863856, "learning_rate": 0.00015384557382574595, "loss": 0.583, "step": 6035 }, { "epoch": 0.38742783835792177, "grad_norm": 0.7263178308133398, "learning_rate": 0.0001537512034561437, "loss": 0.7377, "step": 6040 }, { "epoch": 0.3877485567671584, "grad_norm": 0.8464297973296825, "learning_rate": 0.00015365676571900725, "loss": 0.6738, "step": 6045 }, { "epoch": 0.3880692751763951, "grad_norm": 0.754402075351536, "learning_rate": 0.00015356226073269736, "loss": 0.8025, "step": 6050 }, { "epoch": 0.3883899935856318, "grad_norm": 1.0080496408230515, "learning_rate": 0.0001534676886156592, "loss": 0.6925, "step": 6055 }, { "epoch": 0.3887107119948685, "grad_norm": 0.859538871785963, "learning_rate": 0.000153373049486422, "loss": 0.6198, "step": 6060 }, { "epoch": 0.3890314304041052, "grad_norm": 0.617907781420839, "learning_rate": 0.0001532783434635991, "loss": 0.708, "step": 6065 }, { "epoch": 0.3893521488133419, "grad_norm": 0.9321179061358089, "learning_rate": 0.00015318357066588747, "loss": 0.8021, "step": 6070 }, { "epoch": 0.3896728672225786, "grad_norm": 1.0543925706918078, "learning_rate": 0.00015308873121206798, "loss": 0.6394, "step": 6075 }, { "epoch": 0.38999358563181524, "grad_norm": 0.7635204958993133, "learning_rate": 0.00015299382522100484, "loss": 0.7279, "step": 6080 }, { "epoch": 0.39031430404105194, "grad_norm": 0.4808058715738424, "learning_rate": 0.00015289885281164587, "loss": 0.6074, "step": 6085 }, { "epoch": 0.39063502245028864, "grad_norm": 0.8001578622671749, "learning_rate": 0.00015280381410302197, "loss": 0.7391, "step": 6090 }, { "epoch": 0.39095574085952534, "grad_norm": 0.6800874640636567, "learning_rate": 0.00015270870921424721, "loss": 0.6633, "step": 6095 }, { "epoch": 0.39127645926876203, "grad_norm": 1.6563190981003801, "learning_rate": 0.00015261353826451858, "loss": 0.5687, "step": 6100 }, { "epoch": 0.39159717767799873, "grad_norm": 1.541052532833452, "learning_rate": 0.00015251830137311587, "loss": 0.7656, "step": 6105 }, { "epoch": 0.39191789608723543, "grad_norm": 0.8841300600956734, "learning_rate": 0.00015242299865940147, "loss": 0.5984, "step": 6110 }, { "epoch": 0.39223861449647207, "grad_norm": 1.2069800426391173, "learning_rate": 0.00015232763024282034, "loss": 0.8064, "step": 6115 }, { "epoch": 0.39255933290570877, "grad_norm": 1.12582638671757, "learning_rate": 0.00015223219624289978, "loss": 0.7329, "step": 6120 }, { "epoch": 0.39288005131494547, "grad_norm": 0.8200206186838468, "learning_rate": 0.0001521366967792493, "loss": 0.5894, "step": 6125 }, { "epoch": 0.39320076972418216, "grad_norm": 0.8420632536848158, "learning_rate": 0.0001520411319715603, "loss": 0.7387, "step": 6130 }, { "epoch": 0.39352148813341886, "grad_norm": 0.8067132371420835, "learning_rate": 0.00015194550193960632, "loss": 0.682, "step": 6135 }, { "epoch": 0.39384220654265556, "grad_norm": 0.7708975305048692, "learning_rate": 0.00015184980680324248, "loss": 0.68, "step": 6140 }, { "epoch": 0.39416292495189226, "grad_norm": 1.0673984272805985, "learning_rate": 0.00015175404668240554, "loss": 0.765, "step": 6145 }, { "epoch": 0.39448364336112896, "grad_norm": 1.3041455682451786, "learning_rate": 0.00015165822169711373, "loss": 0.6576, "step": 6150 }, { "epoch": 0.3948043617703656, "grad_norm": 0.6831544367344609, "learning_rate": 0.00015156233196746653, "loss": 0.7366, "step": 6155 }, { "epoch": 0.3951250801796023, "grad_norm": 0.9906492347644728, "learning_rate": 0.00015146637761364457, "loss": 0.7104, "step": 6160 }, { "epoch": 0.395445798588839, "grad_norm": 0.8542271989350849, "learning_rate": 0.00015137035875590956, "loss": 0.6678, "step": 6165 }, { "epoch": 0.3957665169980757, "grad_norm": 1.542102394105923, "learning_rate": 0.00015127427551460396, "loss": 0.665, "step": 6170 }, { "epoch": 0.3960872354073124, "grad_norm": 0.8016623705576872, "learning_rate": 0.00015117812801015095, "loss": 0.5812, "step": 6175 }, { "epoch": 0.3964079538165491, "grad_norm": 1.2073109631978751, "learning_rate": 0.00015108191636305427, "loss": 0.7527, "step": 6180 }, { "epoch": 0.3967286722257858, "grad_norm": 0.8328169453200382, "learning_rate": 0.000150985640693898, "loss": 0.6733, "step": 6185 }, { "epoch": 0.3970493906350224, "grad_norm": 0.9951192780366616, "learning_rate": 0.00015088930112334653, "loss": 0.733, "step": 6190 }, { "epoch": 0.3973701090442591, "grad_norm": 0.7405889864532202, "learning_rate": 0.0001507928977721443, "loss": 0.5478, "step": 6195 }, { "epoch": 0.3976908274534958, "grad_norm": 1.080626962102723, "learning_rate": 0.0001506964307611157, "loss": 0.6115, "step": 6200 }, { "epoch": 0.3980115458627325, "grad_norm": 0.7995884570597525, "learning_rate": 0.0001505999002111649, "loss": 0.5829, "step": 6205 }, { "epoch": 0.3983322642719692, "grad_norm": 0.4992231946350308, "learning_rate": 0.0001505033062432757, "loss": 0.5649, "step": 6210 }, { "epoch": 0.3986529826812059, "grad_norm": 0.8489355096183382, "learning_rate": 0.00015040664897851138, "loss": 0.7291, "step": 6215 }, { "epoch": 0.3989737010904426, "grad_norm": 1.136002981763331, "learning_rate": 0.00015030992853801454, "loss": 0.7918, "step": 6220 }, { "epoch": 0.39929441949967925, "grad_norm": 0.895880595156802, "learning_rate": 0.00015021314504300704, "loss": 0.5635, "step": 6225 }, { "epoch": 0.39961513790891595, "grad_norm": 0.8226243605298355, "learning_rate": 0.0001501162986147897, "loss": 0.815, "step": 6230 }, { "epoch": 0.39993585631815265, "grad_norm": 0.9921294907910895, "learning_rate": 0.00015001938937474218, "loss": 0.7156, "step": 6235 }, { "epoch": 0.40025657472738935, "grad_norm": 0.9510451771447491, "learning_rate": 0.0001499224174443229, "loss": 0.681, "step": 6240 }, { "epoch": 0.40057729313662604, "grad_norm": 0.9952627757450367, "learning_rate": 0.0001498253829450689, "loss": 0.712, "step": 6245 }, { "epoch": 0.40089801154586274, "grad_norm": 0.6514927458391138, "learning_rate": 0.00014972828599859556, "loss": 0.633, "step": 6250 }, { "epoch": 0.40121872995509944, "grad_norm": 0.9621219480196492, "learning_rate": 0.0001496311267265966, "loss": 0.6988, "step": 6255 }, { "epoch": 0.40153944836433614, "grad_norm": 1.0155290688557055, "learning_rate": 0.00014953390525084377, "loss": 0.7093, "step": 6260 }, { "epoch": 0.4018601667735728, "grad_norm": 0.6507458129551235, "learning_rate": 0.00014943662169318686, "loss": 0.6781, "step": 6265 }, { "epoch": 0.4021808851828095, "grad_norm": 0.8206284722853324, "learning_rate": 0.00014933927617555342, "loss": 0.6472, "step": 6270 }, { "epoch": 0.4025016035920462, "grad_norm": 0.969513442832448, "learning_rate": 0.00014924186881994867, "loss": 0.6322, "step": 6275 }, { "epoch": 0.4028223220012829, "grad_norm": 1.0110426378326145, "learning_rate": 0.00014914439974845532, "loss": 0.6192, "step": 6280 }, { "epoch": 0.40314304041051957, "grad_norm": 0.9182180122329154, "learning_rate": 0.0001490468690832335, "loss": 0.7624, "step": 6285 }, { "epoch": 0.40346375881975627, "grad_norm": 1.0221754081762093, "learning_rate": 0.00014894927694652046, "loss": 0.5685, "step": 6290 }, { "epoch": 0.40378447722899297, "grad_norm": 0.7951566985169003, "learning_rate": 0.00014885162346063048, "loss": 0.6114, "step": 6295 }, { "epoch": 0.4041051956382296, "grad_norm": 0.9205666830852229, "learning_rate": 0.00014875390874795482, "loss": 0.6126, "step": 6300 }, { "epoch": 0.4044259140474663, "grad_norm": 0.8495232187331296, "learning_rate": 0.00014865613293096132, "loss": 0.6743, "step": 6305 }, { "epoch": 0.404746632456703, "grad_norm": 0.5863050150246784, "learning_rate": 0.0001485582961321946, "loss": 0.5965, "step": 6310 }, { "epoch": 0.4050673508659397, "grad_norm": 0.732145223215556, "learning_rate": 0.00014846039847427563, "loss": 0.6549, "step": 6315 }, { "epoch": 0.4053880692751764, "grad_norm": 0.7872248738108987, "learning_rate": 0.00014836244007990156, "loss": 0.675, "step": 6320 }, { "epoch": 0.4057087876844131, "grad_norm": 0.6983906622235366, "learning_rate": 0.0001482644210718458, "loss": 0.6684, "step": 6325 }, { "epoch": 0.4060295060936498, "grad_norm": 1.036082877660743, "learning_rate": 0.0001481663415729576, "loss": 0.6682, "step": 6330 }, { "epoch": 0.4063502245028865, "grad_norm": 0.8176112608665335, "learning_rate": 0.00014806820170616222, "loss": 0.8555, "step": 6335 }, { "epoch": 0.40667094291212313, "grad_norm": 0.7770154320072936, "learning_rate": 0.00014797000159446038, "loss": 0.557, "step": 6340 }, { "epoch": 0.40699166132135983, "grad_norm": 1.5604043527138882, "learning_rate": 0.00014787174136092837, "loss": 0.5678, "step": 6345 }, { "epoch": 0.40731237973059653, "grad_norm": 0.5000651713384456, "learning_rate": 0.00014777342112871786, "loss": 0.6323, "step": 6350 }, { "epoch": 0.4076330981398332, "grad_norm": 0.7129539414804645, "learning_rate": 0.0001476750410210557, "loss": 0.6531, "step": 6355 }, { "epoch": 0.4079538165490699, "grad_norm": 0.6838741535402209, "learning_rate": 0.0001475766011612438, "loss": 0.6734, "step": 6360 }, { "epoch": 0.4082745349583066, "grad_norm": 0.6003288340459018, "learning_rate": 0.00014747810167265894, "loss": 0.5793, "step": 6365 }, { "epoch": 0.4085952533675433, "grad_norm": 1.5754948140455838, "learning_rate": 0.00014737954267875263, "loss": 0.702, "step": 6370 }, { "epoch": 0.40891597177677996, "grad_norm": 1.0150345516766142, "learning_rate": 0.000147280924303051, "loss": 0.8569, "step": 6375 }, { "epoch": 0.40923669018601666, "grad_norm": 1.0034479899495579, "learning_rate": 0.0001471822466691545, "loss": 0.8446, "step": 6380 }, { "epoch": 0.40955740859525336, "grad_norm": 0.9184425443953635, "learning_rate": 0.00014708350990073798, "loss": 0.6602, "step": 6385 }, { "epoch": 0.40987812700449006, "grad_norm": 0.6284500041695303, "learning_rate": 0.0001469847141215503, "loss": 0.7291, "step": 6390 }, { "epoch": 0.41019884541372675, "grad_norm": 0.9039981636058719, "learning_rate": 0.0001468858594554144, "loss": 0.8008, "step": 6395 }, { "epoch": 0.41051956382296345, "grad_norm": 0.9662431864347534, "learning_rate": 0.0001467869460262269, "loss": 0.5989, "step": 6400 }, { "epoch": 0.41084028223220015, "grad_norm": 0.6824016883811361, "learning_rate": 0.00014668797395795812, "loss": 0.7651, "step": 6405 }, { "epoch": 0.4111610006414368, "grad_norm": 0.8325307304433841, "learning_rate": 0.00014658894337465187, "loss": 0.762, "step": 6410 }, { "epoch": 0.4114817190506735, "grad_norm": 0.6873637896445222, "learning_rate": 0.00014648985440042533, "loss": 0.6868, "step": 6415 }, { "epoch": 0.4118024374599102, "grad_norm": 0.8851369890257763, "learning_rate": 0.0001463907071594688, "loss": 0.719, "step": 6420 }, { "epoch": 0.4121231558691469, "grad_norm": 0.8755806997147045, "learning_rate": 0.00014629150177604565, "loss": 0.6161, "step": 6425 }, { "epoch": 0.4124438742783836, "grad_norm": 0.9956221559599793, "learning_rate": 0.00014619223837449211, "loss": 0.6246, "step": 6430 }, { "epoch": 0.4127645926876203, "grad_norm": 0.9146462627716199, "learning_rate": 0.00014609291707921713, "loss": 0.665, "step": 6435 }, { "epoch": 0.413085311096857, "grad_norm": 0.7096303973864491, "learning_rate": 0.0001459935380147022, "loss": 0.7379, "step": 6440 }, { "epoch": 0.4134060295060937, "grad_norm": 0.8414445373385668, "learning_rate": 0.00014589410130550124, "loss": 0.7533, "step": 6445 }, { "epoch": 0.4137267479153303, "grad_norm": 1.1009718984925583, "learning_rate": 0.0001457946070762404, "loss": 0.673, "step": 6450 }, { "epoch": 0.414047466324567, "grad_norm": 0.9982085240192685, "learning_rate": 0.000145695055451618, "loss": 0.6951, "step": 6455 }, { "epoch": 0.4143681847338037, "grad_norm": 0.7828125692520432, "learning_rate": 0.00014559544655640412, "loss": 0.7779, "step": 6460 }, { "epoch": 0.4146889031430404, "grad_norm": 1.0323696606312884, "learning_rate": 0.0001454957805154408, "loss": 0.666, "step": 6465 }, { "epoch": 0.4150096215522771, "grad_norm": 0.6618186643447491, "learning_rate": 0.00014539605745364156, "loss": 0.7354, "step": 6470 }, { "epoch": 0.4153303399615138, "grad_norm": 1.3747337158411725, "learning_rate": 0.00014529627749599146, "loss": 0.7191, "step": 6475 }, { "epoch": 0.4156510583707505, "grad_norm": 0.6208342823219867, "learning_rate": 0.0001451964407675469, "loss": 0.648, "step": 6480 }, { "epoch": 0.41597177677998715, "grad_norm": 1.0319199547152835, "learning_rate": 0.00014509654739343534, "loss": 0.7808, "step": 6485 }, { "epoch": 0.41629249518922384, "grad_norm": 1.1954322026767323, "learning_rate": 0.0001449965974988553, "loss": 0.7695, "step": 6490 }, { "epoch": 0.41661321359846054, "grad_norm": 1.2496250479975701, "learning_rate": 0.00014489659120907615, "loss": 0.6214, "step": 6495 }, { "epoch": 0.41693393200769724, "grad_norm": 0.6983577410015246, "learning_rate": 0.00014479652864943788, "loss": 0.6312, "step": 6500 }, { "epoch": 0.41725465041693394, "grad_norm": 0.8629680447857923, "learning_rate": 0.0001446964099453511, "loss": 0.7508, "step": 6505 }, { "epoch": 0.41757536882617063, "grad_norm": 1.0627571850045838, "learning_rate": 0.00014459623522229662, "loss": 0.7044, "step": 6510 }, { "epoch": 0.41789608723540733, "grad_norm": 0.900748883113857, "learning_rate": 0.00014449600460582563, "loss": 0.7454, "step": 6515 }, { "epoch": 0.41821680564464403, "grad_norm": 0.9690669274483888, "learning_rate": 0.00014439571822155934, "loss": 0.5726, "step": 6520 }, { "epoch": 0.41853752405388067, "grad_norm": 1.0487104357417287, "learning_rate": 0.00014429537619518873, "loss": 0.799, "step": 6525 }, { "epoch": 0.41885824246311737, "grad_norm": 1.2154258394073059, "learning_rate": 0.0001441949786524747, "loss": 0.5219, "step": 6530 }, { "epoch": 0.41917896087235407, "grad_norm": 0.7794877423395151, "learning_rate": 0.0001440945257192476, "loss": 0.5707, "step": 6535 }, { "epoch": 0.41949967928159076, "grad_norm": 0.5347206233594262, "learning_rate": 0.00014399401752140728, "loss": 0.55, "step": 6540 }, { "epoch": 0.41982039769082746, "grad_norm": 0.6862664210890991, "learning_rate": 0.00014389345418492272, "loss": 0.7803, "step": 6545 }, { "epoch": 0.42014111610006416, "grad_norm": 1.1652827614213763, "learning_rate": 0.0001437928358358322, "loss": 0.6907, "step": 6550 }, { "epoch": 0.42046183450930086, "grad_norm": 1.0771142450313498, "learning_rate": 0.00014369216260024282, "loss": 0.5868, "step": 6555 }, { "epoch": 0.4207825529185375, "grad_norm": 0.7384688516596317, "learning_rate": 0.00014359143460433046, "loss": 0.5754, "step": 6560 }, { "epoch": 0.4211032713277742, "grad_norm": 0.7961839635706309, "learning_rate": 0.00014349065197433977, "loss": 0.6247, "step": 6565 }, { "epoch": 0.4214239897370109, "grad_norm": 0.9321579239285109, "learning_rate": 0.0001433898148365837, "loss": 0.6856, "step": 6570 }, { "epoch": 0.4217447081462476, "grad_norm": 0.7081449574427117, "learning_rate": 0.00014328892331744362, "loss": 0.5893, "step": 6575 }, { "epoch": 0.4220654265554843, "grad_norm": 0.9200227580239932, "learning_rate": 0.000143187977543369, "loss": 0.661, "step": 6580 }, { "epoch": 0.422386144964721, "grad_norm": 1.1330174896855054, "learning_rate": 0.00014308697764087738, "loss": 0.8342, "step": 6585 }, { "epoch": 0.4227068633739577, "grad_norm": 0.851200673216541, "learning_rate": 0.00014298592373655414, "loss": 0.8357, "step": 6590 }, { "epoch": 0.42302758178319433, "grad_norm": 0.6342829663427049, "learning_rate": 0.00014288481595705217, "loss": 0.4643, "step": 6595 }, { "epoch": 0.423348300192431, "grad_norm": 0.5823246535632486, "learning_rate": 0.00014278365442909214, "loss": 0.6472, "step": 6600 }, { "epoch": 0.4236690186016677, "grad_norm": 1.0907798326084035, "learning_rate": 0.0001426824392794619, "loss": 0.5667, "step": 6605 }, { "epoch": 0.4239897370109044, "grad_norm": 0.6171485537285203, "learning_rate": 0.00014258117063501658, "loss": 0.7975, "step": 6610 }, { "epoch": 0.4243104554201411, "grad_norm": 0.920203490173087, "learning_rate": 0.00014247984862267833, "loss": 0.5432, "step": 6615 }, { "epoch": 0.4246311738293778, "grad_norm": 0.6838928556102262, "learning_rate": 0.0001423784733694362, "loss": 0.5982, "step": 6620 }, { "epoch": 0.4249518922386145, "grad_norm": 1.2229051146263923, "learning_rate": 0.00014227704500234599, "loss": 0.8164, "step": 6625 }, { "epoch": 0.4252726106478512, "grad_norm": 0.7990664572540562, "learning_rate": 0.00014217556364853006, "loss": 0.7974, "step": 6630 }, { "epoch": 0.42559332905708785, "grad_norm": 1.439913710180236, "learning_rate": 0.00014207402943517707, "loss": 0.6574, "step": 6635 }, { "epoch": 0.42591404746632455, "grad_norm": 1.5833188763841297, "learning_rate": 0.0001419724424895421, "loss": 0.6127, "step": 6640 }, { "epoch": 0.42623476587556125, "grad_norm": 1.0972694183324532, "learning_rate": 0.00014187080293894623, "loss": 0.6384, "step": 6645 }, { "epoch": 0.42655548428479795, "grad_norm": 0.7755437327444886, "learning_rate": 0.0001417691109107765, "loss": 0.6467, "step": 6650 }, { "epoch": 0.42687620269403465, "grad_norm": 0.7307053147732903, "learning_rate": 0.00014166736653248568, "loss": 0.6857, "step": 6655 }, { "epoch": 0.42719692110327134, "grad_norm": 1.1129466425839534, "learning_rate": 0.00014156556993159215, "loss": 0.6325, "step": 6660 }, { "epoch": 0.42751763951250804, "grad_norm": 1.0829046562773215, "learning_rate": 0.00014146372123567986, "loss": 0.4627, "step": 6665 }, { "epoch": 0.4278383579217447, "grad_norm": 0.7286117691400992, "learning_rate": 0.00014136182057239788, "loss": 0.7129, "step": 6670 }, { "epoch": 0.4281590763309814, "grad_norm": 0.9030468850448815, "learning_rate": 0.00014125986806946052, "loss": 0.6249, "step": 6675 }, { "epoch": 0.4284797947402181, "grad_norm": 1.0569038979072376, "learning_rate": 0.00014115786385464704, "loss": 0.5753, "step": 6680 }, { "epoch": 0.4288005131494548, "grad_norm": 1.9939364349504531, "learning_rate": 0.0001410558080558015, "loss": 0.6928, "step": 6685 }, { "epoch": 0.4291212315586915, "grad_norm": 0.7638304398434881, "learning_rate": 0.00014095370080083262, "loss": 0.7665, "step": 6690 }, { "epoch": 0.42944194996792817, "grad_norm": 1.0470546825430735, "learning_rate": 0.00014085154221771362, "loss": 0.5786, "step": 6695 }, { "epoch": 0.42976266837716487, "grad_norm": 1.122127513476166, "learning_rate": 0.00014074933243448203, "loss": 0.5162, "step": 6700 }, { "epoch": 0.43008338678640157, "grad_norm": 0.9808616961072774, "learning_rate": 0.00014064707157923956, "loss": 0.5722, "step": 6705 }, { "epoch": 0.4304041051956382, "grad_norm": 0.8653107924861354, "learning_rate": 0.00014054475978015192, "loss": 0.6378, "step": 6710 }, { "epoch": 0.4307248236048749, "grad_norm": 0.8962127595984706, "learning_rate": 0.00014044239716544868, "loss": 0.6408, "step": 6715 }, { "epoch": 0.4310455420141116, "grad_norm": 0.8365357084309853, "learning_rate": 0.00014033998386342312, "loss": 0.6256, "step": 6720 }, { "epoch": 0.4313662604233483, "grad_norm": 1.0863245013957081, "learning_rate": 0.000140237520002432, "loss": 0.7068, "step": 6725 }, { "epoch": 0.431686978832585, "grad_norm": 0.662268709969254, "learning_rate": 0.0001401350057108955, "loss": 0.7573, "step": 6730 }, { "epoch": 0.4320076972418217, "grad_norm": 1.1715222453521494, "learning_rate": 0.0001400324411172969, "loss": 0.7574, "step": 6735 }, { "epoch": 0.4323284156510584, "grad_norm": 0.9424425544184805, "learning_rate": 0.0001399298263501827, "loss": 0.8143, "step": 6740 }, { "epoch": 0.43264913406029504, "grad_norm": 0.7955537139368879, "learning_rate": 0.00013982716153816213, "loss": 0.5263, "step": 6745 }, { "epoch": 0.43296985246953174, "grad_norm": 0.9554382885880205, "learning_rate": 0.00013972444680990722, "loss": 0.6976, "step": 6750 }, { "epoch": 0.43329057087876843, "grad_norm": 1.5328515613064213, "learning_rate": 0.00013962168229415253, "loss": 0.627, "step": 6755 }, { "epoch": 0.43361128928800513, "grad_norm": 1.096222900496091, "learning_rate": 0.00013951886811969501, "loss": 0.8235, "step": 6760 }, { "epoch": 0.43393200769724183, "grad_norm": 1.3093624744883847, "learning_rate": 0.00013941600441539392, "loss": 0.5996, "step": 6765 }, { "epoch": 0.4342527261064785, "grad_norm": 0.9375701783813489, "learning_rate": 0.00013931309131017046, "loss": 0.8571, "step": 6770 }, { "epoch": 0.4345734445157152, "grad_norm": 0.8981486814038466, "learning_rate": 0.0001392101289330079, "loss": 0.7036, "step": 6775 }, { "epoch": 0.43489416292495187, "grad_norm": 1.030290589948871, "learning_rate": 0.00013910711741295113, "loss": 0.5523, "step": 6780 }, { "epoch": 0.43521488133418856, "grad_norm": 0.7783354813811616, "learning_rate": 0.00013900405687910676, "loss": 0.6957, "step": 6785 }, { "epoch": 0.43553559974342526, "grad_norm": 0.7762966668183622, "learning_rate": 0.00013890094746064273, "loss": 0.7249, "step": 6790 }, { "epoch": 0.43585631815266196, "grad_norm": 1.0757163547744426, "learning_rate": 0.0001387977892867883, "loss": 0.7033, "step": 6795 }, { "epoch": 0.43617703656189866, "grad_norm": 0.9414991837160046, "learning_rate": 0.00013869458248683377, "loss": 0.6503, "step": 6800 }, { "epoch": 0.43649775497113535, "grad_norm": 1.1367581767585646, "learning_rate": 0.0001385913271901305, "loss": 0.6653, "step": 6805 }, { "epoch": 0.43681847338037205, "grad_norm": 0.9718072928244804, "learning_rate": 0.0001384880235260905, "loss": 0.6126, "step": 6810 }, { "epoch": 0.43713919178960875, "grad_norm": 1.051631260475179, "learning_rate": 0.00013838467162418652, "loss": 0.7529, "step": 6815 }, { "epoch": 0.4374599101988454, "grad_norm": 1.1255123924704187, "learning_rate": 0.00013828127161395165, "loss": 0.7, "step": 6820 }, { "epoch": 0.4377806286080821, "grad_norm": 0.6159074752294377, "learning_rate": 0.00013817782362497938, "loss": 0.7815, "step": 6825 }, { "epoch": 0.4381013470173188, "grad_norm": 0.7651323158439101, "learning_rate": 0.00013807432778692333, "loss": 0.6508, "step": 6830 }, { "epoch": 0.4384220654265555, "grad_norm": 1.49661820735196, "learning_rate": 0.00013797078422949697, "loss": 0.6949, "step": 6835 }, { "epoch": 0.4387427838357922, "grad_norm": 0.9888853439915466, "learning_rate": 0.0001378671930824737, "loss": 0.6223, "step": 6840 }, { "epoch": 0.4390635022450289, "grad_norm": 1.248537199848208, "learning_rate": 0.00013776355447568648, "loss": 0.8024, "step": 6845 }, { "epoch": 0.4393842206542656, "grad_norm": 0.9575631631075234, "learning_rate": 0.00013765986853902783, "loss": 0.6739, "step": 6850 }, { "epoch": 0.4397049390635022, "grad_norm": 0.9680343975909423, "learning_rate": 0.00013755613540244958, "loss": 0.6917, "step": 6855 }, { "epoch": 0.4400256574727389, "grad_norm": 1.117269566951374, "learning_rate": 0.00013745235519596263, "loss": 0.7042, "step": 6860 }, { "epoch": 0.4403463758819756, "grad_norm": 0.8619372703069825, "learning_rate": 0.00013734852804963703, "loss": 0.609, "step": 6865 }, { "epoch": 0.4406670942912123, "grad_norm": 0.8117525588458958, "learning_rate": 0.00013724465409360148, "loss": 0.6981, "step": 6870 }, { "epoch": 0.440987812700449, "grad_norm": 1.01398403519154, "learning_rate": 0.0001371407334580434, "loss": 0.6151, "step": 6875 }, { "epoch": 0.4413085311096857, "grad_norm": 0.834092658374222, "learning_rate": 0.00013703676627320886, "loss": 0.7673, "step": 6880 }, { "epoch": 0.4416292495189224, "grad_norm": 1.5311945048848135, "learning_rate": 0.00013693275266940207, "loss": 0.7119, "step": 6885 }, { "epoch": 0.44194996792815905, "grad_norm": 1.527540376439275, "learning_rate": 0.00013682869277698557, "loss": 0.6265, "step": 6890 }, { "epoch": 0.44227068633739575, "grad_norm": 0.7951368893260018, "learning_rate": 0.00013672458672637984, "loss": 0.8016, "step": 6895 }, { "epoch": 0.44259140474663244, "grad_norm": 1.2763559389048758, "learning_rate": 0.0001366204346480632, "loss": 0.7206, "step": 6900 }, { "epoch": 0.44291212315586914, "grad_norm": 0.8023255338282319, "learning_rate": 0.00013651623667257164, "loss": 0.7554, "step": 6905 }, { "epoch": 0.44323284156510584, "grad_norm": 0.8695350841504818, "learning_rate": 0.00013641199293049877, "loss": 0.8358, "step": 6910 }, { "epoch": 0.44355355997434254, "grad_norm": 0.9044131348318595, "learning_rate": 0.0001363077035524955, "loss": 0.6412, "step": 6915 }, { "epoch": 0.44387427838357923, "grad_norm": 0.8127899752297872, "learning_rate": 0.00013620336866926997, "loss": 0.6957, "step": 6920 }, { "epoch": 0.44419499679281593, "grad_norm": 0.8688512997555105, "learning_rate": 0.00013609898841158725, "loss": 0.724, "step": 6925 }, { "epoch": 0.4445157152020526, "grad_norm": 0.8760877608220616, "learning_rate": 0.0001359945629102694, "loss": 0.5738, "step": 6930 }, { "epoch": 0.4448364336112893, "grad_norm": 1.0325674004426306, "learning_rate": 0.0001358900922961951, "loss": 0.5873, "step": 6935 }, { "epoch": 0.44515715202052597, "grad_norm": 0.8467908302129974, "learning_rate": 0.00013578557670029966, "loss": 0.7058, "step": 6940 }, { "epoch": 0.44547787042976267, "grad_norm": 0.8131400613232301, "learning_rate": 0.00013568101625357465, "loss": 0.7422, "step": 6945 }, { "epoch": 0.44579858883899937, "grad_norm": 0.724722516850653, "learning_rate": 0.000135576411087068, "loss": 0.6638, "step": 6950 }, { "epoch": 0.44611930724823606, "grad_norm": 0.8948898208956525, "learning_rate": 0.00013547176133188354, "loss": 0.7129, "step": 6955 }, { "epoch": 0.44644002565747276, "grad_norm": 1.0104789290655904, "learning_rate": 0.00013536706711918107, "loss": 0.7032, "step": 6960 }, { "epoch": 0.4467607440667094, "grad_norm": 0.8414717932992289, "learning_rate": 0.0001352623285801761, "loss": 0.6836, "step": 6965 }, { "epoch": 0.4470814624759461, "grad_norm": 1.1406826410807314, "learning_rate": 0.00013515754584613962, "loss": 0.6053, "step": 6970 }, { "epoch": 0.4474021808851828, "grad_norm": 0.8742591243812547, "learning_rate": 0.00013505271904839817, "loss": 0.7431, "step": 6975 }, { "epoch": 0.4477228992944195, "grad_norm": 0.6939509932441673, "learning_rate": 0.00013494784831833337, "loss": 0.6291, "step": 6980 }, { "epoch": 0.4480436177036562, "grad_norm": 1.1945030623029917, "learning_rate": 0.00013484293378738193, "loss": 0.6403, "step": 6985 }, { "epoch": 0.4483643361128929, "grad_norm": 1.2041604733537394, "learning_rate": 0.0001347379755870355, "loss": 0.7259, "step": 6990 }, { "epoch": 0.4486850545221296, "grad_norm": 1.2915007724773113, "learning_rate": 0.00013463297384884047, "loss": 0.659, "step": 6995 }, { "epoch": 0.4490057729313663, "grad_norm": 0.9604685032866782, "learning_rate": 0.00013452792870439774, "loss": 0.7607, "step": 7000 }, { "epoch": 0.44932649134060293, "grad_norm": 0.683575690655945, "learning_rate": 0.00013442284028536265, "loss": 0.6597, "step": 7005 }, { "epoch": 0.4496472097498396, "grad_norm": 0.8599337861042293, "learning_rate": 0.0001343177087234447, "loss": 0.6324, "step": 7010 }, { "epoch": 0.4499679281590763, "grad_norm": 1.0590394622444155, "learning_rate": 0.00013421253415040764, "loss": 0.7187, "step": 7015 }, { "epoch": 0.450288646568313, "grad_norm": 0.7304239044871675, "learning_rate": 0.00013410731669806893, "loss": 0.6951, "step": 7020 }, { "epoch": 0.4506093649775497, "grad_norm": 0.6027716061436601, "learning_rate": 0.00013400205649829986, "loss": 0.6254, "step": 7025 }, { "epoch": 0.4509300833867864, "grad_norm": 0.9290585913030099, "learning_rate": 0.00013389675368302538, "loss": 0.6395, "step": 7030 }, { "epoch": 0.4512508017960231, "grad_norm": 0.6100444770178587, "learning_rate": 0.00013379140838422368, "loss": 0.6956, "step": 7035 }, { "epoch": 0.45157152020525976, "grad_norm": 1.0560462270870308, "learning_rate": 0.00013368602073392626, "loss": 0.7217, "step": 7040 }, { "epoch": 0.45189223861449646, "grad_norm": 0.9506970796048375, "learning_rate": 0.00013358059086421777, "loss": 0.7538, "step": 7045 }, { "epoch": 0.45221295702373315, "grad_norm": 0.8472683366273123, "learning_rate": 0.0001334751189072357, "loss": 0.7699, "step": 7050 }, { "epoch": 0.45253367543296985, "grad_norm": 0.8123297983190807, "learning_rate": 0.00013336960499517035, "loss": 0.7617, "step": 7055 }, { "epoch": 0.45285439384220655, "grad_norm": 0.7432610908008688, "learning_rate": 0.00013326404926026453, "loss": 0.4966, "step": 7060 }, { "epoch": 0.45317511225144325, "grad_norm": 1.9038556869996193, "learning_rate": 0.00013315845183481352, "loss": 0.7716, "step": 7065 }, { "epoch": 0.45349583066067994, "grad_norm": 1.517420207283064, "learning_rate": 0.0001330528128511648, "loss": 0.7335, "step": 7070 }, { "epoch": 0.4538165490699166, "grad_norm": 0.8901376925504432, "learning_rate": 0.00013294713244171798, "loss": 0.6803, "step": 7075 }, { "epoch": 0.4541372674791533, "grad_norm": 0.9458291501306725, "learning_rate": 0.0001328414107389246, "loss": 0.8463, "step": 7080 }, { "epoch": 0.45445798588839, "grad_norm": 0.771925264607674, "learning_rate": 0.00013273564787528796, "loss": 0.6271, "step": 7085 }, { "epoch": 0.4547787042976267, "grad_norm": 0.9552006861914584, "learning_rate": 0.00013262984398336287, "loss": 0.6903, "step": 7090 }, { "epoch": 0.4550994227068634, "grad_norm": 0.7912142730312611, "learning_rate": 0.00013252399919575565, "loss": 0.7355, "step": 7095 }, { "epoch": 0.4554201411161001, "grad_norm": 0.8790500769675236, "learning_rate": 0.0001324181136451238, "loss": 0.6732, "step": 7100 }, { "epoch": 0.45574085952533677, "grad_norm": 1.2386079454717946, "learning_rate": 0.00013231218746417595, "loss": 0.7522, "step": 7105 }, { "epoch": 0.45606157793457347, "grad_norm": 0.7962051132713993, "learning_rate": 0.0001322062207856717, "loss": 0.8145, "step": 7110 }, { "epoch": 0.4563822963438101, "grad_norm": 1.0329953407444796, "learning_rate": 0.00013210021374242134, "loss": 0.7769, "step": 7115 }, { "epoch": 0.4567030147530468, "grad_norm": 0.9259650281367799, "learning_rate": 0.00013199416646728573, "loss": 0.6457, "step": 7120 }, { "epoch": 0.4570237331622835, "grad_norm": 0.9088503892075743, "learning_rate": 0.0001318880790931762, "loss": 0.6294, "step": 7125 }, { "epoch": 0.4573444515715202, "grad_norm": 0.8985892524046365, "learning_rate": 0.00013178195175305438, "loss": 0.6828, "step": 7130 }, { "epoch": 0.4576651699807569, "grad_norm": 0.912515537663532, "learning_rate": 0.00013167578457993188, "loss": 0.7064, "step": 7135 }, { "epoch": 0.4579858883899936, "grad_norm": 0.9729614181574077, "learning_rate": 0.0001315695777068703, "loss": 0.7272, "step": 7140 }, { "epoch": 0.4583066067992303, "grad_norm": 0.6424734919666812, "learning_rate": 0.00013146333126698103, "loss": 0.6299, "step": 7145 }, { "epoch": 0.45862732520846694, "grad_norm": 0.9359545383993509, "learning_rate": 0.00013135704539342494, "loss": 0.6424, "step": 7150 }, { "epoch": 0.45894804361770364, "grad_norm": 0.7928212174336042, "learning_rate": 0.00013125072021941248, "loss": 0.6982, "step": 7155 }, { "epoch": 0.45926876202694034, "grad_norm": 0.5352504172374731, "learning_rate": 0.00013114435587820316, "loss": 0.5291, "step": 7160 }, { "epoch": 0.45958948043617703, "grad_norm": 0.7128732592198029, "learning_rate": 0.00013103795250310577, "loss": 0.7029, "step": 7165 }, { "epoch": 0.45991019884541373, "grad_norm": 1.0850764381783637, "learning_rate": 0.00013093151022747793, "loss": 0.7707, "step": 7170 }, { "epoch": 0.46023091725465043, "grad_norm": 1.0237223555264552, "learning_rate": 0.000130825029184726, "loss": 0.6769, "step": 7175 }, { "epoch": 0.4605516356638871, "grad_norm": 1.1136242211182483, "learning_rate": 0.00013071850950830492, "loss": 0.5703, "step": 7180 }, { "epoch": 0.4608723540731238, "grad_norm": 0.8143443059526504, "learning_rate": 0.00013061195133171814, "loss": 0.6334, "step": 7185 }, { "epoch": 0.46119307248236047, "grad_norm": 0.9509973045912795, "learning_rate": 0.00013050535478851728, "loss": 0.6757, "step": 7190 }, { "epoch": 0.46151379089159716, "grad_norm": 0.6191444236173257, "learning_rate": 0.00013039872001230208, "loss": 0.6217, "step": 7195 }, { "epoch": 0.46183450930083386, "grad_norm": 0.7788953363838352, "learning_rate": 0.00013029204713672015, "loss": 0.7384, "step": 7200 }, { "epoch": 0.46215522771007056, "grad_norm": 0.8450930304171778, "learning_rate": 0.00013018533629546695, "loss": 0.7298, "step": 7205 }, { "epoch": 0.46247594611930726, "grad_norm": 1.0385186485500146, "learning_rate": 0.0001300785876222854, "loss": 0.6529, "step": 7210 }, { "epoch": 0.46279666452854396, "grad_norm": 0.9152190048763487, "learning_rate": 0.00012997180125096596, "loss": 0.4276, "step": 7215 }, { "epoch": 0.46311738293778065, "grad_norm": 0.9787836443016305, "learning_rate": 0.00012986497731534618, "loss": 0.63, "step": 7220 }, { "epoch": 0.4634381013470173, "grad_norm": 0.9734043537474775, "learning_rate": 0.00012975811594931094, "loss": 0.7634, "step": 7225 }, { "epoch": 0.463758819756254, "grad_norm": 0.9713910942202003, "learning_rate": 0.00012965121728679175, "loss": 0.757, "step": 7230 }, { "epoch": 0.4640795381654907, "grad_norm": 0.9081157831943877, "learning_rate": 0.00012954428146176703, "loss": 0.7426, "step": 7235 }, { "epoch": 0.4644002565747274, "grad_norm": 0.7116758820381245, "learning_rate": 0.00012943730860826174, "loss": 0.8052, "step": 7240 }, { "epoch": 0.4647209749839641, "grad_norm": 0.8501864866133851, "learning_rate": 0.00012933029886034723, "loss": 0.7407, "step": 7245 }, { "epoch": 0.4650416933932008, "grad_norm": 0.9701598818030126, "learning_rate": 0.00012922325235214114, "loss": 0.672, "step": 7250 }, { "epoch": 0.4653624118024375, "grad_norm": 0.7147413441513334, "learning_rate": 0.00012911616921780717, "loss": 0.572, "step": 7255 }, { "epoch": 0.4656831302116741, "grad_norm": 1.1031756310087157, "learning_rate": 0.00012900904959155482, "loss": 0.502, "step": 7260 }, { "epoch": 0.4660038486209108, "grad_norm": 0.9549539883250536, "learning_rate": 0.0001289018936076395, "loss": 0.7697, "step": 7265 }, { "epoch": 0.4663245670301475, "grad_norm": 0.7061368474604979, "learning_rate": 0.00012879470140036205, "loss": 0.77, "step": 7270 }, { "epoch": 0.4666452854393842, "grad_norm": 0.8174054654625066, "learning_rate": 0.00012868747310406875, "loss": 0.644, "step": 7275 }, { "epoch": 0.4669660038486209, "grad_norm": 1.0847763653058102, "learning_rate": 0.00012858020885315118, "loss": 0.6265, "step": 7280 }, { "epoch": 0.4672867222578576, "grad_norm": 0.7498493863919715, "learning_rate": 0.00012847290878204584, "loss": 0.6246, "step": 7285 }, { "epoch": 0.4676074406670943, "grad_norm": 0.981941482754815, "learning_rate": 0.0001283655730252343, "loss": 0.6622, "step": 7290 }, { "epoch": 0.467928159076331, "grad_norm": 0.9518018861299121, "learning_rate": 0.00012825820171724267, "loss": 0.6284, "step": 7295 }, { "epoch": 0.46824887748556765, "grad_norm": 0.8663834243061985, "learning_rate": 0.00012815079499264178, "loss": 0.5667, "step": 7300 }, { "epoch": 0.46856959589480435, "grad_norm": 0.7672027770311252, "learning_rate": 0.00012804335298604672, "loss": 0.7221, "step": 7305 }, { "epoch": 0.46889031430404104, "grad_norm": 0.8035416637587046, "learning_rate": 0.00012793587583211693, "loss": 0.5737, "step": 7310 }, { "epoch": 0.46921103271327774, "grad_norm": 0.7309561664000054, "learning_rate": 0.00012782836366555578, "loss": 0.6313, "step": 7315 }, { "epoch": 0.46953175112251444, "grad_norm": 0.6252749910832299, "learning_rate": 0.00012772081662111053, "loss": 0.6736, "step": 7320 }, { "epoch": 0.46985246953175114, "grad_norm": 1.025835083057594, "learning_rate": 0.00012761323483357227, "loss": 0.5665, "step": 7325 }, { "epoch": 0.47017318794098784, "grad_norm": 0.6525095712503345, "learning_rate": 0.00012750561843777552, "loss": 0.6443, "step": 7330 }, { "epoch": 0.4704939063502245, "grad_norm": 0.7418969128305869, "learning_rate": 0.00012739796756859825, "loss": 0.8236, "step": 7335 }, { "epoch": 0.4708146247594612, "grad_norm": 1.0413884397203683, "learning_rate": 0.00012729028236096155, "loss": 0.6624, "step": 7340 }, { "epoch": 0.4711353431686979, "grad_norm": 0.9159067009468284, "learning_rate": 0.0001271825629498296, "loss": 0.6376, "step": 7345 }, { "epoch": 0.47145606157793457, "grad_norm": 0.5992387879000995, "learning_rate": 0.0001270748094702095, "loss": 0.5685, "step": 7350 }, { "epoch": 0.47177677998717127, "grad_norm": 1.7163402868588182, "learning_rate": 0.00012696702205715088, "loss": 0.5311, "step": 7355 }, { "epoch": 0.47209749839640797, "grad_norm": 0.7926851445802399, "learning_rate": 0.00012685920084574618, "loss": 0.7548, "step": 7360 }, { "epoch": 0.47241821680564466, "grad_norm": 0.9751658539863987, "learning_rate": 0.0001267513459711299, "loss": 0.6665, "step": 7365 }, { "epoch": 0.47273893521488136, "grad_norm": 1.0752483823874541, "learning_rate": 0.00012664345756847892, "loss": 0.583, "step": 7370 }, { "epoch": 0.473059653624118, "grad_norm": 1.0127918776763205, "learning_rate": 0.00012653553577301202, "loss": 0.749, "step": 7375 }, { "epoch": 0.4733803720333547, "grad_norm": 0.9059323990908674, "learning_rate": 0.00012642758071999, "loss": 0.7049, "step": 7380 }, { "epoch": 0.4737010904425914, "grad_norm": 0.8259800182390388, "learning_rate": 0.00012631959254471515, "loss": 0.6771, "step": 7385 }, { "epoch": 0.4740218088518281, "grad_norm": 1.47432552983105, "learning_rate": 0.00012621157138253142, "loss": 0.5965, "step": 7390 }, { "epoch": 0.4743425272610648, "grad_norm": 0.9830245238116091, "learning_rate": 0.00012610351736882402, "loss": 0.7302, "step": 7395 }, { "epoch": 0.4746632456703015, "grad_norm": 0.9860227904680734, "learning_rate": 0.00012599543063901935, "loss": 0.6942, "step": 7400 }, { "epoch": 0.4749839640795382, "grad_norm": 0.9011424798066042, "learning_rate": 0.00012588731132858486, "loss": 0.6456, "step": 7405 }, { "epoch": 0.47530468248877483, "grad_norm": 0.9091580384346607, "learning_rate": 0.00012577915957302872, "loss": 0.6091, "step": 7410 }, { "epoch": 0.47562540089801153, "grad_norm": 0.9741008974793179, "learning_rate": 0.00012567097550789997, "loss": 0.6012, "step": 7415 }, { "epoch": 0.4759461193072482, "grad_norm": 0.9602884477063278, "learning_rate": 0.00012556275926878789, "loss": 0.6792, "step": 7420 }, { "epoch": 0.4762668377164849, "grad_norm": 0.6210052131474215, "learning_rate": 0.00012545451099132225, "loss": 0.6193, "step": 7425 }, { "epoch": 0.4765875561257216, "grad_norm": 0.8832670583789428, "learning_rate": 0.000125346230811173, "loss": 0.6106, "step": 7430 }, { "epoch": 0.4769082745349583, "grad_norm": 0.851189577398919, "learning_rate": 0.00012523791886404986, "loss": 0.8305, "step": 7435 }, { "epoch": 0.477228992944195, "grad_norm": 1.2879732211506167, "learning_rate": 0.00012512957528570265, "loss": 0.5887, "step": 7440 }, { "epoch": 0.47754971135343166, "grad_norm": 0.5699068076911031, "learning_rate": 0.0001250212002119207, "loss": 0.5558, "step": 7445 }, { "epoch": 0.47787042976266836, "grad_norm": 1.1918583269997756, "learning_rate": 0.00012491279377853268, "loss": 0.6408, "step": 7450 }, { "epoch": 0.47819114817190506, "grad_norm": 1.4317720523654553, "learning_rate": 0.0001248043561214068, "loss": 0.6172, "step": 7455 }, { "epoch": 0.47851186658114175, "grad_norm": 1.0666113380037154, "learning_rate": 0.0001246958873764503, "loss": 0.7485, "step": 7460 }, { "epoch": 0.47883258499037845, "grad_norm": 1.2123844625766853, "learning_rate": 0.00012458738767960937, "loss": 0.7277, "step": 7465 }, { "epoch": 0.47915330339961515, "grad_norm": 0.6850700187680755, "learning_rate": 0.00012447885716686892, "loss": 0.6412, "step": 7470 }, { "epoch": 0.47947402180885185, "grad_norm": 0.7818905955159324, "learning_rate": 0.00012437029597425268, "loss": 0.6845, "step": 7475 }, { "epoch": 0.47979474021808854, "grad_norm": 0.7985800895037933, "learning_rate": 0.00012426170423782265, "loss": 0.7376, "step": 7480 }, { "epoch": 0.4801154586273252, "grad_norm": 1.4988959271026578, "learning_rate": 0.0001241530820936792, "loss": 0.6025, "step": 7485 }, { "epoch": 0.4804361770365619, "grad_norm": 0.7532644364170019, "learning_rate": 0.00012404442967796077, "loss": 0.7597, "step": 7490 }, { "epoch": 0.4807568954457986, "grad_norm": 0.9781127180520404, "learning_rate": 0.0001239357471268438, "loss": 0.7113, "step": 7495 }, { "epoch": 0.4810776138550353, "grad_norm": 1.2808191157193494, "learning_rate": 0.00012382703457654247, "loss": 0.7197, "step": 7500 }, { "epoch": 0.481398332264272, "grad_norm": 0.9577008167614253, "learning_rate": 0.00012371829216330842, "loss": 0.6633, "step": 7505 }, { "epoch": 0.4817190506735087, "grad_norm": 0.9163574634981259, "learning_rate": 0.000123609520023431, "loss": 0.6577, "step": 7510 }, { "epoch": 0.4820397690827454, "grad_norm": 0.9436379402563304, "learning_rate": 0.00012350071829323657, "loss": 0.665, "step": 7515 }, { "epoch": 0.482360487491982, "grad_norm": 0.8955893724229462, "learning_rate": 0.0001233918871090887, "loss": 0.65, "step": 7520 }, { "epoch": 0.4826812059012187, "grad_norm": 1.1039069837177617, "learning_rate": 0.0001232830266073879, "loss": 0.6262, "step": 7525 }, { "epoch": 0.4830019243104554, "grad_norm": 0.8240710234420133, "learning_rate": 0.00012317413692457125, "loss": 0.7796, "step": 7530 }, { "epoch": 0.4833226427196921, "grad_norm": 0.5672101461672577, "learning_rate": 0.0001230652181971126, "loss": 0.6606, "step": 7535 }, { "epoch": 0.4836433611289288, "grad_norm": 0.6312799174708051, "learning_rate": 0.00012295627056152205, "loss": 0.6847, "step": 7540 }, { "epoch": 0.4839640795381655, "grad_norm": 0.9279904903302523, "learning_rate": 0.0001228472941543461, "loss": 0.7298, "step": 7545 }, { "epoch": 0.4842847979474022, "grad_norm": 1.0061624072103414, "learning_rate": 0.00012273828911216715, "loss": 0.688, "step": 7550 }, { "epoch": 0.48460551635663884, "grad_norm": 0.9531338313200752, "learning_rate": 0.00012262925557160362, "loss": 0.7381, "step": 7555 }, { "epoch": 0.48492623476587554, "grad_norm": 0.9084381778100004, "learning_rate": 0.0001225201936693095, "loss": 0.5676, "step": 7560 }, { "epoch": 0.48524695317511224, "grad_norm": 1.0203436397332364, "learning_rate": 0.00012241110354197448, "loss": 0.571, "step": 7565 }, { "epoch": 0.48556767158434894, "grad_norm": 0.9169062207127215, "learning_rate": 0.00012230198532632347, "loss": 0.6456, "step": 7570 }, { "epoch": 0.48588838999358563, "grad_norm": 0.6002350728637655, "learning_rate": 0.0001221928391591167, "loss": 0.6998, "step": 7575 }, { "epoch": 0.48620910840282233, "grad_norm": 0.5575094896397851, "learning_rate": 0.00012208366517714946, "loss": 0.6751, "step": 7580 }, { "epoch": 0.48652982681205903, "grad_norm": 0.7309868460212633, "learning_rate": 0.00012197446351725174, "loss": 0.6152, "step": 7585 }, { "epoch": 0.4868505452212957, "grad_norm": 0.9692168543018325, "learning_rate": 0.0001218652343162884, "loss": 0.6374, "step": 7590 }, { "epoch": 0.48717126363053237, "grad_norm": 0.7189150002506619, "learning_rate": 0.00012175597771115871, "loss": 0.7784, "step": 7595 }, { "epoch": 0.48749198203976907, "grad_norm": 0.8123916784425887, "learning_rate": 0.0001216466938387963, "loss": 0.5559, "step": 7600 }, { "epoch": 0.48781270044900576, "grad_norm": 0.903323959073406, "learning_rate": 0.00012153738283616897, "loss": 0.6245, "step": 7605 }, { "epoch": 0.48813341885824246, "grad_norm": 1.1841897784251287, "learning_rate": 0.00012142804484027862, "loss": 0.7076, "step": 7610 }, { "epoch": 0.48845413726747916, "grad_norm": 0.96970852663879, "learning_rate": 0.0001213186799881608, "loss": 0.6394, "step": 7615 }, { "epoch": 0.48877485567671586, "grad_norm": 0.9366182177279975, "learning_rate": 0.00012120928841688486, "loss": 0.6738, "step": 7620 }, { "epoch": 0.48909557408595256, "grad_norm": 0.6547998596688648, "learning_rate": 0.0001210998702635536, "loss": 0.5484, "step": 7625 }, { "epoch": 0.4894162924951892, "grad_norm": 0.61835825910844, "learning_rate": 0.00012099042566530318, "loss": 0.7106, "step": 7630 }, { "epoch": 0.4897370109044259, "grad_norm": 0.9889648893113016, "learning_rate": 0.00012088095475930281, "loss": 0.6665, "step": 7635 }, { "epoch": 0.4900577293136626, "grad_norm": 1.0009313158645148, "learning_rate": 0.00012077145768275473, "loss": 0.7342, "step": 7640 }, { "epoch": 0.4903784477228993, "grad_norm": 1.207980433506984, "learning_rate": 0.00012066193457289397, "loss": 0.797, "step": 7645 }, { "epoch": 0.490699166132136, "grad_norm": 0.7854979595695312, "learning_rate": 0.00012055238556698816, "loss": 0.6988, "step": 7650 }, { "epoch": 0.4910198845413727, "grad_norm": 0.7188797039130606, "learning_rate": 0.00012044281080233746, "loss": 0.7325, "step": 7655 }, { "epoch": 0.4913406029506094, "grad_norm": 0.9561317362271494, "learning_rate": 0.00012033321041627425, "loss": 0.6506, "step": 7660 }, { "epoch": 0.4916613213598461, "grad_norm": 0.7528076899928123, "learning_rate": 0.00012022358454616306, "loss": 0.5609, "step": 7665 }, { "epoch": 0.4919820397690827, "grad_norm": 0.8596601027470778, "learning_rate": 0.0001201139333294003, "loss": 0.6597, "step": 7670 }, { "epoch": 0.4923027581783194, "grad_norm": 0.6508137207715219, "learning_rate": 0.00012000425690341422, "loss": 0.4953, "step": 7675 }, { "epoch": 0.4926234765875561, "grad_norm": 0.8505276898684504, "learning_rate": 0.00011989455540566462, "loss": 0.6649, "step": 7680 }, { "epoch": 0.4929441949967928, "grad_norm": 0.758748378012195, "learning_rate": 0.00011978482897364273, "loss": 0.7204, "step": 7685 }, { "epoch": 0.4932649134060295, "grad_norm": 0.8242651845310669, "learning_rate": 0.00011967507774487108, "loss": 0.6598, "step": 7690 }, { "epoch": 0.4935856318152662, "grad_norm": 0.8816627197677691, "learning_rate": 0.0001195653018569032, "loss": 0.8369, "step": 7695 }, { "epoch": 0.4939063502245029, "grad_norm": 0.781020774879966, "learning_rate": 0.00011945550144732354, "loss": 0.7912, "step": 7700 }, { "epoch": 0.49422706863373955, "grad_norm": 0.5912028419510443, "learning_rate": 0.00011934567665374732, "loss": 0.673, "step": 7705 }, { "epoch": 0.49454778704297625, "grad_norm": 0.7852150600454825, "learning_rate": 0.00011923582761382031, "loss": 0.6989, "step": 7710 }, { "epoch": 0.49486850545221295, "grad_norm": 0.8345934386959575, "learning_rate": 0.00011912595446521868, "loss": 0.6319, "step": 7715 }, { "epoch": 0.49518922386144965, "grad_norm": 1.2815263854782484, "learning_rate": 0.0001190160573456488, "loss": 0.6247, "step": 7720 }, { "epoch": 0.49550994227068634, "grad_norm": 1.1234841964502218, "learning_rate": 0.00011890613639284704, "loss": 0.653, "step": 7725 }, { "epoch": 0.49583066067992304, "grad_norm": 0.9428012694473118, "learning_rate": 0.00011879619174457976, "loss": 0.9064, "step": 7730 }, { "epoch": 0.49615137908915974, "grad_norm": 0.7822481283735353, "learning_rate": 0.00011868622353864285, "loss": 0.5887, "step": 7735 }, { "epoch": 0.4964720974983964, "grad_norm": 0.6197300598147442, "learning_rate": 0.00011857623191286186, "loss": 0.5871, "step": 7740 }, { "epoch": 0.4967928159076331, "grad_norm": 0.6742268900193886, "learning_rate": 0.00011846621700509171, "loss": 0.6153, "step": 7745 }, { "epoch": 0.4971135343168698, "grad_norm": 1.0097074349573119, "learning_rate": 0.00011835617895321633, "loss": 0.726, "step": 7750 }, { "epoch": 0.4974342527261065, "grad_norm": 0.7938742619155006, "learning_rate": 0.00011824611789514881, "loss": 0.7576, "step": 7755 }, { "epoch": 0.49775497113534317, "grad_norm": 0.7594193522785816, "learning_rate": 0.00011813603396883108, "loss": 0.631, "step": 7760 }, { "epoch": 0.49807568954457987, "grad_norm": 1.1449681048330884, "learning_rate": 0.0001180259273122336, "loss": 0.8346, "step": 7765 }, { "epoch": 0.49839640795381657, "grad_norm": 0.6106704277152839, "learning_rate": 0.00011791579806335547, "loss": 0.7094, "step": 7770 }, { "epoch": 0.49871712636305326, "grad_norm": 0.9764152562715487, "learning_rate": 0.000117805646360224, "loss": 0.7922, "step": 7775 }, { "epoch": 0.4990378447722899, "grad_norm": 1.4581971435959649, "learning_rate": 0.00011769547234089469, "loss": 0.7598, "step": 7780 }, { "epoch": 0.4993585631815266, "grad_norm": 1.1726593622900077, "learning_rate": 0.00011758527614345097, "loss": 0.6934, "step": 7785 }, { "epoch": 0.4996792815907633, "grad_norm": 1.382229173196648, "learning_rate": 0.00011747505790600412, "loss": 0.6793, "step": 7790 }, { "epoch": 0.5, "grad_norm": 0.7583044707535523, "learning_rate": 0.00011736481776669306, "loss": 0.7244, "step": 7795 }, { "epoch": 0.5003207184092366, "grad_norm": 1.0327502481504163, "learning_rate": 0.000117254555863684, "loss": 0.7023, "step": 7800 }, { "epoch": 0.5006414368184734, "grad_norm": 0.6928521319692996, "learning_rate": 0.00011714427233517069, "loss": 0.5508, "step": 7805 }, { "epoch": 0.50096215522771, "grad_norm": 0.6645980452165248, "learning_rate": 0.0001170339673193737, "loss": 0.7463, "step": 7810 }, { "epoch": 0.5012828736369468, "grad_norm": 0.6668044106727686, "learning_rate": 0.00011692364095454076, "loss": 0.6357, "step": 7815 }, { "epoch": 0.5016035920461834, "grad_norm": 0.9287710383565055, "learning_rate": 0.00011681329337894623, "loss": 0.6308, "step": 7820 }, { "epoch": 0.5019243104554202, "grad_norm": 1.3104043465513664, "learning_rate": 0.0001167029247308911, "loss": 0.5399, "step": 7825 }, { "epoch": 0.5022450288646568, "grad_norm": 1.428373507944948, "learning_rate": 0.00011659253514870276, "loss": 0.7011, "step": 7830 }, { "epoch": 0.5025657472738935, "grad_norm": 0.833100109623975, "learning_rate": 0.00011648212477073484, "loss": 0.7404, "step": 7835 }, { "epoch": 0.5028864656831302, "grad_norm": 1.0751700158927022, "learning_rate": 0.00011637169373536698, "loss": 0.6389, "step": 7840 }, { "epoch": 0.5032071840923669, "grad_norm": 0.9610389244865, "learning_rate": 0.00011626124218100483, "loss": 0.732, "step": 7845 }, { "epoch": 0.5035279025016036, "grad_norm": 1.4064338381179782, "learning_rate": 0.00011615077024607965, "loss": 0.7248, "step": 7850 }, { "epoch": 0.5038486209108403, "grad_norm": 1.0089167449788845, "learning_rate": 0.00011604027806904833, "loss": 0.6808, "step": 7855 }, { "epoch": 0.504169339320077, "grad_norm": 0.8297282225570892, "learning_rate": 0.00011592976578839303, "loss": 0.7505, "step": 7860 }, { "epoch": 0.5044900577293137, "grad_norm": 0.8562597418732677, "learning_rate": 0.00011581923354262117, "loss": 0.7069, "step": 7865 }, { "epoch": 0.5048107761385503, "grad_norm": 1.1555443138727173, "learning_rate": 0.00011570868147026517, "loss": 0.6213, "step": 7870 }, { "epoch": 0.505131494547787, "grad_norm": 1.4259877059174733, "learning_rate": 0.00011559810970988232, "loss": 0.6105, "step": 7875 }, { "epoch": 0.5054522129570237, "grad_norm": 0.6183735071336424, "learning_rate": 0.00011548751840005459, "loss": 0.4662, "step": 7880 }, { "epoch": 0.5057729313662604, "grad_norm": 0.9453435423443054, "learning_rate": 0.00011537690767938843, "loss": 0.6083, "step": 7885 }, { "epoch": 0.5060936497754971, "grad_norm": 0.6729282582317203, "learning_rate": 0.00011526627768651459, "loss": 0.7553, "step": 7890 }, { "epoch": 0.5064143681847338, "grad_norm": 0.8579324957843062, "learning_rate": 0.00011515562856008808, "loss": 0.7014, "step": 7895 }, { "epoch": 0.5067350865939705, "grad_norm": 0.9652710068101304, "learning_rate": 0.00011504496043878776, "loss": 0.7203, "step": 7900 }, { "epoch": 0.5070558050032072, "grad_norm": 1.3328325121052935, "learning_rate": 0.00011493427346131636, "loss": 0.7462, "step": 7905 }, { "epoch": 0.5073765234124439, "grad_norm": 0.7750774157499563, "learning_rate": 0.00011482356776640028, "loss": 0.7554, "step": 7910 }, { "epoch": 0.5076972418216805, "grad_norm": 0.7771858604565626, "learning_rate": 0.00011471284349278928, "loss": 0.7032, "step": 7915 }, { "epoch": 0.5080179602309173, "grad_norm": 0.9990707053591126, "learning_rate": 0.0001146021007792565, "loss": 0.5966, "step": 7920 }, { "epoch": 0.5083386786401539, "grad_norm": 0.9864579497103747, "learning_rate": 0.00011449133976459816, "loss": 0.701, "step": 7925 }, { "epoch": 0.5086593970493907, "grad_norm": 0.9752505086126679, "learning_rate": 0.0001143805605876334, "loss": 0.6502, "step": 7930 }, { "epoch": 0.5089801154586273, "grad_norm": 1.3306389404931571, "learning_rate": 0.00011426976338720412, "loss": 0.6592, "step": 7935 }, { "epoch": 0.5093008338678641, "grad_norm": 0.6705402480174242, "learning_rate": 0.00011415894830217486, "loss": 0.6531, "step": 7940 }, { "epoch": 0.5096215522771007, "grad_norm": 0.8130683741487627, "learning_rate": 0.00011404811547143251, "loss": 0.7333, "step": 7945 }, { "epoch": 0.5099422706863374, "grad_norm": 1.1664159763922086, "learning_rate": 0.0001139372650338862, "loss": 0.8146, "step": 7950 }, { "epoch": 0.5102629890955741, "grad_norm": 0.5999515830143689, "learning_rate": 0.00011382639712846721, "loss": 0.5825, "step": 7955 }, { "epoch": 0.5105837075048107, "grad_norm": 1.1054727651684402, "learning_rate": 0.00011371551189412868, "loss": 0.7374, "step": 7960 }, { "epoch": 0.5109044259140475, "grad_norm": 1.0319949146313503, "learning_rate": 0.00011360460946984537, "loss": 0.7562, "step": 7965 }, { "epoch": 0.5112251443232841, "grad_norm": 0.6047170156572763, "learning_rate": 0.00011349368999461374, "loss": 0.7588, "step": 7970 }, { "epoch": 0.5115458627325209, "grad_norm": 0.8725079332758466, "learning_rate": 0.00011338275360745147, "loss": 0.7421, "step": 7975 }, { "epoch": 0.5118665811417575, "grad_norm": 0.784376771151006, "learning_rate": 0.00011327180044739755, "loss": 0.5837, "step": 7980 }, { "epoch": 0.5121872995509942, "grad_norm": 0.8977359490481988, "learning_rate": 0.00011316083065351195, "loss": 0.7392, "step": 7985 }, { "epoch": 0.5125080179602309, "grad_norm": 0.653772242009018, "learning_rate": 0.00011304984436487551, "loss": 0.6166, "step": 7990 }, { "epoch": 0.5128287363694676, "grad_norm": 1.2310492343797879, "learning_rate": 0.00011293884172058971, "loss": 0.5507, "step": 7995 }, { "epoch": 0.5131494547787043, "grad_norm": 1.0077531207139014, "learning_rate": 0.00011282782285977649, "loss": 0.6358, "step": 8000 }, { "epoch": 0.513470173187941, "grad_norm": 1.19554249733326, "learning_rate": 0.00011271678792157823, "loss": 0.6614, "step": 8005 }, { "epoch": 0.5137908915971777, "grad_norm": 0.8654028252618859, "learning_rate": 0.00011260573704515734, "loss": 0.6444, "step": 8010 }, { "epoch": 0.5141116100064144, "grad_norm": 0.9637998906695273, "learning_rate": 0.00011249467036969632, "loss": 0.6859, "step": 8015 }, { "epoch": 0.514432328415651, "grad_norm": 1.2621981138132725, "learning_rate": 0.00011238358803439739, "loss": 0.7247, "step": 8020 }, { "epoch": 0.5147530468248878, "grad_norm": 0.6255230049474781, "learning_rate": 0.0001122724901784824, "loss": 0.7025, "step": 8025 }, { "epoch": 0.5150737652341244, "grad_norm": 0.8124027597004405, "learning_rate": 0.00011216137694119271, "loss": 0.6465, "step": 8030 }, { "epoch": 0.5153944836433612, "grad_norm": 0.7060753692578354, "learning_rate": 0.00011205024846178886, "loss": 0.5977, "step": 8035 }, { "epoch": 0.5157152020525978, "grad_norm": 0.9066775542047206, "learning_rate": 0.00011193910487955059, "loss": 0.6407, "step": 8040 }, { "epoch": 0.5160359204618346, "grad_norm": 0.6903326908804434, "learning_rate": 0.00011182794633377653, "loss": 0.6925, "step": 8045 }, { "epoch": 0.5163566388710712, "grad_norm": 0.9472934152436594, "learning_rate": 0.00011171677296378411, "loss": 0.7609, "step": 8050 }, { "epoch": 0.5166773572803078, "grad_norm": 1.0828907895794335, "learning_rate": 0.0001116055849089092, "loss": 0.7855, "step": 8055 }, { "epoch": 0.5169980756895446, "grad_norm": 1.3155495321215651, "learning_rate": 0.00011149438230850626, "loss": 0.6561, "step": 8060 }, { "epoch": 0.5173187940987812, "grad_norm": 0.7751536928800652, "learning_rate": 0.00011138316530194782, "loss": 0.6302, "step": 8065 }, { "epoch": 0.517639512508018, "grad_norm": 1.278374102598091, "learning_rate": 0.00011127193402862457, "loss": 0.6741, "step": 8070 }, { "epoch": 0.5179602309172546, "grad_norm": 0.7961067269873462, "learning_rate": 0.00011116068862794506, "loss": 0.7248, "step": 8075 }, { "epoch": 0.5182809493264914, "grad_norm": 0.9325619210714818, "learning_rate": 0.0001110494292393355, "loss": 0.6036, "step": 8080 }, { "epoch": 0.518601667735728, "grad_norm": 0.9427970552237784, "learning_rate": 0.00011093815600223966, "loss": 0.6906, "step": 8085 }, { "epoch": 0.5189223861449648, "grad_norm": 0.9820235565256558, "learning_rate": 0.00011082686905611872, "loss": 0.6996, "step": 8090 }, { "epoch": 0.5192431045542014, "grad_norm": 0.7847448260775505, "learning_rate": 0.00011071556854045098, "loss": 0.67, "step": 8095 }, { "epoch": 0.5195638229634381, "grad_norm": 0.7114519312016215, "learning_rate": 0.00011060425459473169, "loss": 0.6844, "step": 8100 }, { "epoch": 0.5198845413726748, "grad_norm": 0.6238373643554763, "learning_rate": 0.00011049292735847312, "loss": 0.5971, "step": 8105 }, { "epoch": 0.5202052597819115, "grad_norm": 0.9399929160198239, "learning_rate": 0.00011038158697120395, "loss": 0.6189, "step": 8110 }, { "epoch": 0.5205259781911482, "grad_norm": 1.1129758526237858, "learning_rate": 0.00011027023357246955, "loss": 0.7023, "step": 8115 }, { "epoch": 0.5208466966003849, "grad_norm": 1.049212324811729, "learning_rate": 0.00011015886730183152, "loss": 0.7014, "step": 8120 }, { "epoch": 0.5211674150096216, "grad_norm": 0.8599253114644705, "learning_rate": 0.00011004748829886755, "loss": 0.6835, "step": 8125 }, { "epoch": 0.5214881334188582, "grad_norm": 0.6066610732008468, "learning_rate": 0.0001099360967031714, "loss": 0.5214, "step": 8130 }, { "epoch": 0.5218088518280949, "grad_norm": 0.8343848602348406, "learning_rate": 0.00010982469265435249, "loss": 0.6169, "step": 8135 }, { "epoch": 0.5221295702373316, "grad_norm": 0.4237175002588996, "learning_rate": 0.00010971327629203587, "loss": 0.5628, "step": 8140 }, { "epoch": 0.5224502886465683, "grad_norm": 0.7612853893387608, "learning_rate": 0.00010960184775586209, "loss": 0.6496, "step": 8145 }, { "epoch": 0.522771007055805, "grad_norm": 0.7090497030288603, "learning_rate": 0.00010949040718548693, "loss": 0.6699, "step": 8150 }, { "epoch": 0.5230917254650417, "grad_norm": 0.8137233187040953, "learning_rate": 0.00010937895472058126, "loss": 0.7825, "step": 8155 }, { "epoch": 0.5234124438742784, "grad_norm": 1.106458178679526, "learning_rate": 0.0001092674905008308, "loss": 0.5917, "step": 8160 }, { "epoch": 0.5237331622835151, "grad_norm": 1.1023421333903827, "learning_rate": 0.00010915601466593604, "loss": 0.652, "step": 8165 }, { "epoch": 0.5240538806927517, "grad_norm": 1.2339053368878727, "learning_rate": 0.00010904452735561204, "loss": 0.7531, "step": 8170 }, { "epoch": 0.5243745991019885, "grad_norm": 0.8536672713520308, "learning_rate": 0.00010893302870958824, "loss": 0.6808, "step": 8175 }, { "epoch": 0.5246953175112251, "grad_norm": 0.9072452347961674, "learning_rate": 0.00010882151886760827, "loss": 0.7883, "step": 8180 }, { "epoch": 0.5250160359204619, "grad_norm": 0.705408047927468, "learning_rate": 0.00010870999796942986, "loss": 0.7448, "step": 8185 }, { "epoch": 0.5253367543296985, "grad_norm": 0.84842819642806, "learning_rate": 0.00010859846615482448, "loss": 0.7873, "step": 8190 }, { "epoch": 0.5256574727389353, "grad_norm": 0.9668127437981949, "learning_rate": 0.00010848692356357735, "loss": 0.6553, "step": 8195 }, { "epoch": 0.5259781911481719, "grad_norm": 1.3910270737631052, "learning_rate": 0.00010837537033548718, "loss": 0.551, "step": 8200 }, { "epoch": 0.5262989095574085, "grad_norm": 0.8934045053705592, "learning_rate": 0.00010826380661036601, "loss": 0.755, "step": 8205 }, { "epoch": 0.5266196279666453, "grad_norm": 0.7580165266865208, "learning_rate": 0.0001081522325280391, "loss": 0.6785, "step": 8210 }, { "epoch": 0.5269403463758819, "grad_norm": 0.895270436973056, "learning_rate": 0.00010804064822834461, "loss": 0.6188, "step": 8215 }, { "epoch": 0.5272610647851187, "grad_norm": 0.8349917473129711, "learning_rate": 0.0001079290538511335, "loss": 0.5295, "step": 8220 }, { "epoch": 0.5275817831943553, "grad_norm": 1.0937712586985149, "learning_rate": 0.00010781744953626944, "loss": 0.718, "step": 8225 }, { "epoch": 0.5279025016035921, "grad_norm": 0.9776711832493594, "learning_rate": 0.00010770583542362848, "loss": 0.7394, "step": 8230 }, { "epoch": 0.5282232200128287, "grad_norm": 0.9916244110681041, "learning_rate": 0.00010759421165309898, "loss": 0.6302, "step": 8235 }, { "epoch": 0.5285439384220654, "grad_norm": 0.7709724576720045, "learning_rate": 0.00010748257836458142, "loss": 0.4377, "step": 8240 }, { "epoch": 0.5288646568313021, "grad_norm": 0.9553016321868766, "learning_rate": 0.00010737093569798815, "loss": 0.5929, "step": 8245 }, { "epoch": 0.5291853752405388, "grad_norm": 0.5921375135170813, "learning_rate": 0.00010725928379324335, "loss": 0.6308, "step": 8250 }, { "epoch": 0.5295060936497755, "grad_norm": 0.9409908884682822, "learning_rate": 0.00010714762279028275, "loss": 0.6488, "step": 8255 }, { "epoch": 0.5298268120590122, "grad_norm": 0.9164401991956044, "learning_rate": 0.00010703595282905343, "loss": 0.7185, "step": 8260 }, { "epoch": 0.5301475304682489, "grad_norm": 0.7915811080548818, "learning_rate": 0.00010692427404951379, "loss": 0.7002, "step": 8265 }, { "epoch": 0.5304682488774856, "grad_norm": 1.1633281858494344, "learning_rate": 0.00010681258659163322, "loss": 0.7142, "step": 8270 }, { "epoch": 0.5307889672867223, "grad_norm": 1.1360488426032926, "learning_rate": 0.00010670089059539201, "loss": 0.6164, "step": 8275 }, { "epoch": 0.531109685695959, "grad_norm": 0.9950081272171089, "learning_rate": 0.0001065891862007811, "loss": 0.5403, "step": 8280 }, { "epoch": 0.5314304041051956, "grad_norm": 1.0499402732473173, "learning_rate": 0.00010647747354780206, "loss": 0.6409, "step": 8285 }, { "epoch": 0.5317511225144324, "grad_norm": 0.9441134224109928, "learning_rate": 0.00010636575277646672, "loss": 0.5947, "step": 8290 }, { "epoch": 0.532071840923669, "grad_norm": 1.3058395760608197, "learning_rate": 0.00010625402402679712, "loss": 0.6901, "step": 8295 }, { "epoch": 0.5323925593329057, "grad_norm": 0.8650565306977751, "learning_rate": 0.0001061422874388253, "loss": 0.6536, "step": 8300 }, { "epoch": 0.5327132777421424, "grad_norm": 1.1023501837328433, "learning_rate": 0.0001060305431525931, "loss": 0.7735, "step": 8305 }, { "epoch": 0.5330339961513791, "grad_norm": 0.7402707462941108, "learning_rate": 0.00010591879130815206, "loss": 0.7746, "step": 8310 }, { "epoch": 0.5333547145606158, "grad_norm": 1.0334014975634367, "learning_rate": 0.0001058070320455631, "loss": 0.6197, "step": 8315 }, { "epoch": 0.5336754329698524, "grad_norm": 0.8973174424463937, "learning_rate": 0.00010569526550489656, "loss": 0.6662, "step": 8320 }, { "epoch": 0.5339961513790892, "grad_norm": 1.1260137879030736, "learning_rate": 0.00010558349182623182, "loss": 0.7384, "step": 8325 }, { "epoch": 0.5343168697883258, "grad_norm": 1.0775603650728314, "learning_rate": 0.00010547171114965721, "loss": 0.53, "step": 8330 }, { "epoch": 0.5346375881975626, "grad_norm": 0.8657241626493881, "learning_rate": 0.00010535992361526986, "loss": 0.6597, "step": 8335 }, { "epoch": 0.5349583066067992, "grad_norm": 0.7754986474145258, "learning_rate": 0.00010524812936317545, "loss": 0.7155, "step": 8340 }, { "epoch": 0.535279025016036, "grad_norm": 0.7235913108295569, "learning_rate": 0.00010513632853348817, "loss": 0.63, "step": 8345 }, { "epoch": 0.5355997434252726, "grad_norm": 1.0376021153773205, "learning_rate": 0.00010502452126633033, "loss": 0.7389, "step": 8350 }, { "epoch": 0.5359204618345093, "grad_norm": 1.0736867388991156, "learning_rate": 0.00010491270770183241, "loss": 0.7524, "step": 8355 }, { "epoch": 0.536241180243746, "grad_norm": 1.2875466262160882, "learning_rate": 0.00010480088798013274, "loss": 0.7637, "step": 8360 }, { "epoch": 0.5365618986529826, "grad_norm": 1.0698179015991502, "learning_rate": 0.00010468906224137736, "loss": 0.7777, "step": 8365 }, { "epoch": 0.5368826170622194, "grad_norm": 0.715308845951178, "learning_rate": 0.00010457723062571984, "loss": 0.581, "step": 8370 }, { "epoch": 0.537203335471456, "grad_norm": 1.9992463200156003, "learning_rate": 0.00010446539327332121, "loss": 0.6813, "step": 8375 }, { "epoch": 0.5375240538806928, "grad_norm": 0.9082670120549011, "learning_rate": 0.00010435355032434958, "loss": 0.8172, "step": 8380 }, { "epoch": 0.5378447722899294, "grad_norm": 0.5039137526581597, "learning_rate": 0.00010424170191898006, "loss": 0.6443, "step": 8385 }, { "epoch": 0.5381654906991661, "grad_norm": 0.8357611125226391, "learning_rate": 0.00010412984819739473, "loss": 0.6672, "step": 8390 }, { "epoch": 0.5384862091084028, "grad_norm": 0.9107912987485977, "learning_rate": 0.00010401798929978224, "loss": 0.6107, "step": 8395 }, { "epoch": 0.5388069275176395, "grad_norm": 0.8281442376194428, "learning_rate": 0.0001039061253663377, "loss": 0.6075, "step": 8400 }, { "epoch": 0.5391276459268762, "grad_norm": 0.7249862380029812, "learning_rate": 0.00010379425653726263, "loss": 0.7265, "step": 8405 }, { "epoch": 0.5394483643361129, "grad_norm": 0.9092092180370709, "learning_rate": 0.00010368238295276455, "loss": 0.6893, "step": 8410 }, { "epoch": 0.5397690827453496, "grad_norm": 0.6540167568734936, "learning_rate": 0.0001035705047530571, "loss": 0.7305, "step": 8415 }, { "epoch": 0.5400898011545863, "grad_norm": 0.7981383776198956, "learning_rate": 0.00010345862207835957, "loss": 0.6453, "step": 8420 }, { "epoch": 0.5404105195638229, "grad_norm": 0.945104000015912, "learning_rate": 0.00010334673506889696, "loss": 0.7016, "step": 8425 }, { "epoch": 0.5407312379730597, "grad_norm": 1.0547131113611765, "learning_rate": 0.00010323484386489961, "loss": 0.7347, "step": 8430 }, { "epoch": 0.5410519563822963, "grad_norm": 0.8025281891388182, "learning_rate": 0.00010312294860660319, "loss": 0.5264, "step": 8435 }, { "epoch": 0.5413726747915331, "grad_norm": 0.9019250163215435, "learning_rate": 0.0001030110494342484, "loss": 0.5963, "step": 8440 }, { "epoch": 0.5416933932007697, "grad_norm": 0.6368675777184184, "learning_rate": 0.00010289914648808088, "loss": 0.5399, "step": 8445 }, { "epoch": 0.5420141116100065, "grad_norm": 0.8008826667949324, "learning_rate": 0.00010278723990835097, "loss": 0.7476, "step": 8450 }, { "epoch": 0.5423348300192431, "grad_norm": 0.7219125921723233, "learning_rate": 0.0001026753298353136, "loss": 0.5883, "step": 8455 }, { "epoch": 0.5426555484284797, "grad_norm": 0.6992313736984004, "learning_rate": 0.0001025634164092281, "loss": 0.5797, "step": 8460 }, { "epoch": 0.5429762668377165, "grad_norm": 0.44695714450265767, "learning_rate": 0.00010245149977035792, "loss": 0.6473, "step": 8465 }, { "epoch": 0.5432969852469531, "grad_norm": 1.248682759415961, "learning_rate": 0.00010233958005897058, "loss": 0.5812, "step": 8470 }, { "epoch": 0.5436177036561899, "grad_norm": 1.0568826134330056, "learning_rate": 0.00010222765741533744, "loss": 0.7862, "step": 8475 }, { "epoch": 0.5439384220654265, "grad_norm": 0.8116820280676993, "learning_rate": 0.00010211573197973356, "loss": 0.6353, "step": 8480 }, { "epoch": 0.5442591404746633, "grad_norm": 0.9997535811765578, "learning_rate": 0.00010200380389243753, "loss": 0.7229, "step": 8485 }, { "epoch": 0.5445798588838999, "grad_norm": 0.8261136419022004, "learning_rate": 0.00010189187329373113, "loss": 0.6919, "step": 8490 }, { "epoch": 0.5449005772931367, "grad_norm": 0.7977851457213406, "learning_rate": 0.00010177994032389946, "loss": 0.5777, "step": 8495 }, { "epoch": 0.5452212957023733, "grad_norm": 1.211421213402399, "learning_rate": 0.00010166800512323043, "loss": 0.6434, "step": 8500 }, { "epoch": 0.54554201411161, "grad_norm": 2.0722177427022244, "learning_rate": 0.00010155606783201488, "loss": 0.5933, "step": 8505 }, { "epoch": 0.5458627325208467, "grad_norm": 0.7874345109274467, "learning_rate": 0.00010144412859054617, "loss": 0.8209, "step": 8510 }, { "epoch": 0.5461834509300834, "grad_norm": 0.5164159774237933, "learning_rate": 0.00010133218753912023, "loss": 0.6337, "step": 8515 }, { "epoch": 0.5465041693393201, "grad_norm": 0.9997324723951748, "learning_rate": 0.00010122024481803509, "loss": 0.7799, "step": 8520 }, { "epoch": 0.5468248877485568, "grad_norm": 0.868379009704931, "learning_rate": 0.000101108300567591, "loss": 0.6205, "step": 8525 }, { "epoch": 0.5471456061577935, "grad_norm": 0.7487726179830052, "learning_rate": 0.00010099635492809007, "loss": 0.7024, "step": 8530 }, { "epoch": 0.5474663245670301, "grad_norm": 0.784320611343729, "learning_rate": 0.00010088440803983616, "loss": 0.765, "step": 8535 }, { "epoch": 0.5477870429762668, "grad_norm": 0.7657678123947386, "learning_rate": 0.00010077246004313472, "loss": 0.6496, "step": 8540 }, { "epoch": 0.5481077613855035, "grad_norm": 0.7225029829590283, "learning_rate": 0.00010066051107829259, "loss": 0.6885, "step": 8545 }, { "epoch": 0.5484284797947402, "grad_norm": 0.8979772778090884, "learning_rate": 0.00010054856128561778, "loss": 0.7111, "step": 8550 }, { "epoch": 0.5487491982039769, "grad_norm": 1.322201085524258, "learning_rate": 0.00010043661080541936, "loss": 0.6252, "step": 8555 }, { "epoch": 0.5490699166132136, "grad_norm": 0.6743113052462498, "learning_rate": 0.00010032465977800726, "loss": 0.5282, "step": 8560 }, { "epoch": 0.5493906350224503, "grad_norm": 0.8693068518513947, "learning_rate": 0.00010021270834369211, "loss": 0.6029, "step": 8565 }, { "epoch": 0.549711353431687, "grad_norm": 1.1870868813911406, "learning_rate": 0.00010010075664278507, "loss": 0.6264, "step": 8570 }, { "epoch": 0.5500320718409236, "grad_norm": 1.0567858782770287, "learning_rate": 9.998880481559755e-05, "loss": 0.8018, "step": 8575 }, { "epoch": 0.5503527902501604, "grad_norm": 0.8137731229847819, "learning_rate": 9.987685300244117e-05, "loss": 0.614, "step": 8580 }, { "epoch": 0.550673508659397, "grad_norm": 0.9599816781819811, "learning_rate": 9.976490134362759e-05, "loss": 0.687, "step": 8585 }, { "epoch": 0.5509942270686338, "grad_norm": 0.6181246421982609, "learning_rate": 9.965294997946815e-05, "loss": 0.6866, "step": 8590 }, { "epoch": 0.5513149454778704, "grad_norm": 1.1348648251746718, "learning_rate": 9.954099905027396e-05, "loss": 0.6416, "step": 8595 }, { "epoch": 0.5516356638871072, "grad_norm": 1.6639502602729528, "learning_rate": 9.94290486963555e-05, "loss": 0.6715, "step": 8600 }, { "epoch": 0.5519563822963438, "grad_norm": 0.7678034571145345, "learning_rate": 9.931709905802252e-05, "loss": 0.6886, "step": 8605 }, { "epoch": 0.5522771007055804, "grad_norm": 1.4578465770643851, "learning_rate": 9.92051502755839e-05, "loss": 0.7689, "step": 8610 }, { "epoch": 0.5525978191148172, "grad_norm": 0.7434972557340698, "learning_rate": 9.909320248934747e-05, "loss": 0.6374, "step": 8615 }, { "epoch": 0.5529185375240538, "grad_norm": 0.8031136082718469, "learning_rate": 9.898125583961977e-05, "loss": 0.7055, "step": 8620 }, { "epoch": 0.5532392559332906, "grad_norm": 1.000878821455057, "learning_rate": 9.886931046670598e-05, "loss": 0.6157, "step": 8625 }, { "epoch": 0.5535599743425272, "grad_norm": 0.6524291495733984, "learning_rate": 9.875736651090956e-05, "loss": 0.561, "step": 8630 }, { "epoch": 0.553880692751764, "grad_norm": 1.3537142167105929, "learning_rate": 9.864542411253229e-05, "loss": 0.6718, "step": 8635 }, { "epoch": 0.5542014111610006, "grad_norm": 1.2775573591627376, "learning_rate": 9.853348341187398e-05, "loss": 0.6645, "step": 8640 }, { "epoch": 0.5545221295702373, "grad_norm": 0.982975595575632, "learning_rate": 9.842154454923236e-05, "loss": 0.5919, "step": 8645 }, { "epoch": 0.554842847979474, "grad_norm": 0.960094691754927, "learning_rate": 9.830960766490274e-05, "loss": 0.8113, "step": 8650 }, { "epoch": 0.5551635663887107, "grad_norm": 0.7965375300164668, "learning_rate": 9.819767289917802e-05, "loss": 0.5782, "step": 8655 }, { "epoch": 0.5554842847979474, "grad_norm": 1.1381902966011452, "learning_rate": 9.808574039234843e-05, "loss": 0.6242, "step": 8660 }, { "epoch": 0.5558050032071841, "grad_norm": 0.8670424286605721, "learning_rate": 9.79738102847014e-05, "loss": 0.7355, "step": 8665 }, { "epoch": 0.5561257216164208, "grad_norm": 0.8366621626207873, "learning_rate": 9.786188271652133e-05, "loss": 0.5744, "step": 8670 }, { "epoch": 0.5564464400256575, "grad_norm": 0.8273685386138488, "learning_rate": 9.774995782808943e-05, "loss": 0.6414, "step": 8675 }, { "epoch": 0.5567671584348942, "grad_norm": 0.9522831235441542, "learning_rate": 9.763803575968357e-05, "loss": 0.7632, "step": 8680 }, { "epoch": 0.5570878768441309, "grad_norm": 0.75372169303836, "learning_rate": 9.752611665157807e-05, "loss": 0.6433, "step": 8685 }, { "epoch": 0.5574085952533675, "grad_norm": 1.2109886710417286, "learning_rate": 9.741420064404353e-05, "loss": 0.63, "step": 8690 }, { "epoch": 0.5577293136626043, "grad_norm": 0.5400874445069787, "learning_rate": 9.730228787734669e-05, "loss": 0.6789, "step": 8695 }, { "epoch": 0.5580500320718409, "grad_norm": 0.7989657543785353, "learning_rate": 9.719037849175023e-05, "loss": 0.7407, "step": 8700 }, { "epoch": 0.5583707504810776, "grad_norm": 0.7239899818926174, "learning_rate": 9.707847262751257e-05, "loss": 0.6029, "step": 8705 }, { "epoch": 0.5586914688903143, "grad_norm": 1.1080694844841645, "learning_rate": 9.696657042488774e-05, "loss": 0.6841, "step": 8710 }, { "epoch": 0.559012187299551, "grad_norm": 0.8668620206006121, "learning_rate": 9.685467202412514e-05, "loss": 0.8091, "step": 8715 }, { "epoch": 0.5593329057087877, "grad_norm": 0.8263012333520392, "learning_rate": 9.674277756546941e-05, "loss": 0.5612, "step": 8720 }, { "epoch": 0.5596536241180243, "grad_norm": 1.2272663628925047, "learning_rate": 9.663088718916031e-05, "loss": 0.6214, "step": 8725 }, { "epoch": 0.5599743425272611, "grad_norm": 0.9766333412497376, "learning_rate": 9.651900103543244e-05, "loss": 0.7342, "step": 8730 }, { "epoch": 0.5602950609364977, "grad_norm": 0.830624516454487, "learning_rate": 9.640711924451514e-05, "loss": 0.6718, "step": 8735 }, { "epoch": 0.5606157793457345, "grad_norm": 0.4675831817637492, "learning_rate": 9.629524195663219e-05, "loss": 0.6039, "step": 8740 }, { "epoch": 0.5609364977549711, "grad_norm": 0.6634840466913374, "learning_rate": 9.618336931200182e-05, "loss": 0.5964, "step": 8745 }, { "epoch": 0.5612572161642079, "grad_norm": 0.9976406641974719, "learning_rate": 9.607150145083642e-05, "loss": 0.7166, "step": 8750 }, { "epoch": 0.5615779345734445, "grad_norm": 0.9545013096296738, "learning_rate": 9.595963851334237e-05, "loss": 0.689, "step": 8755 }, { "epoch": 0.5618986529826812, "grad_norm": 0.9634333696652287, "learning_rate": 9.58477806397199e-05, "loss": 0.8048, "step": 8760 }, { "epoch": 0.5622193713919179, "grad_norm": 0.8057551483876174, "learning_rate": 9.573592797016285e-05, "loss": 0.672, "step": 8765 }, { "epoch": 0.5625400898011546, "grad_norm": 1.0000169919459303, "learning_rate": 9.562408064485858e-05, "loss": 0.656, "step": 8770 }, { "epoch": 0.5628608082103913, "grad_norm": 1.0059598561012926, "learning_rate": 9.551223880398778e-05, "loss": 0.6689, "step": 8775 }, { "epoch": 0.563181526619628, "grad_norm": 0.7089352756337184, "learning_rate": 9.540040258772413e-05, "loss": 0.6104, "step": 8780 }, { "epoch": 0.5635022450288647, "grad_norm": 0.9673260454868421, "learning_rate": 9.528857213623441e-05, "loss": 0.625, "step": 8785 }, { "epoch": 0.5638229634381013, "grad_norm": 0.8425769011906392, "learning_rate": 9.517674758967812e-05, "loss": 0.6385, "step": 8790 }, { "epoch": 0.564143681847338, "grad_norm": 0.8483079594314462, "learning_rate": 9.506492908820737e-05, "loss": 0.7091, "step": 8795 }, { "epoch": 0.5644644002565747, "grad_norm": 1.1949041204777606, "learning_rate": 9.495311677196663e-05, "loss": 0.5583, "step": 8800 }, { "epoch": 0.5647851186658114, "grad_norm": 1.1203988658358368, "learning_rate": 9.484131078109272e-05, "loss": 0.6491, "step": 8805 }, { "epoch": 0.5651058370750481, "grad_norm": 0.7171168814679133, "learning_rate": 9.472951125571447e-05, "loss": 0.5704, "step": 8810 }, { "epoch": 0.5654265554842848, "grad_norm": 0.43705154049643696, "learning_rate": 9.461771833595263e-05, "loss": 0.6235, "step": 8815 }, { "epoch": 0.5657472738935215, "grad_norm": 0.5972509611997564, "learning_rate": 9.450593216191962e-05, "loss": 0.6011, "step": 8820 }, { "epoch": 0.5660679923027582, "grad_norm": 0.6585353171844711, "learning_rate": 9.439415287371949e-05, "loss": 0.6338, "step": 8825 }, { "epoch": 0.5663887107119948, "grad_norm": 1.182861072860639, "learning_rate": 9.42823806114476e-05, "loss": 0.6286, "step": 8830 }, { "epoch": 0.5667094291212316, "grad_norm": 0.774985192783614, "learning_rate": 9.417061551519051e-05, "loss": 0.6362, "step": 8835 }, { "epoch": 0.5670301475304682, "grad_norm": 1.6279736397998856, "learning_rate": 9.405885772502582e-05, "loss": 0.5434, "step": 8840 }, { "epoch": 0.567350865939705, "grad_norm": 0.8603999240784707, "learning_rate": 9.394710738102198e-05, "loss": 0.7135, "step": 8845 }, { "epoch": 0.5676715843489416, "grad_norm": 0.8326631481896093, "learning_rate": 9.383536462323807e-05, "loss": 0.6316, "step": 8850 }, { "epoch": 0.5679923027581784, "grad_norm": 1.1396992210320314, "learning_rate": 9.372362959172364e-05, "loss": 0.6325, "step": 8855 }, { "epoch": 0.568313021167415, "grad_norm": 0.6117345152175109, "learning_rate": 9.361190242651864e-05, "loss": 0.6159, "step": 8860 }, { "epoch": 0.5686337395766518, "grad_norm": 0.9306563316596532, "learning_rate": 9.350018326765311e-05, "loss": 0.6533, "step": 8865 }, { "epoch": 0.5689544579858884, "grad_norm": 0.8930767778362739, "learning_rate": 9.338847225514708e-05, "loss": 0.6675, "step": 8870 }, { "epoch": 0.569275176395125, "grad_norm": 0.4141144493955828, "learning_rate": 9.327676952901034e-05, "loss": 0.5957, "step": 8875 }, { "epoch": 0.5695958948043618, "grad_norm": 0.8888417335481001, "learning_rate": 9.31650752292423e-05, "loss": 0.5665, "step": 8880 }, { "epoch": 0.5699166132135984, "grad_norm": 0.7603252238964692, "learning_rate": 9.305338949583183e-05, "loss": 0.6428, "step": 8885 }, { "epoch": 0.5702373316228352, "grad_norm": 1.271342150118716, "learning_rate": 9.294171246875705e-05, "loss": 0.7219, "step": 8890 }, { "epoch": 0.5705580500320718, "grad_norm": 0.9447555346689784, "learning_rate": 9.283004428798519e-05, "loss": 0.6965, "step": 8895 }, { "epoch": 0.5708787684413086, "grad_norm": 0.8678646764049435, "learning_rate": 9.271838509347233e-05, "loss": 0.7673, "step": 8900 }, { "epoch": 0.5711994868505452, "grad_norm": 0.7416908587434721, "learning_rate": 9.260673502516333e-05, "loss": 0.6081, "step": 8905 }, { "epoch": 0.5715202052597819, "grad_norm": 0.939422337464896, "learning_rate": 9.24950942229917e-05, "loss": 0.6721, "step": 8910 }, { "epoch": 0.5718409236690186, "grad_norm": 0.8506289909429936, "learning_rate": 9.238346282687912e-05, "loss": 0.7379, "step": 8915 }, { "epoch": 0.5721616420782553, "grad_norm": 1.3927657753594376, "learning_rate": 9.227184097673566e-05, "loss": 0.7231, "step": 8920 }, { "epoch": 0.572482360487492, "grad_norm": 0.6002814159409026, "learning_rate": 9.21602288124594e-05, "loss": 0.8172, "step": 8925 }, { "epoch": 0.5728030788967287, "grad_norm": 0.7935777728563393, "learning_rate": 9.204862647393625e-05, "loss": 0.8086, "step": 8930 }, { "epoch": 0.5731237973059654, "grad_norm": 1.0397353291637284, "learning_rate": 9.193703410103978e-05, "loss": 0.6631, "step": 8935 }, { "epoch": 0.573444515715202, "grad_norm": 0.8367031156015087, "learning_rate": 9.182545183363112e-05, "loss": 0.5788, "step": 8940 }, { "epoch": 0.5737652341244387, "grad_norm": 1.2325263908639137, "learning_rate": 9.17138798115587e-05, "loss": 0.7789, "step": 8945 }, { "epoch": 0.5740859525336754, "grad_norm": 0.9464147249819552, "learning_rate": 9.160231817465815e-05, "loss": 0.5279, "step": 8950 }, { "epoch": 0.5744066709429121, "grad_norm": 0.8158486660018726, "learning_rate": 9.149076706275207e-05, "loss": 0.7098, "step": 8955 }, { "epoch": 0.5747273893521488, "grad_norm": 0.7825563949372556, "learning_rate": 9.137922661564981e-05, "loss": 0.6993, "step": 8960 }, { "epoch": 0.5750481077613855, "grad_norm": 0.9955286924734048, "learning_rate": 9.126769697314741e-05, "loss": 0.6668, "step": 8965 }, { "epoch": 0.5753688261706222, "grad_norm": 0.987888018064567, "learning_rate": 9.11561782750274e-05, "loss": 0.7683, "step": 8970 }, { "epoch": 0.5756895445798589, "grad_norm": 0.9029264976754006, "learning_rate": 9.104467066105855e-05, "loss": 0.5976, "step": 8975 }, { "epoch": 0.5760102629890955, "grad_norm": 1.2083151109064707, "learning_rate": 9.093317427099567e-05, "loss": 0.7444, "step": 8980 }, { "epoch": 0.5763309813983323, "grad_norm": 0.627708721729255, "learning_rate": 9.082168924457963e-05, "loss": 0.5052, "step": 8985 }, { "epoch": 0.5766516998075689, "grad_norm": 0.818341174384118, "learning_rate": 9.071021572153699e-05, "loss": 0.6956, "step": 8990 }, { "epoch": 0.5769724182168057, "grad_norm": 0.7174427987431503, "learning_rate": 9.05987538415799e-05, "loss": 0.6537, "step": 8995 }, { "epoch": 0.5772931366260423, "grad_norm": 1.0123101523225277, "learning_rate": 9.048730374440593e-05, "loss": 0.6298, "step": 9000 }, { "epoch": 0.5776138550352791, "grad_norm": 1.4927380842347644, "learning_rate": 9.037586556969785e-05, "loss": 0.7866, "step": 9005 }, { "epoch": 0.5779345734445157, "grad_norm": 1.1107550009988214, "learning_rate": 9.026443945712355e-05, "loss": 0.5272, "step": 9010 }, { "epoch": 0.5782552918537524, "grad_norm": 1.042711051305287, "learning_rate": 9.015302554633572e-05, "loss": 0.6862, "step": 9015 }, { "epoch": 0.5785760102629891, "grad_norm": 1.097565575641477, "learning_rate": 9.004162397697183e-05, "loss": 0.6653, "step": 9020 }, { "epoch": 0.5788967286722257, "grad_norm": 0.7962187563904711, "learning_rate": 8.993023488865384e-05, "loss": 0.7807, "step": 9025 }, { "epoch": 0.5792174470814625, "grad_norm": 0.8018799159927662, "learning_rate": 8.981885842098807e-05, "loss": 0.6755, "step": 9030 }, { "epoch": 0.5795381654906991, "grad_norm": 1.0103385936451423, "learning_rate": 8.970749471356508e-05, "loss": 0.7498, "step": 9035 }, { "epoch": 0.5798588838999359, "grad_norm": 0.8540199269462798, "learning_rate": 8.959614390595933e-05, "loss": 0.7041, "step": 9040 }, { "epoch": 0.5801796023091725, "grad_norm": 1.1040345444470279, "learning_rate": 8.948480613772923e-05, "loss": 0.5949, "step": 9045 }, { "epoch": 0.5805003207184093, "grad_norm": 1.0463417093934197, "learning_rate": 8.93734815484167e-05, "loss": 0.6716, "step": 9050 }, { "epoch": 0.5808210391276459, "grad_norm": 0.9338670777982941, "learning_rate": 8.92621702775473e-05, "loss": 0.652, "step": 9055 }, { "epoch": 0.5811417575368826, "grad_norm": 0.8605449857576016, "learning_rate": 8.915087246462981e-05, "loss": 0.6335, "step": 9060 }, { "epoch": 0.5814624759461193, "grad_norm": 0.9482034036580209, "learning_rate": 8.903958824915616e-05, "loss": 0.7407, "step": 9065 }, { "epoch": 0.581783194355356, "grad_norm": 0.9120660938985135, "learning_rate": 8.892831777060128e-05, "loss": 0.714, "step": 9070 }, { "epoch": 0.5821039127645927, "grad_norm": 0.7546853050581628, "learning_rate": 8.881706116842277e-05, "loss": 0.6643, "step": 9075 }, { "epoch": 0.5824246311738294, "grad_norm": 0.7217266514190624, "learning_rate": 8.870581858206097e-05, "loss": 0.6232, "step": 9080 }, { "epoch": 0.5827453495830661, "grad_norm": 0.8122719551725256, "learning_rate": 8.859459015093856e-05, "loss": 0.753, "step": 9085 }, { "epoch": 0.5830660679923028, "grad_norm": 0.6978194557670415, "learning_rate": 8.848337601446056e-05, "loss": 0.592, "step": 9090 }, { "epoch": 0.5833867864015394, "grad_norm": 0.7490982355447477, "learning_rate": 8.8372176312014e-05, "loss": 0.6739, "step": 9095 }, { "epoch": 0.5837075048107762, "grad_norm": 1.074058776492988, "learning_rate": 8.826099118296781e-05, "loss": 0.6831, "step": 9100 }, { "epoch": 0.5840282232200128, "grad_norm": 0.7986527171477741, "learning_rate": 8.814982076667274e-05, "loss": 0.6572, "step": 9105 }, { "epoch": 0.5843489416292496, "grad_norm": 0.9594556597631692, "learning_rate": 8.803866520246111e-05, "loss": 0.6968, "step": 9110 }, { "epoch": 0.5846696600384862, "grad_norm": 0.8185832555992929, "learning_rate": 8.792752462964643e-05, "loss": 0.6396, "step": 9115 }, { "epoch": 0.584990378447723, "grad_norm": 0.830230327348044, "learning_rate": 8.781639918752364e-05, "loss": 0.6288, "step": 9120 }, { "epoch": 0.5853110968569596, "grad_norm": 1.260466190111766, "learning_rate": 8.770528901536866e-05, "loss": 0.6248, "step": 9125 }, { "epoch": 0.5856318152661962, "grad_norm": 0.7805742440541377, "learning_rate": 8.75941942524382e-05, "loss": 0.726, "step": 9130 }, { "epoch": 0.585952533675433, "grad_norm": 1.0612454515173708, "learning_rate": 8.748311503796971e-05, "loss": 0.6807, "step": 9135 }, { "epoch": 0.5862732520846696, "grad_norm": 0.8808610696974422, "learning_rate": 8.737205151118115e-05, "loss": 0.7349, "step": 9140 }, { "epoch": 0.5865939704939064, "grad_norm": 0.8397400084374878, "learning_rate": 8.726100381127084e-05, "loss": 0.677, "step": 9145 }, { "epoch": 0.586914688903143, "grad_norm": 1.3081126728734789, "learning_rate": 8.714997207741725e-05, "loss": 0.7485, "step": 9150 }, { "epoch": 0.5872354073123798, "grad_norm": 0.23647447615753048, "learning_rate": 8.703895644877877e-05, "loss": 0.5389, "step": 9155 }, { "epoch": 0.5875561257216164, "grad_norm": 1.0035423360368345, "learning_rate": 8.692795706449371e-05, "loss": 0.6547, "step": 9160 }, { "epoch": 0.5878768441308531, "grad_norm": 0.7176089252240778, "learning_rate": 8.681697406367997e-05, "loss": 0.6607, "step": 9165 }, { "epoch": 0.5881975625400898, "grad_norm": 0.8342266954014463, "learning_rate": 8.670600758543492e-05, "loss": 0.6957, "step": 9170 }, { "epoch": 0.5885182809493265, "grad_norm": 0.9577059909314858, "learning_rate": 8.659505776883523e-05, "loss": 0.7079, "step": 9175 }, { "epoch": 0.5888389993585632, "grad_norm": 0.5591665135253571, "learning_rate": 8.648412475293667e-05, "loss": 0.4696, "step": 9180 }, { "epoch": 0.5891597177677999, "grad_norm": 0.6612061534246185, "learning_rate": 8.637320867677395e-05, "loss": 0.8161, "step": 9185 }, { "epoch": 0.5894804361770366, "grad_norm": 0.7364614135023326, "learning_rate": 8.626230967936056e-05, "loss": 0.584, "step": 9190 }, { "epoch": 0.5898011545862732, "grad_norm": 1.1805347583614008, "learning_rate": 8.615142789968862e-05, "loss": 0.6749, "step": 9195 }, { "epoch": 0.5901218729955099, "grad_norm": 0.8670374427365669, "learning_rate": 8.604056347672862e-05, "loss": 0.6273, "step": 9200 }, { "epoch": 0.5904425914047466, "grad_norm": 0.9304848686764007, "learning_rate": 8.592971654942934e-05, "loss": 0.7438, "step": 9205 }, { "epoch": 0.5907633098139833, "grad_norm": 0.9747134027393929, "learning_rate": 8.581888725671756e-05, "loss": 0.6131, "step": 9210 }, { "epoch": 0.59108402822322, "grad_norm": 1.0129060114876993, "learning_rate": 8.570807573749803e-05, "loss": 0.7444, "step": 9215 }, { "epoch": 0.5914047466324567, "grad_norm": 0.860206331729887, "learning_rate": 8.559728213065322e-05, "loss": 0.71, "step": 9220 }, { "epoch": 0.5917254650416934, "grad_norm": 0.9817359438145173, "learning_rate": 8.548650657504312e-05, "loss": 0.6491, "step": 9225 }, { "epoch": 0.5920461834509301, "grad_norm": 0.7544658228792815, "learning_rate": 8.537574920950509e-05, "loss": 0.6348, "step": 9230 }, { "epoch": 0.5923669018601668, "grad_norm": 0.7630242666798073, "learning_rate": 8.526501017285371e-05, "loss": 0.6261, "step": 9235 }, { "epoch": 0.5926876202694035, "grad_norm": 0.9267179536684838, "learning_rate": 8.515428960388064e-05, "loss": 0.8258, "step": 9240 }, { "epoch": 0.5930083386786401, "grad_norm": 0.6784696630153367, "learning_rate": 8.504358764135423e-05, "loss": 0.707, "step": 9245 }, { "epoch": 0.5933290570878769, "grad_norm": 0.6689426887073786, "learning_rate": 8.49329044240197e-05, "loss": 0.751, "step": 9250 }, { "epoch": 0.5936497754971135, "grad_norm": 1.0074921827758931, "learning_rate": 8.482224009059867e-05, "loss": 0.7213, "step": 9255 }, { "epoch": 0.5939704939063503, "grad_norm": 0.6037825152713899, "learning_rate": 8.471159477978915e-05, "loss": 0.621, "step": 9260 }, { "epoch": 0.5942912123155869, "grad_norm": 0.6325399857778463, "learning_rate": 8.460096863026523e-05, "loss": 0.6925, "step": 9265 }, { "epoch": 0.5946119307248237, "grad_norm": 0.9785164961672185, "learning_rate": 8.449036178067706e-05, "loss": 0.7721, "step": 9270 }, { "epoch": 0.5949326491340603, "grad_norm": 0.8071126693831758, "learning_rate": 8.437977436965057e-05, "loss": 0.5628, "step": 9275 }, { "epoch": 0.5952533675432969, "grad_norm": 1.093008483996882, "learning_rate": 8.426920653578731e-05, "loss": 0.5135, "step": 9280 }, { "epoch": 0.5955740859525337, "grad_norm": 0.7334552943764545, "learning_rate": 8.415865841766437e-05, "loss": 0.6418, "step": 9285 }, { "epoch": 0.5958948043617703, "grad_norm": 0.9720157753455849, "learning_rate": 8.404813015383402e-05, "loss": 0.6855, "step": 9290 }, { "epoch": 0.5962155227710071, "grad_norm": 0.7988660585883463, "learning_rate": 8.39376218828237e-05, "loss": 0.5753, "step": 9295 }, { "epoch": 0.5965362411802437, "grad_norm": 1.1413457984041735, "learning_rate": 8.382713374313582e-05, "loss": 0.6003, "step": 9300 }, { "epoch": 0.5968569595894805, "grad_norm": 1.1011093623211472, "learning_rate": 8.371666587324753e-05, "loss": 0.7294, "step": 9305 }, { "epoch": 0.5971776779987171, "grad_norm": 0.9285733358885891, "learning_rate": 8.360621841161059e-05, "loss": 0.5484, "step": 9310 }, { "epoch": 0.5974983964079538, "grad_norm": 0.6748939404643401, "learning_rate": 8.349579149665111e-05, "loss": 0.6096, "step": 9315 }, { "epoch": 0.5978191148171905, "grad_norm": 0.9020042133223751, "learning_rate": 8.338538526676955e-05, "loss": 0.6025, "step": 9320 }, { "epoch": 0.5981398332264272, "grad_norm": 0.9270397135681554, "learning_rate": 8.32749998603404e-05, "loss": 0.7169, "step": 9325 }, { "epoch": 0.5984605516356639, "grad_norm": 0.9890377973574781, "learning_rate": 8.316463541571202e-05, "loss": 0.6308, "step": 9330 }, { "epoch": 0.5987812700449006, "grad_norm": 0.9865556224427305, "learning_rate": 8.305429207120657e-05, "loss": 0.6582, "step": 9335 }, { "epoch": 0.5991019884541373, "grad_norm": 0.7178728991086797, "learning_rate": 8.294396996511973e-05, "loss": 0.6433, "step": 9340 }, { "epoch": 0.599422706863374, "grad_norm": 0.9285152964545721, "learning_rate": 8.283366923572054e-05, "loss": 0.548, "step": 9345 }, { "epoch": 0.5997434252726106, "grad_norm": 1.0943546547273215, "learning_rate": 8.272339002125126e-05, "loss": 0.5401, "step": 9350 }, { "epoch": 0.6000641436818474, "grad_norm": 1.0722476752693422, "learning_rate": 8.261313245992719e-05, "loss": 0.7496, "step": 9355 }, { "epoch": 0.600384862091084, "grad_norm": 0.7239338874930329, "learning_rate": 8.250289668993651e-05, "loss": 0.6294, "step": 9360 }, { "epoch": 0.6007055805003207, "grad_norm": 0.8162856731878313, "learning_rate": 8.239268284944008e-05, "loss": 0.784, "step": 9365 }, { "epoch": 0.6010262989095574, "grad_norm": 0.8529031580797097, "learning_rate": 8.228249107657125e-05, "loss": 0.7338, "step": 9370 }, { "epoch": 0.6013470173187941, "grad_norm": 0.914197482847494, "learning_rate": 8.217232150943575e-05, "loss": 0.6738, "step": 9375 }, { "epoch": 0.6016677357280308, "grad_norm": 0.561817894827455, "learning_rate": 8.20621742861114e-05, "loss": 0.4924, "step": 9380 }, { "epoch": 0.6019884541372674, "grad_norm": 0.8679917658001024, "learning_rate": 8.19520495446481e-05, "loss": 0.8074, "step": 9385 }, { "epoch": 0.6023091725465042, "grad_norm": 1.0120069230072926, "learning_rate": 8.184194742306756e-05, "loss": 0.7112, "step": 9390 }, { "epoch": 0.6026298909557408, "grad_norm": 0.7356825859409829, "learning_rate": 8.173186805936313e-05, "loss": 0.6514, "step": 9395 }, { "epoch": 0.6029506093649776, "grad_norm": 0.7794340302339006, "learning_rate": 8.162181159149964e-05, "loss": 0.7748, "step": 9400 }, { "epoch": 0.6032713277742142, "grad_norm": 0.9190740265202144, "learning_rate": 8.151177815741318e-05, "loss": 0.6399, "step": 9405 }, { "epoch": 0.603592046183451, "grad_norm": 1.1526131658530894, "learning_rate": 8.140176789501102e-05, "loss": 0.7519, "step": 9410 }, { "epoch": 0.6039127645926876, "grad_norm": 0.8970675006265497, "learning_rate": 8.129178094217141e-05, "loss": 0.7025, "step": 9415 }, { "epoch": 0.6042334830019244, "grad_norm": 1.16563982635486, "learning_rate": 8.118181743674334e-05, "loss": 0.6515, "step": 9420 }, { "epoch": 0.604554201411161, "grad_norm": 1.009328430894082, "learning_rate": 8.107187751654642e-05, "loss": 0.8061, "step": 9425 }, { "epoch": 0.6048749198203976, "grad_norm": 0.6431656020123224, "learning_rate": 8.096196131937068e-05, "loss": 0.7703, "step": 9430 }, { "epoch": 0.6051956382296344, "grad_norm": 0.8022392814347792, "learning_rate": 8.085206898297648e-05, "loss": 0.4945, "step": 9435 }, { "epoch": 0.605516356638871, "grad_norm": 0.8590402951031166, "learning_rate": 8.074220064509428e-05, "loss": 0.577, "step": 9440 }, { "epoch": 0.6058370750481078, "grad_norm": 0.6529036302559359, "learning_rate": 8.06323564434243e-05, "loss": 0.6972, "step": 9445 }, { "epoch": 0.6061577934573444, "grad_norm": 0.9053770255851836, "learning_rate": 8.052253651563671e-05, "loss": 0.6241, "step": 9450 }, { "epoch": 0.6064785118665812, "grad_norm": 0.6968143227671041, "learning_rate": 8.04127409993712e-05, "loss": 0.7196, "step": 9455 }, { "epoch": 0.6067992302758178, "grad_norm": 0.7907742358273027, "learning_rate": 8.030297003223676e-05, "loss": 0.6535, "step": 9460 }, { "epoch": 0.6071199486850545, "grad_norm": 0.9043816519851674, "learning_rate": 8.019322375181175e-05, "loss": 0.7183, "step": 9465 }, { "epoch": 0.6074406670942912, "grad_norm": 0.8583282541776323, "learning_rate": 8.008350229564351e-05, "loss": 0.7373, "step": 9470 }, { "epoch": 0.6077613855035279, "grad_norm": 1.1639398571753123, "learning_rate": 7.997380580124832e-05, "loss": 0.6619, "step": 9475 }, { "epoch": 0.6080821039127646, "grad_norm": 0.7363838290393571, "learning_rate": 7.986413440611115e-05, "loss": 0.5238, "step": 9480 }, { "epoch": 0.6084028223220013, "grad_norm": 0.7361031316329811, "learning_rate": 7.975448824768546e-05, "loss": 0.7093, "step": 9485 }, { "epoch": 0.608723540731238, "grad_norm": 0.8655976177215603, "learning_rate": 7.964486746339315e-05, "loss": 0.6699, "step": 9490 }, { "epoch": 0.6090442591404747, "grad_norm": 0.7757949116609816, "learning_rate": 7.95352721906243e-05, "loss": 0.6457, "step": 9495 }, { "epoch": 0.6093649775497113, "grad_norm": 1.0532442121286478, "learning_rate": 7.942570256673704e-05, "loss": 0.8266, "step": 9500 }, { "epoch": 0.6096856959589481, "grad_norm": 0.8097807634079536, "learning_rate": 7.931615872905727e-05, "loss": 0.6542, "step": 9505 }, { "epoch": 0.6100064143681847, "grad_norm": 1.170352424739306, "learning_rate": 7.92066408148787e-05, "loss": 0.6511, "step": 9510 }, { "epoch": 0.6103271327774215, "grad_norm": 0.6465117473629731, "learning_rate": 7.909714896146239e-05, "loss": 0.6102, "step": 9515 }, { "epoch": 0.6106478511866581, "grad_norm": 0.9562444288916828, "learning_rate": 7.898768330603687e-05, "loss": 0.7281, "step": 9520 }, { "epoch": 0.6109685695958949, "grad_norm": 0.48629635257867143, "learning_rate": 7.887824398579778e-05, "loss": 0.5576, "step": 9525 }, { "epoch": 0.6112892880051315, "grad_norm": 0.6187174821618042, "learning_rate": 7.876883113790777e-05, "loss": 0.4536, "step": 9530 }, { "epoch": 0.6116100064143681, "grad_norm": 0.8491363897597337, "learning_rate": 7.865944489949632e-05, "loss": 0.5082, "step": 9535 }, { "epoch": 0.6119307248236049, "grad_norm": 0.9489825766872471, "learning_rate": 7.855008540765954e-05, "loss": 0.8288, "step": 9540 }, { "epoch": 0.6122514432328415, "grad_norm": 0.8247180962617905, "learning_rate": 7.844075279945998e-05, "loss": 0.7947, "step": 9545 }, { "epoch": 0.6125721616420783, "grad_norm": 0.8487499152582451, "learning_rate": 7.833144721192658e-05, "loss": 0.4836, "step": 9550 }, { "epoch": 0.6128928800513149, "grad_norm": 1.4749421151082263, "learning_rate": 7.822216878205437e-05, "loss": 0.6604, "step": 9555 }, { "epoch": 0.6132135984605517, "grad_norm": 0.6439839118081867, "learning_rate": 7.811291764680436e-05, "loss": 0.5311, "step": 9560 }, { "epoch": 0.6135343168697883, "grad_norm": 0.6948565188236483, "learning_rate": 7.800369394310329e-05, "loss": 0.7818, "step": 9565 }, { "epoch": 0.613855035279025, "grad_norm": 0.5432098551962209, "learning_rate": 7.789449780784361e-05, "loss": 0.4817, "step": 9570 }, { "epoch": 0.6141757536882617, "grad_norm": 0.8116998264643036, "learning_rate": 7.778532937788319e-05, "loss": 0.6809, "step": 9575 }, { "epoch": 0.6144964720974984, "grad_norm": 0.927156766210116, "learning_rate": 7.767618879004509e-05, "loss": 0.6117, "step": 9580 }, { "epoch": 0.6148171905067351, "grad_norm": 0.5580255415813408, "learning_rate": 7.756707618111758e-05, "loss": 0.5121, "step": 9585 }, { "epoch": 0.6151379089159718, "grad_norm": 0.7697324881673694, "learning_rate": 7.745799168785387e-05, "loss": 0.7019, "step": 9590 }, { "epoch": 0.6154586273252085, "grad_norm": 1.2533080746391783, "learning_rate": 7.734893544697182e-05, "loss": 0.6921, "step": 9595 }, { "epoch": 0.6157793457344451, "grad_norm": 0.8591968885866408, "learning_rate": 7.723990759515399e-05, "loss": 0.6234, "step": 9600 }, { "epoch": 0.6161000641436819, "grad_norm": 0.8144982447654572, "learning_rate": 7.713090826904732e-05, "loss": 0.6175, "step": 9605 }, { "epoch": 0.6164207825529185, "grad_norm": 0.7852604055969639, "learning_rate": 7.702193760526301e-05, "loss": 0.538, "step": 9610 }, { "epoch": 0.6167415009621552, "grad_norm": 0.82507022800839, "learning_rate": 7.691299574037633e-05, "loss": 0.5858, "step": 9615 }, { "epoch": 0.6170622193713919, "grad_norm": 0.8977703001606776, "learning_rate": 7.68040828109264e-05, "loss": 0.6686, "step": 9620 }, { "epoch": 0.6173829377806286, "grad_norm": 0.7575641120784353, "learning_rate": 7.669519895341618e-05, "loss": 0.6733, "step": 9625 }, { "epoch": 0.6177036561898653, "grad_norm": 0.7782783108716851, "learning_rate": 7.658634430431211e-05, "loss": 0.6113, "step": 9630 }, { "epoch": 0.618024374599102, "grad_norm": 0.8737688527317737, "learning_rate": 7.647751900004408e-05, "loss": 0.7703, "step": 9635 }, { "epoch": 0.6183450930083387, "grad_norm": 0.7163537021531532, "learning_rate": 7.63687231770052e-05, "loss": 0.6687, "step": 9640 }, { "epoch": 0.6186658114175754, "grad_norm": 0.7383194119362961, "learning_rate": 7.625995697155153e-05, "loss": 0.7192, "step": 9645 }, { "epoch": 0.618986529826812, "grad_norm": 0.7818780084969111, "learning_rate": 7.615122052000212e-05, "loss": 0.4781, "step": 9650 }, { "epoch": 0.6193072482360488, "grad_norm": 0.9549919791876611, "learning_rate": 7.604251395863868e-05, "loss": 0.5972, "step": 9655 }, { "epoch": 0.6196279666452854, "grad_norm": 0.9266947067171263, "learning_rate": 7.593383742370547e-05, "loss": 0.7661, "step": 9660 }, { "epoch": 0.6199486850545222, "grad_norm": 0.7815262374564014, "learning_rate": 7.582519105140915e-05, "loss": 0.844, "step": 9665 }, { "epoch": 0.6202694034637588, "grad_norm": 0.9851958882202488, "learning_rate": 7.571657497791855e-05, "loss": 0.6573, "step": 9670 }, { "epoch": 0.6205901218729956, "grad_norm": 0.863915136317819, "learning_rate": 7.560798933936446e-05, "loss": 0.6965, "step": 9675 }, { "epoch": 0.6209108402822322, "grad_norm": 0.8169772635721835, "learning_rate": 7.549943427183963e-05, "loss": 0.6739, "step": 9680 }, { "epoch": 0.6212315586914688, "grad_norm": 0.9621597430987586, "learning_rate": 7.539090991139843e-05, "loss": 0.7107, "step": 9685 }, { "epoch": 0.6215522771007056, "grad_norm": 1.1682951488621962, "learning_rate": 7.52824163940568e-05, "loss": 0.7016, "step": 9690 }, { "epoch": 0.6218729955099422, "grad_norm": 0.5988705115634277, "learning_rate": 7.517395385579198e-05, "loss": 0.5883, "step": 9695 }, { "epoch": 0.622193713919179, "grad_norm": 0.6405875029114282, "learning_rate": 7.506552243254235e-05, "loss": 0.5632, "step": 9700 }, { "epoch": 0.6225144323284156, "grad_norm": 0.9039124102611747, "learning_rate": 7.49571222602074e-05, "loss": 0.5569, "step": 9705 }, { "epoch": 0.6228351507376524, "grad_norm": 1.1918655890149419, "learning_rate": 7.484875347464731e-05, "loss": 0.755, "step": 9710 }, { "epoch": 0.623155869146889, "grad_norm": 2.014073968409583, "learning_rate": 7.474041621168304e-05, "loss": 0.6472, "step": 9715 }, { "epoch": 0.6234765875561257, "grad_norm": 0.8921505648356219, "learning_rate": 7.4632110607096e-05, "loss": 0.8289, "step": 9720 }, { "epoch": 0.6237973059653624, "grad_norm": 1.1073242240733232, "learning_rate": 7.452383679662794e-05, "loss": 0.6634, "step": 9725 }, { "epoch": 0.6241180243745991, "grad_norm": 1.1492204881968546, "learning_rate": 7.441559491598072e-05, "loss": 0.6672, "step": 9730 }, { "epoch": 0.6244387427838358, "grad_norm": 1.2072073594662214, "learning_rate": 7.43073851008162e-05, "loss": 0.6821, "step": 9735 }, { "epoch": 0.6247594611930725, "grad_norm": 0.7796944953436583, "learning_rate": 7.41992074867561e-05, "loss": 0.5997, "step": 9740 }, { "epoch": 0.6250801796023092, "grad_norm": 0.8744950902348806, "learning_rate": 7.40910622093817e-05, "loss": 0.8027, "step": 9745 }, { "epoch": 0.6254008980115459, "grad_norm": 0.5663128313006088, "learning_rate": 7.398294940423382e-05, "loss": 0.6558, "step": 9750 }, { "epoch": 0.6257216164207825, "grad_norm": 1.03786462429062, "learning_rate": 7.387486920681251e-05, "loss": 0.7204, "step": 9755 }, { "epoch": 0.6260423348300193, "grad_norm": 1.0086514423501614, "learning_rate": 7.376682175257703e-05, "loss": 0.5726, "step": 9760 }, { "epoch": 0.6263630532392559, "grad_norm": 0.7340138238860899, "learning_rate": 7.365880717694558e-05, "loss": 0.6003, "step": 9765 }, { "epoch": 0.6266837716484926, "grad_norm": 1.0154279037896083, "learning_rate": 7.355082561529511e-05, "loss": 0.6518, "step": 9770 }, { "epoch": 0.6270044900577293, "grad_norm": 1.1008265637631556, "learning_rate": 7.344287720296128e-05, "loss": 0.6493, "step": 9775 }, { "epoch": 0.627325208466966, "grad_norm": 0.8136002565232989, "learning_rate": 7.333496207523805e-05, "loss": 0.7117, "step": 9780 }, { "epoch": 0.6276459268762027, "grad_norm": 0.5762089560179455, "learning_rate": 7.322708036737784e-05, "loss": 0.4664, "step": 9785 }, { "epoch": 0.6279666452854393, "grad_norm": 0.8389502685505456, "learning_rate": 7.311923221459108e-05, "loss": 0.6836, "step": 9790 }, { "epoch": 0.6282873636946761, "grad_norm": 0.7980523725918469, "learning_rate": 7.301141775204614e-05, "loss": 0.6824, "step": 9795 }, { "epoch": 0.6286080821039127, "grad_norm": 1.1727596107618312, "learning_rate": 7.290363711486923e-05, "loss": 0.6435, "step": 9800 }, { "epoch": 0.6289288005131495, "grad_norm": 0.4755883693546517, "learning_rate": 7.279589043814413e-05, "loss": 0.7567, "step": 9805 }, { "epoch": 0.6292495189223861, "grad_norm": 0.59249663501007, "learning_rate": 7.268817785691204e-05, "loss": 0.6907, "step": 9810 }, { "epoch": 0.6295702373316229, "grad_norm": 0.848542013217018, "learning_rate": 7.258049950617146e-05, "loss": 0.6471, "step": 9815 }, { "epoch": 0.6298909557408595, "grad_norm": 1.047981392744028, "learning_rate": 7.247285552087797e-05, "loss": 0.5712, "step": 9820 }, { "epoch": 0.6302116741500963, "grad_norm": 0.8916612499406957, "learning_rate": 7.236524603594406e-05, "loss": 0.6496, "step": 9825 }, { "epoch": 0.6305323925593329, "grad_norm": 0.810154490032121, "learning_rate": 7.225767118623906e-05, "loss": 0.5871, "step": 9830 }, { "epoch": 0.6308531109685696, "grad_norm": 0.8722001341085496, "learning_rate": 7.215013110658875e-05, "loss": 0.643, "step": 9835 }, { "epoch": 0.6311738293778063, "grad_norm": 0.6036268039451337, "learning_rate": 7.204262593177551e-05, "loss": 0.6787, "step": 9840 }, { "epoch": 0.631494547787043, "grad_norm": 1.1616717351436967, "learning_rate": 7.193515579653777e-05, "loss": 0.5542, "step": 9845 }, { "epoch": 0.6318152661962797, "grad_norm": 0.8131100593226482, "learning_rate": 7.182772083557022e-05, "loss": 0.7859, "step": 9850 }, { "epoch": 0.6321359846055163, "grad_norm": 0.876808117538372, "learning_rate": 7.172032118352338e-05, "loss": 0.6484, "step": 9855 }, { "epoch": 0.6324567030147531, "grad_norm": 0.8713054808471165, "learning_rate": 7.161295697500353e-05, "loss": 0.6265, "step": 9860 }, { "epoch": 0.6327774214239897, "grad_norm": 1.023366348564304, "learning_rate": 7.150562834457257e-05, "loss": 0.5939, "step": 9865 }, { "epoch": 0.6330981398332264, "grad_norm": 0.7588376669281691, "learning_rate": 7.13983354267477e-05, "loss": 0.7873, "step": 9870 }, { "epoch": 0.6334188582424631, "grad_norm": 1.028561424510279, "learning_rate": 7.129107835600149e-05, "loss": 0.6212, "step": 9875 }, { "epoch": 0.6337395766516998, "grad_norm": 0.5002948721851668, "learning_rate": 7.118385726676148e-05, "loss": 0.6269, "step": 9880 }, { "epoch": 0.6340602950609365, "grad_norm": 0.6840341058593294, "learning_rate": 7.10766722934102e-05, "loss": 0.6232, "step": 9885 }, { "epoch": 0.6343810134701732, "grad_norm": 1.1628940715108431, "learning_rate": 7.096952357028486e-05, "loss": 0.7978, "step": 9890 }, { "epoch": 0.6347017318794099, "grad_norm": 0.8853939814346806, "learning_rate": 7.086241123167722e-05, "loss": 0.6057, "step": 9895 }, { "epoch": 0.6350224502886466, "grad_norm": 0.7451557600335174, "learning_rate": 7.07553354118335e-05, "loss": 0.7038, "step": 9900 }, { "epoch": 0.6353431686978832, "grad_norm": 1.40409713973294, "learning_rate": 7.064829624495415e-05, "loss": 0.6721, "step": 9905 }, { "epoch": 0.63566388710712, "grad_norm": 0.8791535681920543, "learning_rate": 7.054129386519356e-05, "loss": 0.7629, "step": 9910 }, { "epoch": 0.6359846055163566, "grad_norm": 0.6562938490531729, "learning_rate": 7.043432840666015e-05, "loss": 0.6885, "step": 9915 }, { "epoch": 0.6363053239255934, "grad_norm": 0.8475306109482822, "learning_rate": 7.032740000341604e-05, "loss": 0.6528, "step": 9920 }, { "epoch": 0.63662604233483, "grad_norm": 1.0340930274606936, "learning_rate": 7.022050878947683e-05, "loss": 0.5579, "step": 9925 }, { "epoch": 0.6369467607440668, "grad_norm": 0.892410748846026, "learning_rate": 7.011365489881164e-05, "loss": 0.622, "step": 9930 }, { "epoch": 0.6372674791533034, "grad_norm": 1.026899828920046, "learning_rate": 7.000683846534268e-05, "loss": 0.7173, "step": 9935 }, { "epoch": 0.63758819756254, "grad_norm": 0.7906424850106287, "learning_rate": 6.99000596229453e-05, "loss": 0.6518, "step": 9940 }, { "epoch": 0.6379089159717768, "grad_norm": 0.885516437560555, "learning_rate": 6.979331850544772e-05, "loss": 0.7629, "step": 9945 }, { "epoch": 0.6382296343810134, "grad_norm": 1.2585108576804727, "learning_rate": 6.968661524663085e-05, "loss": 0.5346, "step": 9950 }, { "epoch": 0.6385503527902502, "grad_norm": 0.6378216033005294, "learning_rate": 6.957994998022817e-05, "loss": 0.5599, "step": 9955 }, { "epoch": 0.6388710711994868, "grad_norm": 1.0857649237283717, "learning_rate": 6.947332283992553e-05, "loss": 0.5546, "step": 9960 }, { "epoch": 0.6391917896087236, "grad_norm": 0.7485103608812504, "learning_rate": 6.936673395936103e-05, "loss": 0.7607, "step": 9965 }, { "epoch": 0.6395125080179602, "grad_norm": 0.6831137045570516, "learning_rate": 6.926018347212482e-05, "loss": 0.7246, "step": 9970 }, { "epoch": 0.6398332264271969, "grad_norm": 0.8371300993555119, "learning_rate": 6.915367151175887e-05, "loss": 0.7647, "step": 9975 }, { "epoch": 0.6401539448364336, "grad_norm": 0.6790794293309601, "learning_rate": 6.904719821175691e-05, "loss": 0.709, "step": 9980 }, { "epoch": 0.6404746632456703, "grad_norm": 1.2809292980337206, "learning_rate": 6.894076370556419e-05, "loss": 0.7072, "step": 9985 }, { "epoch": 0.640795381654907, "grad_norm": 0.6309070049475263, "learning_rate": 6.883436812657736e-05, "loss": 0.7517, "step": 9990 }, { "epoch": 0.6411161000641437, "grad_norm": 0.7057857328226916, "learning_rate": 6.872801160814429e-05, "loss": 0.5892, "step": 9995 }, { "epoch": 0.6414368184733804, "grad_norm": 0.6684609047663461, "learning_rate": 6.862169428356391e-05, "loss": 0.7041, "step": 10000 }, { "epoch": 0.641757536882617, "grad_norm": 0.9825781560923286, "learning_rate": 6.851541628608593e-05, "loss": 0.5732, "step": 10005 }, { "epoch": 0.6420782552918538, "grad_norm": 0.6656401212815036, "learning_rate": 6.840917774891089e-05, "loss": 0.6996, "step": 10010 }, { "epoch": 0.6423989737010904, "grad_norm": 1.0284673996842317, "learning_rate": 6.830297880518982e-05, "loss": 0.6385, "step": 10015 }, { "epoch": 0.6427196921103271, "grad_norm": 1.3813453443085013, "learning_rate": 6.819681958802411e-05, "loss": 0.8024, "step": 10020 }, { "epoch": 0.6430404105195638, "grad_norm": 1.0439998261378045, "learning_rate": 6.809070023046542e-05, "loss": 0.7246, "step": 10025 }, { "epoch": 0.6433611289288005, "grad_norm": 1.3726132291968678, "learning_rate": 6.798462086551536e-05, "loss": 0.7607, "step": 10030 }, { "epoch": 0.6436818473380372, "grad_norm": 0.696112632783953, "learning_rate": 6.78785816261255e-05, "loss": 0.6657, "step": 10035 }, { "epoch": 0.6440025657472739, "grad_norm": 0.9271308758677715, "learning_rate": 6.777258264519712e-05, "loss": 0.7089, "step": 10040 }, { "epoch": 0.6443232841565106, "grad_norm": 0.971107223858267, "learning_rate": 6.766662405558095e-05, "loss": 0.7127, "step": 10045 }, { "epoch": 0.6446440025657473, "grad_norm": 1.1077553805147324, "learning_rate": 6.756070599007717e-05, "loss": 0.6674, "step": 10050 }, { "epoch": 0.6449647209749839, "grad_norm": 1.1241145720577337, "learning_rate": 6.745482858143519e-05, "loss": 0.6908, "step": 10055 }, { "epoch": 0.6452854393842207, "grad_norm": 1.0311402231942566, "learning_rate": 6.734899196235342e-05, "loss": 0.5903, "step": 10060 }, { "epoch": 0.6456061577934573, "grad_norm": 1.1164020984789884, "learning_rate": 6.724319626547916e-05, "loss": 0.7299, "step": 10065 }, { "epoch": 0.6459268762026941, "grad_norm": 0.862577581408513, "learning_rate": 6.71374416234084e-05, "loss": 0.6447, "step": 10070 }, { "epoch": 0.6462475946119307, "grad_norm": 0.6813994701366789, "learning_rate": 6.703172816868575e-05, "loss": 0.6327, "step": 10075 }, { "epoch": 0.6465683130211675, "grad_norm": 0.8916563918460675, "learning_rate": 6.69260560338041e-05, "loss": 0.5921, "step": 10080 }, { "epoch": 0.6468890314304041, "grad_norm": 0.9332137514439207, "learning_rate": 6.682042535120463e-05, "loss": 0.6558, "step": 10085 }, { "epoch": 0.6472097498396407, "grad_norm": 0.83477107809383, "learning_rate": 6.67148362532765e-05, "loss": 0.6404, "step": 10090 }, { "epoch": 0.6475304682488775, "grad_norm": 1.2218962185380584, "learning_rate": 6.66092888723568e-05, "loss": 0.6856, "step": 10095 }, { "epoch": 0.6478511866581141, "grad_norm": 0.5613953193652488, "learning_rate": 6.650378334073036e-05, "loss": 0.5747, "step": 10100 }, { "epoch": 0.6481719050673509, "grad_norm": 1.161315529719475, "learning_rate": 6.639831979062952e-05, "loss": 0.7714, "step": 10105 }, { "epoch": 0.6484926234765875, "grad_norm": 1.2013466455307917, "learning_rate": 6.629289835423393e-05, "loss": 0.7067, "step": 10110 }, { "epoch": 0.6488133418858243, "grad_norm": 0.8985970817080027, "learning_rate": 6.618751916367061e-05, "loss": 0.8022, "step": 10115 }, { "epoch": 0.6491340602950609, "grad_norm": 1.2136972519623022, "learning_rate": 6.608218235101352e-05, "loss": 0.6141, "step": 10120 }, { "epoch": 0.6494547787042976, "grad_norm": 0.9718583450791072, "learning_rate": 6.597688804828353e-05, "loss": 0.5938, "step": 10125 }, { "epoch": 0.6497754971135343, "grad_norm": 0.9547734637829278, "learning_rate": 6.587163638744827e-05, "loss": 0.6992, "step": 10130 }, { "epoch": 0.650096215522771, "grad_norm": 0.9151909021410464, "learning_rate": 6.57664275004219e-05, "loss": 0.7343, "step": 10135 }, { "epoch": 0.6504169339320077, "grad_norm": 1.5971760196514397, "learning_rate": 6.566126151906498e-05, "loss": 0.7017, "step": 10140 }, { "epoch": 0.6507376523412444, "grad_norm": 0.8126791037548418, "learning_rate": 6.555613857518425e-05, "loss": 0.6567, "step": 10145 }, { "epoch": 0.6510583707504811, "grad_norm": 0.7571219128173635, "learning_rate": 6.545105880053258e-05, "loss": 0.6871, "step": 10150 }, { "epoch": 0.6513790891597178, "grad_norm": 0.688497347517119, "learning_rate": 6.534602232680869e-05, "loss": 0.7347, "step": 10155 }, { "epoch": 0.6516998075689544, "grad_norm": 0.8955793200079804, "learning_rate": 6.524102928565706e-05, "loss": 0.5972, "step": 10160 }, { "epoch": 0.6520205259781912, "grad_norm": 0.9443767111598063, "learning_rate": 6.513607980866768e-05, "loss": 0.723, "step": 10165 }, { "epoch": 0.6523412443874278, "grad_norm": 0.8214020012837946, "learning_rate": 6.5031174027376e-05, "loss": 0.7531, "step": 10170 }, { "epoch": 0.6526619627966646, "grad_norm": 0.9405554364877039, "learning_rate": 6.492631207326271e-05, "loss": 0.6579, "step": 10175 }, { "epoch": 0.6529826812059012, "grad_norm": 0.8528480386187783, "learning_rate": 6.482149407775348e-05, "loss": 0.6639, "step": 10180 }, { "epoch": 0.653303399615138, "grad_norm": 1.0215536554217552, "learning_rate": 6.471672017221897e-05, "loss": 0.6788, "step": 10185 }, { "epoch": 0.6536241180243746, "grad_norm": 1.0458906526223661, "learning_rate": 6.461199048797457e-05, "loss": 0.7466, "step": 10190 }, { "epoch": 0.6539448364336113, "grad_norm": 0.7250104664732925, "learning_rate": 6.450730515628025e-05, "loss": 0.4862, "step": 10195 }, { "epoch": 0.654265554842848, "grad_norm": 1.1562228223771571, "learning_rate": 6.440266430834035e-05, "loss": 0.7554, "step": 10200 }, { "epoch": 0.6545862732520846, "grad_norm": 0.7656674676905709, "learning_rate": 6.429806807530348e-05, "loss": 0.6668, "step": 10205 }, { "epoch": 0.6549069916613214, "grad_norm": 1.1136322722942007, "learning_rate": 6.419351658826236e-05, "loss": 0.7241, "step": 10210 }, { "epoch": 0.655227710070558, "grad_norm": 1.0761146316049985, "learning_rate": 6.40890099782536e-05, "loss": 0.6501, "step": 10215 }, { "epoch": 0.6555484284797948, "grad_norm": 0.9079430022905365, "learning_rate": 6.398454837625761e-05, "loss": 0.8384, "step": 10220 }, { "epoch": 0.6558691468890314, "grad_norm": 0.8488475441393789, "learning_rate": 6.388013191319829e-05, "loss": 0.697, "step": 10225 }, { "epoch": 0.6561898652982682, "grad_norm": 1.8731573144161795, "learning_rate": 6.377576071994306e-05, "loss": 0.5274, "step": 10230 }, { "epoch": 0.6565105837075048, "grad_norm": 0.9597668865369915, "learning_rate": 6.367143492730257e-05, "loss": 0.5793, "step": 10235 }, { "epoch": 0.6568313021167415, "grad_norm": 0.9184805187055093, "learning_rate": 6.356715466603058e-05, "loss": 0.7204, "step": 10240 }, { "epoch": 0.6571520205259782, "grad_norm": 1.010481078501907, "learning_rate": 6.346292006682375e-05, "loss": 0.6568, "step": 10245 }, { "epoch": 0.6574727389352149, "grad_norm": 1.2893595780329616, "learning_rate": 6.335873126032155e-05, "loss": 0.7476, "step": 10250 }, { "epoch": 0.6577934573444516, "grad_norm": 0.7919851978335327, "learning_rate": 6.325458837710603e-05, "loss": 0.6681, "step": 10255 }, { "epoch": 0.6581141757536882, "grad_norm": 0.7133876917502856, "learning_rate": 6.31504915477017e-05, "loss": 0.7879, "step": 10260 }, { "epoch": 0.658434894162925, "grad_norm": 0.8067826322951818, "learning_rate": 6.304644090257536e-05, "loss": 0.64, "step": 10265 }, { "epoch": 0.6587556125721616, "grad_norm": 0.7174409241967863, "learning_rate": 6.294243657213587e-05, "loss": 0.5671, "step": 10270 }, { "epoch": 0.6590763309813983, "grad_norm": 0.7812465401233117, "learning_rate": 6.283847868673417e-05, "loss": 0.628, "step": 10275 }, { "epoch": 0.659397049390635, "grad_norm": 0.565828308616574, "learning_rate": 6.273456737666281e-05, "loss": 0.621, "step": 10280 }, { "epoch": 0.6597177677998717, "grad_norm": 1.0913219783317336, "learning_rate": 6.26307027721561e-05, "loss": 0.6341, "step": 10285 }, { "epoch": 0.6600384862091084, "grad_norm": 0.812647700581263, "learning_rate": 6.252688500338979e-05, "loss": 0.6266, "step": 10290 }, { "epoch": 0.6603592046183451, "grad_norm": 1.3344320513324446, "learning_rate": 6.242311420048087e-05, "loss": 0.697, "step": 10295 }, { "epoch": 0.6606799230275818, "grad_norm": 0.8037339071262586, "learning_rate": 6.231939049348756e-05, "loss": 0.662, "step": 10300 }, { "epoch": 0.6610006414368185, "grad_norm": 0.8348124914063436, "learning_rate": 6.221571401240898e-05, "loss": 0.5953, "step": 10305 }, { "epoch": 0.6613213598460551, "grad_norm": 0.8007698372402566, "learning_rate": 6.211208488718508e-05, "loss": 0.7067, "step": 10310 }, { "epoch": 0.6616420782552919, "grad_norm": 1.0240691382811138, "learning_rate": 6.200850324769645e-05, "loss": 0.6563, "step": 10315 }, { "epoch": 0.6619627966645285, "grad_norm": 0.6245391951301155, "learning_rate": 6.190496922376419e-05, "loss": 0.566, "step": 10320 }, { "epoch": 0.6622835150737653, "grad_norm": 0.9667633410108524, "learning_rate": 6.180148294514969e-05, "loss": 0.6114, "step": 10325 }, { "epoch": 0.6626042334830019, "grad_norm": 0.7507271356005688, "learning_rate": 6.169804454155457e-05, "loss": 0.5604, "step": 10330 }, { "epoch": 0.6629249518922387, "grad_norm": 1.3185339543060972, "learning_rate": 6.159465414262034e-05, "loss": 0.6832, "step": 10335 }, { "epoch": 0.6632456703014753, "grad_norm": 1.1847306027291458, "learning_rate": 6.14913118779284e-05, "loss": 0.8276, "step": 10340 }, { "epoch": 0.6635663887107119, "grad_norm": 0.645482702109424, "learning_rate": 6.138801787699988e-05, "loss": 0.7251, "step": 10345 }, { "epoch": 0.6638871071199487, "grad_norm": 0.9170687001642995, "learning_rate": 6.128477226929532e-05, "loss": 0.5489, "step": 10350 }, { "epoch": 0.6642078255291853, "grad_norm": 1.000806725934412, "learning_rate": 6.118157518421468e-05, "loss": 0.7246, "step": 10355 }, { "epoch": 0.6645285439384221, "grad_norm": 0.8379511672470946, "learning_rate": 6.107842675109703e-05, "loss": 0.7874, "step": 10360 }, { "epoch": 0.6648492623476587, "grad_norm": 0.7371509556636497, "learning_rate": 6.097532709922054e-05, "loss": 0.6244, "step": 10365 }, { "epoch": 0.6651699807568955, "grad_norm": 0.9539665664045133, "learning_rate": 6.087227635780225e-05, "loss": 0.6107, "step": 10370 }, { "epoch": 0.6654906991661321, "grad_norm": 0.7979555148132079, "learning_rate": 6.0769274655997775e-05, "loss": 0.5344, "step": 10375 }, { "epoch": 0.6658114175753689, "grad_norm": 0.909657054573839, "learning_rate": 6.0666322122901396e-05, "loss": 0.6275, "step": 10380 }, { "epoch": 0.6661321359846055, "grad_norm": 1.0313940290067696, "learning_rate": 6.056341888754573e-05, "loss": 0.6082, "step": 10385 }, { "epoch": 0.6664528543938422, "grad_norm": 0.7489838245596225, "learning_rate": 6.0460565078901633e-05, "loss": 0.5819, "step": 10390 }, { "epoch": 0.6667735728030789, "grad_norm": 1.1118413959198947, "learning_rate": 6.035776082587794e-05, "loss": 0.5196, "step": 10395 }, { "epoch": 0.6670942912123156, "grad_norm": 0.8125706280287548, "learning_rate": 6.025500625732142e-05, "loss": 0.5352, "step": 10400 }, { "epoch": 0.6674150096215523, "grad_norm": 0.9492211031254315, "learning_rate": 6.015230150201661e-05, "loss": 0.5139, "step": 10405 }, { "epoch": 0.667735728030789, "grad_norm": 0.7268694268672965, "learning_rate": 6.0049646688685567e-05, "loss": 0.6442, "step": 10410 }, { "epoch": 0.6680564464400257, "grad_norm": 0.7538411268384596, "learning_rate": 5.994704194598775e-05, "loss": 0.7771, "step": 10415 }, { "epoch": 0.6683771648492624, "grad_norm": 0.732055273874663, "learning_rate": 5.9844487402519886e-05, "loss": 0.4246, "step": 10420 }, { "epoch": 0.668697883258499, "grad_norm": 0.9282996799361855, "learning_rate": 5.97419831868158e-05, "loss": 0.6212, "step": 10425 }, { "epoch": 0.6690186016677357, "grad_norm": 0.8160584484135337, "learning_rate": 5.96395294273462e-05, "loss": 0.5947, "step": 10430 }, { "epoch": 0.6693393200769724, "grad_norm": 0.563899508227464, "learning_rate": 5.9537126252518595e-05, "loss": 0.6085, "step": 10435 }, { "epoch": 0.6696600384862091, "grad_norm": 0.7096696600311123, "learning_rate": 5.9434773790677076e-05, "loss": 0.6623, "step": 10440 }, { "epoch": 0.6699807568954458, "grad_norm": 1.0083725702632502, "learning_rate": 5.933247217010216e-05, "loss": 0.7533, "step": 10445 }, { "epoch": 0.6703014753046825, "grad_norm": 0.8583730314996155, "learning_rate": 5.9230221519010634e-05, "loss": 0.6899, "step": 10450 }, { "epoch": 0.6706221937139192, "grad_norm": 0.9948242533172998, "learning_rate": 5.912802196555547e-05, "loss": 0.6441, "step": 10455 }, { "epoch": 0.6709429121231558, "grad_norm": 0.8416659287585814, "learning_rate": 5.902587363782553e-05, "loss": 0.52, "step": 10460 }, { "epoch": 0.6712636305323926, "grad_norm": 0.7875617753719326, "learning_rate": 5.892377666384552e-05, "loss": 0.8289, "step": 10465 }, { "epoch": 0.6715843489416292, "grad_norm": 1.3665322708300398, "learning_rate": 5.882173117157579e-05, "loss": 0.6931, "step": 10470 }, { "epoch": 0.671905067350866, "grad_norm": 1.484703583509698, "learning_rate": 5.871973728891207e-05, "loss": 0.6282, "step": 10475 }, { "epoch": 0.6722257857601026, "grad_norm": 0.6277171001704246, "learning_rate": 5.861779514368552e-05, "loss": 0.5476, "step": 10480 }, { "epoch": 0.6725465041693394, "grad_norm": 0.893359208561377, "learning_rate": 5.851590486366241e-05, "loss": 0.5851, "step": 10485 }, { "epoch": 0.672867222578576, "grad_norm": 0.7320275300041723, "learning_rate": 5.841406657654402e-05, "loss": 0.7706, "step": 10490 }, { "epoch": 0.6731879409878126, "grad_norm": 0.8287094016340315, "learning_rate": 5.831228040996643e-05, "loss": 0.6782, "step": 10495 }, { "epoch": 0.6735086593970494, "grad_norm": 0.668748966976369, "learning_rate": 5.8210546491500416e-05, "loss": 0.4843, "step": 10500 }, { "epoch": 0.673829377806286, "grad_norm": 0.7774193196749479, "learning_rate": 5.8108864948651385e-05, "loss": 0.6915, "step": 10505 }, { "epoch": 0.6741500962155228, "grad_norm": 0.7361276836480435, "learning_rate": 5.8007235908858815e-05, "loss": 0.6037, "step": 10510 }, { "epoch": 0.6744708146247594, "grad_norm": 0.9273797610571103, "learning_rate": 5.790565949949669e-05, "loss": 0.6447, "step": 10515 }, { "epoch": 0.6747915330339962, "grad_norm": 0.7357377379625472, "learning_rate": 5.780413584787285e-05, "loss": 0.6123, "step": 10520 }, { "epoch": 0.6751122514432328, "grad_norm": 0.7349196129011529, "learning_rate": 5.770266508122903e-05, "loss": 0.6148, "step": 10525 }, { "epoch": 0.6754329698524695, "grad_norm": 0.7228184809432814, "learning_rate": 5.760124732674079e-05, "loss": 0.7375, "step": 10530 }, { "epoch": 0.6757536882617062, "grad_norm": 0.7245846277368149, "learning_rate": 5.749988271151714e-05, "loss": 0.8622, "step": 10535 }, { "epoch": 0.6760744066709429, "grad_norm": 0.7864676224072312, "learning_rate": 5.739857136260046e-05, "loss": 0.712, "step": 10540 }, { "epoch": 0.6763951250801796, "grad_norm": 1.645141716455399, "learning_rate": 5.7297313406966534e-05, "loss": 0.6939, "step": 10545 }, { "epoch": 0.6767158434894163, "grad_norm": 0.5062488079743617, "learning_rate": 5.719610897152405e-05, "loss": 0.5611, "step": 10550 }, { "epoch": 0.677036561898653, "grad_norm": 0.7048718325836721, "learning_rate": 5.709495818311477e-05, "loss": 0.7464, "step": 10555 }, { "epoch": 0.6773572803078897, "grad_norm": 1.1659307946452016, "learning_rate": 5.699386116851309e-05, "loss": 0.7177, "step": 10560 }, { "epoch": 0.6776779987171264, "grad_norm": 0.9170897775066968, "learning_rate": 5.6892818054426035e-05, "loss": 0.669, "step": 10565 }, { "epoch": 0.6779987171263631, "grad_norm": 1.0508889718757837, "learning_rate": 5.679182896749322e-05, "loss": 0.6744, "step": 10570 }, { "epoch": 0.6783194355355997, "grad_norm": 0.8259858656059345, "learning_rate": 5.669089403428627e-05, "loss": 0.6801, "step": 10575 }, { "epoch": 0.6786401539448365, "grad_norm": 0.6629893516596802, "learning_rate": 5.659001338130923e-05, "loss": 0.6013, "step": 10580 }, { "epoch": 0.6789608723540731, "grad_norm": 0.968488221191984, "learning_rate": 5.648918713499787e-05, "loss": 0.7905, "step": 10585 }, { "epoch": 0.6792815907633099, "grad_norm": 0.7585559410962367, "learning_rate": 5.6388415421719996e-05, "loss": 0.5525, "step": 10590 }, { "epoch": 0.6796023091725465, "grad_norm": 1.2745141606185377, "learning_rate": 5.6287698367774897e-05, "loss": 0.7167, "step": 10595 }, { "epoch": 0.6799230275817832, "grad_norm": 0.6728914302123802, "learning_rate": 5.6187036099393375e-05, "loss": 0.6937, "step": 10600 }, { "epoch": 0.6802437459910199, "grad_norm": 0.600819149081247, "learning_rate": 5.608642874273771e-05, "loss": 0.6316, "step": 10605 }, { "epoch": 0.6805644644002565, "grad_norm": 0.6959088365991615, "learning_rate": 5.598587642390114e-05, "loss": 0.7457, "step": 10610 }, { "epoch": 0.6808851828094933, "grad_norm": 0.7266824723699652, "learning_rate": 5.5885379268908134e-05, "loss": 0.6045, "step": 10615 }, { "epoch": 0.6812059012187299, "grad_norm": 0.6681555688621381, "learning_rate": 5.578493740371389e-05, "loss": 0.6286, "step": 10620 }, { "epoch": 0.6815266196279667, "grad_norm": 0.7610528413953269, "learning_rate": 5.568455095420431e-05, "loss": 0.5733, "step": 10625 }, { "epoch": 0.6818473380372033, "grad_norm": 1.3214312132482846, "learning_rate": 5.558422004619597e-05, "loss": 0.6319, "step": 10630 }, { "epoch": 0.6821680564464401, "grad_norm": 0.6966982078568826, "learning_rate": 5.548394480543564e-05, "loss": 0.4698, "step": 10635 }, { "epoch": 0.6824887748556767, "grad_norm": 0.6367878363111128, "learning_rate": 5.538372535760057e-05, "loss": 0.662, "step": 10640 }, { "epoch": 0.6828094932649134, "grad_norm": 0.5466987109462808, "learning_rate": 5.528356182829777e-05, "loss": 0.5193, "step": 10645 }, { "epoch": 0.6831302116741501, "grad_norm": 0.8091665259225381, "learning_rate": 5.518345434306444e-05, "loss": 0.5853, "step": 10650 }, { "epoch": 0.6834509300833868, "grad_norm": 0.5989345577351957, "learning_rate": 5.508340302736743e-05, "loss": 0.5997, "step": 10655 }, { "epoch": 0.6837716484926235, "grad_norm": 0.8246700551716405, "learning_rate": 5.498340800660313e-05, "loss": 0.715, "step": 10660 }, { "epoch": 0.6840923669018601, "grad_norm": 0.7999016646795889, "learning_rate": 5.488346940609753e-05, "loss": 0.7212, "step": 10665 }, { "epoch": 0.6844130853110969, "grad_norm": 0.5763703153217136, "learning_rate": 5.4783587351105734e-05, "loss": 0.6361, "step": 10670 }, { "epoch": 0.6847338037203335, "grad_norm": 1.3911645606934129, "learning_rate": 5.4683761966812154e-05, "loss": 0.7494, "step": 10675 }, { "epoch": 0.6850545221295702, "grad_norm": 1.1526450545139104, "learning_rate": 5.458399337833002e-05, "loss": 0.5274, "step": 10680 }, { "epoch": 0.6853752405388069, "grad_norm": 1.0168267129176949, "learning_rate": 5.448428171070141e-05, "loss": 0.8071, "step": 10685 }, { "epoch": 0.6856959589480436, "grad_norm": 0.7598086971815275, "learning_rate": 5.438462708889718e-05, "loss": 0.676, "step": 10690 }, { "epoch": 0.6860166773572803, "grad_norm": 1.056491176869749, "learning_rate": 5.428502963781654e-05, "loss": 0.591, "step": 10695 }, { "epoch": 0.686337395766517, "grad_norm": 0.8433612740283131, "learning_rate": 5.418548948228709e-05, "loss": 0.6323, "step": 10700 }, { "epoch": 0.6866581141757537, "grad_norm": 1.1399615640431888, "learning_rate": 5.408600674706474e-05, "loss": 0.6943, "step": 10705 }, { "epoch": 0.6869788325849904, "grad_norm": 1.1427576567421822, "learning_rate": 5.39865815568332e-05, "loss": 0.6542, "step": 10710 }, { "epoch": 0.687299550994227, "grad_norm": 0.8398449025370285, "learning_rate": 5.3887214036204295e-05, "loss": 0.6775, "step": 10715 }, { "epoch": 0.6876202694034638, "grad_norm": 0.6183753226440165, "learning_rate": 5.3787904309717365e-05, "loss": 0.5856, "step": 10720 }, { "epoch": 0.6879409878127004, "grad_norm": 0.7303097761926962, "learning_rate": 5.368865250183952e-05, "loss": 0.5393, "step": 10725 }, { "epoch": 0.6882617062219372, "grad_norm": 1.042159531292707, "learning_rate": 5.358945873696514e-05, "loss": 0.598, "step": 10730 }, { "epoch": 0.6885824246311738, "grad_norm": 0.8726534481321939, "learning_rate": 5.3490323139415844e-05, "loss": 0.6874, "step": 10735 }, { "epoch": 0.6889031430404106, "grad_norm": 0.8279765934645724, "learning_rate": 5.339124583344046e-05, "loss": 0.7282, "step": 10740 }, { "epoch": 0.6892238614496472, "grad_norm": 1.1033370234326692, "learning_rate": 5.3292226943214666e-05, "loss": 0.6647, "step": 10745 }, { "epoch": 0.689544579858884, "grad_norm": 0.6731635406372563, "learning_rate": 5.3193266592840994e-05, "loss": 0.642, "step": 10750 }, { "epoch": 0.6898652982681206, "grad_norm": 0.682406135632238, "learning_rate": 5.309436490634855e-05, "loss": 0.6876, "step": 10755 }, { "epoch": 0.6901860166773572, "grad_norm": 0.6884304464201593, "learning_rate": 5.299552200769289e-05, "loss": 0.6405, "step": 10760 }, { "epoch": 0.690506735086594, "grad_norm": 0.9303606786373573, "learning_rate": 5.289673802075601e-05, "loss": 0.5867, "step": 10765 }, { "epoch": 0.6908274534958306, "grad_norm": 0.8966481917540933, "learning_rate": 5.279801306934598e-05, "loss": 0.7328, "step": 10770 }, { "epoch": 0.6911481719050674, "grad_norm": 0.8301326693368314, "learning_rate": 5.269934727719685e-05, "loss": 0.673, "step": 10775 }, { "epoch": 0.691468890314304, "grad_norm": 0.9231136482226949, "learning_rate": 5.260074076796859e-05, "loss": 0.8013, "step": 10780 }, { "epoch": 0.6917896087235408, "grad_norm": 0.6344332487623263, "learning_rate": 5.250219366524687e-05, "loss": 0.6477, "step": 10785 }, { "epoch": 0.6921103271327774, "grad_norm": 0.6184925377516596, "learning_rate": 5.240370609254288e-05, "loss": 0.5484, "step": 10790 }, { "epoch": 0.6924310455420141, "grad_norm": 0.7946249563385892, "learning_rate": 5.230527817329316e-05, "loss": 0.7455, "step": 10795 }, { "epoch": 0.6927517639512508, "grad_norm": 0.5532448902772473, "learning_rate": 5.22069100308596e-05, "loss": 0.5486, "step": 10800 }, { "epoch": 0.6930724823604875, "grad_norm": 0.6171304782365078, "learning_rate": 5.210860178852903e-05, "loss": 0.681, "step": 10805 }, { "epoch": 0.6933932007697242, "grad_norm": 1.2635876971136728, "learning_rate": 5.201035356951334e-05, "loss": 0.6736, "step": 10810 }, { "epoch": 0.6937139191789609, "grad_norm": 0.5205480150437042, "learning_rate": 5.191216549694909e-05, "loss": 0.5153, "step": 10815 }, { "epoch": 0.6940346375881976, "grad_norm": 0.9442523324184217, "learning_rate": 5.1814037693897464e-05, "loss": 0.6185, "step": 10820 }, { "epoch": 0.6943553559974343, "grad_norm": 1.1934267268940544, "learning_rate": 5.1715970283344205e-05, "loss": 0.6677, "step": 10825 }, { "epoch": 0.6946760744066709, "grad_norm": 0.7652562771619698, "learning_rate": 5.161796338819924e-05, "loss": 0.7638, "step": 10830 }, { "epoch": 0.6949967928159076, "grad_norm": 0.8994137424891815, "learning_rate": 5.152001713129677e-05, "loss": 0.5898, "step": 10835 }, { "epoch": 0.6953175112251443, "grad_norm": 1.1569578317709166, "learning_rate": 5.142213163539491e-05, "loss": 0.5728, "step": 10840 }, { "epoch": 0.695638229634381, "grad_norm": 0.9567492023568471, "learning_rate": 5.132430702317562e-05, "loss": 0.6646, "step": 10845 }, { "epoch": 0.6959589480436177, "grad_norm": 0.9942541719053858, "learning_rate": 5.122654341724462e-05, "loss": 0.7398, "step": 10850 }, { "epoch": 0.6962796664528544, "grad_norm": 0.69345380130255, "learning_rate": 5.1128840940131064e-05, "loss": 0.5888, "step": 10855 }, { "epoch": 0.6966003848620911, "grad_norm": 0.8276215026435204, "learning_rate": 5.103119971428765e-05, "loss": 0.6781, "step": 10860 }, { "epoch": 0.6969211032713277, "grad_norm": 0.7245991079345528, "learning_rate": 5.093361986209015e-05, "loss": 0.7442, "step": 10865 }, { "epoch": 0.6972418216805645, "grad_norm": 0.7885551527874833, "learning_rate": 5.0836101505837494e-05, "loss": 0.6788, "step": 10870 }, { "epoch": 0.6975625400898011, "grad_norm": 0.857297702149309, "learning_rate": 5.073864476775157e-05, "loss": 0.6013, "step": 10875 }, { "epoch": 0.6978832584990379, "grad_norm": 0.6348649341355659, "learning_rate": 5.064124976997693e-05, "loss": 0.6045, "step": 10880 }, { "epoch": 0.6982039769082745, "grad_norm": 0.6585605551969316, "learning_rate": 5.054391663458087e-05, "loss": 0.6171, "step": 10885 }, { "epoch": 0.6985246953175113, "grad_norm": 0.986468962885202, "learning_rate": 5.044664548355307e-05, "loss": 0.7186, "step": 10890 }, { "epoch": 0.6988454137267479, "grad_norm": 0.9785918246000489, "learning_rate": 5.0349436438805494e-05, "loss": 0.7877, "step": 10895 }, { "epoch": 0.6991661321359846, "grad_norm": 1.5065392603292607, "learning_rate": 5.025228962217241e-05, "loss": 0.6156, "step": 10900 }, { "epoch": 0.6994868505452213, "grad_norm": 0.9224408618353005, "learning_rate": 5.015520515540996e-05, "loss": 0.5855, "step": 10905 }, { "epoch": 0.699807568954458, "grad_norm": 0.8828715863784493, "learning_rate": 5.005818316019618e-05, "loss": 0.6038, "step": 10910 }, { "epoch": 0.7001282873636947, "grad_norm": 0.9568291721616811, "learning_rate": 4.996122375813079e-05, "loss": 0.6317, "step": 10915 }, { "epoch": 0.7004490057729313, "grad_norm": 1.4247569725340374, "learning_rate": 4.986432707073515e-05, "loss": 0.7097, "step": 10920 }, { "epoch": 0.7007697241821681, "grad_norm": 0.5257863778727976, "learning_rate": 4.976749321945191e-05, "loss": 0.5316, "step": 10925 }, { "epoch": 0.7010904425914047, "grad_norm": 0.7116948483921095, "learning_rate": 4.9670722325644993e-05, "loss": 0.6438, "step": 10930 }, { "epoch": 0.7014111610006415, "grad_norm": 0.8934801180351521, "learning_rate": 4.957401451059948e-05, "loss": 0.6628, "step": 10935 }, { "epoch": 0.7017318794098781, "grad_norm": 0.5554525116078812, "learning_rate": 4.9477369895521284e-05, "loss": 0.6803, "step": 10940 }, { "epoch": 0.7020525978191148, "grad_norm": 1.115600134036066, "learning_rate": 4.938078860153725e-05, "loss": 0.582, "step": 10945 }, { "epoch": 0.7023733162283515, "grad_norm": 1.04204980372642, "learning_rate": 4.928427074969475e-05, "loss": 0.6396, "step": 10950 }, { "epoch": 0.7026940346375882, "grad_norm": 0.6952203258967746, "learning_rate": 4.918781646096161e-05, "loss": 0.609, "step": 10955 }, { "epoch": 0.7030147530468249, "grad_norm": 0.8455941974814938, "learning_rate": 4.909142585622616e-05, "loss": 0.7442, "step": 10960 }, { "epoch": 0.7033354714560616, "grad_norm": 0.9358056805840572, "learning_rate": 4.899509905629671e-05, "loss": 0.6163, "step": 10965 }, { "epoch": 0.7036561898652983, "grad_norm": 0.8368567909279319, "learning_rate": 4.889883618190184e-05, "loss": 0.6729, "step": 10970 }, { "epoch": 0.703976908274535, "grad_norm": 0.9626200217934863, "learning_rate": 4.8802637353689694e-05, "loss": 0.6208, "step": 10975 }, { "epoch": 0.7042976266837716, "grad_norm": 1.423525816978348, "learning_rate": 4.870650269222845e-05, "loss": 0.6301, "step": 10980 }, { "epoch": 0.7046183450930084, "grad_norm": 0.8943539539791406, "learning_rate": 4.8610432318005705e-05, "loss": 0.8259, "step": 10985 }, { "epoch": 0.704939063502245, "grad_norm": 1.0047328070171035, "learning_rate": 4.851442635142846e-05, "loss": 0.6759, "step": 10990 }, { "epoch": 0.7052597819114818, "grad_norm": 0.864965532206175, "learning_rate": 4.841848491282315e-05, "loss": 0.6722, "step": 10995 }, { "epoch": 0.7055805003207184, "grad_norm": 0.7890255740216144, "learning_rate": 4.832260812243513e-05, "loss": 0.6922, "step": 11000 }, { "epoch": 0.7059012187299551, "grad_norm": 1.2389180866062235, "learning_rate": 4.822679610042894e-05, "loss": 0.6051, "step": 11005 }, { "epoch": 0.7062219371391918, "grad_norm": 0.6998283128694094, "learning_rate": 4.813104896688777e-05, "loss": 0.6615, "step": 11010 }, { "epoch": 0.7065426555484284, "grad_norm": 0.8090143409111475, "learning_rate": 4.803536684181354e-05, "loss": 0.7387, "step": 11015 }, { "epoch": 0.7068633739576652, "grad_norm": 1.0370968663682347, "learning_rate": 4.793974984512677e-05, "loss": 0.7072, "step": 11020 }, { "epoch": 0.7071840923669018, "grad_norm": 0.7853945975713512, "learning_rate": 4.7844198096666246e-05, "loss": 0.686, "step": 11025 }, { "epoch": 0.7075048107761386, "grad_norm": 0.702386626377002, "learning_rate": 4.774871171618901e-05, "loss": 0.7127, "step": 11030 }, { "epoch": 0.7078255291853752, "grad_norm": 1.0108215460660506, "learning_rate": 4.765329082337027e-05, "loss": 0.6434, "step": 11035 }, { "epoch": 0.708146247594612, "grad_norm": 0.9899048924342988, "learning_rate": 4.755793553780292e-05, "loss": 0.7323, "step": 11040 }, { "epoch": 0.7084669660038486, "grad_norm": 0.9147032893585562, "learning_rate": 4.746264597899792e-05, "loss": 0.6739, "step": 11045 }, { "epoch": 0.7087876844130853, "grad_norm": 1.0330004401132, "learning_rate": 4.736742226638363e-05, "loss": 0.8609, "step": 11050 }, { "epoch": 0.709108402822322, "grad_norm": 0.6548738796277453, "learning_rate": 4.727226451930604e-05, "loss": 0.6734, "step": 11055 }, { "epoch": 0.7094291212315587, "grad_norm": 0.81714120996019, "learning_rate": 4.717717285702835e-05, "loss": 0.7523, "step": 11060 }, { "epoch": 0.7097498396407954, "grad_norm": 0.885017113426685, "learning_rate": 4.708214739873096e-05, "loss": 0.5943, "step": 11065 }, { "epoch": 0.710070558050032, "grad_norm": 0.8620179894720568, "learning_rate": 4.698718826351135e-05, "loss": 0.593, "step": 11070 }, { "epoch": 0.7103912764592688, "grad_norm": 0.7663377237340008, "learning_rate": 4.689229557038379e-05, "loss": 0.7649, "step": 11075 }, { "epoch": 0.7107119948685054, "grad_norm": 0.779291905786263, "learning_rate": 4.679746943827939e-05, "loss": 0.6231, "step": 11080 }, { "epoch": 0.7110327132777421, "grad_norm": 0.8488045821194506, "learning_rate": 4.6702709986045745e-05, "loss": 0.5658, "step": 11085 }, { "epoch": 0.7113534316869788, "grad_norm": 0.7591544492497508, "learning_rate": 4.660801733244685e-05, "loss": 0.5434, "step": 11090 }, { "epoch": 0.7116741500962155, "grad_norm": 0.9324567178402989, "learning_rate": 4.651339159616312e-05, "loss": 0.7694, "step": 11095 }, { "epoch": 0.7119948685054522, "grad_norm": 0.614241285241644, "learning_rate": 4.641883289579095e-05, "loss": 0.573, "step": 11100 }, { "epoch": 0.7123155869146889, "grad_norm": 0.7297521213628075, "learning_rate": 4.632434134984288e-05, "loss": 0.7862, "step": 11105 }, { "epoch": 0.7126363053239256, "grad_norm": 0.8547500506968054, "learning_rate": 4.6229917076747056e-05, "loss": 0.6224, "step": 11110 }, { "epoch": 0.7129570237331623, "grad_norm": 1.1207952262364815, "learning_rate": 4.613556019484754e-05, "loss": 0.7452, "step": 11115 }, { "epoch": 0.7132777421423989, "grad_norm": 0.5122245150734959, "learning_rate": 4.604127082240379e-05, "loss": 0.6216, "step": 11120 }, { "epoch": 0.7135984605516357, "grad_norm": 0.6841888313664231, "learning_rate": 4.5947049077590664e-05, "loss": 0.6031, "step": 11125 }, { "epoch": 0.7139191789608723, "grad_norm": 0.8085851937507493, "learning_rate": 4.585289507849838e-05, "loss": 0.5983, "step": 11130 }, { "epoch": 0.7142398973701091, "grad_norm": 0.8748340585570812, "learning_rate": 4.575880894313207e-05, "loss": 0.6462, "step": 11135 }, { "epoch": 0.7145606157793457, "grad_norm": 0.5741182108460992, "learning_rate": 4.566479078941198e-05, "loss": 0.6313, "step": 11140 }, { "epoch": 0.7148813341885825, "grad_norm": 1.3368271859986067, "learning_rate": 4.557084073517305e-05, "loss": 0.5434, "step": 11145 }, { "epoch": 0.7152020525978191, "grad_norm": 0.7497857375686727, "learning_rate": 4.547695889816485e-05, "loss": 0.557, "step": 11150 }, { "epoch": 0.7155227710070559, "grad_norm": 0.8178864612038674, "learning_rate": 4.538314539605155e-05, "loss": 0.6979, "step": 11155 }, { "epoch": 0.7158434894162925, "grad_norm": 0.8969560105198988, "learning_rate": 4.528940034641158e-05, "loss": 0.765, "step": 11160 }, { "epoch": 0.7161642078255291, "grad_norm": 1.2265503200288288, "learning_rate": 4.519572386673768e-05, "loss": 0.5296, "step": 11165 }, { "epoch": 0.7164849262347659, "grad_norm": 0.611571817659739, "learning_rate": 4.510211607443654e-05, "loss": 0.6223, "step": 11170 }, { "epoch": 0.7168056446440025, "grad_norm": 0.8641143822600184, "learning_rate": 4.500857708682883e-05, "loss": 0.7204, "step": 11175 }, { "epoch": 0.7171263630532393, "grad_norm": 0.9563759174291445, "learning_rate": 4.491510702114894e-05, "loss": 0.6728, "step": 11180 }, { "epoch": 0.7174470814624759, "grad_norm": 0.5814502110654781, "learning_rate": 4.482170599454489e-05, "loss": 0.6652, "step": 11185 }, { "epoch": 0.7177677998717127, "grad_norm": 1.0858563785495055, "learning_rate": 4.472837412407825e-05, "loss": 0.5543, "step": 11190 }, { "epoch": 0.7180885182809493, "grad_norm": 0.6644009179012256, "learning_rate": 4.4635111526723826e-05, "loss": 0.8072, "step": 11195 }, { "epoch": 0.718409236690186, "grad_norm": 0.9031430293191645, "learning_rate": 4.454191831936958e-05, "loss": 0.7006, "step": 11200 }, { "epoch": 0.7187299550994227, "grad_norm": 0.6707442290616978, "learning_rate": 4.4448794618816634e-05, "loss": 0.6081, "step": 11205 }, { "epoch": 0.7190506735086594, "grad_norm": 0.4567339031728235, "learning_rate": 4.4355740541778837e-05, "loss": 0.5996, "step": 11210 }, { "epoch": 0.7193713919178961, "grad_norm": 0.8456434286308311, "learning_rate": 4.426275620488293e-05, "loss": 0.5902, "step": 11215 }, { "epoch": 0.7196921103271328, "grad_norm": 0.7375984313670896, "learning_rate": 4.416984172466814e-05, "loss": 0.5592, "step": 11220 }, { "epoch": 0.7200128287363695, "grad_norm": 1.001285278455043, "learning_rate": 4.407699721758614e-05, "loss": 0.4883, "step": 11225 }, { "epoch": 0.7203335471456062, "grad_norm": 1.2917508534051378, "learning_rate": 4.398422280000101e-05, "loss": 0.6768, "step": 11230 }, { "epoch": 0.7206542655548428, "grad_norm": 0.9685204099266428, "learning_rate": 4.3891518588188875e-05, "loss": 0.5883, "step": 11235 }, { "epoch": 0.7209749839640796, "grad_norm": 0.5295383592814902, "learning_rate": 4.379888469833791e-05, "loss": 0.6229, "step": 11240 }, { "epoch": 0.7212957023733162, "grad_norm": 0.9573436890552846, "learning_rate": 4.370632124654811e-05, "loss": 0.7156, "step": 11245 }, { "epoch": 0.721616420782553, "grad_norm": 0.741578858748363, "learning_rate": 4.361382834883131e-05, "loss": 0.6556, "step": 11250 }, { "epoch": 0.7219371391917896, "grad_norm": 0.916633580201409, "learning_rate": 4.3521406121110807e-05, "loss": 0.676, "step": 11255 }, { "epoch": 0.7222578576010263, "grad_norm": 0.3992983111166088, "learning_rate": 4.342905467922133e-05, "loss": 0.4788, "step": 11260 }, { "epoch": 0.722578576010263, "grad_norm": 1.4519640203571154, "learning_rate": 4.333677413890896e-05, "loss": 0.7693, "step": 11265 }, { "epoch": 0.7228992944194996, "grad_norm": 1.014341854127021, "learning_rate": 4.324456461583084e-05, "loss": 0.7161, "step": 11270 }, { "epoch": 0.7232200128287364, "grad_norm": 0.5798440252008737, "learning_rate": 4.315242622555518e-05, "loss": 0.5319, "step": 11275 }, { "epoch": 0.723540731237973, "grad_norm": 1.3961411697107977, "learning_rate": 4.306035908356097e-05, "loss": 0.7755, "step": 11280 }, { "epoch": 0.7238614496472098, "grad_norm": 0.7989332199967835, "learning_rate": 4.296836330523791e-05, "loss": 0.6761, "step": 11285 }, { "epoch": 0.7241821680564464, "grad_norm": 0.5432452037456782, "learning_rate": 4.287643900588634e-05, "loss": 0.5398, "step": 11290 }, { "epoch": 0.7245028864656832, "grad_norm": 1.1422963762576541, "learning_rate": 4.278458630071687e-05, "loss": 0.5321, "step": 11295 }, { "epoch": 0.7248236048749198, "grad_norm": 0.6668170639427147, "learning_rate": 4.2692805304850545e-05, "loss": 0.5796, "step": 11300 }, { "epoch": 0.7251443232841565, "grad_norm": 0.8515640505208902, "learning_rate": 4.260109613331842e-05, "loss": 0.6569, "step": 11305 }, { "epoch": 0.7254650416933932, "grad_norm": 0.7014693919060985, "learning_rate": 4.250945890106156e-05, "loss": 0.6856, "step": 11310 }, { "epoch": 0.7257857601026299, "grad_norm": 1.067030988068662, "learning_rate": 4.241789372293087e-05, "loss": 0.7749, "step": 11315 }, { "epoch": 0.7261064785118666, "grad_norm": 0.7479024679363765, "learning_rate": 4.232640071368691e-05, "loss": 0.5478, "step": 11320 }, { "epoch": 0.7264271969211032, "grad_norm": 1.0084686752935972, "learning_rate": 4.22349799879999e-05, "loss": 0.7788, "step": 11325 }, { "epoch": 0.72674791533034, "grad_norm": 0.6585878195188157, "learning_rate": 4.214363166044932e-05, "loss": 0.6133, "step": 11330 }, { "epoch": 0.7270686337395766, "grad_norm": 0.6784141958893567, "learning_rate": 4.205235584552407e-05, "loss": 0.6019, "step": 11335 }, { "epoch": 0.7273893521488134, "grad_norm": 0.993300088957976, "learning_rate": 4.1961152657622024e-05, "loss": 0.7166, "step": 11340 }, { "epoch": 0.72771007055805, "grad_norm": 0.8874942343310022, "learning_rate": 4.1870022211050074e-05, "loss": 0.6981, "step": 11345 }, { "epoch": 0.7280307889672867, "grad_norm": 1.4921657931640064, "learning_rate": 4.177896462002402e-05, "loss": 0.5832, "step": 11350 }, { "epoch": 0.7283515073765234, "grad_norm": 0.7853192040977804, "learning_rate": 4.168797999866827e-05, "loss": 0.7185, "step": 11355 }, { "epoch": 0.7286722257857601, "grad_norm": 0.7775032508697538, "learning_rate": 4.159706846101574e-05, "loss": 0.5868, "step": 11360 }, { "epoch": 0.7289929441949968, "grad_norm": 0.8328166231193795, "learning_rate": 4.1506230121007894e-05, "loss": 0.6707, "step": 11365 }, { "epoch": 0.7293136626042335, "grad_norm": 1.1556231103657886, "learning_rate": 4.141546509249433e-05, "loss": 0.602, "step": 11370 }, { "epoch": 0.7296343810134702, "grad_norm": 0.6535692635433068, "learning_rate": 4.1324773489232794e-05, "loss": 0.7015, "step": 11375 }, { "epoch": 0.7299550994227069, "grad_norm": 1.0308989718059964, "learning_rate": 4.1234155424889e-05, "loss": 0.6524, "step": 11380 }, { "epoch": 0.7302758178319435, "grad_norm": 0.9042723107486375, "learning_rate": 4.1143611013036556e-05, "loss": 0.6932, "step": 11385 }, { "epoch": 0.7305965362411803, "grad_norm": 1.045581159518661, "learning_rate": 4.105314036715668e-05, "loss": 0.598, "step": 11390 }, { "epoch": 0.7309172546504169, "grad_norm": 0.720438985489428, "learning_rate": 4.096274360063814e-05, "loss": 0.6927, "step": 11395 }, { "epoch": 0.7312379730596537, "grad_norm": 0.7837057060205996, "learning_rate": 4.087242082677721e-05, "loss": 0.6271, "step": 11400 }, { "epoch": 0.7315586914688903, "grad_norm": 0.9277273501073059, "learning_rate": 4.0782172158777296e-05, "loss": 0.7232, "step": 11405 }, { "epoch": 0.731879409878127, "grad_norm": 0.7663141809384151, "learning_rate": 4.069199770974904e-05, "loss": 0.5593, "step": 11410 }, { "epoch": 0.7322001282873637, "grad_norm": 0.7732548069785231, "learning_rate": 4.0601897592709984e-05, "loss": 0.6973, "step": 11415 }, { "epoch": 0.7325208466966003, "grad_norm": 1.0148083244026747, "learning_rate": 4.0511871920584486e-05, "loss": 0.8616, "step": 11420 }, { "epoch": 0.7328415651058371, "grad_norm": 0.7789337008538708, "learning_rate": 4.042192080620374e-05, "loss": 0.7399, "step": 11425 }, { "epoch": 0.7331622835150737, "grad_norm": 0.7411707815027391, "learning_rate": 4.033204436230532e-05, "loss": 0.7219, "step": 11430 }, { "epoch": 0.7334830019243105, "grad_norm": 0.9973447184162525, "learning_rate": 4.0242242701533396e-05, "loss": 0.6579, "step": 11435 }, { "epoch": 0.7338037203335471, "grad_norm": 0.5830094144343125, "learning_rate": 4.015251593643818e-05, "loss": 0.7666, "step": 11440 }, { "epoch": 0.7341244387427839, "grad_norm": 0.9049494653802453, "learning_rate": 4.006286417947627e-05, "loss": 0.7362, "step": 11445 }, { "epoch": 0.7344451571520205, "grad_norm": 1.1555455068409544, "learning_rate": 3.9973287543010064e-05, "loss": 0.7706, "step": 11450 }, { "epoch": 0.7347658755612572, "grad_norm": 0.8236939327253207, "learning_rate": 3.9883786139307864e-05, "loss": 0.4883, "step": 11455 }, { "epoch": 0.7350865939704939, "grad_norm": 0.7242616375495603, "learning_rate": 3.979436008054377e-05, "loss": 0.6765, "step": 11460 }, { "epoch": 0.7354073123797306, "grad_norm": 0.8282782204794581, "learning_rate": 3.97050094787973e-05, "loss": 0.6393, "step": 11465 }, { "epoch": 0.7357280307889673, "grad_norm": 0.5484580528486228, "learning_rate": 3.9615734446053534e-05, "loss": 0.6273, "step": 11470 }, { "epoch": 0.736048749198204, "grad_norm": 0.8342001080027434, "learning_rate": 3.952653509420277e-05, "loss": 0.6517, "step": 11475 }, { "epoch": 0.7363694676074407, "grad_norm": 0.8544406097793438, "learning_rate": 3.9437411535040416e-05, "loss": 0.5679, "step": 11480 }, { "epoch": 0.7366901860166774, "grad_norm": 0.8001118287868482, "learning_rate": 3.9348363880267006e-05, "loss": 0.7448, "step": 11485 }, { "epoch": 0.737010904425914, "grad_norm": 1.0049068620138881, "learning_rate": 3.92593922414878e-05, "loss": 0.5381, "step": 11490 }, { "epoch": 0.7373316228351507, "grad_norm": 1.0836198813580136, "learning_rate": 3.9170496730212944e-05, "loss": 0.6346, "step": 11495 }, { "epoch": 0.7376523412443874, "grad_norm": 0.4690219622238173, "learning_rate": 3.9081677457857045e-05, "loss": 0.5469, "step": 11500 }, { "epoch": 0.7379730596536241, "grad_norm": 0.7653256546259366, "learning_rate": 3.899293453573919e-05, "loss": 0.6005, "step": 11505 }, { "epoch": 0.7382937780628608, "grad_norm": 0.8939110106983141, "learning_rate": 3.890426807508278e-05, "loss": 0.6783, "step": 11510 }, { "epoch": 0.7386144964720975, "grad_norm": 0.775603525768831, "learning_rate": 3.881567818701538e-05, "loss": 0.6916, "step": 11515 }, { "epoch": 0.7389352148813342, "grad_norm": 1.3430493149234304, "learning_rate": 3.872716498256863e-05, "loss": 0.5578, "step": 11520 }, { "epoch": 0.7392559332905709, "grad_norm": 0.715829315420304, "learning_rate": 3.863872857267802e-05, "loss": 0.7686, "step": 11525 }, { "epoch": 0.7395766516998076, "grad_norm": 0.6732314863048653, "learning_rate": 3.8550369068182735e-05, "loss": 0.4974, "step": 11530 }, { "epoch": 0.7398973701090442, "grad_norm": 0.5624440967305854, "learning_rate": 3.846208657982572e-05, "loss": 0.5765, "step": 11535 }, { "epoch": 0.740218088518281, "grad_norm": 0.9351668361698933, "learning_rate": 3.837388121825323e-05, "loss": 0.6699, "step": 11540 }, { "epoch": 0.7405388069275176, "grad_norm": 1.0442410475484458, "learning_rate": 3.828575309401501e-05, "loss": 0.5723, "step": 11545 }, { "epoch": 0.7408595253367544, "grad_norm": 0.897573742077218, "learning_rate": 3.819770231756389e-05, "loss": 0.7723, "step": 11550 }, { "epoch": 0.741180243745991, "grad_norm": 0.6333361868228848, "learning_rate": 3.810972899925575e-05, "loss": 0.5929, "step": 11555 }, { "epoch": 0.7415009621552278, "grad_norm": 1.2414234428777005, "learning_rate": 3.802183324934952e-05, "loss": 0.6754, "step": 11560 }, { "epoch": 0.7418216805644644, "grad_norm": 0.8678280206604037, "learning_rate": 3.793401517800672e-05, "loss": 0.434, "step": 11565 }, { "epoch": 0.742142398973701, "grad_norm": 0.8589814705072975, "learning_rate": 3.784627489529177e-05, "loss": 0.7005, "step": 11570 }, { "epoch": 0.7424631173829378, "grad_norm": 1.096069158153898, "learning_rate": 3.775861251117128e-05, "loss": 0.6066, "step": 11575 }, { "epoch": 0.7427838357921744, "grad_norm": 0.8956575121848285, "learning_rate": 3.76710281355145e-05, "loss": 0.5453, "step": 11580 }, { "epoch": 0.7431045542014112, "grad_norm": 0.9901238623869012, "learning_rate": 3.7583521878092766e-05, "loss": 0.6829, "step": 11585 }, { "epoch": 0.7434252726106478, "grad_norm": 1.1556330315855146, "learning_rate": 3.749609384857952e-05, "loss": 0.6617, "step": 11590 }, { "epoch": 0.7437459910198846, "grad_norm": 0.8946200380979793, "learning_rate": 3.7408744156550235e-05, "loss": 0.6454, "step": 11595 }, { "epoch": 0.7440667094291212, "grad_norm": 0.6811470722359575, "learning_rate": 3.73214729114821e-05, "loss": 0.558, "step": 11600 }, { "epoch": 0.7443874278383579, "grad_norm": 1.2129672803037883, "learning_rate": 3.72342802227541e-05, "loss": 0.6829, "step": 11605 }, { "epoch": 0.7447081462475946, "grad_norm": 0.7287815359687029, "learning_rate": 3.7147166199646665e-05, "loss": 0.7291, "step": 11610 }, { "epoch": 0.7450288646568313, "grad_norm": 0.7381906467511818, "learning_rate": 3.706013095134162e-05, "loss": 0.673, "step": 11615 }, { "epoch": 0.745349583066068, "grad_norm": 1.2592430310132843, "learning_rate": 3.697317458692219e-05, "loss": 0.6236, "step": 11620 }, { "epoch": 0.7456703014753047, "grad_norm": 0.6359130442368803, "learning_rate": 3.688629721537256e-05, "loss": 0.6774, "step": 11625 }, { "epoch": 0.7459910198845414, "grad_norm": 0.9163313019367859, "learning_rate": 3.679949894557808e-05, "loss": 0.6353, "step": 11630 }, { "epoch": 0.7463117382937781, "grad_norm": 0.66124758919148, "learning_rate": 3.671277988632484e-05, "loss": 0.6667, "step": 11635 }, { "epoch": 0.7466324567030147, "grad_norm": 1.093053112833277, "learning_rate": 3.6626140146299715e-05, "loss": 0.6706, "step": 11640 }, { "epoch": 0.7469531751122515, "grad_norm": 0.585918591610346, "learning_rate": 3.653957983409012e-05, "loss": 0.596, "step": 11645 }, { "epoch": 0.7472738935214881, "grad_norm": 0.8785492282676739, "learning_rate": 3.6453099058183936e-05, "loss": 0.8345, "step": 11650 }, { "epoch": 0.7475946119307249, "grad_norm": 1.0886821917358311, "learning_rate": 3.6366697926969415e-05, "loss": 0.7223, "step": 11655 }, { "epoch": 0.7479153303399615, "grad_norm": 0.8352362172770396, "learning_rate": 3.628037654873489e-05, "loss": 0.7974, "step": 11660 }, { "epoch": 0.7482360487491982, "grad_norm": 0.6846055972157917, "learning_rate": 3.619413503166888e-05, "loss": 0.7061, "step": 11665 }, { "epoch": 0.7485567671584349, "grad_norm": 1.1651393765637517, "learning_rate": 3.610797348385965e-05, "loss": 0.6326, "step": 11670 }, { "epoch": 0.7488774855676715, "grad_norm": 0.8887525600265255, "learning_rate": 3.60218920132953e-05, "loss": 0.6543, "step": 11675 }, { "epoch": 0.7491982039769083, "grad_norm": 0.47701205334570973, "learning_rate": 3.5935890727863653e-05, "loss": 0.5758, "step": 11680 }, { "epoch": 0.7495189223861449, "grad_norm": 1.0003500503360518, "learning_rate": 3.5849969735351917e-05, "loss": 0.7507, "step": 11685 }, { "epoch": 0.7498396407953817, "grad_norm": 0.9203454434610632, "learning_rate": 3.57641291434467e-05, "loss": 0.7704, "step": 11690 }, { "epoch": 0.7501603592046183, "grad_norm": 1.035485843783069, "learning_rate": 3.5678369059733884e-05, "loss": 0.7227, "step": 11695 }, { "epoch": 0.7504810776138551, "grad_norm": 0.8574293258900955, "learning_rate": 3.559268959169842e-05, "loss": 0.5932, "step": 11700 }, { "epoch": 0.7508017960230917, "grad_norm": 1.0713424994868566, "learning_rate": 3.55070908467242e-05, "loss": 0.7351, "step": 11705 }, { "epoch": 0.7511225144323285, "grad_norm": 0.7637351663255856, "learning_rate": 3.542157293209394e-05, "loss": 0.5982, "step": 11710 }, { "epoch": 0.7514432328415651, "grad_norm": 0.7283758639132564, "learning_rate": 3.533613595498914e-05, "loss": 0.6919, "step": 11715 }, { "epoch": 0.7517639512508018, "grad_norm": 0.9199615101682994, "learning_rate": 3.525078002248974e-05, "loss": 0.834, "step": 11720 }, { "epoch": 0.7520846696600385, "grad_norm": 0.685052311744196, "learning_rate": 3.516550524157415e-05, "loss": 0.7766, "step": 11725 }, { "epoch": 0.7524053880692751, "grad_norm": 0.9557933778705214, "learning_rate": 3.508031171911913e-05, "loss": 0.7334, "step": 11730 }, { "epoch": 0.7527261064785119, "grad_norm": 0.8217799938196116, "learning_rate": 3.4995199561899496e-05, "loss": 0.6719, "step": 11735 }, { "epoch": 0.7530468248877485, "grad_norm": 0.8490165290571312, "learning_rate": 3.491016887658819e-05, "loss": 0.6352, "step": 11740 }, { "epoch": 0.7533675432969853, "grad_norm": 1.0096737759482532, "learning_rate": 3.4825219769755955e-05, "loss": 0.6278, "step": 11745 }, { "epoch": 0.7536882617062219, "grad_norm": 0.8116824311381272, "learning_rate": 3.4740352347871294e-05, "loss": 0.5794, "step": 11750 }, { "epoch": 0.7540089801154586, "grad_norm": 1.0567664205528664, "learning_rate": 3.4655566717300433e-05, "loss": 0.5817, "step": 11755 }, { "epoch": 0.7543296985246953, "grad_norm": 0.8458879335378663, "learning_rate": 3.457086298430696e-05, "loss": 0.5779, "step": 11760 }, { "epoch": 0.754650416933932, "grad_norm": 0.8982863213171639, "learning_rate": 3.448624125505194e-05, "loss": 0.6697, "step": 11765 }, { "epoch": 0.7549711353431687, "grad_norm": 0.8975989314029491, "learning_rate": 3.440170163559355e-05, "loss": 0.7032, "step": 11770 }, { "epoch": 0.7552918537524054, "grad_norm": 0.8729443546989577, "learning_rate": 3.4317244231887125e-05, "loss": 0.8033, "step": 11775 }, { "epoch": 0.7556125721616421, "grad_norm": 1.0239920545191055, "learning_rate": 3.423286914978493e-05, "loss": 0.672, "step": 11780 }, { "epoch": 0.7559332905708788, "grad_norm": 0.7010189828092076, "learning_rate": 3.414857649503602e-05, "loss": 0.6409, "step": 11785 }, { "epoch": 0.7562540089801154, "grad_norm": 0.8719062018189001, "learning_rate": 3.4064366373286274e-05, "loss": 0.7164, "step": 11790 }, { "epoch": 0.7565747273893522, "grad_norm": 0.7198915627914316, "learning_rate": 3.398023889007794e-05, "loss": 0.6249, "step": 11795 }, { "epoch": 0.7568954457985888, "grad_norm": 0.8718719431875859, "learning_rate": 3.389619415084989e-05, "loss": 0.6064, "step": 11800 }, { "epoch": 0.7572161642078256, "grad_norm": 0.8120042747717762, "learning_rate": 3.381223226093715e-05, "loss": 0.5433, "step": 11805 }, { "epoch": 0.7575368826170622, "grad_norm": 0.9647874073108456, "learning_rate": 3.3728353325570915e-05, "loss": 0.7064, "step": 11810 }, { "epoch": 0.757857601026299, "grad_norm": 1.2538875949194586, "learning_rate": 3.364455744987853e-05, "loss": 0.5527, "step": 11815 }, { "epoch": 0.7581783194355356, "grad_norm": 1.178257170426357, "learning_rate": 3.35608447388831e-05, "loss": 0.6565, "step": 11820 }, { "epoch": 0.7584990378447722, "grad_norm": 0.8864713208910722, "learning_rate": 3.3477215297503605e-05, "loss": 0.5459, "step": 11825 }, { "epoch": 0.758819756254009, "grad_norm": 0.81482691903865, "learning_rate": 3.339366923055458e-05, "loss": 0.6798, "step": 11830 }, { "epoch": 0.7591404746632456, "grad_norm": 0.7808704507490104, "learning_rate": 3.3310206642746125e-05, "loss": 0.6767, "step": 11835 }, { "epoch": 0.7594611930724824, "grad_norm": 0.6063874143510388, "learning_rate": 3.3226827638683665e-05, "loss": 0.7335, "step": 11840 }, { "epoch": 0.759781911481719, "grad_norm": 0.9081154038511268, "learning_rate": 3.3143532322867865e-05, "loss": 0.7284, "step": 11845 }, { "epoch": 0.7601026298909558, "grad_norm": 0.845045773951182, "learning_rate": 3.306032079969459e-05, "loss": 0.7782, "step": 11850 }, { "epoch": 0.7604233483001924, "grad_norm": 0.8991436429034236, "learning_rate": 3.29771931734546e-05, "loss": 0.7148, "step": 11855 }, { "epoch": 0.7607440667094291, "grad_norm": 0.9742693305593477, "learning_rate": 3.2894149548333495e-05, "loss": 0.6244, "step": 11860 }, { "epoch": 0.7610647851186658, "grad_norm": 0.6773700996601912, "learning_rate": 3.281119002841169e-05, "loss": 0.5872, "step": 11865 }, { "epoch": 0.7613855035279025, "grad_norm": 0.8384804126775537, "learning_rate": 3.2728314717664055e-05, "loss": 0.7845, "step": 11870 }, { "epoch": 0.7617062219371392, "grad_norm": 1.1357544575552236, "learning_rate": 3.264552371996008e-05, "loss": 0.6953, "step": 11875 }, { "epoch": 0.7620269403463759, "grad_norm": 0.8516566580601438, "learning_rate": 3.256281713906343e-05, "loss": 0.7256, "step": 11880 }, { "epoch": 0.7623476587556126, "grad_norm": 1.2370541167396898, "learning_rate": 3.248019507863203e-05, "loss": 0.7604, "step": 11885 }, { "epoch": 0.7626683771648493, "grad_norm": 0.9542563866917992, "learning_rate": 3.2397657642217926e-05, "loss": 0.5988, "step": 11890 }, { "epoch": 0.762989095574086, "grad_norm": 1.0432964488893417, "learning_rate": 3.2315204933266996e-05, "loss": 0.6991, "step": 11895 }, { "epoch": 0.7633098139833226, "grad_norm": 1.0011228778914865, "learning_rate": 3.223283705511908e-05, "loss": 0.7298, "step": 11900 }, { "epoch": 0.7636305323925593, "grad_norm": 1.5274397488438434, "learning_rate": 3.215055411100748e-05, "loss": 0.6428, "step": 11905 }, { "epoch": 0.763951250801796, "grad_norm": 0.876587920734237, "learning_rate": 3.2068356204059255e-05, "loss": 0.7244, "step": 11910 }, { "epoch": 0.7642719692110327, "grad_norm": 0.6121339451327354, "learning_rate": 3.198624343729479e-05, "loss": 0.7324, "step": 11915 }, { "epoch": 0.7645926876202694, "grad_norm": 0.8464048080490233, "learning_rate": 3.190421591362772e-05, "loss": 0.7464, "step": 11920 }, { "epoch": 0.7649134060295061, "grad_norm": 0.9880557475834854, "learning_rate": 3.1822273735864984e-05, "loss": 0.71, "step": 11925 }, { "epoch": 0.7652341244387428, "grad_norm": 1.0295342644337049, "learning_rate": 3.174041700670638e-05, "loss": 0.4895, "step": 11930 }, { "epoch": 0.7655548428479795, "grad_norm": 0.7076312841936536, "learning_rate": 3.165864582874477e-05, "loss": 0.691, "step": 11935 }, { "epoch": 0.7658755612572161, "grad_norm": 1.0135591193887252, "learning_rate": 3.1576960304465705e-05, "loss": 0.6266, "step": 11940 }, { "epoch": 0.7661962796664529, "grad_norm": 1.0323761526191306, "learning_rate": 3.149536053624735e-05, "loss": 0.7654, "step": 11945 }, { "epoch": 0.7665169980756895, "grad_norm": 1.55635605359068, "learning_rate": 3.1413846626360536e-05, "loss": 0.7714, "step": 11950 }, { "epoch": 0.7668377164849263, "grad_norm": 0.9497662276751877, "learning_rate": 3.133241867696829e-05, "loss": 0.6683, "step": 11955 }, { "epoch": 0.7671584348941629, "grad_norm": 0.8979757336357795, "learning_rate": 3.1251076790126086e-05, "loss": 0.7516, "step": 11960 }, { "epoch": 0.7674791533033997, "grad_norm": 0.764820887022675, "learning_rate": 3.1169821067781425e-05, "loss": 0.5679, "step": 11965 }, { "epoch": 0.7677998717126363, "grad_norm": 0.5942733392588654, "learning_rate": 3.1088651611773834e-05, "loss": 0.5194, "step": 11970 }, { "epoch": 0.768120590121873, "grad_norm": 0.9490603016131256, "learning_rate": 3.100756852383473e-05, "loss": 0.5963, "step": 11975 }, { "epoch": 0.7684413085311097, "grad_norm": 0.7616783689998372, "learning_rate": 3.092657190558727e-05, "loss": 0.6785, "step": 11980 }, { "epoch": 0.7687620269403463, "grad_norm": 0.830417639785896, "learning_rate": 3.084566185854628e-05, "loss": 0.5892, "step": 11985 }, { "epoch": 0.7690827453495831, "grad_norm": 1.0515557973724121, "learning_rate": 3.076483848411803e-05, "loss": 0.6846, "step": 11990 }, { "epoch": 0.7694034637588197, "grad_norm": 0.9480637021643955, "learning_rate": 3.068410188360022e-05, "loss": 0.741, "step": 11995 }, { "epoch": 0.7697241821680565, "grad_norm": 0.9435811108298884, "learning_rate": 3.0603452158181744e-05, "loss": 0.7019, "step": 12000 }, { "epoch": 0.7700449005772931, "grad_norm": 0.7019989507064325, "learning_rate": 3.052288940894259e-05, "loss": 0.5835, "step": 12005 }, { "epoch": 0.7703656189865298, "grad_norm": 0.6770008543875123, "learning_rate": 3.0442413736853846e-05, "loss": 0.6826, "step": 12010 }, { "epoch": 0.7706863373957665, "grad_norm": 0.7178710129095005, "learning_rate": 3.036202524277735e-05, "loss": 0.7033, "step": 12015 }, { "epoch": 0.7710070558050032, "grad_norm": 0.7298827842977621, "learning_rate": 3.0281724027465708e-05, "loss": 0.6847, "step": 12020 }, { "epoch": 0.7713277742142399, "grad_norm": 1.2518124809303286, "learning_rate": 3.020151019156221e-05, "loss": 0.5659, "step": 12025 }, { "epoch": 0.7716484926234766, "grad_norm": 0.7542697248961158, "learning_rate": 3.0121383835600513e-05, "loss": 0.7575, "step": 12030 }, { "epoch": 0.7719692110327133, "grad_norm": 0.779461786694263, "learning_rate": 3.0041345060004776e-05, "loss": 0.7238, "step": 12035 }, { "epoch": 0.77228992944195, "grad_norm": 1.0655675292269764, "learning_rate": 2.9961393965089203e-05, "loss": 0.7475, "step": 12040 }, { "epoch": 0.7726106478511866, "grad_norm": 1.1044101389504177, "learning_rate": 2.98815306510583e-05, "loss": 0.6353, "step": 12045 }, { "epoch": 0.7729313662604234, "grad_norm": 0.8533414942650657, "learning_rate": 2.9801755218006433e-05, "loss": 0.5867, "step": 12050 }, { "epoch": 0.77325208466966, "grad_norm": 1.0958682723686255, "learning_rate": 2.9722067765917838e-05, "loss": 0.5739, "step": 12055 }, { "epoch": 0.7735728030788968, "grad_norm": 0.7152332630816656, "learning_rate": 2.9642468394666557e-05, "loss": 0.6729, "step": 12060 }, { "epoch": 0.7738935214881334, "grad_norm": 0.9986989562442445, "learning_rate": 2.956295720401612e-05, "loss": 0.6726, "step": 12065 }, { "epoch": 0.7742142398973701, "grad_norm": 0.9811723796412208, "learning_rate": 2.9483534293619685e-05, "loss": 0.5619, "step": 12070 }, { "epoch": 0.7745349583066068, "grad_norm": 0.9118000616924434, "learning_rate": 2.9404199763019645e-05, "loss": 0.6516, "step": 12075 }, { "epoch": 0.7748556767158435, "grad_norm": 0.8942392291019036, "learning_rate": 2.932495371164764e-05, "loss": 0.7949, "step": 12080 }, { "epoch": 0.7751763951250802, "grad_norm": 0.9745393445698103, "learning_rate": 2.9245796238824496e-05, "loss": 0.6836, "step": 12085 }, { "epoch": 0.7754971135343168, "grad_norm": 0.624918898789372, "learning_rate": 2.916672744375991e-05, "loss": 0.5384, "step": 12090 }, { "epoch": 0.7758178319435536, "grad_norm": 0.7577038101937041, "learning_rate": 2.908774742555257e-05, "loss": 0.7673, "step": 12095 }, { "epoch": 0.7761385503527902, "grad_norm": 1.0261935822819983, "learning_rate": 2.9008856283189778e-05, "loss": 0.5503, "step": 12100 }, { "epoch": 0.776459268762027, "grad_norm": 0.8962534874969645, "learning_rate": 2.8930054115547488e-05, "loss": 0.6463, "step": 12105 }, { "epoch": 0.7767799871712636, "grad_norm": 0.70250181904508, "learning_rate": 2.8851341021390155e-05, "loss": 0.5889, "step": 12110 }, { "epoch": 0.7771007055805004, "grad_norm": 0.6163717028953168, "learning_rate": 2.877271709937056e-05, "loss": 0.6057, "step": 12115 }, { "epoch": 0.777421423989737, "grad_norm": 1.139236879333557, "learning_rate": 2.8694182448029795e-05, "loss": 0.6143, "step": 12120 }, { "epoch": 0.7777421423989737, "grad_norm": 0.8597109154676085, "learning_rate": 2.8615737165796974e-05, "loss": 0.6156, "step": 12125 }, { "epoch": 0.7780628608082104, "grad_norm": 1.0377068227971646, "learning_rate": 2.8537381350989288e-05, "loss": 0.7131, "step": 12130 }, { "epoch": 0.778383579217447, "grad_norm": 0.9278713523838525, "learning_rate": 2.8459115101811752e-05, "loss": 0.5643, "step": 12135 }, { "epoch": 0.7787042976266838, "grad_norm": 0.9111079193714665, "learning_rate": 2.838093851635708e-05, "loss": 0.7114, "step": 12140 }, { "epoch": 0.7790250160359204, "grad_norm": 0.636013231630343, "learning_rate": 2.8302851692605748e-05, "loss": 0.5425, "step": 12145 }, { "epoch": 0.7793457344451572, "grad_norm": 0.9437606048473691, "learning_rate": 2.8224854728425555e-05, "loss": 0.7358, "step": 12150 }, { "epoch": 0.7796664528543938, "grad_norm": 0.9877250051200861, "learning_rate": 2.814694772157184e-05, "loss": 0.7881, "step": 12155 }, { "epoch": 0.7799871712636305, "grad_norm": 0.6355892070558739, "learning_rate": 2.806913076968709e-05, "loss": 0.5765, "step": 12160 }, { "epoch": 0.7803078896728672, "grad_norm": 0.8553618089212107, "learning_rate": 2.7991403970300923e-05, "loss": 0.6339, "step": 12165 }, { "epoch": 0.7806286080821039, "grad_norm": 0.7956244875523378, "learning_rate": 2.7913767420830105e-05, "loss": 0.6316, "step": 12170 }, { "epoch": 0.7809493264913406, "grad_norm": 0.74745099568378, "learning_rate": 2.7836221218578052e-05, "loss": 0.5178, "step": 12175 }, { "epoch": 0.7812700449005773, "grad_norm": 2.797197105902477, "learning_rate": 2.775876546073518e-05, "loss": 0.7453, "step": 12180 }, { "epoch": 0.781590763309814, "grad_norm": 0.8203117179056878, "learning_rate": 2.768140024437842e-05, "loss": 0.7123, "step": 12185 }, { "epoch": 0.7819114817190507, "grad_norm": 0.8491800107534502, "learning_rate": 2.7604125666471202e-05, "loss": 0.6031, "step": 12190 }, { "epoch": 0.7822322001282873, "grad_norm": 0.7920825834762689, "learning_rate": 2.7526941823863494e-05, "loss": 0.6918, "step": 12195 }, { "epoch": 0.7825529185375241, "grad_norm": 0.8070095630772426, "learning_rate": 2.744984881329139e-05, "loss": 0.5921, "step": 12200 }, { "epoch": 0.7828736369467607, "grad_norm": 0.6455255637368961, "learning_rate": 2.7372846731377265e-05, "loss": 0.6382, "step": 12205 }, { "epoch": 0.7831943553559975, "grad_norm": 0.92556283214074, "learning_rate": 2.7295935674629457e-05, "loss": 0.5116, "step": 12210 }, { "epoch": 0.7835150737652341, "grad_norm": 1.1170799846804207, "learning_rate": 2.7219115739442215e-05, "loss": 0.6566, "step": 12215 }, { "epoch": 0.7838357921744709, "grad_norm": 0.5890009042735036, "learning_rate": 2.7142387022095638e-05, "loss": 0.6128, "step": 12220 }, { "epoch": 0.7841565105837075, "grad_norm": 0.6327668177080631, "learning_rate": 2.7065749618755455e-05, "loss": 0.6366, "step": 12225 }, { "epoch": 0.7844772289929441, "grad_norm": 0.8664538277798131, "learning_rate": 2.698920362547299e-05, "loss": 0.6013, "step": 12230 }, { "epoch": 0.7847979474021809, "grad_norm": 0.7003044665428215, "learning_rate": 2.6912749138184956e-05, "loss": 0.7929, "step": 12235 }, { "epoch": 0.7851186658114175, "grad_norm": 0.7853265661064053, "learning_rate": 2.6836386252713396e-05, "loss": 0.7137, "step": 12240 }, { "epoch": 0.7854393842206543, "grad_norm": 0.909806347924112, "learning_rate": 2.6760115064765568e-05, "loss": 0.6994, "step": 12245 }, { "epoch": 0.7857601026298909, "grad_norm": 0.8351806612159146, "learning_rate": 2.6683935669933736e-05, "loss": 0.6935, "step": 12250 }, { "epoch": 0.7860808210391277, "grad_norm": 0.7611491943408887, "learning_rate": 2.6607848163695227e-05, "loss": 0.7319, "step": 12255 }, { "epoch": 0.7864015394483643, "grad_norm": 1.122080599336026, "learning_rate": 2.6531852641412082e-05, "loss": 0.6022, "step": 12260 }, { "epoch": 0.7867222578576011, "grad_norm": 1.1817121943287525, "learning_rate": 2.645594919833119e-05, "loss": 0.7494, "step": 12265 }, { "epoch": 0.7870429762668377, "grad_norm": 0.7929071478719117, "learning_rate": 2.6380137929583914e-05, "loss": 0.7783, "step": 12270 }, { "epoch": 0.7873636946760744, "grad_norm": 0.820309764452619, "learning_rate": 2.6304418930186115e-05, "loss": 0.6332, "step": 12275 }, { "epoch": 0.7876844130853111, "grad_norm": 0.707291602928582, "learning_rate": 2.6228792295038106e-05, "loss": 0.537, "step": 12280 }, { "epoch": 0.7880051314945478, "grad_norm": 0.8141400312776754, "learning_rate": 2.6153258118924308e-05, "loss": 0.6322, "step": 12285 }, { "epoch": 0.7883258499037845, "grad_norm": 0.7187432563518902, "learning_rate": 2.6077816496513363e-05, "loss": 0.5032, "step": 12290 }, { "epoch": 0.7886465683130212, "grad_norm": 0.921998673200194, "learning_rate": 2.6002467522357867e-05, "loss": 0.6134, "step": 12295 }, { "epoch": 0.7889672867222579, "grad_norm": 1.4739251939697386, "learning_rate": 2.592721129089427e-05, "loss": 0.6579, "step": 12300 }, { "epoch": 0.7892880051314946, "grad_norm": 0.7698494785751436, "learning_rate": 2.5852047896442853e-05, "loss": 0.6832, "step": 12305 }, { "epoch": 0.7896087235407312, "grad_norm": 0.9676144058038108, "learning_rate": 2.577697743320746e-05, "loss": 0.6789, "step": 12310 }, { "epoch": 0.789929441949968, "grad_norm": 0.7989952533967423, "learning_rate": 2.570199999527557e-05, "loss": 0.683, "step": 12315 }, { "epoch": 0.7902501603592046, "grad_norm": 0.7540668642091226, "learning_rate": 2.5627115676617953e-05, "loss": 0.6137, "step": 12320 }, { "epoch": 0.7905708787684413, "grad_norm": 1.2363573852579546, "learning_rate": 2.555232457108879e-05, "loss": 0.6497, "step": 12325 }, { "epoch": 0.790891597177678, "grad_norm": 0.5683854501183521, "learning_rate": 2.5477626772425356e-05, "loss": 0.6996, "step": 12330 }, { "epoch": 0.7912123155869147, "grad_norm": 0.5533412352742278, "learning_rate": 2.5403022374247953e-05, "loss": 0.7001, "step": 12335 }, { "epoch": 0.7915330339961514, "grad_norm": 0.675236986686075, "learning_rate": 2.5328511470059935e-05, "loss": 0.5805, "step": 12340 }, { "epoch": 0.791853752405388, "grad_norm": 0.7285390988297157, "learning_rate": 2.5254094153247355e-05, "loss": 0.6149, "step": 12345 }, { "epoch": 0.7921744708146248, "grad_norm": 0.80400571870766, "learning_rate": 2.5179770517079093e-05, "loss": 0.6948, "step": 12350 }, { "epoch": 0.7924951892238614, "grad_norm": 0.9377676574780994, "learning_rate": 2.510554065470653e-05, "loss": 0.7308, "step": 12355 }, { "epoch": 0.7928159076330982, "grad_norm": 0.6446906934234106, "learning_rate": 2.5031404659163492e-05, "loss": 0.7255, "step": 12360 }, { "epoch": 0.7931366260423348, "grad_norm": 0.8158537224973699, "learning_rate": 2.495736262336632e-05, "loss": 0.7016, "step": 12365 }, { "epoch": 0.7934573444515716, "grad_norm": 0.9172314841106095, "learning_rate": 2.4883414640113357e-05, "loss": 0.6117, "step": 12370 }, { "epoch": 0.7937780628608082, "grad_norm": 0.7437504326268314, "learning_rate": 2.4809560802085274e-05, "loss": 0.6409, "step": 12375 }, { "epoch": 0.7940987812700449, "grad_norm": 0.6879611505056618, "learning_rate": 2.4735801201844645e-05, "loss": 0.6397, "step": 12380 }, { "epoch": 0.7944194996792816, "grad_norm": 0.9926575009144855, "learning_rate": 2.466213593183593e-05, "loss": 0.6966, "step": 12385 }, { "epoch": 0.7947402180885182, "grad_norm": 0.8127945292903275, "learning_rate": 2.458856508438544e-05, "loss": 0.7704, "step": 12390 }, { "epoch": 0.795060936497755, "grad_norm": 0.8871371492144181, "learning_rate": 2.451508875170104e-05, "loss": 0.5606, "step": 12395 }, { "epoch": 0.7953816549069916, "grad_norm": 0.8206919204372869, "learning_rate": 2.444170702587226e-05, "loss": 0.6932, "step": 12400 }, { "epoch": 0.7957023733162284, "grad_norm": 0.6603633676196071, "learning_rate": 2.436841999886994e-05, "loss": 0.6109, "step": 12405 }, { "epoch": 0.796023091725465, "grad_norm": 0.9151323413512733, "learning_rate": 2.4295227762546267e-05, "loss": 0.6631, "step": 12410 }, { "epoch": 0.7963438101347017, "grad_norm": 0.9827343805814039, "learning_rate": 2.422213040863468e-05, "loss": 0.6563, "step": 12415 }, { "epoch": 0.7966645285439384, "grad_norm": 0.9469619065977057, "learning_rate": 2.414912802874961e-05, "loss": 0.7412, "step": 12420 }, { "epoch": 0.7969852469531751, "grad_norm": 1.3131843532103706, "learning_rate": 2.4076220714386568e-05, "loss": 0.6886, "step": 12425 }, { "epoch": 0.7973059653624118, "grad_norm": 1.2148517258592102, "learning_rate": 2.40034085569218e-05, "loss": 0.6898, "step": 12430 }, { "epoch": 0.7976266837716485, "grad_norm": 0.8095565024509138, "learning_rate": 2.393069164761237e-05, "loss": 0.6122, "step": 12435 }, { "epoch": 0.7979474021808852, "grad_norm": 0.9467420200870824, "learning_rate": 2.3858070077595908e-05, "loss": 0.7174, "step": 12440 }, { "epoch": 0.7982681205901219, "grad_norm": 0.6202794025655268, "learning_rate": 2.3785543937890586e-05, "loss": 0.66, "step": 12445 }, { "epoch": 0.7985888389993585, "grad_norm": 1.0791006971385633, "learning_rate": 2.3713113319394997e-05, "loss": 0.5363, "step": 12450 }, { "epoch": 0.7989095574085953, "grad_norm": 1.026500892588481, "learning_rate": 2.3640778312887945e-05, "loss": 0.7948, "step": 12455 }, { "epoch": 0.7992302758178319, "grad_norm": 0.7967893717258743, "learning_rate": 2.35685390090285e-05, "loss": 0.6343, "step": 12460 }, { "epoch": 0.7995509942270687, "grad_norm": 1.1948126480397625, "learning_rate": 2.3496395498355694e-05, "loss": 0.7174, "step": 12465 }, { "epoch": 0.7998717126363053, "grad_norm": 0.8650772892603197, "learning_rate": 2.34243478712885e-05, "loss": 0.7018, "step": 12470 }, { "epoch": 0.800192431045542, "grad_norm": 0.49196395624702055, "learning_rate": 2.3352396218125827e-05, "loss": 0.5881, "step": 12475 }, { "epoch": 0.8005131494547787, "grad_norm": 0.7575733059076403, "learning_rate": 2.3280540629046143e-05, "loss": 0.7292, "step": 12480 }, { "epoch": 0.8008338678640154, "grad_norm": 0.8513796572354395, "learning_rate": 2.3208781194107664e-05, "loss": 0.6286, "step": 12485 }, { "epoch": 0.8011545862732521, "grad_norm": 0.734121779464679, "learning_rate": 2.3137118003248004e-05, "loss": 0.6818, "step": 12490 }, { "epoch": 0.8014753046824887, "grad_norm": 0.5881243074608535, "learning_rate": 2.306555114628415e-05, "loss": 0.6553, "step": 12495 }, { "epoch": 0.8017960230917255, "grad_norm": 0.6452008879569514, "learning_rate": 2.2994080712912435e-05, "loss": 0.705, "step": 12500 }, { "epoch": 0.8021167415009621, "grad_norm": 1.409626103322556, "learning_rate": 2.2922706792708194e-05, "loss": 0.5859, "step": 12505 }, { "epoch": 0.8024374599101989, "grad_norm": 0.7556485492806266, "learning_rate": 2.2851429475125963e-05, "loss": 0.6137, "step": 12510 }, { "epoch": 0.8027581783194355, "grad_norm": 0.9809427245901448, "learning_rate": 2.2780248849499088e-05, "loss": 0.7344, "step": 12515 }, { "epoch": 0.8030788967286723, "grad_norm": 0.38473648876347516, "learning_rate": 2.2709165005039802e-05, "loss": 0.4635, "step": 12520 }, { "epoch": 0.8033996151379089, "grad_norm": 0.7409973296233345, "learning_rate": 2.263817803083901e-05, "loss": 0.6076, "step": 12525 }, { "epoch": 0.8037203335471456, "grad_norm": 0.7165871670251992, "learning_rate": 2.256728801586616e-05, "loss": 0.6541, "step": 12530 }, { "epoch": 0.8040410519563823, "grad_norm": 0.8518968659931285, "learning_rate": 2.249649504896929e-05, "loss": 0.7555, "step": 12535 }, { "epoch": 0.804361770365619, "grad_norm": 0.9159683373230153, "learning_rate": 2.242579921887471e-05, "loss": 0.6843, "step": 12540 }, { "epoch": 0.8046824887748557, "grad_norm": 0.6228826380501181, "learning_rate": 2.2355200614186987e-05, "loss": 0.5394, "step": 12545 }, { "epoch": 0.8050032071840924, "grad_norm": 0.8002539057082869, "learning_rate": 2.2284699323388923e-05, "loss": 0.7345, "step": 12550 }, { "epoch": 0.8053239255933291, "grad_norm": 0.9766455426961175, "learning_rate": 2.2214295434841248e-05, "loss": 0.7367, "step": 12555 }, { "epoch": 0.8056446440025657, "grad_norm": 0.7046361659107024, "learning_rate": 2.2143989036782707e-05, "loss": 0.5187, "step": 12560 }, { "epoch": 0.8059653624118024, "grad_norm": 0.8108273818757799, "learning_rate": 2.2073780217329786e-05, "loss": 0.6532, "step": 12565 }, { "epoch": 0.8062860808210391, "grad_norm": 0.818379710541348, "learning_rate": 2.2003669064476706e-05, "loss": 0.6059, "step": 12570 }, { "epoch": 0.8066067992302758, "grad_norm": 0.984654681269158, "learning_rate": 2.1933655666095275e-05, "loss": 0.6525, "step": 12575 }, { "epoch": 0.8069275176395125, "grad_norm": 0.9567899833609597, "learning_rate": 2.186374010993476e-05, "loss": 0.7311, "step": 12580 }, { "epoch": 0.8072482360487492, "grad_norm": 0.7463705769882709, "learning_rate": 2.1793922483621876e-05, "loss": 0.6196, "step": 12585 }, { "epoch": 0.8075689544579859, "grad_norm": 0.9733520585461265, "learning_rate": 2.1724202874660492e-05, "loss": 0.7193, "step": 12590 }, { "epoch": 0.8078896728672226, "grad_norm": 0.7681175464199929, "learning_rate": 2.165458137043175e-05, "loss": 0.6522, "step": 12595 }, { "epoch": 0.8082103912764592, "grad_norm": 0.6886221085607587, "learning_rate": 2.158505805819374e-05, "loss": 0.6666, "step": 12600 }, { "epoch": 0.808531109685696, "grad_norm": 0.603328263564938, "learning_rate": 2.1515633025081484e-05, "loss": 0.667, "step": 12605 }, { "epoch": 0.8088518280949326, "grad_norm": 0.8470975793567042, "learning_rate": 2.1446306358106927e-05, "loss": 0.6453, "step": 12610 }, { "epoch": 0.8091725465041694, "grad_norm": 1.0220077328521942, "learning_rate": 2.1377078144158603e-05, "loss": 0.6582, "step": 12615 }, { "epoch": 0.809493264913406, "grad_norm": 0.7129620704949545, "learning_rate": 2.1307948470001782e-05, "loss": 0.5496, "step": 12620 }, { "epoch": 0.8098139833226428, "grad_norm": 0.6343852911809139, "learning_rate": 2.1238917422278116e-05, "loss": 0.5455, "step": 12625 }, { "epoch": 0.8101347017318794, "grad_norm": 0.36707540294038493, "learning_rate": 2.1169985087505694e-05, "loss": 0.6399, "step": 12630 }, { "epoch": 0.810455420141116, "grad_norm": 0.813228299713834, "learning_rate": 2.1101151552078944e-05, "loss": 0.6842, "step": 12635 }, { "epoch": 0.8107761385503528, "grad_norm": 0.6267132658473076, "learning_rate": 2.1032416902268314e-05, "loss": 0.5479, "step": 12640 }, { "epoch": 0.8110968569595894, "grad_norm": 1.275645304461915, "learning_rate": 2.0963781224220503e-05, "loss": 0.6785, "step": 12645 }, { "epoch": 0.8114175753688262, "grad_norm": 0.8576850457893269, "learning_rate": 2.0895244603957998e-05, "loss": 0.7868, "step": 12650 }, { "epoch": 0.8117382937780628, "grad_norm": 0.5639578214670323, "learning_rate": 2.082680712737929e-05, "loss": 0.5559, "step": 12655 }, { "epoch": 0.8120590121872996, "grad_norm": 1.1440696942831554, "learning_rate": 2.0758468880258486e-05, "loss": 0.7089, "step": 12660 }, { "epoch": 0.8123797305965362, "grad_norm": 0.8070604839659317, "learning_rate": 2.0690229948245365e-05, "loss": 0.6695, "step": 12665 }, { "epoch": 0.812700449005773, "grad_norm": 0.6244747169984161, "learning_rate": 2.0622090416865293e-05, "loss": 0.5854, "step": 12670 }, { "epoch": 0.8130211674150096, "grad_norm": 0.506375535891638, "learning_rate": 2.055405037151894e-05, "loss": 0.6383, "step": 12675 }, { "epoch": 0.8133418858242463, "grad_norm": 1.183001348716755, "learning_rate": 2.0486109897482407e-05, "loss": 0.6203, "step": 12680 }, { "epoch": 0.813662604233483, "grad_norm": 0.6143509135493088, "learning_rate": 2.0418269079906936e-05, "loss": 0.5593, "step": 12685 }, { "epoch": 0.8139833226427197, "grad_norm": 0.6234718472183463, "learning_rate": 2.0350528003818825e-05, "loss": 0.6459, "step": 12690 }, { "epoch": 0.8143040410519564, "grad_norm": 1.8693845624658407, "learning_rate": 2.0282886754119478e-05, "loss": 0.7211, "step": 12695 }, { "epoch": 0.8146247594611931, "grad_norm": 0.8258541488205007, "learning_rate": 2.0215345415585107e-05, "loss": 0.5976, "step": 12700 }, { "epoch": 0.8149454778704298, "grad_norm": 0.914739265249098, "learning_rate": 2.0147904072866695e-05, "loss": 0.6308, "step": 12705 }, { "epoch": 0.8152661962796665, "grad_norm": 0.7090505847389847, "learning_rate": 2.0080562810489935e-05, "loss": 0.727, "step": 12710 }, { "epoch": 0.8155869146889031, "grad_norm": 0.9339182937300688, "learning_rate": 2.001332171285505e-05, "loss": 0.6809, "step": 12715 }, { "epoch": 0.8159076330981399, "grad_norm": 0.925613865395883, "learning_rate": 1.9946180864236797e-05, "loss": 0.7004, "step": 12720 }, { "epoch": 0.8162283515073765, "grad_norm": 0.874166373614285, "learning_rate": 1.9879140348784177e-05, "loss": 0.6623, "step": 12725 }, { "epoch": 0.8165490699166132, "grad_norm": 0.8313132986404351, "learning_rate": 1.981220025052056e-05, "loss": 0.6177, "step": 12730 }, { "epoch": 0.8168697883258499, "grad_norm": 0.6383078710564455, "learning_rate": 1.9745360653343393e-05, "loss": 0.6089, "step": 12735 }, { "epoch": 0.8171905067350866, "grad_norm": 0.5929159065490891, "learning_rate": 1.9678621641024132e-05, "loss": 0.5833, "step": 12740 }, { "epoch": 0.8175112251443233, "grad_norm": 0.6839908339425101, "learning_rate": 1.961198329720827e-05, "loss": 0.6513, "step": 12745 }, { "epoch": 0.8178319435535599, "grad_norm": 0.43381578975254104, "learning_rate": 1.9545445705415012e-05, "loss": 0.655, "step": 12750 }, { "epoch": 0.8181526619627967, "grad_norm": 0.666728316560307, "learning_rate": 1.947900894903739e-05, "loss": 0.5284, "step": 12755 }, { "epoch": 0.8184733803720333, "grad_norm": 1.0911535549941562, "learning_rate": 1.9412673111342018e-05, "loss": 0.6534, "step": 12760 }, { "epoch": 0.8187940987812701, "grad_norm": 0.8721963911370444, "learning_rate": 1.934643827546899e-05, "loss": 0.7718, "step": 12765 }, { "epoch": 0.8191148171905067, "grad_norm": 0.9043104390757369, "learning_rate": 1.928030452443187e-05, "loss": 0.7249, "step": 12770 }, { "epoch": 0.8194355355997435, "grad_norm": 0.6520308339900129, "learning_rate": 1.9214271941117458e-05, "loss": 0.569, "step": 12775 }, { "epoch": 0.8197562540089801, "grad_norm": 1.0081351400932888, "learning_rate": 1.9148340608285863e-05, "loss": 0.6623, "step": 12780 }, { "epoch": 0.8200769724182168, "grad_norm": 0.6541686083293314, "learning_rate": 1.908251060857019e-05, "loss": 0.6006, "step": 12785 }, { "epoch": 0.8203976908274535, "grad_norm": 0.6996268349045872, "learning_rate": 1.901678202447663e-05, "loss": 0.6209, "step": 12790 }, { "epoch": 0.8207184092366901, "grad_norm": 0.6137399071233165, "learning_rate": 1.8951154938384207e-05, "loss": 0.7341, "step": 12795 }, { "epoch": 0.8210391276459269, "grad_norm": 0.6979894249139232, "learning_rate": 1.8885629432544717e-05, "loss": 0.6331, "step": 12800 }, { "epoch": 0.8213598460551635, "grad_norm": 1.4876520614972237, "learning_rate": 1.882020558908274e-05, "loss": 0.5262, "step": 12805 }, { "epoch": 0.8216805644644003, "grad_norm": 1.1310428300822517, "learning_rate": 1.8754883489995335e-05, "loss": 0.6548, "step": 12810 }, { "epoch": 0.8220012828736369, "grad_norm": 0.819858534428383, "learning_rate": 1.868966321715212e-05, "loss": 0.6514, "step": 12815 }, { "epoch": 0.8223220012828736, "grad_norm": 0.9699270159513138, "learning_rate": 1.8624544852295046e-05, "loss": 0.6668, "step": 12820 }, { "epoch": 0.8226427196921103, "grad_norm": 1.1171340784169779, "learning_rate": 1.8559528477038325e-05, "loss": 0.7466, "step": 12825 }, { "epoch": 0.822963438101347, "grad_norm": 0.9010920277558152, "learning_rate": 1.849461417286843e-05, "loss": 0.5722, "step": 12830 }, { "epoch": 0.8232841565105837, "grad_norm": 0.8446632185572971, "learning_rate": 1.8429802021143816e-05, "loss": 0.7673, "step": 12835 }, { "epoch": 0.8236048749198204, "grad_norm": 0.8445623736137308, "learning_rate": 1.8365092103094938e-05, "loss": 0.6343, "step": 12840 }, { "epoch": 0.8239255933290571, "grad_norm": 1.3224256501204117, "learning_rate": 1.83004844998241e-05, "loss": 0.6446, "step": 12845 }, { "epoch": 0.8242463117382938, "grad_norm": 1.2509505443818558, "learning_rate": 1.8235979292305448e-05, "loss": 0.5908, "step": 12850 }, { "epoch": 0.8245670301475305, "grad_norm": 1.045236864985607, "learning_rate": 1.8171576561384718e-05, "loss": 0.6833, "step": 12855 }, { "epoch": 0.8248877485567672, "grad_norm": 0.8131230488754208, "learning_rate": 1.8107276387779194e-05, "loss": 0.6713, "step": 12860 }, { "epoch": 0.8252084669660038, "grad_norm": 0.9987203815522278, "learning_rate": 1.8043078852077723e-05, "loss": 0.6382, "step": 12865 }, { "epoch": 0.8255291853752406, "grad_norm": 0.8378880198765352, "learning_rate": 1.797898403474041e-05, "loss": 0.651, "step": 12870 }, { "epoch": 0.8258499037844772, "grad_norm": 0.69860101125052, "learning_rate": 1.7914992016098652e-05, "loss": 0.6678, "step": 12875 }, { "epoch": 0.826170622193714, "grad_norm": 0.7906981356515638, "learning_rate": 1.7851102876355064e-05, "loss": 0.7724, "step": 12880 }, { "epoch": 0.8264913406029506, "grad_norm": 0.7220660188316776, "learning_rate": 1.778731669558322e-05, "loss": 0.7528, "step": 12885 }, { "epoch": 0.8268120590121874, "grad_norm": 0.8602114436332251, "learning_rate": 1.772363355372776e-05, "loss": 0.7355, "step": 12890 }, { "epoch": 0.827132777421424, "grad_norm": 0.7936909578079667, "learning_rate": 1.7660053530604103e-05, "loss": 0.5939, "step": 12895 }, { "epoch": 0.8274534958306606, "grad_norm": 0.7386556230325233, "learning_rate": 1.759657670589844e-05, "loss": 0.7065, "step": 12900 }, { "epoch": 0.8277742142398974, "grad_norm": 0.7508393958424202, "learning_rate": 1.7533203159167653e-05, "loss": 0.7995, "step": 12905 }, { "epoch": 0.828094932649134, "grad_norm": 1.484996895062748, "learning_rate": 1.7469932969839133e-05, "loss": 0.5822, "step": 12910 }, { "epoch": 0.8284156510583708, "grad_norm": 0.7889368806667416, "learning_rate": 1.7406766217210813e-05, "loss": 0.6915, "step": 12915 }, { "epoch": 0.8287363694676074, "grad_norm": 1.043078354293378, "learning_rate": 1.7343702980450882e-05, "loss": 0.6678, "step": 12920 }, { "epoch": 0.8290570878768442, "grad_norm": 0.5235441869984315, "learning_rate": 1.7280743338597903e-05, "loss": 0.6732, "step": 12925 }, { "epoch": 0.8293778062860808, "grad_norm": 0.9827303368182867, "learning_rate": 1.7217887370560527e-05, "loss": 0.5817, "step": 12930 }, { "epoch": 0.8296985246953175, "grad_norm": 0.8919025135393817, "learning_rate": 1.715513515511743e-05, "loss": 0.5394, "step": 12935 }, { "epoch": 0.8300192431045542, "grad_norm": 0.8422357074138689, "learning_rate": 1.7092486770917382e-05, "loss": 0.7755, "step": 12940 }, { "epoch": 0.8303399615137909, "grad_norm": 0.9473245373995116, "learning_rate": 1.7029942296478885e-05, "loss": 0.6846, "step": 12945 }, { "epoch": 0.8306606799230276, "grad_norm": 0.6373840068433619, "learning_rate": 1.6967501810190323e-05, "loss": 0.6543, "step": 12950 }, { "epoch": 0.8309813983322643, "grad_norm": 0.7843610971634594, "learning_rate": 1.6905165390309665e-05, "loss": 0.6431, "step": 12955 }, { "epoch": 0.831302116741501, "grad_norm": 1.1652096610055944, "learning_rate": 1.6842933114964466e-05, "loss": 0.8221, "step": 12960 }, { "epoch": 0.8316228351507376, "grad_norm": 0.8194937278113069, "learning_rate": 1.6780805062151816e-05, "loss": 0.5232, "step": 12965 }, { "epoch": 0.8319435535599743, "grad_norm": 1.188666287581691, "learning_rate": 1.6718781309738073e-05, "loss": 0.6604, "step": 12970 }, { "epoch": 0.832264271969211, "grad_norm": 0.8641382912001553, "learning_rate": 1.665686193545898e-05, "loss": 0.5844, "step": 12975 }, { "epoch": 0.8325849903784477, "grad_norm": 0.7062740744596516, "learning_rate": 1.6595047016919373e-05, "loss": 0.6843, "step": 12980 }, { "epoch": 0.8329057087876844, "grad_norm": 1.7666107387397485, "learning_rate": 1.6533336631593276e-05, "loss": 0.5533, "step": 12985 }, { "epoch": 0.8332264271969211, "grad_norm": 0.6713809127329562, "learning_rate": 1.6471730856823587e-05, "loss": 0.5803, "step": 12990 }, { "epoch": 0.8335471456061578, "grad_norm": 0.789870715650865, "learning_rate": 1.6410229769822137e-05, "loss": 0.5722, "step": 12995 }, { "epoch": 0.8338678640153945, "grad_norm": 0.694543681011162, "learning_rate": 1.6348833447669596e-05, "loss": 0.7518, "step": 13000 }, { "epoch": 0.8341885824246311, "grad_norm": 0.9060155570486944, "learning_rate": 1.6287541967315246e-05, "loss": 0.6968, "step": 13005 }, { "epoch": 0.8345093008338679, "grad_norm": 0.7521276185282114, "learning_rate": 1.6226355405577052e-05, "loss": 0.7398, "step": 13010 }, { "epoch": 0.8348300192431045, "grad_norm": 0.6239824879078599, "learning_rate": 1.6165273839141425e-05, "loss": 0.5993, "step": 13015 }, { "epoch": 0.8351507376523413, "grad_norm": 0.8788280197433859, "learning_rate": 1.610429734456317e-05, "loss": 0.5281, "step": 13020 }, { "epoch": 0.8354714560615779, "grad_norm": 0.5708218830810341, "learning_rate": 1.604342599826548e-05, "loss": 0.6636, "step": 13025 }, { "epoch": 0.8357921744708147, "grad_norm": 0.9995506015609548, "learning_rate": 1.5982659876539706e-05, "loss": 0.6224, "step": 13030 }, { "epoch": 0.8361128928800513, "grad_norm": 0.6985670528256153, "learning_rate": 1.5921999055545322e-05, "loss": 0.7875, "step": 13035 }, { "epoch": 0.8364336112892881, "grad_norm": 1.1017729058211603, "learning_rate": 1.5861443611309836e-05, "loss": 0.5689, "step": 13040 }, { "epoch": 0.8367543296985247, "grad_norm": 0.6102105059220153, "learning_rate": 1.5800993619728645e-05, "loss": 0.6071, "step": 13045 }, { "epoch": 0.8370750481077613, "grad_norm": 1.0918121069567406, "learning_rate": 1.574064915656508e-05, "loss": 0.6389, "step": 13050 }, { "epoch": 0.8373957665169981, "grad_norm": 0.8119509757109902, "learning_rate": 1.5680410297450097e-05, "loss": 0.6904, "step": 13055 }, { "epoch": 0.8377164849262347, "grad_norm": 1.0654010067070523, "learning_rate": 1.56202771178824e-05, "loss": 0.6806, "step": 13060 }, { "epoch": 0.8380372033354715, "grad_norm": 1.003140917229182, "learning_rate": 1.5560249693228167e-05, "loss": 0.7506, "step": 13065 }, { "epoch": 0.8383579217447081, "grad_norm": 0.8104009198927022, "learning_rate": 1.5500328098721017e-05, "loss": 0.6771, "step": 13070 }, { "epoch": 0.8386786401539449, "grad_norm": 0.6505916854083006, "learning_rate": 1.5440512409462027e-05, "loss": 0.4606, "step": 13075 }, { "epoch": 0.8389993585631815, "grad_norm": 0.8172274238106711, "learning_rate": 1.5380802700419437e-05, "loss": 0.6273, "step": 13080 }, { "epoch": 0.8393200769724182, "grad_norm": 0.8412486560565198, "learning_rate": 1.5321199046428748e-05, "loss": 0.6232, "step": 13085 }, { "epoch": 0.8396407953816549, "grad_norm": 1.2677355193498017, "learning_rate": 1.526170152219246e-05, "loss": 0.6965, "step": 13090 }, { "epoch": 0.8399615137908916, "grad_norm": 1.1729148810404941, "learning_rate": 1.520231020228008e-05, "loss": 0.6742, "step": 13095 }, { "epoch": 0.8402822322001283, "grad_norm": 0.9492910072998716, "learning_rate": 1.51430251611281e-05, "loss": 0.6427, "step": 13100 }, { "epoch": 0.840602950609365, "grad_norm": 0.9485664054113067, "learning_rate": 1.508384647303962e-05, "loss": 0.7599, "step": 13105 }, { "epoch": 0.8409236690186017, "grad_norm": 0.7710450909617227, "learning_rate": 1.5024774212184644e-05, "loss": 0.7211, "step": 13110 }, { "epoch": 0.8412443874278384, "grad_norm": 1.4732302257890362, "learning_rate": 1.496580845259965e-05, "loss": 0.5757, "step": 13115 }, { "epoch": 0.841565105837075, "grad_norm": 0.815748738677427, "learning_rate": 1.4906949268187731e-05, "loss": 0.7202, "step": 13120 }, { "epoch": 0.8418858242463118, "grad_norm": 0.7569265134733956, "learning_rate": 1.4848196732718333e-05, "loss": 0.5067, "step": 13125 }, { "epoch": 0.8422065426555484, "grad_norm": 0.7019350874014, "learning_rate": 1.4789550919827255e-05, "loss": 0.6555, "step": 13130 }, { "epoch": 0.8425272610647851, "grad_norm": 1.070502908495116, "learning_rate": 1.4731011903016589e-05, "loss": 0.5612, "step": 13135 }, { "epoch": 0.8428479794740218, "grad_norm": 0.8746378057344433, "learning_rate": 1.4672579755654492e-05, "loss": 0.6644, "step": 13140 }, { "epoch": 0.8431686978832585, "grad_norm": 0.9069204901759049, "learning_rate": 1.4614254550975282e-05, "loss": 0.6041, "step": 13145 }, { "epoch": 0.8434894162924952, "grad_norm": 0.6286045045253976, "learning_rate": 1.455603636207915e-05, "loss": 0.573, "step": 13150 }, { "epoch": 0.8438101347017318, "grad_norm": 0.8046184953958996, "learning_rate": 1.4497925261932188e-05, "loss": 0.7031, "step": 13155 }, { "epoch": 0.8441308531109686, "grad_norm": 0.9289022471342262, "learning_rate": 1.4439921323366323e-05, "loss": 0.6532, "step": 13160 }, { "epoch": 0.8444515715202052, "grad_norm": 0.8155940315800527, "learning_rate": 1.4382024619079105e-05, "loss": 0.6537, "step": 13165 }, { "epoch": 0.844772289929442, "grad_norm": 1.064740365786613, "learning_rate": 1.432423522163372e-05, "loss": 0.598, "step": 13170 }, { "epoch": 0.8450930083386786, "grad_norm": 0.7962110283298796, "learning_rate": 1.4266553203458831e-05, "loss": 0.7714, "step": 13175 }, { "epoch": 0.8454137267479154, "grad_norm": 0.9891349725088471, "learning_rate": 1.4208978636848591e-05, "loss": 0.666, "step": 13180 }, { "epoch": 0.845734445157152, "grad_norm": 0.4894444101288945, "learning_rate": 1.4151511593962418e-05, "loss": 0.5697, "step": 13185 }, { "epoch": 0.8460551635663887, "grad_norm": 0.9733970578229911, "learning_rate": 1.4094152146824969e-05, "loss": 0.639, "step": 13190 }, { "epoch": 0.8463758819756254, "grad_norm": 0.7185939555951706, "learning_rate": 1.40369003673261e-05, "loss": 0.6608, "step": 13195 }, { "epoch": 0.846696600384862, "grad_norm": 0.9770382341303654, "learning_rate": 1.3979756327220683e-05, "loss": 0.5714, "step": 13200 }, { "epoch": 0.8470173187940988, "grad_norm": 0.9521286519290345, "learning_rate": 1.3922720098128527e-05, "loss": 0.7672, "step": 13205 }, { "epoch": 0.8473380372033354, "grad_norm": 1.0026426650083589, "learning_rate": 1.3865791751534418e-05, "loss": 0.589, "step": 13210 }, { "epoch": 0.8476587556125722, "grad_norm": 0.7945648077908503, "learning_rate": 1.3808971358787837e-05, "loss": 0.5791, "step": 13215 }, { "epoch": 0.8479794740218088, "grad_norm": 0.6890626619071494, "learning_rate": 1.3752258991103018e-05, "loss": 0.7313, "step": 13220 }, { "epoch": 0.8483001924310456, "grad_norm": 0.8523591274592248, "learning_rate": 1.369565471955878e-05, "loss": 0.7, "step": 13225 }, { "epoch": 0.8486209108402822, "grad_norm": 0.6661468510777631, "learning_rate": 1.3639158615098457e-05, "loss": 0.681, "step": 13230 }, { "epoch": 0.8489416292495189, "grad_norm": 0.498183926121059, "learning_rate": 1.3582770748529839e-05, "loss": 0.6238, "step": 13235 }, { "epoch": 0.8492623476587556, "grad_norm": 0.9855613055277577, "learning_rate": 1.3526491190525025e-05, "loss": 0.7218, "step": 13240 }, { "epoch": 0.8495830660679923, "grad_norm": 0.42718056670086024, "learning_rate": 1.3470320011620418e-05, "loss": 0.5768, "step": 13245 }, { "epoch": 0.849903784477229, "grad_norm": 1.0362257024186183, "learning_rate": 1.3414257282216535e-05, "loss": 0.6332, "step": 13250 }, { "epoch": 0.8502245028864657, "grad_norm": 0.8990446366365678, "learning_rate": 1.3358303072578027e-05, "loss": 0.6709, "step": 13255 }, { "epoch": 0.8505452212957024, "grad_norm": 0.7211479323078617, "learning_rate": 1.3302457452833484e-05, "loss": 0.5878, "step": 13260 }, { "epoch": 0.8508659397049391, "grad_norm": 1.1924552884788637, "learning_rate": 1.3246720492975396e-05, "loss": 0.7302, "step": 13265 }, { "epoch": 0.8511866581141757, "grad_norm": 0.6589451039855936, "learning_rate": 1.3191092262860127e-05, "loss": 0.6891, "step": 13270 }, { "epoch": 0.8515073765234125, "grad_norm": 0.6379938202383435, "learning_rate": 1.3135572832207699e-05, "loss": 0.5751, "step": 13275 }, { "epoch": 0.8518280949326491, "grad_norm": 0.6107227064835382, "learning_rate": 1.3080162270601826e-05, "loss": 0.6705, "step": 13280 }, { "epoch": 0.8521488133418859, "grad_norm": 0.7796857101023206, "learning_rate": 1.3024860647489756e-05, "loss": 0.595, "step": 13285 }, { "epoch": 0.8524695317511225, "grad_norm": 1.067556813441523, "learning_rate": 1.2969668032182147e-05, "loss": 0.6906, "step": 13290 }, { "epoch": 0.8527902501603593, "grad_norm": 0.7705240841097785, "learning_rate": 1.2914584493853144e-05, "loss": 0.6176, "step": 13295 }, { "epoch": 0.8531109685695959, "grad_norm": 0.9227266859657003, "learning_rate": 1.285961010154011e-05, "loss": 0.6479, "step": 13300 }, { "epoch": 0.8534316869788325, "grad_norm": 1.0189541311376396, "learning_rate": 1.2804744924143608e-05, "loss": 0.728, "step": 13305 }, { "epoch": 0.8537524053880693, "grad_norm": 0.8401997316168908, "learning_rate": 1.2749989030427344e-05, "loss": 0.7617, "step": 13310 }, { "epoch": 0.8540731237973059, "grad_norm": 0.9093797714776795, "learning_rate": 1.269534248901807e-05, "loss": 0.5851, "step": 13315 }, { "epoch": 0.8543938422065427, "grad_norm": 1.1037034088263697, "learning_rate": 1.2640805368405462e-05, "loss": 0.6118, "step": 13320 }, { "epoch": 0.8547145606157793, "grad_norm": 0.7177427685245759, "learning_rate": 1.2586377736942034e-05, "loss": 0.7042, "step": 13325 }, { "epoch": 0.8550352790250161, "grad_norm": 0.9633359403241921, "learning_rate": 1.2532059662843144e-05, "loss": 0.7182, "step": 13330 }, { "epoch": 0.8553559974342527, "grad_norm": 0.8564133887667676, "learning_rate": 1.2477851214186754e-05, "loss": 0.5807, "step": 13335 }, { "epoch": 0.8556767158434894, "grad_norm": 1.0067512789243385, "learning_rate": 1.2423752458913518e-05, "loss": 0.6689, "step": 13340 }, { "epoch": 0.8559974342527261, "grad_norm": 0.6740456644820353, "learning_rate": 1.2369763464826533e-05, "loss": 0.5505, "step": 13345 }, { "epoch": 0.8563181526619628, "grad_norm": 0.7485205146558563, "learning_rate": 1.2315884299591362e-05, "loss": 0.7485, "step": 13350 }, { "epoch": 0.8566388710711995, "grad_norm": 0.9943455478406926, "learning_rate": 1.2262115030735944e-05, "loss": 0.7464, "step": 13355 }, { "epoch": 0.8569595894804362, "grad_norm": 0.7832997459113116, "learning_rate": 1.2208455725650436e-05, "loss": 0.6956, "step": 13360 }, { "epoch": 0.8572803078896729, "grad_norm": 0.9569726126068407, "learning_rate": 1.2154906451587189e-05, "loss": 0.7132, "step": 13365 }, { "epoch": 0.8576010262989096, "grad_norm": 0.7447552856015294, "learning_rate": 1.2101467275660661e-05, "loss": 0.4959, "step": 13370 }, { "epoch": 0.8579217447081462, "grad_norm": 0.8033856598382162, "learning_rate": 1.2048138264847297e-05, "loss": 0.8208, "step": 13375 }, { "epoch": 0.858242463117383, "grad_norm": 1.2548309542667209, "learning_rate": 1.1994919485985522e-05, "loss": 0.5933, "step": 13380 }, { "epoch": 0.8585631815266196, "grad_norm": 0.8849084463562876, "learning_rate": 1.1941811005775538e-05, "loss": 0.7345, "step": 13385 }, { "epoch": 0.8588838999358563, "grad_norm": 1.0662756941569218, "learning_rate": 1.1888812890779377e-05, "loss": 0.672, "step": 13390 }, { "epoch": 0.859204618345093, "grad_norm": 1.4484403343446357, "learning_rate": 1.1835925207420694e-05, "loss": 0.606, "step": 13395 }, { "epoch": 0.8595253367543297, "grad_norm": 0.8332555994611591, "learning_rate": 1.1783148021984725e-05, "loss": 0.692, "step": 13400 }, { "epoch": 0.8598460551635664, "grad_norm": 0.7857634142558743, "learning_rate": 1.1730481400618299e-05, "loss": 0.8791, "step": 13405 }, { "epoch": 0.8601667735728031, "grad_norm": 1.0726454797623632, "learning_rate": 1.167792540932957e-05, "loss": 0.6978, "step": 13410 }, { "epoch": 0.8604874919820398, "grad_norm": 0.662627507867472, "learning_rate": 1.162548011398814e-05, "loss": 0.6655, "step": 13415 }, { "epoch": 0.8608082103912764, "grad_norm": 0.7427411411925819, "learning_rate": 1.1573145580324785e-05, "loss": 0.7019, "step": 13420 }, { "epoch": 0.8611289288005132, "grad_norm": 0.8465518983483786, "learning_rate": 1.1520921873931489e-05, "loss": 0.7452, "step": 13425 }, { "epoch": 0.8614496472097498, "grad_norm": 0.5455286801662246, "learning_rate": 1.1468809060261399e-05, "loss": 0.652, "step": 13430 }, { "epoch": 0.8617703656189866, "grad_norm": 0.8972113556345591, "learning_rate": 1.1416807204628533e-05, "loss": 0.5988, "step": 13435 }, { "epoch": 0.8620910840282232, "grad_norm": 0.6854697322056322, "learning_rate": 1.1364916372208e-05, "loss": 0.696, "step": 13440 }, { "epoch": 0.86241180243746, "grad_norm": 0.678150343614853, "learning_rate": 1.1313136628035647e-05, "loss": 0.5252, "step": 13445 }, { "epoch": 0.8627325208466966, "grad_norm": 0.6285060401132421, "learning_rate": 1.1261468037008172e-05, "loss": 0.4725, "step": 13450 }, { "epoch": 0.8630532392559332, "grad_norm": 0.6510845504498061, "learning_rate": 1.1209910663882916e-05, "loss": 0.5565, "step": 13455 }, { "epoch": 0.86337395766517, "grad_norm": 1.2698183413935256, "learning_rate": 1.1158464573277816e-05, "loss": 0.7544, "step": 13460 }, { "epoch": 0.8636946760744066, "grad_norm": 1.048484623181104, "learning_rate": 1.1107129829671393e-05, "loss": 0.6762, "step": 13465 }, { "epoch": 0.8640153944836434, "grad_norm": 0.8197138470113798, "learning_rate": 1.1055906497402534e-05, "loss": 0.7671, "step": 13470 }, { "epoch": 0.86433611289288, "grad_norm": 0.8060735013585868, "learning_rate": 1.1004794640670602e-05, "loss": 0.7412, "step": 13475 }, { "epoch": 0.8646568313021168, "grad_norm": 0.5202202198681646, "learning_rate": 1.0953794323535138e-05, "loss": 0.617, "step": 13480 }, { "epoch": 0.8649775497113534, "grad_norm": 0.9060221838859691, "learning_rate": 1.0902905609915925e-05, "loss": 0.6724, "step": 13485 }, { "epoch": 0.8652982681205901, "grad_norm": 0.9948896143875089, "learning_rate": 1.0852128563592911e-05, "loss": 0.6916, "step": 13490 }, { "epoch": 0.8656189865298268, "grad_norm": 0.6185205159442889, "learning_rate": 1.0801463248206012e-05, "loss": 0.6155, "step": 13495 }, { "epoch": 0.8659397049390635, "grad_norm": 0.8621415617622489, "learning_rate": 1.0750909727255231e-05, "loss": 0.5641, "step": 13500 }, { "epoch": 0.8662604233483002, "grad_norm": 1.0099987644568347, "learning_rate": 1.0700468064100278e-05, "loss": 0.5874, "step": 13505 }, { "epoch": 0.8665811417575369, "grad_norm": 0.7860625683994522, "learning_rate": 1.0650138321960834e-05, "loss": 0.6447, "step": 13510 }, { "epoch": 0.8669018601667736, "grad_norm": 1.0075130412273372, "learning_rate": 1.0599920563916233e-05, "loss": 0.6428, "step": 13515 }, { "epoch": 0.8672225785760103, "grad_norm": 0.8433746537048423, "learning_rate": 1.0549814852905427e-05, "loss": 0.6156, "step": 13520 }, { "epoch": 0.8675432969852469, "grad_norm": 0.6911458595910109, "learning_rate": 1.0499821251727038e-05, "loss": 0.7697, "step": 13525 }, { "epoch": 0.8678640153944837, "grad_norm": 0.7261479775249019, "learning_rate": 1.044993982303909e-05, "loss": 0.7353, "step": 13530 }, { "epoch": 0.8681847338037203, "grad_norm": 0.5256687873474478, "learning_rate": 1.040017062935902e-05, "loss": 0.5737, "step": 13535 }, { "epoch": 0.868505452212957, "grad_norm": 1.0493206252194889, "learning_rate": 1.035051373306366e-05, "loss": 0.6215, "step": 13540 }, { "epoch": 0.8688261706221937, "grad_norm": 0.665208544741004, "learning_rate": 1.0300969196389033e-05, "loss": 0.6073, "step": 13545 }, { "epoch": 0.8691468890314304, "grad_norm": 0.6978534685649864, "learning_rate": 1.0251537081430406e-05, "loss": 0.5837, "step": 13550 }, { "epoch": 0.8694676074406671, "grad_norm": 0.7579892159049441, "learning_rate": 1.0202217450142082e-05, "loss": 0.5604, "step": 13555 }, { "epoch": 0.8697883258499037, "grad_norm": 0.6514517952782195, "learning_rate": 1.015301036433739e-05, "loss": 0.6971, "step": 13560 }, { "epoch": 0.8701090442591405, "grad_norm": 0.4398371785948417, "learning_rate": 1.0103915885688686e-05, "loss": 0.5459, "step": 13565 }, { "epoch": 0.8704297626683771, "grad_norm": 0.6924160948174624, "learning_rate": 1.0054934075727062e-05, "loss": 0.5386, "step": 13570 }, { "epoch": 0.8707504810776139, "grad_norm": 0.9269090072648052, "learning_rate": 1.0006064995842513e-05, "loss": 0.7547, "step": 13575 }, { "epoch": 0.8710711994868505, "grad_norm": 0.788185049843599, "learning_rate": 9.957308707283675e-06, "loss": 0.6128, "step": 13580 }, { "epoch": 0.8713919178960873, "grad_norm": 0.7308595928706564, "learning_rate": 9.90866527115788e-06, "loss": 0.6036, "step": 13585 }, { "epoch": 0.8717126363053239, "grad_norm": 0.7092354057653707, "learning_rate": 9.860134748430972e-06, "loss": 0.7038, "step": 13590 }, { "epoch": 0.8720333547145607, "grad_norm": 1.0470346737728682, "learning_rate": 9.811717199927273e-06, "loss": 0.73, "step": 13595 }, { "epoch": 0.8723540731237973, "grad_norm": 1.2863495939351028, "learning_rate": 9.763412686329575e-06, "loss": 0.7084, "step": 13600 }, { "epoch": 0.872674791533034, "grad_norm": 0.768903275631644, "learning_rate": 9.71522126817892e-06, "loss": 0.7444, "step": 13605 }, { "epoch": 0.8729955099422707, "grad_norm": 0.8561382895899066, "learning_rate": 9.667143005874679e-06, "loss": 0.6743, "step": 13610 }, { "epoch": 0.8733162283515074, "grad_norm": 0.6255033102428371, "learning_rate": 9.619177959674353e-06, "loss": 0.6357, "step": 13615 }, { "epoch": 0.8736369467607441, "grad_norm": 0.8367879131361138, "learning_rate": 9.57132618969354e-06, "loss": 0.7229, "step": 13620 }, { "epoch": 0.8739576651699807, "grad_norm": 0.6292130728042913, "learning_rate": 9.523587755905938e-06, "loss": 0.6561, "step": 13625 }, { "epoch": 0.8742783835792175, "grad_norm": 0.9860204738083063, "learning_rate": 9.475962718143106e-06, "loss": 0.6323, "step": 13630 }, { "epoch": 0.8745991019884541, "grad_norm": 0.841887275726057, "learning_rate": 9.428451136094541e-06, "loss": 0.6762, "step": 13635 }, { "epoch": 0.8749198203976908, "grad_norm": 0.7100122528682058, "learning_rate": 9.381053069307499e-06, "loss": 0.5494, "step": 13640 }, { "epoch": 0.8752405388069275, "grad_norm": 1.1202599763010757, "learning_rate": 9.33376857718703e-06, "loss": 0.5936, "step": 13645 }, { "epoch": 0.8755612572161642, "grad_norm": 1.0773135254923245, "learning_rate": 9.286597718995783e-06, "loss": 0.5523, "step": 13650 }, { "epoch": 0.8758819756254009, "grad_norm": 0.7262011668633317, "learning_rate": 9.239540553853987e-06, "loss": 0.7559, "step": 13655 }, { "epoch": 0.8762026940346376, "grad_norm": 1.1845562776611291, "learning_rate": 9.192597140739445e-06, "loss": 0.6214, "step": 13660 }, { "epoch": 0.8765234124438743, "grad_norm": 1.0832215867500623, "learning_rate": 9.145767538487282e-06, "loss": 0.6363, "step": 13665 }, { "epoch": 0.876844130853111, "grad_norm": 0.8384508766840872, "learning_rate": 9.099051805790081e-06, "loss": 0.7162, "step": 13670 }, { "epoch": 0.8771648492623476, "grad_norm": 0.7886740113805487, "learning_rate": 9.052450001197666e-06, "loss": 0.5292, "step": 13675 }, { "epoch": 0.8774855676715844, "grad_norm": 0.724073412445175, "learning_rate": 9.005962183117055e-06, "loss": 0.7159, "step": 13680 }, { "epoch": 0.877806286080821, "grad_norm": 0.5059344342927663, "learning_rate": 8.959588409812458e-06, "loss": 0.6316, "step": 13685 }, { "epoch": 0.8781270044900578, "grad_norm": 1.2097294273874917, "learning_rate": 8.913328739405092e-06, "loss": 0.7006, "step": 13690 }, { "epoch": 0.8784477228992944, "grad_norm": 0.615032496760421, "learning_rate": 8.867183229873211e-06, "loss": 0.738, "step": 13695 }, { "epoch": 0.8787684413085312, "grad_norm": 0.6135358966273193, "learning_rate": 8.821151939051953e-06, "loss": 0.6287, "step": 13700 }, { "epoch": 0.8790891597177678, "grad_norm": 0.8910522096004475, "learning_rate": 8.775234924633301e-06, "loss": 0.7301, "step": 13705 }, { "epoch": 0.8794098781270044, "grad_norm": 0.7815093977889225, "learning_rate": 8.72943224416609e-06, "loss": 0.6499, "step": 13710 }, { "epoch": 0.8797305965362412, "grad_norm": 0.5607257491266542, "learning_rate": 8.683743955055746e-06, "loss": 0.6083, "step": 13715 }, { "epoch": 0.8800513149454778, "grad_norm": 0.884214002379739, "learning_rate": 8.638170114564414e-06, "loss": 0.611, "step": 13720 }, { "epoch": 0.8803720333547146, "grad_norm": 0.7528314170250561, "learning_rate": 8.592710779810765e-06, "loss": 0.6921, "step": 13725 }, { "epoch": 0.8806927517639512, "grad_norm": 0.9161588988308113, "learning_rate": 8.547366007769919e-06, "loss": 0.652, "step": 13730 }, { "epoch": 0.881013470173188, "grad_norm": 1.4044011636843894, "learning_rate": 8.502135855273497e-06, "loss": 0.6532, "step": 13735 }, { "epoch": 0.8813341885824246, "grad_norm": 0.5311315649019397, "learning_rate": 8.457020379009373e-06, "loss": 0.5949, "step": 13740 }, { "epoch": 0.8816549069916613, "grad_norm": 0.6747473256173435, "learning_rate": 8.412019635521784e-06, "loss": 0.5982, "step": 13745 }, { "epoch": 0.881975625400898, "grad_norm": 0.6539295071967237, "learning_rate": 8.367133681211103e-06, "loss": 0.4702, "step": 13750 }, { "epoch": 0.8822963438101347, "grad_norm": 0.403206890252452, "learning_rate": 8.322362572333841e-06, "loss": 0.5464, "step": 13755 }, { "epoch": 0.8826170622193714, "grad_norm": 0.7780767642995721, "learning_rate": 8.277706365002625e-06, "loss": 0.6976, "step": 13760 }, { "epoch": 0.8829377806286081, "grad_norm": 0.6272304201483566, "learning_rate": 8.233165115186003e-06, "loss": 0.6613, "step": 13765 }, { "epoch": 0.8832584990378448, "grad_norm": 0.8343537172020628, "learning_rate": 8.188738878708502e-06, "loss": 0.7469, "step": 13770 }, { "epoch": 0.8835792174470815, "grad_norm": 0.9345794017556924, "learning_rate": 8.144427711250447e-06, "loss": 0.7586, "step": 13775 }, { "epoch": 0.8838999358563181, "grad_norm": 1.162828611729811, "learning_rate": 8.100231668348002e-06, "loss": 0.5382, "step": 13780 }, { "epoch": 0.8842206542655549, "grad_norm": 1.1205395105885234, "learning_rate": 8.056150805392993e-06, "loss": 0.6138, "step": 13785 }, { "epoch": 0.8845413726747915, "grad_norm": 0.5630057786543724, "learning_rate": 8.012185177632914e-06, "loss": 0.4977, "step": 13790 }, { "epoch": 0.8848620910840282, "grad_norm": 0.8477848139037634, "learning_rate": 7.968334840170843e-06, "loss": 0.7394, "step": 13795 }, { "epoch": 0.8851828094932649, "grad_norm": 0.7207695540829029, "learning_rate": 7.92459984796532e-06, "loss": 0.7108, "step": 13800 }, { "epoch": 0.8855035279025016, "grad_norm": 0.9355747131091594, "learning_rate": 7.880980255830372e-06, "loss": 0.6971, "step": 13805 }, { "epoch": 0.8858242463117383, "grad_norm": 0.8391670611046308, "learning_rate": 7.83747611843536e-06, "loss": 0.6618, "step": 13810 }, { "epoch": 0.886144964720975, "grad_norm": 0.6940296556964382, "learning_rate": 7.794087490304935e-06, "loss": 0.7303, "step": 13815 }, { "epoch": 0.8864656831302117, "grad_norm": 0.5463085826484815, "learning_rate": 7.75081442581902e-06, "loss": 0.7128, "step": 13820 }, { "epoch": 0.8867864015394483, "grad_norm": 0.9553016730601827, "learning_rate": 7.707656979212653e-06, "loss": 0.5325, "step": 13825 }, { "epoch": 0.8871071199486851, "grad_norm": 0.9151217967040441, "learning_rate": 7.66461520457602e-06, "loss": 0.7276, "step": 13830 }, { "epoch": 0.8874278383579217, "grad_norm": 0.7869236135130984, "learning_rate": 7.6216891558542395e-06, "loss": 0.744, "step": 13835 }, { "epoch": 0.8877485567671585, "grad_norm": 0.748585172606016, "learning_rate": 7.578878886847507e-06, "loss": 0.5891, "step": 13840 }, { "epoch": 0.8880692751763951, "grad_norm": 0.7205402378107477, "learning_rate": 7.536184451210815e-06, "loss": 0.6715, "step": 13845 }, { "epoch": 0.8883899935856319, "grad_norm": 0.6198613140638497, "learning_rate": 7.493605902454004e-06, "loss": 0.7581, "step": 13850 }, { "epoch": 0.8887107119948685, "grad_norm": 0.84149727085621, "learning_rate": 7.451143293941709e-06, "loss": 0.746, "step": 13855 }, { "epoch": 0.8890314304041051, "grad_norm": 0.8368846152026573, "learning_rate": 7.408796678893226e-06, "loss": 0.6687, "step": 13860 }, { "epoch": 0.8893521488133419, "grad_norm": 0.7827898269521945, "learning_rate": 7.366566110382445e-06, "loss": 0.5832, "step": 13865 }, { "epoch": 0.8896728672225785, "grad_norm": 0.7300699318830831, "learning_rate": 7.324451641337882e-06, "loss": 0.6294, "step": 13870 }, { "epoch": 0.8899935856318153, "grad_norm": 0.9238241719407477, "learning_rate": 7.28245332454246e-06, "loss": 0.7083, "step": 13875 }, { "epoch": 0.8903143040410519, "grad_norm": 0.7709293686153301, "learning_rate": 7.240571212633618e-06, "loss": 0.5686, "step": 13880 }, { "epoch": 0.8906350224502887, "grad_norm": 1.1869349060713659, "learning_rate": 7.198805358103067e-06, "loss": 0.728, "step": 13885 }, { "epoch": 0.8909557408595253, "grad_norm": 1.0851258551108929, "learning_rate": 7.157155813296834e-06, "loss": 0.7379, "step": 13890 }, { "epoch": 0.891276459268762, "grad_norm": 0.7394708926504447, "learning_rate": 7.115622630415253e-06, "loss": 0.7321, "step": 13895 }, { "epoch": 0.8915971776779987, "grad_norm": 0.816039779235774, "learning_rate": 7.0742058615126726e-06, "loss": 0.601, "step": 13900 }, { "epoch": 0.8919178960872354, "grad_norm": 0.41244323070119415, "learning_rate": 7.03290555849766e-06, "loss": 0.5809, "step": 13905 }, { "epoch": 0.8922386144964721, "grad_norm": 0.8918418533925353, "learning_rate": 6.991721773132742e-06, "loss": 0.7142, "step": 13910 }, { "epoch": 0.8925593329057088, "grad_norm": 0.8732825568065812, "learning_rate": 6.950654557034475e-06, "loss": 0.6635, "step": 13915 }, { "epoch": 0.8928800513149455, "grad_norm": 0.7358325355065991, "learning_rate": 6.909703961673253e-06, "loss": 0.6412, "step": 13920 }, { "epoch": 0.8932007697241822, "grad_norm": 0.7111667197818642, "learning_rate": 6.868870038373332e-06, "loss": 0.6767, "step": 13925 }, { "epoch": 0.8935214881334188, "grad_norm": 1.0721200112803682, "learning_rate": 6.828152838312773e-06, "loss": 0.5066, "step": 13930 }, { "epoch": 0.8938422065426556, "grad_norm": 0.8310238983860934, "learning_rate": 6.787552412523279e-06, "loss": 0.6764, "step": 13935 }, { "epoch": 0.8941629249518922, "grad_norm": 0.6872676077028719, "learning_rate": 6.747068811890256e-06, "loss": 0.6671, "step": 13940 }, { "epoch": 0.894483643361129, "grad_norm": 0.9702633803545438, "learning_rate": 6.706702087152661e-06, "loss": 0.4624, "step": 13945 }, { "epoch": 0.8948043617703656, "grad_norm": 1.375052365512822, "learning_rate": 6.666452288902958e-06, "loss": 0.7522, "step": 13950 }, { "epoch": 0.8951250801796024, "grad_norm": 0.908667367564301, "learning_rate": 6.626319467587106e-06, "loss": 0.6602, "step": 13955 }, { "epoch": 0.895445798588839, "grad_norm": 1.0327055092345554, "learning_rate": 6.586303673504412e-06, "loss": 0.6192, "step": 13960 }, { "epoch": 0.8957665169980756, "grad_norm": 0.7913439515419154, "learning_rate": 6.5464049568075615e-06, "loss": 0.6883, "step": 13965 }, { "epoch": 0.8960872354073124, "grad_norm": 0.9249759944838365, "learning_rate": 6.506623367502418e-06, "loss": 0.7207, "step": 13970 }, { "epoch": 0.896407953816549, "grad_norm": 0.6185623923439777, "learning_rate": 6.4669589554481325e-06, "loss": 0.7935, "step": 13975 }, { "epoch": 0.8967286722257858, "grad_norm": 0.9047502038967159, "learning_rate": 6.4274117703569615e-06, "loss": 0.523, "step": 13980 }, { "epoch": 0.8970493906350224, "grad_norm": 0.5862791588591175, "learning_rate": 6.387981861794212e-06, "loss": 0.5767, "step": 13985 }, { "epoch": 0.8973701090442592, "grad_norm": 0.9286416832372187, "learning_rate": 6.348669279178277e-06, "loss": 0.5952, "step": 13990 }, { "epoch": 0.8976908274534958, "grad_norm": 0.9632286005822661, "learning_rate": 6.309474071780408e-06, "loss": 0.7512, "step": 13995 }, { "epoch": 0.8980115458627326, "grad_norm": 0.6713818773459586, "learning_rate": 6.2703962887248444e-06, "loss": 0.8033, "step": 14000 }, { "epoch": 0.8983322642719692, "grad_norm": 0.43098921146350616, "learning_rate": 6.2314359789885756e-06, "loss": 0.5506, "step": 14005 }, { "epoch": 0.8986529826812059, "grad_norm": 1.029998963102262, "learning_rate": 6.192593191401396e-06, "loss": 0.6528, "step": 14010 }, { "epoch": 0.8989737010904426, "grad_norm": 0.9428065435910548, "learning_rate": 6.153867974645833e-06, "loss": 0.6822, "step": 14015 }, { "epoch": 0.8992944194996793, "grad_norm": 0.6275896637114994, "learning_rate": 6.115260377257004e-06, "loss": 0.556, "step": 14020 }, { "epoch": 0.899615137908916, "grad_norm": 0.6094084941175278, "learning_rate": 6.076770447622615e-06, "loss": 0.5094, "step": 14025 }, { "epoch": 0.8999358563181526, "grad_norm": 0.7526848860794296, "learning_rate": 6.038398233982989e-06, "loss": 0.678, "step": 14030 }, { "epoch": 0.9002565747273894, "grad_norm": 0.5704944797751071, "learning_rate": 6.000143784430756e-06, "loss": 0.6822, "step": 14035 }, { "epoch": 0.900577293136626, "grad_norm": 0.7525424440388754, "learning_rate": 5.962007146911109e-06, "loss": 0.7008, "step": 14040 }, { "epoch": 0.9008980115458627, "grad_norm": 0.961888964093016, "learning_rate": 5.923988369221456e-06, "loss": 0.6805, "step": 14045 }, { "epoch": 0.9012187299550994, "grad_norm": 0.8861288123930613, "learning_rate": 5.886087499011594e-06, "loss": 0.758, "step": 14050 }, { "epoch": 0.9015394483643361, "grad_norm": 0.8032927310909407, "learning_rate": 5.8483045837834705e-06, "loss": 0.6607, "step": 14055 }, { "epoch": 0.9018601667735728, "grad_norm": 0.8087075039644414, "learning_rate": 5.810639670891216e-06, "loss": 0.7027, "step": 14060 }, { "epoch": 0.9021808851828095, "grad_norm": 0.8539578913251452, "learning_rate": 5.773092807541092e-06, "loss": 0.5801, "step": 14065 }, { "epoch": 0.9025016035920462, "grad_norm": 0.7756452243315396, "learning_rate": 5.735664040791367e-06, "loss": 0.7103, "step": 14070 }, { "epoch": 0.9028223220012829, "grad_norm": 1.222999060061691, "learning_rate": 5.698353417552327e-06, "loss": 0.6017, "step": 14075 }, { "epoch": 0.9031430404105195, "grad_norm": 0.7983696291416744, "learning_rate": 5.661160984586178e-06, "loss": 0.6049, "step": 14080 }, { "epoch": 0.9034637588197563, "grad_norm": 0.6490907871037943, "learning_rate": 5.624086788506977e-06, "loss": 0.5526, "step": 14085 }, { "epoch": 0.9037844772289929, "grad_norm": 0.8508537947980717, "learning_rate": 5.587130875780633e-06, "loss": 0.7109, "step": 14090 }, { "epoch": 0.9041051956382297, "grad_norm": 1.2192033565455072, "learning_rate": 5.550293292724762e-06, "loss": 0.7051, "step": 14095 }, { "epoch": 0.9044259140474663, "grad_norm": 0.6587693265105345, "learning_rate": 5.51357408550871e-06, "loss": 0.6174, "step": 14100 }, { "epoch": 0.9047466324567031, "grad_norm": 0.49748231304384327, "learning_rate": 5.47697330015341e-06, "loss": 0.5172, "step": 14105 }, { "epoch": 0.9050673508659397, "grad_norm": 0.7884068967557873, "learning_rate": 5.440490982531465e-06, "loss": 0.6816, "step": 14110 }, { "epoch": 0.9053880692751763, "grad_norm": 1.1069363022735697, "learning_rate": 5.404127178366902e-06, "loss": 0.6431, "step": 14115 }, { "epoch": 0.9057087876844131, "grad_norm": 0.6826869882965035, "learning_rate": 5.367881933235275e-06, "loss": 0.5101, "step": 14120 }, { "epoch": 0.9060295060936497, "grad_norm": 0.7273219199634979, "learning_rate": 5.331755292563523e-06, "loss": 0.637, "step": 14125 }, { "epoch": 0.9063502245028865, "grad_norm": 0.8472794210673035, "learning_rate": 5.295747301629917e-06, "loss": 0.6022, "step": 14130 }, { "epoch": 0.9066709429121231, "grad_norm": 0.8444956562341863, "learning_rate": 5.259858005564089e-06, "loss": 0.5334, "step": 14135 }, { "epoch": 0.9069916613213599, "grad_norm": 0.6732860335353007, "learning_rate": 5.224087449346826e-06, "loss": 0.7202, "step": 14140 }, { "epoch": 0.9073123797305965, "grad_norm": 0.9666322387828169, "learning_rate": 5.188435677810133e-06, "loss": 0.7559, "step": 14145 }, { "epoch": 0.9076330981398332, "grad_norm": 0.9869737805273263, "learning_rate": 5.152902735637166e-06, "loss": 0.678, "step": 14150 }, { "epoch": 0.9079538165490699, "grad_norm": 0.5732371579819191, "learning_rate": 5.1174886673620805e-06, "loss": 0.5993, "step": 14155 }, { "epoch": 0.9082745349583066, "grad_norm": 0.8654988560178682, "learning_rate": 5.082193517370127e-06, "loss": 0.6813, "step": 14160 }, { "epoch": 0.9085952533675433, "grad_norm": 1.0265921567687237, "learning_rate": 5.047017329897463e-06, "loss": 0.6737, "step": 14165 }, { "epoch": 0.90891597177678, "grad_norm": 0.8248152748170539, "learning_rate": 5.011960149031137e-06, "loss": 0.5857, "step": 14170 }, { "epoch": 0.9092366901860167, "grad_norm": 0.7956729093404309, "learning_rate": 4.977022018709088e-06, "loss": 0.6643, "step": 14175 }, { "epoch": 0.9095574085952534, "grad_norm": 1.7578923486790687, "learning_rate": 4.94220298271999e-06, "loss": 0.7325, "step": 14180 }, { "epoch": 0.9098781270044901, "grad_norm": 0.8908535862934428, "learning_rate": 4.907503084703335e-06, "loss": 0.7003, "step": 14185 }, { "epoch": 0.9101988454137268, "grad_norm": 0.5989152273082363, "learning_rate": 4.872922368149213e-06, "loss": 0.6494, "step": 14190 }, { "epoch": 0.9105195638229634, "grad_norm": 1.1947032610011639, "learning_rate": 4.838460876398365e-06, "loss": 0.712, "step": 14195 }, { "epoch": 0.9108402822322001, "grad_norm": 0.8008113658697428, "learning_rate": 4.804118652642164e-06, "loss": 0.6607, "step": 14200 }, { "epoch": 0.9111610006414368, "grad_norm": 0.9092451384048743, "learning_rate": 4.769895739922403e-06, "loss": 0.532, "step": 14205 }, { "epoch": 0.9114817190506735, "grad_norm": 0.9642837868126427, "learning_rate": 4.7357921811314374e-06, "loss": 0.5875, "step": 14210 }, { "epoch": 0.9118024374599102, "grad_norm": 1.2120942953279068, "learning_rate": 4.701808019011966e-06, "loss": 0.644, "step": 14215 }, { "epoch": 0.9121231558691469, "grad_norm": 0.7731779356318255, "learning_rate": 4.66794329615704e-06, "loss": 0.7528, "step": 14220 }, { "epoch": 0.9124438742783836, "grad_norm": 0.8452499221199778, "learning_rate": 4.634198055010097e-06, "loss": 0.7321, "step": 14225 }, { "epoch": 0.9127645926876202, "grad_norm": 0.7660682093886364, "learning_rate": 4.600572337864739e-06, "loss": 0.58, "step": 14230 }, { "epoch": 0.913085311096857, "grad_norm": 0.919577008788518, "learning_rate": 4.567066186864799e-06, "loss": 0.5792, "step": 14235 }, { "epoch": 0.9134060295060936, "grad_norm": 0.7240560589023852, "learning_rate": 4.53367964400423e-06, "loss": 0.6382, "step": 14240 }, { "epoch": 0.9137267479153304, "grad_norm": 0.9404018211860803, "learning_rate": 4.500412751127148e-06, "loss": 0.6983, "step": 14245 }, { "epoch": 0.914047466324567, "grad_norm": 0.9226737613175637, "learning_rate": 4.467265549927646e-06, "loss": 0.7371, "step": 14250 }, { "epoch": 0.9143681847338038, "grad_norm": 0.8674349211052579, "learning_rate": 4.434238081949793e-06, "loss": 0.715, "step": 14255 }, { "epoch": 0.9146889031430404, "grad_norm": 1.0086095744064745, "learning_rate": 4.401330388587655e-06, "loss": 0.6359, "step": 14260 }, { "epoch": 0.915009621552277, "grad_norm": 0.7399699212191572, "learning_rate": 4.368542511085127e-06, "loss": 0.6856, "step": 14265 }, { "epoch": 0.9153303399615138, "grad_norm": 0.7837381511015072, "learning_rate": 4.3358744905359845e-06, "loss": 0.5355, "step": 14270 }, { "epoch": 0.9156510583707504, "grad_norm": 0.7456554819958952, "learning_rate": 4.303326367883742e-06, "loss": 0.6506, "step": 14275 }, { "epoch": 0.9159717767799872, "grad_norm": 0.7504015595604561, "learning_rate": 4.2708981839216344e-06, "loss": 0.7347, "step": 14280 }, { "epoch": 0.9162924951892238, "grad_norm": 0.7872333950088334, "learning_rate": 4.238589979292651e-06, "loss": 0.7448, "step": 14285 }, { "epoch": 0.9166132135984606, "grad_norm": 0.848658406503067, "learning_rate": 4.206401794489301e-06, "loss": 0.755, "step": 14290 }, { "epoch": 0.9169339320076972, "grad_norm": 0.7157699993484576, "learning_rate": 4.1743336698537805e-06, "loss": 0.6877, "step": 14295 }, { "epoch": 0.9172546504169339, "grad_norm": 0.920746793540226, "learning_rate": 4.142385645577707e-06, "loss": 0.6888, "step": 14300 }, { "epoch": 0.9175753688261706, "grad_norm": 0.6845975702530432, "learning_rate": 4.110557761702249e-06, "loss": 0.754, "step": 14305 }, { "epoch": 0.9178960872354073, "grad_norm": 1.1511196348448594, "learning_rate": 4.078850058117978e-06, "loss": 0.616, "step": 14310 }, { "epoch": 0.918216805644644, "grad_norm": 0.6109287776036132, "learning_rate": 4.0472625745648144e-06, "loss": 0.5921, "step": 14315 }, { "epoch": 0.9185375240538807, "grad_norm": 0.5799180489438701, "learning_rate": 4.015795350632068e-06, "loss": 0.6258, "step": 14320 }, { "epoch": 0.9188582424631174, "grad_norm": 1.0588410053870487, "learning_rate": 3.984448425758236e-06, "loss": 0.6294, "step": 14325 }, { "epoch": 0.9191789608723541, "grad_norm": 0.9656078510689677, "learning_rate": 3.953221839231125e-06, "loss": 0.7232, "step": 14330 }, { "epoch": 0.9194996792815907, "grad_norm": 0.7627108781290338, "learning_rate": 3.922115630187684e-06, "loss": 0.7192, "step": 14335 }, { "epoch": 0.9198203976908275, "grad_norm": 0.9118690797348065, "learning_rate": 3.8911298376139604e-06, "loss": 0.7131, "step": 14340 }, { "epoch": 0.9201411161000641, "grad_norm": 0.6032629064325823, "learning_rate": 3.860264500345145e-06, "loss": 0.701, "step": 14345 }, { "epoch": 0.9204618345093009, "grad_norm": 0.7887702725778526, "learning_rate": 3.829519657065417e-06, "loss": 0.4822, "step": 14350 }, { "epoch": 0.9207825529185375, "grad_norm": 0.7138715411195988, "learning_rate": 3.798895346307929e-06, "loss": 0.6301, "step": 14355 }, { "epoch": 0.9211032713277743, "grad_norm": 0.9024603895099268, "learning_rate": 3.768391606454824e-06, "loss": 0.7522, "step": 14360 }, { "epoch": 0.9214239897370109, "grad_norm": 1.0280776294268867, "learning_rate": 3.7380084757370427e-06, "loss": 0.5146, "step": 14365 }, { "epoch": 0.9217447081462476, "grad_norm": 1.2746584097883105, "learning_rate": 3.707745992234446e-06, "loss": 0.6437, "step": 14370 }, { "epoch": 0.9220654265554843, "grad_norm": 0.7420480886663697, "learning_rate": 3.677604193875639e-06, "loss": 0.7434, "step": 14375 }, { "epoch": 0.9223861449647209, "grad_norm": 0.7760260552269074, "learning_rate": 3.647583118438003e-06, "loss": 0.7314, "step": 14380 }, { "epoch": 0.9227068633739577, "grad_norm": 0.5526340026602907, "learning_rate": 3.617682803547573e-06, "loss": 0.6684, "step": 14385 }, { "epoch": 0.9230275817831943, "grad_norm": 0.8601770168248275, "learning_rate": 3.587903286679051e-06, "loss": 0.7048, "step": 14390 }, { "epoch": 0.9233483001924311, "grad_norm": 0.768831329847095, "learning_rate": 3.5582446051557694e-06, "loss": 0.7109, "step": 14395 }, { "epoch": 0.9236690186016677, "grad_norm": 0.7061972963645736, "learning_rate": 3.5287067961495613e-06, "loss": 0.7226, "step": 14400 }, { "epoch": 0.9239897370109045, "grad_norm": 0.9718492483949128, "learning_rate": 3.4992898966808128e-06, "loss": 0.6096, "step": 14405 }, { "epoch": 0.9243104554201411, "grad_norm": 0.6613307717148478, "learning_rate": 3.4699939436183548e-06, "loss": 0.6359, "step": 14410 }, { "epoch": 0.9246311738293778, "grad_norm": 0.48853477777273874, "learning_rate": 3.440818973679416e-06, "loss": 0.5916, "step": 14415 }, { "epoch": 0.9249518922386145, "grad_norm": 2.8872548788201846, "learning_rate": 3.411765023429625e-06, "loss": 0.6681, "step": 14420 }, { "epoch": 0.9252726106478512, "grad_norm": 0.8605678505533776, "learning_rate": 3.382832129282909e-06, "loss": 0.7061, "step": 14425 }, { "epoch": 0.9255933290570879, "grad_norm": 0.8152777611420922, "learning_rate": 3.354020327501506e-06, "loss": 0.7016, "step": 14430 }, { "epoch": 0.9259140474663246, "grad_norm": 0.5720911855352934, "learning_rate": 3.32532965419583e-06, "loss": 0.6065, "step": 14435 }, { "epoch": 0.9262347658755613, "grad_norm": 0.5729769215244488, "learning_rate": 3.29676014532454e-06, "loss": 0.6385, "step": 14440 }, { "epoch": 0.926555484284798, "grad_norm": 0.7971168307254297, "learning_rate": 3.2683118366944153e-06, "loss": 0.7482, "step": 14445 }, { "epoch": 0.9268762026940346, "grad_norm": 0.8082127626355636, "learning_rate": 3.2399847639603132e-06, "loss": 0.5749, "step": 14450 }, { "epoch": 0.9271969211032713, "grad_norm": 0.986366425048449, "learning_rate": 3.211778962625178e-06, "loss": 0.814, "step": 14455 }, { "epoch": 0.927517639512508, "grad_norm": 0.7974470102591675, "learning_rate": 3.1836944680399215e-06, "loss": 0.6845, "step": 14460 }, { "epoch": 0.9278383579217447, "grad_norm": 0.9030012061093406, "learning_rate": 3.155731315403465e-06, "loss": 0.7462, "step": 14465 }, { "epoch": 0.9281590763309814, "grad_norm": 0.8114451125831404, "learning_rate": 3.1278895397626295e-06, "loss": 0.7289, "step": 14470 }, { "epoch": 0.9284797947402181, "grad_norm": 0.7580184369514217, "learning_rate": 3.10016917601208e-06, "loss": 0.8204, "step": 14475 }, { "epoch": 0.9288005131494548, "grad_norm": 0.9028047332034969, "learning_rate": 3.0725702588943693e-06, "loss": 0.6502, "step": 14480 }, { "epoch": 0.9291212315586914, "grad_norm": 0.5328705285389578, "learning_rate": 3.0450928229997956e-06, "loss": 0.6282, "step": 14485 }, { "epoch": 0.9294419499679282, "grad_norm": 0.676301284723922, "learning_rate": 3.0177369027664324e-06, "loss": 0.6152, "step": 14490 }, { "epoch": 0.9297626683771648, "grad_norm": 0.6911219963447808, "learning_rate": 2.990502532480033e-06, "loss": 0.7075, "step": 14495 }, { "epoch": 0.9300833867864016, "grad_norm": 0.8158597361321028, "learning_rate": 2.9633897462740035e-06, "loss": 0.5278, "step": 14500 }, { "epoch": 0.9304041051956382, "grad_norm": 0.8885816510360459, "learning_rate": 2.936398578129407e-06, "loss": 0.7842, "step": 14505 }, { "epoch": 0.930724823604875, "grad_norm": 0.9090481734964072, "learning_rate": 2.909529061874816e-06, "loss": 0.6346, "step": 14510 }, { "epoch": 0.9310455420141116, "grad_norm": 0.6271937382541385, "learning_rate": 2.8827812311864044e-06, "loss": 0.4965, "step": 14515 }, { "epoch": 0.9313662604233482, "grad_norm": 0.8626519977341744, "learning_rate": 2.856155119587789e-06, "loss": 0.6916, "step": 14520 }, { "epoch": 0.931686978832585, "grad_norm": 1.2378284751762905, "learning_rate": 2.829650760450031e-06, "loss": 0.6573, "step": 14525 }, { "epoch": 0.9320076972418216, "grad_norm": 1.2677367998396853, "learning_rate": 2.8032681869916366e-06, "loss": 0.5755, "step": 14530 }, { "epoch": 0.9323284156510584, "grad_norm": 0.5109336107393835, "learning_rate": 2.7770074322784334e-06, "loss": 0.5688, "step": 14535 }, { "epoch": 0.932649134060295, "grad_norm": 0.7042004857736548, "learning_rate": 2.7508685292235937e-06, "loss": 0.7213, "step": 14540 }, { "epoch": 0.9329698524695318, "grad_norm": 0.7309101698002372, "learning_rate": 2.7248515105875673e-06, "loss": 0.6667, "step": 14545 }, { "epoch": 0.9332905708787684, "grad_norm": 0.6908743464424493, "learning_rate": 2.6989564089780263e-06, "loss": 0.6156, "step": 14550 }, { "epoch": 0.9336112892880052, "grad_norm": 0.9549405672325, "learning_rate": 2.673183256849876e-06, "loss": 0.5705, "step": 14555 }, { "epoch": 0.9339320076972418, "grad_norm": 0.8108069141144446, "learning_rate": 2.6475320865051444e-06, "loss": 0.6301, "step": 14560 }, { "epoch": 0.9342527261064785, "grad_norm": 0.7542934406058188, "learning_rate": 2.6220029300930037e-06, "loss": 0.6081, "step": 14565 }, { "epoch": 0.9345734445157152, "grad_norm": 0.8121008842739622, "learning_rate": 2.5965958196096706e-06, "loss": 0.7333, "step": 14570 }, { "epoch": 0.9348941629249519, "grad_norm": 0.7044098978011041, "learning_rate": 2.571310786898451e-06, "loss": 0.6786, "step": 14575 }, { "epoch": 0.9352148813341886, "grad_norm": 0.669296953567193, "learning_rate": 2.5461478636496062e-06, "loss": 0.6451, "step": 14580 }, { "epoch": 0.9355355997434253, "grad_norm": 1.0134964970782947, "learning_rate": 2.5211070814003536e-06, "loss": 0.7071, "step": 14585 }, { "epoch": 0.935856318152662, "grad_norm": 0.8079966960225432, "learning_rate": 2.496188471534866e-06, "loss": 0.6494, "step": 14590 }, { "epoch": 0.9361770365618987, "grad_norm": 0.7980284916096867, "learning_rate": 2.4713920652841394e-06, "loss": 0.6966, "step": 14595 }, { "epoch": 0.9364977549711353, "grad_norm": 1.4182606806536633, "learning_rate": 2.4467178937260692e-06, "loss": 0.5106, "step": 14600 }, { "epoch": 0.936818473380372, "grad_norm": 1.1450293247030983, "learning_rate": 2.4221659877853074e-06, "loss": 0.6734, "step": 14605 }, { "epoch": 0.9371391917896087, "grad_norm": 0.7955638461295016, "learning_rate": 2.397736378233284e-06, "loss": 0.725, "step": 14610 }, { "epoch": 0.9374599101988454, "grad_norm": 0.7397520509486079, "learning_rate": 2.3734290956881734e-06, "loss": 0.6244, "step": 14615 }, { "epoch": 0.9377806286080821, "grad_norm": 0.9732579754101209, "learning_rate": 2.349244170614773e-06, "loss": 0.6057, "step": 14620 }, { "epoch": 0.9381013470173188, "grad_norm": 1.406456086581141, "learning_rate": 2.3251816333246025e-06, "loss": 0.7182, "step": 14625 }, { "epoch": 0.9384220654265555, "grad_norm": 0.8952424347381697, "learning_rate": 2.301241513975749e-06, "loss": 0.7598, "step": 14630 }, { "epoch": 0.9387427838357921, "grad_norm": 1.0218439096331748, "learning_rate": 2.2774238425728677e-06, "loss": 0.7246, "step": 14635 }, { "epoch": 0.9390635022450289, "grad_norm": 0.7685781373474748, "learning_rate": 2.2537286489671573e-06, "loss": 0.5579, "step": 14640 }, { "epoch": 0.9393842206542655, "grad_norm": 0.7182539188714678, "learning_rate": 2.2301559628563062e-06, "loss": 0.4816, "step": 14645 }, { "epoch": 0.9397049390635023, "grad_norm": 0.7271338524133633, "learning_rate": 2.206705813784471e-06, "loss": 0.7117, "step": 14650 }, { "epoch": 0.9400256574727389, "grad_norm": 0.9142892488291297, "learning_rate": 2.18337823114223e-06, "loss": 0.5035, "step": 14655 }, { "epoch": 0.9403463758819757, "grad_norm": 1.1230106908678623, "learning_rate": 2.160173244166541e-06, "loss": 0.5692, "step": 14660 }, { "epoch": 0.9406670942912123, "grad_norm": 0.40796226780607736, "learning_rate": 2.1370908819407174e-06, "loss": 0.5771, "step": 14665 }, { "epoch": 0.940987812700449, "grad_norm": 0.9481608724103522, "learning_rate": 2.1141311733943626e-06, "loss": 0.5029, "step": 14670 }, { "epoch": 0.9413085311096857, "grad_norm": 1.0000026556770782, "learning_rate": 2.09129414730338e-06, "loss": 0.6156, "step": 14675 }, { "epoch": 0.9416292495189224, "grad_norm": 0.521971426032197, "learning_rate": 2.0685798322899073e-06, "loss": 0.6233, "step": 14680 }, { "epoch": 0.9419499679281591, "grad_norm": 0.555113548672577, "learning_rate": 2.045988256822273e-06, "loss": 0.6226, "step": 14685 }, { "epoch": 0.9422706863373957, "grad_norm": 1.0940970203612415, "learning_rate": 2.0235194492149832e-06, "loss": 0.6603, "step": 14690 }, { "epoch": 0.9425914047466325, "grad_norm": 1.0787803604629624, "learning_rate": 2.0011734376286896e-06, "loss": 0.6915, "step": 14695 }, { "epoch": 0.9429121231558691, "grad_norm": 0.603441598329727, "learning_rate": 1.978950250070111e-06, "loss": 0.7826, "step": 14700 }, { "epoch": 0.9432328415651058, "grad_norm": 1.1933790532010597, "learning_rate": 1.9568499143920336e-06, "loss": 0.6277, "step": 14705 }, { "epoch": 0.9435535599743425, "grad_norm": 0.5764914897220961, "learning_rate": 1.9348724582933133e-06, "loss": 0.6875, "step": 14710 }, { "epoch": 0.9438742783835792, "grad_norm": 0.9696889870454197, "learning_rate": 1.9130179093187484e-06, "loss": 0.8159, "step": 14715 }, { "epoch": 0.9441949967928159, "grad_norm": 1.174884517440042, "learning_rate": 1.891286294859107e-06, "loss": 0.7811, "step": 14720 }, { "epoch": 0.9445157152020526, "grad_norm": 0.7432254800663841, "learning_rate": 1.869677642151102e-06, "loss": 0.8169, "step": 14725 }, { "epoch": 0.9448364336112893, "grad_norm": 1.3451481683596176, "learning_rate": 1.8481919782773138e-06, "loss": 0.6386, "step": 14730 }, { "epoch": 0.945157152020526, "grad_norm": 0.8999549303642768, "learning_rate": 1.82682933016618e-06, "loss": 0.6578, "step": 14735 }, { "epoch": 0.9454778704297627, "grad_norm": 0.7535938047620351, "learning_rate": 1.8055897245919718e-06, "loss": 0.6345, "step": 14740 }, { "epoch": 0.9457985888389994, "grad_norm": 0.9031933438522918, "learning_rate": 1.78447318817474e-06, "loss": 0.6979, "step": 14745 }, { "epoch": 0.946119307248236, "grad_norm": 0.5909234139284275, "learning_rate": 1.7634797473802922e-06, "loss": 0.5283, "step": 14750 }, { "epoch": 0.9464400256574728, "grad_norm": 0.7478929356403822, "learning_rate": 1.7426094285201478e-06, "loss": 0.7548, "step": 14755 }, { "epoch": 0.9467607440667094, "grad_norm": 0.7939890902510196, "learning_rate": 1.7218622577515496e-06, "loss": 0.7005, "step": 14760 }, { "epoch": 0.9470814624759462, "grad_norm": 0.6058878555015041, "learning_rate": 1.7012382610773315e-06, "loss": 0.6766, "step": 14765 }, { "epoch": 0.9474021808851828, "grad_norm": 0.848486027790844, "learning_rate": 1.6807374643460272e-06, "loss": 0.7677, "step": 14770 }, { "epoch": 0.9477228992944196, "grad_norm": 0.7595303087988711, "learning_rate": 1.6603598932517061e-06, "loss": 0.7407, "step": 14775 }, { "epoch": 0.9480436177036562, "grad_norm": 0.7579789167134414, "learning_rate": 1.6401055733340164e-06, "loss": 0.669, "step": 14780 }, { "epoch": 0.9483643361128928, "grad_norm": 1.2648466067379471, "learning_rate": 1.61997452997813e-06, "loss": 0.6469, "step": 14785 }, { "epoch": 0.9486850545221296, "grad_norm": 0.797026657881511, "learning_rate": 1.5999667884147196e-06, "loss": 0.588, "step": 14790 }, { "epoch": 0.9490057729313662, "grad_norm": 0.915174796254417, "learning_rate": 1.5800823737199156e-06, "loss": 0.7036, "step": 14795 }, { "epoch": 0.949326491340603, "grad_norm": 0.7014564001359544, "learning_rate": 1.5603213108152715e-06, "loss": 0.604, "step": 14800 }, { "epoch": 0.9496472097498396, "grad_norm": 1.0673933698941918, "learning_rate": 1.5406836244677646e-06, "loss": 0.6767, "step": 14805 }, { "epoch": 0.9499679281590764, "grad_norm": 0.5974581758846627, "learning_rate": 1.5211693392897185e-06, "loss": 0.6277, "step": 14810 }, { "epoch": 0.950288646568313, "grad_norm": 0.76752354413579, "learning_rate": 1.5017784797388024e-06, "loss": 0.6575, "step": 14815 }, { "epoch": 0.9506093649775497, "grad_norm": 0.6302709486833972, "learning_rate": 1.482511070118009e-06, "loss": 0.5797, "step": 14820 }, { "epoch": 0.9509300833867864, "grad_norm": 0.6408626471147529, "learning_rate": 1.4633671345755884e-06, "loss": 0.6938, "step": 14825 }, { "epoch": 0.9512508017960231, "grad_norm": 1.147885938640683, "learning_rate": 1.4443466971050367e-06, "loss": 0.6631, "step": 14830 }, { "epoch": 0.9515715202052598, "grad_norm": 1.2090975514637632, "learning_rate": 1.4254497815450852e-06, "loss": 0.5987, "step": 14835 }, { "epoch": 0.9518922386144965, "grad_norm": 1.4462854589201612, "learning_rate": 1.4066764115796328e-06, "loss": 0.5496, "step": 14840 }, { "epoch": 0.9522129570237332, "grad_norm": 2.2267736323891603, "learning_rate": 1.3880266107377581e-06, "loss": 0.6236, "step": 14845 }, { "epoch": 0.9525336754329699, "grad_norm": 0.9767897268690148, "learning_rate": 1.369500402393653e-06, "loss": 0.6737, "step": 14850 }, { "epoch": 0.9528543938422065, "grad_norm": 0.6597022287518994, "learning_rate": 1.3510978097665994e-06, "loss": 0.6009, "step": 14855 }, { "epoch": 0.9531751122514432, "grad_norm": 0.8352297747099178, "learning_rate": 1.332818855920981e-06, "loss": 0.6206, "step": 14860 }, { "epoch": 0.9534958306606799, "grad_norm": 0.3398468741414835, "learning_rate": 1.314663563766172e-06, "loss": 0.745, "step": 14865 }, { "epoch": 0.9538165490699166, "grad_norm": 0.6650997138673455, "learning_rate": 1.2966319560566264e-06, "loss": 0.5189, "step": 14870 }, { "epoch": 0.9541372674791533, "grad_norm": 0.8495035997423334, "learning_rate": 1.2787240553917223e-06, "loss": 0.5352, "step": 14875 }, { "epoch": 0.95445798588839, "grad_norm": 0.6804679950864659, "learning_rate": 1.2609398842158171e-06, "loss": 0.5298, "step": 14880 }, { "epoch": 0.9547787042976267, "grad_norm": 0.9011394842975389, "learning_rate": 1.2432794648181922e-06, "loss": 0.6416, "step": 14885 }, { "epoch": 0.9550994227068633, "grad_norm": 0.8017624405517991, "learning_rate": 1.225742819333031e-06, "loss": 0.7683, "step": 14890 }, { "epoch": 0.9554201411161001, "grad_norm": 1.0189493989237226, "learning_rate": 1.2083299697393968e-06, "loss": 0.6712, "step": 14895 }, { "epoch": 0.9557408595253367, "grad_norm": 0.8632861800860692, "learning_rate": 1.1910409378611653e-06, "loss": 0.6677, "step": 14900 }, { "epoch": 0.9560615779345735, "grad_norm": 0.8271377018484679, "learning_rate": 1.17387574536707e-06, "loss": 0.8435, "step": 14905 }, { "epoch": 0.9563822963438101, "grad_norm": 1.090763241775662, "learning_rate": 1.1568344137706133e-06, "loss": 0.751, "step": 14910 }, { "epoch": 0.9567030147530469, "grad_norm": 0.8533558406500173, "learning_rate": 1.1399169644300323e-06, "loss": 0.7627, "step": 14915 }, { "epoch": 0.9570237331622835, "grad_norm": 0.7969691903367916, "learning_rate": 1.1231234185483663e-06, "loss": 0.6599, "step": 14920 }, { "epoch": 0.9573444515715203, "grad_norm": 0.6892919393359965, "learning_rate": 1.1064537971733124e-06, "loss": 0.6862, "step": 14925 }, { "epoch": 0.9576651699807569, "grad_norm": 0.8464857234158932, "learning_rate": 1.0899081211972584e-06, "loss": 0.8058, "step": 14930 }, { "epoch": 0.9579858883899935, "grad_norm": 0.5019234017303561, "learning_rate": 1.0734864113572606e-06, "loss": 0.684, "step": 14935 }, { "epoch": 0.9583066067992303, "grad_norm": 0.7995354303661617, "learning_rate": 1.057188688234989e-06, "loss": 0.577, "step": 14940 }, { "epoch": 0.9586273252084669, "grad_norm": 1.053084388323032, "learning_rate": 1.0410149722567376e-06, "loss": 0.6179, "step": 14945 }, { "epoch": 0.9589480436177037, "grad_norm": 0.9473025528524849, "learning_rate": 1.0249652836933688e-06, "loss": 0.6448, "step": 14950 }, { "epoch": 0.9592687620269403, "grad_norm": 0.8867828551638389, "learning_rate": 1.0090396426603143e-06, "loss": 0.7081, "step": 14955 }, { "epoch": 0.9595894804361771, "grad_norm": 0.579392165704179, "learning_rate": 9.93238069117508e-07, "loss": 0.6266, "step": 14960 }, { "epoch": 0.9599101988454137, "grad_norm": 1.3419589121931794, "learning_rate": 9.775605828693969e-07, "loss": 0.6619, "step": 14965 }, { "epoch": 0.9602309172546504, "grad_norm": 0.9125359836127329, "learning_rate": 9.620072035649075e-07, "loss": 0.6073, "step": 14970 }, { "epoch": 0.9605516356638871, "grad_norm": 1.0860000796878035, "learning_rate": 9.465779506974359e-07, "loss": 0.5401, "step": 14975 }, { "epoch": 0.9608723540731238, "grad_norm": 1.171824681775004, "learning_rate": 9.312728436047913e-07, "loss": 0.5753, "step": 14980 }, { "epoch": 0.9611930724823605, "grad_norm": 0.5643018528812354, "learning_rate": 9.160919014691848e-07, "loss": 0.5638, "step": 14985 }, { "epoch": 0.9615137908915972, "grad_norm": 0.9034235555165777, "learning_rate": 9.010351433172304e-07, "loss": 0.6334, "step": 14990 }, { "epoch": 0.9618345093008339, "grad_norm": 1.1839905897068703, "learning_rate": 8.86102588019877e-07, "loss": 0.7153, "step": 14995 }, { "epoch": 0.9621552277100706, "grad_norm": 0.8180578726272846, "learning_rate": 8.712942542923986e-07, "loss": 0.5817, "step": 15000 }, { "epoch": 0.9624759461193072, "grad_norm": 1.0696335688074747, "learning_rate": 8.566101606944266e-07, "loss": 0.6736, "step": 15005 }, { "epoch": 0.962796664528544, "grad_norm": 0.7303824338994761, "learning_rate": 8.420503256298396e-07, "loss": 0.6429, "step": 15010 }, { "epoch": 0.9631173829377806, "grad_norm": 1.0294755318998579, "learning_rate": 8.276147673467849e-07, "loss": 0.7188, "step": 15015 }, { "epoch": 0.9634381013470174, "grad_norm": 0.9556262852737702, "learning_rate": 8.133035039376679e-07, "loss": 0.5951, "step": 15020 }, { "epoch": 0.963758819756254, "grad_norm": 0.9324693251087647, "learning_rate": 7.991165533390854e-07, "loss": 0.7127, "step": 15025 }, { "epoch": 0.9640795381654907, "grad_norm": 0.9591152159542692, "learning_rate": 7.850539333318585e-07, "loss": 0.6322, "step": 15030 }, { "epoch": 0.9644002565747274, "grad_norm": 0.6946002197246557, "learning_rate": 7.711156615409665e-07, "loss": 0.5755, "step": 15035 }, { "epoch": 0.964720974983964, "grad_norm": 1.3334758098994104, "learning_rate": 7.573017554355355e-07, "loss": 0.6318, "step": 15040 }, { "epoch": 0.9650416933932008, "grad_norm": 0.8978971885207064, "learning_rate": 7.436122323288497e-07, "loss": 0.6035, "step": 15045 }, { "epoch": 0.9653624118024374, "grad_norm": 0.8103686748723528, "learning_rate": 7.300471093782624e-07, "loss": 0.6194, "step": 15050 }, { "epoch": 0.9656831302116742, "grad_norm": 0.753034703476334, "learning_rate": 7.166064035852405e-07, "loss": 0.6241, "step": 15055 }, { "epoch": 0.9660038486209108, "grad_norm": 0.8194295630630289, "learning_rate": 7.032901317953089e-07, "loss": 0.804, "step": 15060 }, { "epoch": 0.9663245670301476, "grad_norm": 0.6380479125093319, "learning_rate": 6.900983106980396e-07, "loss": 0.4591, "step": 15065 }, { "epoch": 0.9666452854393842, "grad_norm": 0.6010950679928249, "learning_rate": 6.770309568270183e-07, "loss": 0.5964, "step": 15070 }, { "epoch": 0.9669660038486209, "grad_norm": 0.6142851169104145, "learning_rate": 6.640880865598331e-07, "loss": 0.515, "step": 15075 }, { "epoch": 0.9672867222578576, "grad_norm": 0.5969279751540932, "learning_rate": 6.512697161180859e-07, "loss": 0.5795, "step": 15080 }, { "epoch": 0.9676074406670943, "grad_norm": 1.1554904145083251, "learning_rate": 6.38575861567281e-07, "loss": 0.7483, "step": 15085 }, { "epoch": 0.967928159076331, "grad_norm": 0.7865746542213344, "learning_rate": 6.260065388169256e-07, "loss": 0.5557, "step": 15090 }, { "epoch": 0.9682488774855676, "grad_norm": 1.1050848806521416, "learning_rate": 6.135617636204072e-07, "loss": 0.5939, "step": 15095 }, { "epoch": 0.9685695958948044, "grad_norm": 0.7070536160439901, "learning_rate": 6.01241551575027e-07, "loss": 0.6985, "step": 15100 }, { "epoch": 0.968890314304041, "grad_norm": 1.105194184766872, "learning_rate": 5.890459181219776e-07, "loss": 0.7083, "step": 15105 }, { "epoch": 0.9692110327132777, "grad_norm": 1.2744464352233527, "learning_rate": 5.769748785463103e-07, "loss": 0.6397, "step": 15110 }, { "epoch": 0.9695317511225144, "grad_norm": 0.9272062316818276, "learning_rate": 5.650284479769008e-07, "loss": 0.7676, "step": 15115 }, { "epoch": 0.9698524695317511, "grad_norm": 0.7995773908927787, "learning_rate": 5.532066413864834e-07, "loss": 0.6971, "step": 15120 }, { "epoch": 0.9701731879409878, "grad_norm": 0.38586358236871543, "learning_rate": 5.415094735915838e-07, "loss": 0.6707, "step": 15125 }, { "epoch": 0.9704939063502245, "grad_norm": 0.9134739108193013, "learning_rate": 5.299369592524972e-07, "loss": 0.7099, "step": 15130 }, { "epoch": 0.9708146247594612, "grad_norm": 1.1214413150852183, "learning_rate": 5.184891128733216e-07, "loss": 0.5773, "step": 15135 }, { "epoch": 0.9711353431686979, "grad_norm": 0.9080341063196368, "learning_rate": 5.071659488018688e-07, "loss": 0.5541, "step": 15140 }, { "epoch": 0.9714560615779346, "grad_norm": 0.6396326113379124, "learning_rate": 4.959674812297089e-07, "loss": 0.7547, "step": 15145 }, { "epoch": 0.9717767799871713, "grad_norm": 0.6247330527268826, "learning_rate": 4.848937241921369e-07, "loss": 0.7347, "step": 15150 }, { "epoch": 0.9720974983964079, "grad_norm": 0.7413180396760661, "learning_rate": 4.7394469156810674e-07, "loss": 0.6324, "step": 15155 }, { "epoch": 0.9724182168056447, "grad_norm": 0.8191285127812412, "learning_rate": 4.6312039708028553e-07, "loss": 0.6501, "step": 15160 }, { "epoch": 0.9727389352148813, "grad_norm": 1.5646180696875727, "learning_rate": 4.5242085429499923e-07, "loss": 0.7018, "step": 15165 }, { "epoch": 0.9730596536241181, "grad_norm": 1.05700452006374, "learning_rate": 4.4184607662220987e-07, "loss": 0.702, "step": 15170 }, { "epoch": 0.9733803720333547, "grad_norm": 0.6341783140741876, "learning_rate": 4.313960773155046e-07, "loss": 0.636, "step": 15175 }, { "epoch": 0.9737010904425915, "grad_norm": 0.7888859139283535, "learning_rate": 4.2107086947209553e-07, "loss": 0.6313, "step": 15180 }, { "epoch": 0.9740218088518281, "grad_norm": 0.9191085670941561, "learning_rate": 4.1087046603279777e-07, "loss": 0.6221, "step": 15185 }, { "epoch": 0.9743425272610647, "grad_norm": 0.747755641512419, "learning_rate": 4.007948797819738e-07, "loss": 0.7214, "step": 15190 }, { "epoch": 0.9746632456703015, "grad_norm": 0.977703835187041, "learning_rate": 3.90844123347589e-07, "loss": 0.6226, "step": 15195 }, { "epoch": 0.9749839640795381, "grad_norm": 1.0760333069724886, "learning_rate": 3.8101820920114494e-07, "loss": 0.5479, "step": 15200 }, { "epoch": 0.9753046824887749, "grad_norm": 0.6944511489853861, "learning_rate": 3.713171496576573e-07, "loss": 0.5499, "step": 15205 }, { "epoch": 0.9756254008980115, "grad_norm": 0.8427188819091052, "learning_rate": 3.617409568756669e-07, "loss": 0.7567, "step": 15210 }, { "epoch": 0.9759461193072483, "grad_norm": 0.8552901758457413, "learning_rate": 3.5228964285722864e-07, "loss": 0.5683, "step": 15215 }, { "epoch": 0.9762668377164849, "grad_norm": 1.3132456382472737, "learning_rate": 3.429632194478782e-07, "loss": 0.6284, "step": 15220 }, { "epoch": 0.9765875561257216, "grad_norm": 0.7318279273617357, "learning_rate": 3.337616983366321e-07, "loss": 0.5582, "step": 15225 }, { "epoch": 0.9769082745349583, "grad_norm": 0.6573550653291185, "learning_rate": 3.246850910559318e-07, "loss": 0.5491, "step": 15230 }, { "epoch": 0.977228992944195, "grad_norm": 0.8242113768294678, "learning_rate": 3.157334089816888e-07, "loss": 0.7255, "step": 15235 }, { "epoch": 0.9775497113534317, "grad_norm": 0.9030228435778539, "learning_rate": 3.0690666333325067e-07, "loss": 0.5873, "step": 15240 }, { "epoch": 0.9778704297626684, "grad_norm": 0.565513303166446, "learning_rate": 2.9820486517335713e-07, "loss": 0.598, "step": 15245 }, { "epoch": 0.9781911481719051, "grad_norm": 0.6147817142778307, "learning_rate": 2.896280254081618e-07, "loss": 0.7145, "step": 15250 }, { "epoch": 0.9785118665811418, "grad_norm": 0.8743323298527471, "learning_rate": 2.811761547871994e-07, "loss": 0.6756, "step": 15255 }, { "epoch": 0.9788325849903784, "grad_norm": 1.1307500659483494, "learning_rate": 2.728492639033742e-07, "loss": 0.6188, "step": 15260 }, { "epoch": 0.9791533033996151, "grad_norm": 0.7125463266714677, "learning_rate": 2.6464736319297136e-07, "loss": 0.6278, "step": 15265 }, { "epoch": 0.9794740218088518, "grad_norm": 0.5910469031411075, "learning_rate": 2.5657046293560137e-07, "loss": 0.6905, "step": 15270 }, { "epoch": 0.9797947402180885, "grad_norm": 0.7878661937473239, "learning_rate": 2.4861857325421123e-07, "loss": 0.7325, "step": 15275 }, { "epoch": 0.9801154586273252, "grad_norm": 0.8286733473521487, "learning_rate": 2.4079170411507315e-07, "loss": 0.7773, "step": 15280 }, { "epoch": 0.9804361770365619, "grad_norm": 0.9685767265903029, "learning_rate": 2.3308986532778464e-07, "loss": 0.646, "step": 15285 }, { "epoch": 0.9807568954457986, "grad_norm": 0.9361901486165769, "learning_rate": 2.255130665452243e-07, "loss": 0.6598, "step": 15290 }, { "epoch": 0.9810776138550352, "grad_norm": 0.9598712869867538, "learning_rate": 2.180613172635404e-07, "loss": 0.5625, "step": 15295 }, { "epoch": 0.981398332264272, "grad_norm": 0.53570267588639, "learning_rate": 2.1073462682217325e-07, "loss": 0.5784, "step": 15300 }, { "epoch": 0.9817190506735086, "grad_norm": 0.7566088957917948, "learning_rate": 2.0353300440382194e-07, "loss": 0.6119, "step": 15305 }, { "epoch": 0.9820397690827454, "grad_norm": 1.146329754716512, "learning_rate": 1.9645645903444422e-07, "loss": 0.7188, "step": 15310 }, { "epoch": 0.982360487491982, "grad_norm": 0.8370973588336825, "learning_rate": 1.895049995832232e-07, "loss": 0.7563, "step": 15315 }, { "epoch": 0.9826812059012188, "grad_norm": 0.9434580889772379, "learning_rate": 1.8267863476255643e-07, "loss": 0.7839, "step": 15320 }, { "epoch": 0.9830019243104554, "grad_norm": 0.8804750628505544, "learning_rate": 1.7597737312810004e-07, "loss": 0.4332, "step": 15325 }, { "epoch": 0.9833226427196922, "grad_norm": 0.7320489722005881, "learning_rate": 1.694012230786579e-07, "loss": 0.7652, "step": 15330 }, { "epoch": 0.9836433611289288, "grad_norm": 0.7366362970085942, "learning_rate": 1.6295019285628154e-07, "loss": 0.7341, "step": 15335 }, { "epoch": 0.9839640795381654, "grad_norm": 1.0140709106862729, "learning_rate": 1.5662429054618122e-07, "loss": 0.4945, "step": 15340 }, { "epoch": 0.9842847979474022, "grad_norm": 1.9484887809729772, "learning_rate": 1.504235240767371e-07, "loss": 0.6308, "step": 15345 }, { "epoch": 0.9846055163566388, "grad_norm": 0.9619117197899885, "learning_rate": 1.4434790121951036e-07, "loss": 0.6099, "step": 15350 }, { "epoch": 0.9849262347658756, "grad_norm": 0.9949706333975902, "learning_rate": 1.3839742958920987e-07, "loss": 0.5725, "step": 15355 }, { "epoch": 0.9852469531751122, "grad_norm": 0.9242186083511401, "learning_rate": 1.3257211664368106e-07, "loss": 0.6308, "step": 15360 }, { "epoch": 0.985567671584349, "grad_norm": 1.0782239190960032, "learning_rate": 1.2687196968392822e-07, "loss": 0.6935, "step": 15365 }, { "epoch": 0.9858883899935856, "grad_norm": 0.8111644243864005, "learning_rate": 1.2129699585404774e-07, "loss": 0.7241, "step": 15370 }, { "epoch": 0.9862091084028223, "grad_norm": 0.7276347564310323, "learning_rate": 1.1584720214129485e-07, "loss": 0.6842, "step": 15375 }, { "epoch": 0.986529826812059, "grad_norm": 1.036558622735431, "learning_rate": 1.1052259537599474e-07, "loss": 0.7109, "step": 15380 }, { "epoch": 0.9868505452212957, "grad_norm": 0.8442723448288622, "learning_rate": 1.053231822315981e-07, "loss": 0.5197, "step": 15385 }, { "epoch": 0.9871712636305324, "grad_norm": 0.7755592771907561, "learning_rate": 1.0024896922464777e-07, "loss": 0.5958, "step": 15390 }, { "epoch": 0.9874919820397691, "grad_norm": 1.0235862204819772, "learning_rate": 9.529996271475661e-08, "loss": 0.7323, "step": 15395 }, { "epoch": 0.9878127004490058, "grad_norm": 0.6802556392432448, "learning_rate": 9.047616890461852e-08, "loss": 0.6661, "step": 15400 }, { "epoch": 0.9881334188582425, "grad_norm": 0.7642842609623561, "learning_rate": 8.57775938399974e-08, "loss": 0.6418, "step": 15405 }, { "epoch": 0.9884541372674791, "grad_norm": 0.7629833080692018, "learning_rate": 8.1204243409716e-08, "loss": 0.71, "step": 15410 }, { "epoch": 0.9887748556767159, "grad_norm": 0.8028551912844719, "learning_rate": 7.675612334566706e-08, "loss": 0.6261, "step": 15415 }, { "epoch": 0.9890955740859525, "grad_norm": 0.8568280018874693, "learning_rate": 7.24332392227578e-08, "loss": 0.7818, "step": 15420 }, { "epoch": 0.9894162924951893, "grad_norm": 0.9435010043749265, "learning_rate": 6.823559645896538e-08, "loss": 0.7135, "step": 15425 }, { "epoch": 0.9897370109044259, "grad_norm": 0.8536947904193946, "learning_rate": 6.416320031527035e-08, "loss": 0.6909, "step": 15430 }, { "epoch": 0.9900577293136626, "grad_norm": 0.6375751055715156, "learning_rate": 6.02160558957121e-08, "loss": 0.7567, "step": 15435 }, { "epoch": 0.9903784477228993, "grad_norm": 0.722851635446421, "learning_rate": 5.639416814731124e-08, "loss": 0.595, "step": 15440 }, { "epoch": 0.9906991661321359, "grad_norm": 0.6530835942019998, "learning_rate": 5.269754186013609e-08, "loss": 0.6185, "step": 15445 }, { "epoch": 0.9910198845413727, "grad_norm": 1.0508657841447764, "learning_rate": 4.912618166723615e-08, "loss": 0.5615, "step": 15450 }, { "epoch": 0.9913406029506093, "grad_norm": 0.89657789016663, "learning_rate": 4.5680092044686486e-08, "loss": 0.686, "step": 15455 }, { "epoch": 0.9916613213598461, "grad_norm": 1.0049970608249212, "learning_rate": 4.235927731153222e-08, "loss": 0.5976, "step": 15460 }, { "epoch": 0.9919820397690827, "grad_norm": 0.5955235189985802, "learning_rate": 3.916374162983294e-08, "loss": 0.4921, "step": 15465 }, { "epoch": 0.9923027581783195, "grad_norm": 1.0006472782878193, "learning_rate": 3.6093489004618286e-08, "loss": 0.6268, "step": 15470 }, { "epoch": 0.9926234765875561, "grad_norm": 0.7931648933621266, "learning_rate": 3.314852328389906e-08, "loss": 0.6005, "step": 15475 }, { "epoch": 0.9929441949967928, "grad_norm": 0.9041277771423232, "learning_rate": 3.032884815866721e-08, "loss": 0.5324, "step": 15480 }, { "epoch": 0.9932649134060295, "grad_norm": 0.9494072939119311, "learning_rate": 2.7634467162873657e-08, "loss": 0.7065, "step": 15485 }, { "epoch": 0.9935856318152662, "grad_norm": 0.6280167222373476, "learning_rate": 2.506538367345046e-08, "loss": 0.6061, "step": 15490 }, { "epoch": 0.9939063502245029, "grad_norm": 0.9951079606352037, "learning_rate": 2.2621600910288644e-08, "loss": 0.6444, "step": 15495 }, { "epoch": 0.9942270686337396, "grad_norm": 0.5695988637172767, "learning_rate": 2.0303121936227077e-08, "loss": 0.5318, "step": 15500 }, { "epoch": 0.9945477870429763, "grad_norm": 0.7104107796380682, "learning_rate": 1.8109949657074687e-08, "loss": 0.584, "step": 15505 }, { "epoch": 0.994868505452213, "grad_norm": 0.9519239668806431, "learning_rate": 1.6042086821566048e-08, "loss": 0.6069, "step": 15510 }, { "epoch": 0.9951892238614497, "grad_norm": 1.0531482909821168, "learning_rate": 1.409953602140579e-08, "loss": 0.6419, "step": 15515 }, { "epoch": 0.9955099422706863, "grad_norm": 0.8960669638227693, "learning_rate": 1.2282299691235289e-08, "loss": 0.6139, "step": 15520 }, { "epoch": 0.995830660679923, "grad_norm": 1.4364607207448494, "learning_rate": 1.059038010863267e-08, "loss": 0.557, "step": 15525 }, { "epoch": 0.9961513790891597, "grad_norm": 0.7870715712258225, "learning_rate": 9.02377939412391e-09, "loss": 0.6829, "step": 15530 }, { "epoch": 0.9964720974983964, "grad_norm": 0.6681758560958523, "learning_rate": 7.582499511160635e-09, "loss": 0.6894, "step": 15535 }, { "epoch": 0.9967928159076331, "grad_norm": 0.7692932903463889, "learning_rate": 6.266542266120112e-09, "loss": 0.6775, "step": 15540 }, { "epoch": 0.9971135343168698, "grad_norm": 1.2971219190629335, "learning_rate": 5.0759093083385665e-09, "loss": 0.6272, "step": 15545 }, { "epoch": 0.9974342527261065, "grad_norm": 0.6500496471556959, "learning_rate": 4.010602130033458e-09, "loss": 0.6068, "step": 15550 }, { "epoch": 0.9977549711353432, "grad_norm": 0.686167526298323, "learning_rate": 3.0706220664034057e-09, "loss": 0.6119, "step": 15555 }, { "epoch": 0.9980756895445798, "grad_norm": 0.7818918449822959, "learning_rate": 2.255970295539367e-09, "loss": 0.6275, "step": 15560 }, { "epoch": 0.9983964079538166, "grad_norm": 0.7349503006612832, "learning_rate": 1.5666478384579464e-09, "loss": 0.7661, "step": 15565 }, { "epoch": 0.9987171263630532, "grad_norm": 0.7447321528022689, "learning_rate": 1.0026555591013952e-09, "loss": 0.7204, "step": 15570 }, { "epoch": 0.99903784477229, "grad_norm": 0.5813111087659052, "learning_rate": 5.639941643376112e-10, "loss": 0.6803, "step": 15575 }, { "epoch": 0.9993585631815266, "grad_norm": 0.7896016109360797, "learning_rate": 2.5066420393793365e-10, "loss": 0.7841, "step": 15580 }, { "epoch": 0.9996792815907634, "grad_norm": 0.8103436457382939, "learning_rate": 6.266607062155316e-11, "loss": 0.5852, "step": 15585 }, { "epoch": 1.0, "grad_norm": 0.9796619082932287, "learning_rate": 0.0, "loss": 0.7238, "step": 15590 }, { "epoch": 1.0, "step": 15590, "total_flos": 1.6764562374852608e+16, "train_loss": 0.0, "train_runtime": 0.0156, "train_samples_per_second": 6662118.554, "train_steps_per_second": 104099.609 } ], "logging_steps": 5, "max_steps": 1624, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6764562374852608e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }