{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.999235928347058, "eval_steps": 500, "global_step": 35334, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001273452754902793, "grad_norm": 3.466355800628662, "learning_rate": 1.9999999011847117e-05, "loss": 0.1462, "step": 5 }, { "epoch": 0.002546905509805586, "grad_norm": 1.5165133476257324, "learning_rate": 1.9999996047388665e-05, "loss": 0.0884, "step": 10 }, { "epoch": 0.003820358264708379, "grad_norm": 1.0625194311141968, "learning_rate": 1.9999991106625233e-05, "loss": 0.0752, "step": 15 }, { "epoch": 0.005093811019611172, "grad_norm": 2.032374858856201, "learning_rate": 1.999998418955779e-05, "loss": 0.0673, "step": 20 }, { "epoch": 0.006367263774513966, "grad_norm": 1.020270824432373, "learning_rate": 1.9999975296187703e-05, "loss": 0.0558, "step": 25 }, { "epoch": 0.007640716529416758, "grad_norm": 1.215514898300171, "learning_rate": 1.999996442651674e-05, "loss": 0.0497, "step": 30 }, { "epoch": 0.008914169284319552, "grad_norm": 1.0068773031234741, "learning_rate": 1.9999951580547035e-05, "loss": 0.0505, "step": 35 }, { "epoch": 0.010187622039222344, "grad_norm": 1.7578357458114624, "learning_rate": 1.999993675828114e-05, "loss": 0.0606, "step": 40 }, { "epoch": 0.011461074794125137, "grad_norm": 1.3710927963256836, "learning_rate": 1.9999919959721977e-05, "loss": 0.0504, "step": 45 }, { "epoch": 0.012734527549027931, "grad_norm": 1.6805496215820312, "learning_rate": 1.9999901184872866e-05, "loss": 0.0758, "step": 50 }, { "epoch": 0.014007980303930724, "grad_norm": 1.1195398569107056, "learning_rate": 1.999988043373752e-05, "loss": 0.0449, "step": 55 }, { "epoch": 0.015281433058833517, "grad_norm": 1.8067474365234375, "learning_rate": 1.999985770632004e-05, "loss": 0.0571, "step": 60 }, { "epoch": 0.01655488581373631, "grad_norm": 1.0832034349441528, "learning_rate": 1.9999833002624916e-05, "loss": 0.062, "step": 65 }, { "epoch": 0.017828338568639104, "grad_norm": 0.5964593291282654, "learning_rate": 1.999980632265703e-05, "loss": 0.0435, "step": 70 }, { "epoch": 0.019101791323541898, "grad_norm": 0.7557662725448608, "learning_rate": 1.9999777666421655e-05, "loss": 0.0414, "step": 75 }, { "epoch": 0.02037524407844469, "grad_norm": 1.2284084558486938, "learning_rate": 1.999974703392446e-05, "loss": 0.045, "step": 80 }, { "epoch": 0.021648696833347483, "grad_norm": 1.0732548236846924, "learning_rate": 1.9999714425171487e-05, "loss": 0.0559, "step": 85 }, { "epoch": 0.022922149588250274, "grad_norm": 1.3787697553634644, "learning_rate": 1.9999679840169192e-05, "loss": 0.0551, "step": 90 }, { "epoch": 0.02419560234315307, "grad_norm": 1.5668123960494995, "learning_rate": 1.9999643278924404e-05, "loss": 0.0545, "step": 95 }, { "epoch": 0.025469055098055863, "grad_norm": 1.631627082824707, "learning_rate": 1.9999604741444346e-05, "loss": 0.0578, "step": 100 }, { "epoch": 0.026742507852958654, "grad_norm": 1.4916422367095947, "learning_rate": 1.999956422773664e-05, "loss": 0.0551, "step": 105 }, { "epoch": 0.028015960607861448, "grad_norm": 0.7466166019439697, "learning_rate": 1.9999521737809297e-05, "loss": 0.0399, "step": 110 }, { "epoch": 0.029289413362764243, "grad_norm": 4.368441581726074, "learning_rate": 1.9999477271670704e-05, "loss": 0.0869, "step": 115 }, { "epoch": 0.030562866117667033, "grad_norm": 0.8793487548828125, "learning_rate": 1.9999430829329653e-05, "loss": 0.0387, "step": 120 }, { "epoch": 0.031836318872569824, "grad_norm": 2.1954450607299805, "learning_rate": 1.9999382410795324e-05, "loss": 0.0699, "step": 125 }, { "epoch": 0.03310977162747262, "grad_norm": 0.7789003849029541, "learning_rate": 1.9999332016077283e-05, "loss": 0.0609, "step": 130 }, { "epoch": 0.03438322438237541, "grad_norm": 1.1194804906845093, "learning_rate": 1.9999279645185493e-05, "loss": 0.0415, "step": 135 }, { "epoch": 0.03565667713727821, "grad_norm": 1.6949464082717896, "learning_rate": 1.9999225298130296e-05, "loss": 0.0544, "step": 140 }, { "epoch": 0.036930129892181, "grad_norm": 0.9878089427947998, "learning_rate": 1.9999168974922446e-05, "loss": 0.0642, "step": 145 }, { "epoch": 0.038203582647083796, "grad_norm": 1.1248618364334106, "learning_rate": 1.9999110675573066e-05, "loss": 0.0572, "step": 150 }, { "epoch": 0.039477035401986584, "grad_norm": 1.9152965545654297, "learning_rate": 1.9999050400093674e-05, "loss": 0.0505, "step": 155 }, { "epoch": 0.04075048815688938, "grad_norm": 0.9803286790847778, "learning_rate": 1.9998988148496192e-05, "loss": 0.0302, "step": 160 }, { "epoch": 0.04202394091179217, "grad_norm": 0.9934186935424805, "learning_rate": 1.9998923920792914e-05, "loss": 0.0519, "step": 165 }, { "epoch": 0.04329739366669497, "grad_norm": 1.0393081903457642, "learning_rate": 1.999885771699654e-05, "loss": 0.0459, "step": 170 }, { "epoch": 0.04457084642159776, "grad_norm": 0.9923281669616699, "learning_rate": 1.9998789537120152e-05, "loss": 0.0424, "step": 175 }, { "epoch": 0.04584429917650055, "grad_norm": 0.9962205290794373, "learning_rate": 1.999871938117722e-05, "loss": 0.048, "step": 180 }, { "epoch": 0.04711775193140334, "grad_norm": 0.8948476314544678, "learning_rate": 1.9998647249181617e-05, "loss": 0.0294, "step": 185 }, { "epoch": 0.04839120468630614, "grad_norm": 0.8511443138122559, "learning_rate": 1.999857314114759e-05, "loss": 0.0435, "step": 190 }, { "epoch": 0.04966465744120893, "grad_norm": 1.1598560810089111, "learning_rate": 1.999849705708979e-05, "loss": 0.0646, "step": 195 }, { "epoch": 0.050938110196111726, "grad_norm": 1.0035039186477661, "learning_rate": 1.9998418997023252e-05, "loss": 0.0383, "step": 200 }, { "epoch": 0.05221156295101452, "grad_norm": 0.7833221554756165, "learning_rate": 1.9998338960963407e-05, "loss": 0.0377, "step": 205 }, { "epoch": 0.05348501570591731, "grad_norm": 0.9863020181655884, "learning_rate": 1.999825694892606e-05, "loss": 0.0485, "step": 210 }, { "epoch": 0.0547584684608201, "grad_norm": 1.6799232959747314, "learning_rate": 1.999817296092744e-05, "loss": 0.0423, "step": 215 }, { "epoch": 0.056031921215722896, "grad_norm": 1.6994788646697998, "learning_rate": 1.9998086996984128e-05, "loss": 0.0677, "step": 220 }, { "epoch": 0.05730537397062569, "grad_norm": 1.504747986793518, "learning_rate": 1.999799905711312e-05, "loss": 0.0368, "step": 225 }, { "epoch": 0.058578826725528485, "grad_norm": 1.6293238401412964, "learning_rate": 1.999790914133179e-05, "loss": 0.0569, "step": 230 }, { "epoch": 0.05985227948043128, "grad_norm": 0.5110263228416443, "learning_rate": 1.999781724965792e-05, "loss": 0.0369, "step": 235 }, { "epoch": 0.06112573223533407, "grad_norm": 1.0569170713424683, "learning_rate": 1.9997723382109657e-05, "loss": 0.0404, "step": 240 }, { "epoch": 0.06239918499023686, "grad_norm": 0.7583103775978088, "learning_rate": 1.9997627538705565e-05, "loss": 0.0317, "step": 245 }, { "epoch": 0.06367263774513965, "grad_norm": 0.8281612992286682, "learning_rate": 1.9997529719464573e-05, "loss": 0.0375, "step": 250 }, { "epoch": 0.06494609050004245, "grad_norm": 0.7210308313369751, "learning_rate": 1.999742992440602e-05, "loss": 0.047, "step": 255 }, { "epoch": 0.06621954325494524, "grad_norm": 1.027665615081787, "learning_rate": 1.999732815354963e-05, "loss": 0.0478, "step": 260 }, { "epoch": 0.06749299600984804, "grad_norm": 1.1793278455734253, "learning_rate": 1.9997224406915513e-05, "loss": 0.0532, "step": 265 }, { "epoch": 0.06876644876475083, "grad_norm": 0.7181020975112915, "learning_rate": 1.9997118684524172e-05, "loss": 0.0473, "step": 270 }, { "epoch": 0.07003990151965363, "grad_norm": 1.9288009405136108, "learning_rate": 1.99970109863965e-05, "loss": 0.0369, "step": 275 }, { "epoch": 0.07131335427455641, "grad_norm": 1.6436256170272827, "learning_rate": 1.9996901312553786e-05, "loss": 0.0483, "step": 280 }, { "epoch": 0.0725868070294592, "grad_norm": 1.5444090366363525, "learning_rate": 1.99967896630177e-05, "loss": 0.06, "step": 285 }, { "epoch": 0.073860259784362, "grad_norm": 1.482207179069519, "learning_rate": 1.9996676037810316e-05, "loss": 0.0375, "step": 290 }, { "epoch": 0.07513371253926479, "grad_norm": 0.8338231444358826, "learning_rate": 1.9996560436954077e-05, "loss": 0.0519, "step": 295 }, { "epoch": 0.07640716529416759, "grad_norm": 1.27544105052948, "learning_rate": 1.999644286047184e-05, "loss": 0.0361, "step": 300 }, { "epoch": 0.07768061804907038, "grad_norm": 0.7690101861953735, "learning_rate": 1.9996323308386837e-05, "loss": 0.033, "step": 305 }, { "epoch": 0.07895407080397317, "grad_norm": 2.2271475791931152, "learning_rate": 1.999620178072269e-05, "loss": 0.0396, "step": 310 }, { "epoch": 0.08022752355887597, "grad_norm": 1.2305625677108765, "learning_rate": 1.9996078277503433e-05, "loss": 0.0436, "step": 315 }, { "epoch": 0.08150097631377876, "grad_norm": 1.3137006759643555, "learning_rate": 1.9995952798753457e-05, "loss": 0.0552, "step": 320 }, { "epoch": 0.08277442906868156, "grad_norm": 1.2099368572235107, "learning_rate": 1.9995825344497567e-05, "loss": 0.0427, "step": 325 }, { "epoch": 0.08404788182358434, "grad_norm": 0.9281789660453796, "learning_rate": 1.999569591476095e-05, "loss": 0.036, "step": 330 }, { "epoch": 0.08532133457848713, "grad_norm": 0.573714554309845, "learning_rate": 1.9995564509569187e-05, "loss": 0.0315, "step": 335 }, { "epoch": 0.08659478733338993, "grad_norm": 1.0472278594970703, "learning_rate": 1.999543112894825e-05, "loss": 0.0332, "step": 340 }, { "epoch": 0.08786824008829272, "grad_norm": 1.353329062461853, "learning_rate": 1.9995295772924498e-05, "loss": 0.0509, "step": 345 }, { "epoch": 0.08914169284319552, "grad_norm": 0.8670082688331604, "learning_rate": 1.999515844152468e-05, "loss": 0.0473, "step": 350 }, { "epoch": 0.09041514559809831, "grad_norm": 1.1126822233200073, "learning_rate": 1.999501913477593e-05, "loss": 0.0401, "step": 355 }, { "epoch": 0.0916885983530011, "grad_norm": 1.1708083152770996, "learning_rate": 1.9994877852705792e-05, "loss": 0.04, "step": 360 }, { "epoch": 0.0929620511079039, "grad_norm": 1.1511286497116089, "learning_rate": 1.9994734595342182e-05, "loss": 0.0422, "step": 365 }, { "epoch": 0.09423550386280669, "grad_norm": 1.0753926038742065, "learning_rate": 1.9994589362713413e-05, "loss": 0.0323, "step": 370 }, { "epoch": 0.09550895661770949, "grad_norm": 1.1716687679290771, "learning_rate": 1.999444215484818e-05, "loss": 0.0524, "step": 375 }, { "epoch": 0.09678240937261227, "grad_norm": 1.5494014024734497, "learning_rate": 1.999429297177559e-05, "loss": 0.0465, "step": 380 }, { "epoch": 0.09805586212751508, "grad_norm": 1.4003781080245972, "learning_rate": 1.9994141813525118e-05, "loss": 0.0485, "step": 385 }, { "epoch": 0.09932931488241786, "grad_norm": 1.029611349105835, "learning_rate": 1.999398868012663e-05, "loss": 0.0451, "step": 390 }, { "epoch": 0.10060276763732065, "grad_norm": 1.2359493970870972, "learning_rate": 1.9993833571610405e-05, "loss": 0.0472, "step": 395 }, { "epoch": 0.10187622039222345, "grad_norm": 0.8043149709701538, "learning_rate": 1.9993676488007088e-05, "loss": 0.059, "step": 400 }, { "epoch": 0.10314967314712624, "grad_norm": 1.1136410236358643, "learning_rate": 1.9993517429347724e-05, "loss": 0.0388, "step": 405 }, { "epoch": 0.10442312590202904, "grad_norm": 1.3286417722702026, "learning_rate": 1.999335639566375e-05, "loss": 0.0365, "step": 410 }, { "epoch": 0.10569657865693183, "grad_norm": 1.38053560256958, "learning_rate": 1.999319338698699e-05, "loss": 0.0563, "step": 415 }, { "epoch": 0.10697003141183462, "grad_norm": 1.4296796321868896, "learning_rate": 1.999302840334966e-05, "loss": 0.038, "step": 420 }, { "epoch": 0.10824348416673742, "grad_norm": 0.7680894732475281, "learning_rate": 1.9992861444784366e-05, "loss": 0.0269, "step": 425 }, { "epoch": 0.1095169369216402, "grad_norm": 1.010419487953186, "learning_rate": 1.9992692511324104e-05, "loss": 0.0563, "step": 430 }, { "epoch": 0.110790389676543, "grad_norm": 1.2328169345855713, "learning_rate": 1.999252160300226e-05, "loss": 0.035, "step": 435 }, { "epoch": 0.11206384243144579, "grad_norm": 1.635837435722351, "learning_rate": 1.999234871985261e-05, "loss": 0.0743, "step": 440 }, { "epoch": 0.11333729518634858, "grad_norm": 1.3944053649902344, "learning_rate": 1.9992173861909323e-05, "loss": 0.0476, "step": 445 }, { "epoch": 0.11461074794125138, "grad_norm": 1.2933045625686646, "learning_rate": 1.9991997029206956e-05, "loss": 0.0423, "step": 450 }, { "epoch": 0.11588420069615417, "grad_norm": 0.9598836898803711, "learning_rate": 1.9991818221780456e-05, "loss": 0.0355, "step": 455 }, { "epoch": 0.11715765345105697, "grad_norm": 1.641486644744873, "learning_rate": 1.9991637439665158e-05, "loss": 0.0486, "step": 460 }, { "epoch": 0.11843110620595976, "grad_norm": 1.6942174434661865, "learning_rate": 1.9991454682896794e-05, "loss": 0.0246, "step": 465 }, { "epoch": 0.11970455896086256, "grad_norm": 1.0677307844161987, "learning_rate": 1.999126995151148e-05, "loss": 0.0478, "step": 470 }, { "epoch": 0.12097801171576535, "grad_norm": 0.7371054887771606, "learning_rate": 1.999108324554573e-05, "loss": 0.0465, "step": 475 }, { "epoch": 0.12225146447066813, "grad_norm": 0.942548930644989, "learning_rate": 1.999089456503643e-05, "loss": 0.0455, "step": 480 }, { "epoch": 0.12352491722557093, "grad_norm": 0.9453840851783752, "learning_rate": 1.9990703910020882e-05, "loss": 0.0366, "step": 485 }, { "epoch": 0.12479836998047372, "grad_norm": 1.0794521570205688, "learning_rate": 1.9990511280536764e-05, "loss": 0.0258, "step": 490 }, { "epoch": 0.1260718227353765, "grad_norm": 1.5811809301376343, "learning_rate": 1.999031667662214e-05, "loss": 0.0362, "step": 495 }, { "epoch": 0.1273452754902793, "grad_norm": 1.4914147853851318, "learning_rate": 1.9990120098315468e-05, "loss": 0.0343, "step": 500 }, { "epoch": 0.1286187282451821, "grad_norm": 1.1843833923339844, "learning_rate": 1.9989921545655606e-05, "loss": 0.0439, "step": 505 }, { "epoch": 0.1298921810000849, "grad_norm": 0.9355488419532776, "learning_rate": 1.998972101868179e-05, "loss": 0.0355, "step": 510 }, { "epoch": 0.1311656337549877, "grad_norm": 1.2830522060394287, "learning_rate": 1.998951851743365e-05, "loss": 0.0365, "step": 515 }, { "epoch": 0.13243908650989047, "grad_norm": 0.9342244863510132, "learning_rate": 1.99893140419512e-05, "loss": 0.0349, "step": 520 }, { "epoch": 0.1337125392647933, "grad_norm": 1.37283456325531, "learning_rate": 1.9989107592274863e-05, "loss": 0.0631, "step": 525 }, { "epoch": 0.13498599201969608, "grad_norm": 0.9996501207351685, "learning_rate": 1.9988899168445432e-05, "loss": 0.0428, "step": 530 }, { "epoch": 0.13625944477459886, "grad_norm": 0.9964371919631958, "learning_rate": 1.9988688770504104e-05, "loss": 0.0423, "step": 535 }, { "epoch": 0.13753289752950165, "grad_norm": 0.8892298340797424, "learning_rate": 1.998847639849245e-05, "loss": 0.0361, "step": 540 }, { "epoch": 0.13880635028440444, "grad_norm": 0.8595050573348999, "learning_rate": 1.9988262052452453e-05, "loss": 0.0465, "step": 545 }, { "epoch": 0.14007980303930725, "grad_norm": 1.472139835357666, "learning_rate": 1.998804573242646e-05, "loss": 0.0446, "step": 550 }, { "epoch": 0.14135325579421004, "grad_norm": 1.0584642887115479, "learning_rate": 1.9987827438457236e-05, "loss": 0.0421, "step": 555 }, { "epoch": 0.14262670854911283, "grad_norm": 1.0089914798736572, "learning_rate": 1.998760717058792e-05, "loss": 0.037, "step": 560 }, { "epoch": 0.14390016130401562, "grad_norm": 0.8372271656990051, "learning_rate": 1.998738492886204e-05, "loss": 0.0473, "step": 565 }, { "epoch": 0.1451736140589184, "grad_norm": 0.8600888252258301, "learning_rate": 1.9987160713323516e-05, "loss": 0.0336, "step": 570 }, { "epoch": 0.14644706681382122, "grad_norm": 0.9962368607521057, "learning_rate": 1.9986934524016665e-05, "loss": 0.0409, "step": 575 }, { "epoch": 0.147720519568724, "grad_norm": 2.3541266918182373, "learning_rate": 1.998670636098618e-05, "loss": 0.0533, "step": 580 }, { "epoch": 0.1489939723236268, "grad_norm": 0.867184042930603, "learning_rate": 1.9986476224277167e-05, "loss": 0.0372, "step": 585 }, { "epoch": 0.15026742507852958, "grad_norm": 1.7438790798187256, "learning_rate": 1.99862441139351e-05, "loss": 0.0493, "step": 590 }, { "epoch": 0.15154087783343237, "grad_norm": 0.7625421285629272, "learning_rate": 1.998601003000585e-05, "loss": 0.04, "step": 595 }, { "epoch": 0.15281433058833518, "grad_norm": 1.7328354120254517, "learning_rate": 1.998577397253568e-05, "loss": 0.0461, "step": 600 }, { "epoch": 0.15408778334323797, "grad_norm": 1.5769754648208618, "learning_rate": 1.9985535941571243e-05, "loss": 0.0672, "step": 605 }, { "epoch": 0.15536123609814076, "grad_norm": 1.1992998123168945, "learning_rate": 1.9985295937159583e-05, "loss": 0.0425, "step": 610 }, { "epoch": 0.15663468885304355, "grad_norm": 1.1076910495758057, "learning_rate": 1.9985053959348128e-05, "loss": 0.0422, "step": 615 }, { "epoch": 0.15790814160794633, "grad_norm": 1.1742002964019775, "learning_rate": 1.9984810008184706e-05, "loss": 0.0279, "step": 620 }, { "epoch": 0.15918159436284915, "grad_norm": 0.8382532596588135, "learning_rate": 1.9984564083717524e-05, "loss": 0.0523, "step": 625 }, { "epoch": 0.16045504711775194, "grad_norm": 1.3141670227050781, "learning_rate": 1.9984316185995186e-05, "loss": 0.0352, "step": 630 }, { "epoch": 0.16172849987265472, "grad_norm": 1.1528003215789795, "learning_rate": 1.9984066315066686e-05, "loss": 0.0449, "step": 635 }, { "epoch": 0.1630019526275575, "grad_norm": 1.2248966693878174, "learning_rate": 1.9983814470981403e-05, "loss": 0.0481, "step": 640 }, { "epoch": 0.1642754053824603, "grad_norm": 1.101231336593628, "learning_rate": 1.998356065378911e-05, "loss": 0.0575, "step": 645 }, { "epoch": 0.16554885813736311, "grad_norm": 1.130606770515442, "learning_rate": 1.998330486353997e-05, "loss": 0.033, "step": 650 }, { "epoch": 0.1668223108922659, "grad_norm": 0.8856629133224487, "learning_rate": 1.9983047100284534e-05, "loss": 0.0375, "step": 655 }, { "epoch": 0.1680957636471687, "grad_norm": 1.0314744710922241, "learning_rate": 1.9982787364073743e-05, "loss": 0.0408, "step": 660 }, { "epoch": 0.16936921640207148, "grad_norm": 0.9101190567016602, "learning_rate": 1.9982525654958932e-05, "loss": 0.041, "step": 665 }, { "epoch": 0.17064266915697426, "grad_norm": 0.9854530692100525, "learning_rate": 1.9982261972991822e-05, "loss": 0.0391, "step": 670 }, { "epoch": 0.17191612191187708, "grad_norm": 1.0130863189697266, "learning_rate": 1.9981996318224524e-05, "loss": 0.0433, "step": 675 }, { "epoch": 0.17318957466677987, "grad_norm": 1.400662899017334, "learning_rate": 1.9981728690709536e-05, "loss": 0.031, "step": 680 }, { "epoch": 0.17446302742168265, "grad_norm": 1.1092907190322876, "learning_rate": 1.9981459090499754e-05, "loss": 0.0339, "step": 685 }, { "epoch": 0.17573648017658544, "grad_norm": 1.1013586521148682, "learning_rate": 1.9981187517648457e-05, "loss": 0.0299, "step": 690 }, { "epoch": 0.17700993293148823, "grad_norm": 0.785093367099762, "learning_rate": 1.998091397220932e-05, "loss": 0.0292, "step": 695 }, { "epoch": 0.17828338568639104, "grad_norm": 1.2091752290725708, "learning_rate": 1.9980638454236397e-05, "loss": 0.0407, "step": 700 }, { "epoch": 0.17955683844129383, "grad_norm": 0.9583770036697388, "learning_rate": 1.9980360963784143e-05, "loss": 0.0403, "step": 705 }, { "epoch": 0.18083029119619662, "grad_norm": 0.8641273379325867, "learning_rate": 1.9980081500907397e-05, "loss": 0.0466, "step": 710 }, { "epoch": 0.1821037439510994, "grad_norm": 0.9726065397262573, "learning_rate": 1.9979800065661396e-05, "loss": 0.0268, "step": 715 }, { "epoch": 0.1833771967060022, "grad_norm": 1.8063558340072632, "learning_rate": 1.9979516658101748e-05, "loss": 0.0481, "step": 720 }, { "epoch": 0.184650649460905, "grad_norm": 0.9426751136779785, "learning_rate": 1.9979231278284474e-05, "loss": 0.0388, "step": 725 }, { "epoch": 0.1859241022158078, "grad_norm": 0.8457497358322144, "learning_rate": 1.9978943926265965e-05, "loss": 0.0287, "step": 730 }, { "epoch": 0.18719755497071058, "grad_norm": 0.8229825496673584, "learning_rate": 1.997865460210302e-05, "loss": 0.0372, "step": 735 }, { "epoch": 0.18847100772561337, "grad_norm": 1.2559150457382202, "learning_rate": 1.997836330585281e-05, "loss": 0.0389, "step": 740 }, { "epoch": 0.18974446048051619, "grad_norm": 1.4974751472473145, "learning_rate": 1.997807003757291e-05, "loss": 0.0329, "step": 745 }, { "epoch": 0.19101791323541897, "grad_norm": 0.9951743483543396, "learning_rate": 1.997777479732127e-05, "loss": 0.0464, "step": 750 }, { "epoch": 0.19229136599032176, "grad_norm": 1.2213815450668335, "learning_rate": 1.9977477585156252e-05, "loss": 0.0394, "step": 755 }, { "epoch": 0.19356481874522455, "grad_norm": 0.8556913137435913, "learning_rate": 1.9977178401136585e-05, "loss": 0.0417, "step": 760 }, { "epoch": 0.19483827150012734, "grad_norm": 1.0053701400756836, "learning_rate": 1.9976877245321398e-05, "loss": 0.0389, "step": 765 }, { "epoch": 0.19611172425503015, "grad_norm": 1.630068063735962, "learning_rate": 1.9976574117770207e-05, "loss": 0.0365, "step": 770 }, { "epoch": 0.19738517700993294, "grad_norm": 1.2757736444473267, "learning_rate": 1.9976269018542926e-05, "loss": 0.0311, "step": 775 }, { "epoch": 0.19865862976483573, "grad_norm": 0.9120242595672607, "learning_rate": 1.9975961947699848e-05, "loss": 0.0554, "step": 780 }, { "epoch": 0.1999320825197385, "grad_norm": 0.943709135055542, "learning_rate": 1.9975652905301656e-05, "loss": 0.0363, "step": 785 }, { "epoch": 0.2012055352746413, "grad_norm": 0.9797458648681641, "learning_rate": 1.997534189140943e-05, "loss": 0.0301, "step": 790 }, { "epoch": 0.20247898802954412, "grad_norm": 0.6385541558265686, "learning_rate": 1.9975028906084636e-05, "loss": 0.0392, "step": 795 }, { "epoch": 0.2037524407844469, "grad_norm": 0.8503642678260803, "learning_rate": 1.997471394938913e-05, "loss": 0.0375, "step": 800 }, { "epoch": 0.2050258935393497, "grad_norm": 1.6555705070495605, "learning_rate": 1.997439702138515e-05, "loss": 0.0422, "step": 805 }, { "epoch": 0.20629934629425248, "grad_norm": 1.2707176208496094, "learning_rate": 1.9974078122135343e-05, "loss": 0.0561, "step": 810 }, { "epoch": 0.20757279904915527, "grad_norm": 1.1288506984710693, "learning_rate": 1.9973757251702722e-05, "loss": 0.0364, "step": 815 }, { "epoch": 0.20884625180405808, "grad_norm": 0.8244511485099792, "learning_rate": 1.9973434410150708e-05, "loss": 0.0426, "step": 820 }, { "epoch": 0.21011970455896087, "grad_norm": 1.2886103391647339, "learning_rate": 1.99731095975431e-05, "loss": 0.0487, "step": 825 }, { "epoch": 0.21139315731386366, "grad_norm": 0.7319647669792175, "learning_rate": 1.9972782813944097e-05, "loss": 0.0385, "step": 830 }, { "epoch": 0.21266661006876644, "grad_norm": 0.7454867362976074, "learning_rate": 1.9972454059418273e-05, "loss": 0.0366, "step": 835 }, { "epoch": 0.21394006282366923, "grad_norm": 0.9279243350028992, "learning_rate": 1.9972123334030605e-05, "loss": 0.0413, "step": 840 }, { "epoch": 0.21521351557857205, "grad_norm": 0.7262492179870605, "learning_rate": 1.9971790637846455e-05, "loss": 0.0357, "step": 845 }, { "epoch": 0.21648696833347483, "grad_norm": 1.064719796180725, "learning_rate": 1.9971455970931574e-05, "loss": 0.046, "step": 850 }, { "epoch": 0.21776042108837762, "grad_norm": 1.3461787700653076, "learning_rate": 1.9971119333352094e-05, "loss": 0.0441, "step": 855 }, { "epoch": 0.2190338738432804, "grad_norm": 1.149779200553894, "learning_rate": 1.997078072517456e-05, "loss": 0.0321, "step": 860 }, { "epoch": 0.2203073265981832, "grad_norm": 0.8859708905220032, "learning_rate": 1.9970440146465878e-05, "loss": 0.0368, "step": 865 }, { "epoch": 0.221580779353086, "grad_norm": 1.0382089614868164, "learning_rate": 1.9970097597293366e-05, "loss": 0.0257, "step": 870 }, { "epoch": 0.2228542321079888, "grad_norm": 1.0994497537612915, "learning_rate": 1.9969753077724717e-05, "loss": 0.0321, "step": 875 }, { "epoch": 0.22412768486289159, "grad_norm": 1.0020182132720947, "learning_rate": 1.9969406587828016e-05, "loss": 0.0375, "step": 880 }, { "epoch": 0.22540113761779437, "grad_norm": 1.3102329969406128, "learning_rate": 1.9969058127671745e-05, "loss": 0.0399, "step": 885 }, { "epoch": 0.22667459037269716, "grad_norm": 0.9331737160682678, "learning_rate": 1.9968707697324773e-05, "loss": 0.0362, "step": 890 }, { "epoch": 0.22794804312759998, "grad_norm": 0.8416597843170166, "learning_rate": 1.996835529685635e-05, "loss": 0.0375, "step": 895 }, { "epoch": 0.22922149588250276, "grad_norm": 1.2889596223831177, "learning_rate": 1.9968000926336118e-05, "loss": 0.041, "step": 900 }, { "epoch": 0.23049494863740555, "grad_norm": 1.1857105493545532, "learning_rate": 1.9967644585834124e-05, "loss": 0.0315, "step": 905 }, { "epoch": 0.23176840139230834, "grad_norm": 1.1093685626983643, "learning_rate": 1.9967286275420782e-05, "loss": 0.0296, "step": 910 }, { "epoch": 0.23304185414721112, "grad_norm": 0.8947302103042603, "learning_rate": 1.9966925995166903e-05, "loss": 0.0453, "step": 915 }, { "epoch": 0.23431530690211394, "grad_norm": 0.9531531929969788, "learning_rate": 1.9966563745143698e-05, "loss": 0.025, "step": 920 }, { "epoch": 0.23558875965701673, "grad_norm": 1.402091383934021, "learning_rate": 1.9966199525422753e-05, "loss": 0.0591, "step": 925 }, { "epoch": 0.23686221241191951, "grad_norm": 1.341934323310852, "learning_rate": 1.996583333607605e-05, "loss": 0.0397, "step": 930 }, { "epoch": 0.2381356651668223, "grad_norm": 1.5161454677581787, "learning_rate": 1.996546517717596e-05, "loss": 0.0366, "step": 935 }, { "epoch": 0.23940911792172512, "grad_norm": 1.5505576133728027, "learning_rate": 1.9965095048795243e-05, "loss": 0.0297, "step": 940 }, { "epoch": 0.2406825706766279, "grad_norm": 1.8009617328643799, "learning_rate": 1.9964722951007043e-05, "loss": 0.0354, "step": 945 }, { "epoch": 0.2419560234315307, "grad_norm": 0.9997638463973999, "learning_rate": 1.9964348883884904e-05, "loss": 0.0324, "step": 950 }, { "epoch": 0.24322947618643348, "grad_norm": 0.9650585055351257, "learning_rate": 1.996397284750275e-05, "loss": 0.0327, "step": 955 }, { "epoch": 0.24450292894133627, "grad_norm": 0.9496198892593384, "learning_rate": 1.99635948419349e-05, "loss": 0.0421, "step": 960 }, { "epoch": 0.24577638169623908, "grad_norm": 0.8485792279243469, "learning_rate": 1.996321486725606e-05, "loss": 0.0545, "step": 965 }, { "epoch": 0.24704983445114187, "grad_norm": 1.0837414264678955, "learning_rate": 1.9962832923541312e-05, "loss": 0.0305, "step": 970 }, { "epoch": 0.24832328720604466, "grad_norm": 1.2013055086135864, "learning_rate": 1.996244901086616e-05, "loss": 0.0277, "step": 975 }, { "epoch": 0.24959673996094744, "grad_norm": 0.9578385949134827, "learning_rate": 1.996206312930646e-05, "loss": 0.0407, "step": 980 }, { "epoch": 0.25087019271585026, "grad_norm": 1.4288047552108765, "learning_rate": 1.9961675278938484e-05, "loss": 0.0441, "step": 985 }, { "epoch": 0.252143645470753, "grad_norm": 0.9257775545120239, "learning_rate": 1.996128545983888e-05, "loss": 0.0377, "step": 990 }, { "epoch": 0.25341709822565583, "grad_norm": 1.0245916843414307, "learning_rate": 1.9960893672084687e-05, "loss": 0.0409, "step": 995 }, { "epoch": 0.2546905509805586, "grad_norm": 2.2781169414520264, "learning_rate": 1.9960499915753334e-05, "loss": 0.0398, "step": 1000 }, { "epoch": 0.2559640037354614, "grad_norm": 1.0442769527435303, "learning_rate": 1.9960104190922642e-05, "loss": 0.0438, "step": 1005 }, { "epoch": 0.2572374564903642, "grad_norm": 1.449857234954834, "learning_rate": 1.9959706497670814e-05, "loss": 0.0397, "step": 1010 }, { "epoch": 0.258510909245267, "grad_norm": 1.0809216499328613, "learning_rate": 1.9959306836076454e-05, "loss": 0.0501, "step": 1015 }, { "epoch": 0.2597843620001698, "grad_norm": 1.4394184350967407, "learning_rate": 1.995890520621854e-05, "loss": 0.0368, "step": 1020 }, { "epoch": 0.26105781475507256, "grad_norm": 1.4633712768554688, "learning_rate": 1.9958501608176447e-05, "loss": 0.04, "step": 1025 }, { "epoch": 0.2623312675099754, "grad_norm": 1.0799405574798584, "learning_rate": 1.995809604202994e-05, "loss": 0.0307, "step": 1030 }, { "epoch": 0.2636047202648782, "grad_norm": 1.0910381078720093, "learning_rate": 1.9957688507859174e-05, "loss": 0.0338, "step": 1035 }, { "epoch": 0.26487817301978095, "grad_norm": 0.7546371817588806, "learning_rate": 1.995727900574469e-05, "loss": 0.0378, "step": 1040 }, { "epoch": 0.26615162577468376, "grad_norm": 0.9852445125579834, "learning_rate": 1.9956867535767413e-05, "loss": 0.0405, "step": 1045 }, { "epoch": 0.2674250785295866, "grad_norm": 1.3188453912734985, "learning_rate": 1.9956454098008663e-05, "loss": 0.0345, "step": 1050 }, { "epoch": 0.26869853128448934, "grad_norm": 1.349234700202942, "learning_rate": 1.995603869255015e-05, "loss": 0.028, "step": 1055 }, { "epoch": 0.26997198403939215, "grad_norm": 1.5641494989395142, "learning_rate": 1.9955621319473973e-05, "loss": 0.049, "step": 1060 }, { "epoch": 0.2712454367942949, "grad_norm": 0.9970291256904602, "learning_rate": 1.9955201978862612e-05, "loss": 0.0394, "step": 1065 }, { "epoch": 0.27251888954919773, "grad_norm": 1.6527056694030762, "learning_rate": 1.9954780670798946e-05, "loss": 0.0404, "step": 1070 }, { "epoch": 0.27379234230410054, "grad_norm": 2.0670340061187744, "learning_rate": 1.995435739536624e-05, "loss": 0.0579, "step": 1075 }, { "epoch": 0.2750657950590033, "grad_norm": 1.1267619132995605, "learning_rate": 1.995393215264814e-05, "loss": 0.0517, "step": 1080 }, { "epoch": 0.2763392478139061, "grad_norm": 1.0311594009399414, "learning_rate": 1.9953504942728692e-05, "loss": 0.036, "step": 1085 }, { "epoch": 0.2776127005688089, "grad_norm": 1.7238144874572754, "learning_rate": 1.9953075765692325e-05, "loss": 0.0331, "step": 1090 }, { "epoch": 0.2788861533237117, "grad_norm": 0.7893922924995422, "learning_rate": 1.9952644621623853e-05, "loss": 0.048, "step": 1095 }, { "epoch": 0.2801596060786145, "grad_norm": 1.251125693321228, "learning_rate": 1.9952211510608488e-05, "loss": 0.0254, "step": 1100 }, { "epoch": 0.28143305883351727, "grad_norm": 0.893530011177063, "learning_rate": 1.9951776432731828e-05, "loss": 0.0336, "step": 1105 }, { "epoch": 0.2827065115884201, "grad_norm": 1.3466256856918335, "learning_rate": 1.995133938807985e-05, "loss": 0.0349, "step": 1110 }, { "epoch": 0.28397996434332284, "grad_norm": 0.8901265263557434, "learning_rate": 1.9950900376738932e-05, "loss": 0.0435, "step": 1115 }, { "epoch": 0.28525341709822566, "grad_norm": 1.3367873430252075, "learning_rate": 1.995045939879584e-05, "loss": 0.0436, "step": 1120 }, { "epoch": 0.2865268698531285, "grad_norm": 1.7295572757720947, "learning_rate": 1.9950016454337716e-05, "loss": 0.047, "step": 1125 }, { "epoch": 0.28780032260803123, "grad_norm": 1.1019790172576904, "learning_rate": 1.9949571543452107e-05, "loss": 0.0401, "step": 1130 }, { "epoch": 0.28907377536293405, "grad_norm": 1.268428087234497, "learning_rate": 1.994912466622694e-05, "loss": 0.037, "step": 1135 }, { "epoch": 0.2903472281178368, "grad_norm": 1.0059044361114502, "learning_rate": 1.9948675822750524e-05, "loss": 0.0473, "step": 1140 }, { "epoch": 0.2916206808727396, "grad_norm": 1.3931198120117188, "learning_rate": 1.9948225013111574e-05, "loss": 0.0406, "step": 1145 }, { "epoch": 0.29289413362764244, "grad_norm": 1.0314961671829224, "learning_rate": 1.994777223739918e-05, "loss": 0.0354, "step": 1150 }, { "epoch": 0.2941675863825452, "grad_norm": 2.1518166065216064, "learning_rate": 1.9947317495702818e-05, "loss": 0.0523, "step": 1155 }, { "epoch": 0.295441039137448, "grad_norm": 1.6452319622039795, "learning_rate": 1.994686078811237e-05, "loss": 0.046, "step": 1160 }, { "epoch": 0.2967144918923508, "grad_norm": 0.8489959836006165, "learning_rate": 1.9946402114718088e-05, "loss": 0.0403, "step": 1165 }, { "epoch": 0.2979879446472536, "grad_norm": 1.2215946912765503, "learning_rate": 1.9945941475610623e-05, "loss": 0.0355, "step": 1170 }, { "epoch": 0.2992613974021564, "grad_norm": 0.6831943988800049, "learning_rate": 1.994547887088101e-05, "loss": 0.0279, "step": 1175 }, { "epoch": 0.30053485015705916, "grad_norm": 0.8251394033432007, "learning_rate": 1.9945014300620675e-05, "loss": 0.0378, "step": 1180 }, { "epoch": 0.301808302911962, "grad_norm": 1.2710734605789185, "learning_rate": 1.9944547764921428e-05, "loss": 0.0505, "step": 1185 }, { "epoch": 0.30308175566686474, "grad_norm": 1.2387505769729614, "learning_rate": 1.9944079263875476e-05, "loss": 0.0352, "step": 1190 }, { "epoch": 0.30435520842176755, "grad_norm": 0.9863851070404053, "learning_rate": 1.9943608797575407e-05, "loss": 0.0294, "step": 1195 }, { "epoch": 0.30562866117667037, "grad_norm": 0.8506487011909485, "learning_rate": 1.9943136366114197e-05, "loss": 0.0238, "step": 1200 }, { "epoch": 0.30690211393157313, "grad_norm": 1.0137815475463867, "learning_rate": 1.9942661969585216e-05, "loss": 0.0317, "step": 1205 }, { "epoch": 0.30817556668647594, "grad_norm": 1.3041102886199951, "learning_rate": 1.9942185608082218e-05, "loss": 0.0399, "step": 1210 }, { "epoch": 0.3094490194413787, "grad_norm": 1.2289561033248901, "learning_rate": 1.994170728169934e-05, "loss": 0.0509, "step": 1215 }, { "epoch": 0.3107224721962815, "grad_norm": 1.085206389427185, "learning_rate": 1.994122699053113e-05, "loss": 0.0319, "step": 1220 }, { "epoch": 0.31199592495118433, "grad_norm": 1.0497052669525146, "learning_rate": 1.9940744734672496e-05, "loss": 0.0301, "step": 1225 }, { "epoch": 0.3132693777060871, "grad_norm": 1.1205796003341675, "learning_rate": 1.9940260514218747e-05, "loss": 0.0354, "step": 1230 }, { "epoch": 0.3145428304609899, "grad_norm": 0.43276360630989075, "learning_rate": 1.9939774329265583e-05, "loss": 0.03, "step": 1235 }, { "epoch": 0.31581628321589267, "grad_norm": 0.74505615234375, "learning_rate": 1.993928617990909e-05, "loss": 0.0303, "step": 1240 }, { "epoch": 0.3170897359707955, "grad_norm": 1.3216359615325928, "learning_rate": 1.9938796066245737e-05, "loss": 0.0397, "step": 1245 }, { "epoch": 0.3183631887256983, "grad_norm": 1.205856442451477, "learning_rate": 1.993830398837239e-05, "loss": 0.0453, "step": 1250 }, { "epoch": 0.31963664148060106, "grad_norm": 1.228124737739563, "learning_rate": 1.9937809946386292e-05, "loss": 0.0435, "step": 1255 }, { "epoch": 0.3209100942355039, "grad_norm": 1.0026150941848755, "learning_rate": 1.993731394038509e-05, "loss": 0.0335, "step": 1260 }, { "epoch": 0.32218354699040663, "grad_norm": 1.5915141105651855, "learning_rate": 1.9936815970466803e-05, "loss": 0.05, "step": 1265 }, { "epoch": 0.32345699974530945, "grad_norm": 0.8936623930931091, "learning_rate": 1.9936316036729847e-05, "loss": 0.0411, "step": 1270 }, { "epoch": 0.32473045250021226, "grad_norm": 1.153505802154541, "learning_rate": 1.9935814139273024e-05, "loss": 0.0383, "step": 1275 }, { "epoch": 0.326003905255115, "grad_norm": 1.1003351211547852, "learning_rate": 1.9935310278195526e-05, "loss": 0.0304, "step": 1280 }, { "epoch": 0.32727735801001784, "grad_norm": 1.0994418859481812, "learning_rate": 1.993480445359693e-05, "loss": 0.0337, "step": 1285 }, { "epoch": 0.3285508107649206, "grad_norm": 1.386269450187683, "learning_rate": 1.99342966655772e-05, "loss": 0.0477, "step": 1290 }, { "epoch": 0.3298242635198234, "grad_norm": 1.170512318611145, "learning_rate": 1.9933786914236694e-05, "loss": 0.0407, "step": 1295 }, { "epoch": 0.33109771627472623, "grad_norm": 1.3741734027862549, "learning_rate": 1.9933275199676155e-05, "loss": 0.0382, "step": 1300 }, { "epoch": 0.332371169029629, "grad_norm": 1.2107757329940796, "learning_rate": 1.9932761521996712e-05, "loss": 0.0362, "step": 1305 }, { "epoch": 0.3336446217845318, "grad_norm": 1.48622727394104, "learning_rate": 1.9932245881299884e-05, "loss": 0.0403, "step": 1310 }, { "epoch": 0.33491807453943456, "grad_norm": 1.4917116165161133, "learning_rate": 1.9931728277687572e-05, "loss": 0.0429, "step": 1315 }, { "epoch": 0.3361915272943374, "grad_norm": 1.9983936548233032, "learning_rate": 1.993120871126208e-05, "loss": 0.0331, "step": 1320 }, { "epoch": 0.3374649800492402, "grad_norm": 0.8884562849998474, "learning_rate": 1.9930687182126083e-05, "loss": 0.0415, "step": 1325 }, { "epoch": 0.33873843280414295, "grad_norm": 1.410744071006775, "learning_rate": 1.993016369038265e-05, "loss": 0.0258, "step": 1330 }, { "epoch": 0.34001188555904577, "grad_norm": 1.1505494117736816, "learning_rate": 1.9929638236135246e-05, "loss": 0.0414, "step": 1335 }, { "epoch": 0.3412853383139485, "grad_norm": 1.2576680183410645, "learning_rate": 1.9929110819487713e-05, "loss": 0.051, "step": 1340 }, { "epoch": 0.34255879106885134, "grad_norm": 0.7476629018783569, "learning_rate": 1.9928581440544284e-05, "loss": 0.0265, "step": 1345 }, { "epoch": 0.34383224382375416, "grad_norm": 1.6660497188568115, "learning_rate": 1.9928050099409582e-05, "loss": 0.0456, "step": 1350 }, { "epoch": 0.3451056965786569, "grad_norm": 1.1117491722106934, "learning_rate": 1.9927516796188614e-05, "loss": 0.0422, "step": 1355 }, { "epoch": 0.34637914933355973, "grad_norm": 1.1893068552017212, "learning_rate": 1.9926981530986776e-05, "loss": 0.0324, "step": 1360 }, { "epoch": 0.3476526020884625, "grad_norm": 0.8570944666862488, "learning_rate": 1.992644430390986e-05, "loss": 0.0323, "step": 1365 }, { "epoch": 0.3489260548433653, "grad_norm": 1.5169267654418945, "learning_rate": 1.992590511506403e-05, "loss": 0.0384, "step": 1370 }, { "epoch": 0.3501995075982681, "grad_norm": 1.2102770805358887, "learning_rate": 1.9925363964555854e-05, "loss": 0.0356, "step": 1375 }, { "epoch": 0.3514729603531709, "grad_norm": 1.27725088596344, "learning_rate": 1.9924820852492272e-05, "loss": 0.0497, "step": 1380 }, { "epoch": 0.3527464131080737, "grad_norm": 1.0390578508377075, "learning_rate": 1.9924275778980623e-05, "loss": 0.0327, "step": 1385 }, { "epoch": 0.35401986586297646, "grad_norm": 1.5946261882781982, "learning_rate": 1.9923728744128633e-05, "loss": 0.0479, "step": 1390 }, { "epoch": 0.3552933186178793, "grad_norm": 1.684098243713379, "learning_rate": 1.9923179748044406e-05, "loss": 0.0349, "step": 1395 }, { "epoch": 0.3565667713727821, "grad_norm": 1.2027508020401, "learning_rate": 1.992262879083645e-05, "loss": 0.0336, "step": 1400 }, { "epoch": 0.35784022412768485, "grad_norm": 1.2734794616699219, "learning_rate": 1.9922075872613642e-05, "loss": 0.0278, "step": 1405 }, { "epoch": 0.35911367688258766, "grad_norm": 1.5442482233047485, "learning_rate": 1.9921520993485263e-05, "loss": 0.034, "step": 1410 }, { "epoch": 0.3603871296374904, "grad_norm": 0.997783899307251, "learning_rate": 1.9920964153560967e-05, "loss": 0.033, "step": 1415 }, { "epoch": 0.36166058239239324, "grad_norm": 1.2022120952606201, "learning_rate": 1.9920405352950806e-05, "loss": 0.0446, "step": 1420 }, { "epoch": 0.36293403514729605, "grad_norm": 1.127021312713623, "learning_rate": 1.991984459176522e-05, "loss": 0.039, "step": 1425 }, { "epoch": 0.3642074879021988, "grad_norm": 0.7869279980659485, "learning_rate": 1.9919281870115024e-05, "loss": 0.0437, "step": 1430 }, { "epoch": 0.3654809406571016, "grad_norm": 1.0355610847473145, "learning_rate": 1.9918717188111434e-05, "loss": 0.0286, "step": 1435 }, { "epoch": 0.3667543934120044, "grad_norm": 1.276656150817871, "learning_rate": 1.991815054586605e-05, "loss": 0.041, "step": 1440 }, { "epoch": 0.3680278461669072, "grad_norm": 0.7893885970115662, "learning_rate": 1.9917581943490852e-05, "loss": 0.0368, "step": 1445 }, { "epoch": 0.36930129892181, "grad_norm": 1.126471996307373, "learning_rate": 1.9917011381098225e-05, "loss": 0.0292, "step": 1450 }, { "epoch": 0.3705747516767128, "grad_norm": 1.547493577003479, "learning_rate": 1.9916438858800913e-05, "loss": 0.0429, "step": 1455 }, { "epoch": 0.3718482044316156, "grad_norm": 0.9190046191215515, "learning_rate": 1.9915864376712077e-05, "loss": 0.037, "step": 1460 }, { "epoch": 0.3731216571865184, "grad_norm": 1.3909088373184204, "learning_rate": 1.9915287934945247e-05, "loss": 0.0331, "step": 1465 }, { "epoch": 0.37439510994142117, "grad_norm": 1.3110547065734863, "learning_rate": 1.9914709533614347e-05, "loss": 0.0361, "step": 1470 }, { "epoch": 0.375668562696324, "grad_norm": 1.1418012380599976, "learning_rate": 1.9914129172833684e-05, "loss": 0.0361, "step": 1475 }, { "epoch": 0.37694201545122674, "grad_norm": 1.949927806854248, "learning_rate": 1.991354685271796e-05, "loss": 0.0404, "step": 1480 }, { "epoch": 0.37821546820612956, "grad_norm": 1.9466365575790405, "learning_rate": 1.9912962573382254e-05, "loss": 0.0455, "step": 1485 }, { "epoch": 0.37948892096103237, "grad_norm": 0.971671462059021, "learning_rate": 1.9912376334942042e-05, "loss": 0.0271, "step": 1490 }, { "epoch": 0.38076237371593513, "grad_norm": 1.4761439561843872, "learning_rate": 1.991178813751318e-05, "loss": 0.0359, "step": 1495 }, { "epoch": 0.38203582647083795, "grad_norm": 1.090395212173462, "learning_rate": 1.9911197981211913e-05, "loss": 0.0356, "step": 1500 }, { "epoch": 0.3833092792257407, "grad_norm": 1.2824065685272217, "learning_rate": 1.9910605866154874e-05, "loss": 0.0506, "step": 1505 }, { "epoch": 0.3845827319806435, "grad_norm": 1.1579900979995728, "learning_rate": 1.9910011792459086e-05, "loss": 0.0395, "step": 1510 }, { "epoch": 0.38585618473554634, "grad_norm": 1.1031315326690674, "learning_rate": 1.9909415760241954e-05, "loss": 0.0351, "step": 1515 }, { "epoch": 0.3871296374904491, "grad_norm": 0.9843181371688843, "learning_rate": 1.9908817769621273e-05, "loss": 0.0355, "step": 1520 }, { "epoch": 0.3884030902453519, "grad_norm": 1.0832459926605225, "learning_rate": 1.9908217820715224e-05, "loss": 0.041, "step": 1525 }, { "epoch": 0.38967654300025467, "grad_norm": 0.6370882987976074, "learning_rate": 1.990761591364237e-05, "loss": 0.0372, "step": 1530 }, { "epoch": 0.3909499957551575, "grad_norm": 0.5911097526550293, "learning_rate": 1.990701204852168e-05, "loss": 0.0273, "step": 1535 }, { "epoch": 0.3922234485100603, "grad_norm": 1.5890871286392212, "learning_rate": 1.9906406225472482e-05, "loss": 0.0412, "step": 1540 }, { "epoch": 0.39349690126496306, "grad_norm": 1.023247480392456, "learning_rate": 1.9905798444614516e-05, "loss": 0.0338, "step": 1545 }, { "epoch": 0.3947703540198659, "grad_norm": 1.4817147254943848, "learning_rate": 1.9905188706067885e-05, "loss": 0.0349, "step": 1550 }, { "epoch": 0.39604380677476864, "grad_norm": 1.5372731685638428, "learning_rate": 1.9904577009953104e-05, "loss": 0.0373, "step": 1555 }, { "epoch": 0.39731725952967145, "grad_norm": 1.8084254264831543, "learning_rate": 1.9903963356391057e-05, "loss": 0.0326, "step": 1560 }, { "epoch": 0.39859071228457427, "grad_norm": 1.1608226299285889, "learning_rate": 1.9903347745503026e-05, "loss": 0.0363, "step": 1565 }, { "epoch": 0.399864165039477, "grad_norm": 1.1610970497131348, "learning_rate": 1.990273017741067e-05, "loss": 0.0401, "step": 1570 }, { "epoch": 0.40113761779437984, "grad_norm": 0.9726022481918335, "learning_rate": 1.9902110652236035e-05, "loss": 0.0461, "step": 1575 }, { "epoch": 0.4024110705492826, "grad_norm": 1.094936728477478, "learning_rate": 1.9901489170101565e-05, "loss": 0.0478, "step": 1580 }, { "epoch": 0.4036845233041854, "grad_norm": 1.519685983657837, "learning_rate": 1.9900865731130087e-05, "loss": 0.0351, "step": 1585 }, { "epoch": 0.40495797605908823, "grad_norm": 1.186555027961731, "learning_rate": 1.9900240335444802e-05, "loss": 0.0337, "step": 1590 }, { "epoch": 0.406231428813991, "grad_norm": 1.236475944519043, "learning_rate": 1.9899612983169314e-05, "loss": 0.0532, "step": 1595 }, { "epoch": 0.4075048815688938, "grad_norm": 0.9421612024307251, "learning_rate": 1.9898983674427606e-05, "loss": 0.0414, "step": 1600 }, { "epoch": 0.40877833432379657, "grad_norm": 0.9245516657829285, "learning_rate": 1.9898352409344045e-05, "loss": 0.0314, "step": 1605 }, { "epoch": 0.4100517870786994, "grad_norm": 0.8419141173362732, "learning_rate": 1.989771918804339e-05, "loss": 0.0353, "step": 1610 }, { "epoch": 0.4113252398336022, "grad_norm": 1.3227187395095825, "learning_rate": 1.9897084010650788e-05, "loss": 0.0446, "step": 1615 }, { "epoch": 0.41259869258850496, "grad_norm": 0.9008974432945251, "learning_rate": 1.989644687729177e-05, "loss": 0.0385, "step": 1620 }, { "epoch": 0.41387214534340777, "grad_norm": 1.5303794145584106, "learning_rate": 1.9895807788092244e-05, "loss": 0.0318, "step": 1625 }, { "epoch": 0.41514559809831053, "grad_norm": 0.9204017519950867, "learning_rate": 1.9895166743178525e-05, "loss": 0.0419, "step": 1630 }, { "epoch": 0.41641905085321335, "grad_norm": 1.387237787246704, "learning_rate": 1.98945237426773e-05, "loss": 0.0305, "step": 1635 }, { "epoch": 0.41769250360811616, "grad_norm": 1.2721889019012451, "learning_rate": 1.989387878671564e-05, "loss": 0.0354, "step": 1640 }, { "epoch": 0.4189659563630189, "grad_norm": 1.065119981765747, "learning_rate": 1.989323187542101e-05, "loss": 0.0369, "step": 1645 }, { "epoch": 0.42023940911792174, "grad_norm": 0.7318292856216431, "learning_rate": 1.9892583008921263e-05, "loss": 0.0416, "step": 1650 }, { "epoch": 0.4215128618728245, "grad_norm": 1.427072286605835, "learning_rate": 1.9891932187344632e-05, "loss": 0.0335, "step": 1655 }, { "epoch": 0.4227863146277273, "grad_norm": 1.6212133169174194, "learning_rate": 1.989127941081974e-05, "loss": 0.0403, "step": 1660 }, { "epoch": 0.4240597673826301, "grad_norm": 1.6069116592407227, "learning_rate": 1.9890624679475598e-05, "loss": 0.0259, "step": 1665 }, { "epoch": 0.4253332201375329, "grad_norm": 1.0963709354400635, "learning_rate": 1.9889967993441597e-05, "loss": 0.0323, "step": 1670 }, { "epoch": 0.4266066728924357, "grad_norm": 1.169898509979248, "learning_rate": 1.988930935284752e-05, "loss": 0.0462, "step": 1675 }, { "epoch": 0.42788012564733846, "grad_norm": 1.5681673288345337, "learning_rate": 1.9888648757823535e-05, "loss": 0.0406, "step": 1680 }, { "epoch": 0.4291535784022413, "grad_norm": 1.466962456703186, "learning_rate": 1.9887986208500197e-05, "loss": 0.0478, "step": 1685 }, { "epoch": 0.4304270311571441, "grad_norm": 0.9215574264526367, "learning_rate": 1.988732170500844e-05, "loss": 0.0376, "step": 1690 }, { "epoch": 0.43170048391204685, "grad_norm": 1.0645416975021362, "learning_rate": 1.9886655247479596e-05, "loss": 0.0355, "step": 1695 }, { "epoch": 0.43297393666694967, "grad_norm": 1.1869854927062988, "learning_rate": 1.9885986836045376e-05, "loss": 0.0342, "step": 1700 }, { "epoch": 0.4342473894218524, "grad_norm": 0.7181638479232788, "learning_rate": 1.988531647083788e-05, "loss": 0.0367, "step": 1705 }, { "epoch": 0.43552084217675524, "grad_norm": 1.1184104681015015, "learning_rate": 1.988464415198959e-05, "loss": 0.0441, "step": 1710 }, { "epoch": 0.43679429493165806, "grad_norm": 1.2239627838134766, "learning_rate": 1.988396987963338e-05, "loss": 0.0461, "step": 1715 }, { "epoch": 0.4380677476865608, "grad_norm": 1.1527738571166992, "learning_rate": 1.98832936539025e-05, "loss": 0.0358, "step": 1720 }, { "epoch": 0.43934120044146363, "grad_norm": 1.220777988433838, "learning_rate": 1.98826154749306e-05, "loss": 0.0437, "step": 1725 }, { "epoch": 0.4406146531963664, "grad_norm": 0.9740056395530701, "learning_rate": 1.9881935342851706e-05, "loss": 0.0396, "step": 1730 }, { "epoch": 0.4418881059512692, "grad_norm": 1.093568205833435, "learning_rate": 1.9881253257800234e-05, "loss": 0.0416, "step": 1735 }, { "epoch": 0.443161558706172, "grad_norm": 0.8391780853271484, "learning_rate": 1.9880569219910982e-05, "loss": 0.037, "step": 1740 }, { "epoch": 0.4444350114610748, "grad_norm": 0.8893064260482788, "learning_rate": 1.987988322931914e-05, "loss": 0.0361, "step": 1745 }, { "epoch": 0.4457084642159776, "grad_norm": 1.3479975461959839, "learning_rate": 1.9879195286160283e-05, "loss": 0.0359, "step": 1750 }, { "epoch": 0.44698191697088036, "grad_norm": 1.2296359539031982, "learning_rate": 1.987850539057036e-05, "loss": 0.0445, "step": 1755 }, { "epoch": 0.44825536972578317, "grad_norm": 1.1338824033737183, "learning_rate": 1.9877813542685726e-05, "loss": 0.0384, "step": 1760 }, { "epoch": 0.449528822480686, "grad_norm": 1.4367777109146118, "learning_rate": 1.9877119742643108e-05, "loss": 0.0389, "step": 1765 }, { "epoch": 0.45080227523558875, "grad_norm": 1.1655769348144531, "learning_rate": 1.9876423990579615e-05, "loss": 0.0356, "step": 1770 }, { "epoch": 0.45207572799049156, "grad_norm": 0.5674152970314026, "learning_rate": 1.9875726286632762e-05, "loss": 0.0286, "step": 1775 }, { "epoch": 0.4533491807453943, "grad_norm": 1.3553210496902466, "learning_rate": 1.9875026630940426e-05, "loss": 0.0386, "step": 1780 }, { "epoch": 0.45462263350029714, "grad_norm": 1.4366786479949951, "learning_rate": 1.9874325023640885e-05, "loss": 0.04, "step": 1785 }, { "epoch": 0.45589608625519995, "grad_norm": 1.216312050819397, "learning_rate": 1.98736214648728e-05, "loss": 0.0368, "step": 1790 }, { "epoch": 0.4571695390101027, "grad_norm": 1.5136430263519287, "learning_rate": 1.9872915954775208e-05, "loss": 0.0313, "step": 1795 }, { "epoch": 0.4584429917650055, "grad_norm": 0.9734575748443604, "learning_rate": 1.9872208493487546e-05, "loss": 0.0265, "step": 1800 }, { "epoch": 0.4597164445199083, "grad_norm": 1.0922114849090576, "learning_rate": 1.9871499081149627e-05, "loss": 0.0324, "step": 1805 }, { "epoch": 0.4609898972748111, "grad_norm": 1.4529361724853516, "learning_rate": 1.9870787717901657e-05, "loss": 0.038, "step": 1810 }, { "epoch": 0.4622633500297139, "grad_norm": 1.2980631589889526, "learning_rate": 1.9870074403884215e-05, "loss": 0.0405, "step": 1815 }, { "epoch": 0.4635368027846167, "grad_norm": 1.1739604473114014, "learning_rate": 1.9869359139238282e-05, "loss": 0.0395, "step": 1820 }, { "epoch": 0.4648102555395195, "grad_norm": 1.748847484588623, "learning_rate": 1.986864192410521e-05, "loss": 0.0282, "step": 1825 }, { "epoch": 0.46608370829442225, "grad_norm": 1.5405009984970093, "learning_rate": 1.986792275862675e-05, "loss": 0.0439, "step": 1830 }, { "epoch": 0.46735716104932506, "grad_norm": 1.0016268491744995, "learning_rate": 1.9867201642945022e-05, "loss": 0.0307, "step": 1835 }, { "epoch": 0.4686306138042279, "grad_norm": 1.2169989347457886, "learning_rate": 1.9866478577202545e-05, "loss": 0.0385, "step": 1840 }, { "epoch": 0.46990406655913064, "grad_norm": 1.0618561506271362, "learning_rate": 1.9865753561542224e-05, "loss": 0.0349, "step": 1845 }, { "epoch": 0.47117751931403346, "grad_norm": 1.172888159751892, "learning_rate": 1.9865026596107336e-05, "loss": 0.03, "step": 1850 }, { "epoch": 0.47245097206893627, "grad_norm": 1.5621144771575928, "learning_rate": 1.986429768104155e-05, "loss": 0.0509, "step": 1855 }, { "epoch": 0.47372442482383903, "grad_norm": 1.3402786254882812, "learning_rate": 1.9863566816488933e-05, "loss": 0.0451, "step": 1860 }, { "epoch": 0.47499787757874185, "grad_norm": 1.4807661771774292, "learning_rate": 1.9862834002593915e-05, "loss": 0.0418, "step": 1865 }, { "epoch": 0.4762713303336446, "grad_norm": 1.6028817892074585, "learning_rate": 1.9862099239501332e-05, "loss": 0.0372, "step": 1870 }, { "epoch": 0.4775447830885474, "grad_norm": 1.2107126712799072, "learning_rate": 1.9861362527356386e-05, "loss": 0.0344, "step": 1875 }, { "epoch": 0.47881823584345024, "grad_norm": 1.7305898666381836, "learning_rate": 1.9860623866304682e-05, "loss": 0.0415, "step": 1880 }, { "epoch": 0.480091688598353, "grad_norm": 1.2759405374526978, "learning_rate": 1.9859883256492198e-05, "loss": 0.0459, "step": 1885 }, { "epoch": 0.4813651413532558, "grad_norm": 1.2357087135314941, "learning_rate": 1.98591406980653e-05, "loss": 0.0355, "step": 1890 }, { "epoch": 0.48263859410815857, "grad_norm": 1.2198361158370972, "learning_rate": 1.9858396191170744e-05, "loss": 0.04, "step": 1895 }, { "epoch": 0.4839120468630614, "grad_norm": 1.943733811378479, "learning_rate": 1.9857649735955667e-05, "loss": 0.0307, "step": 1900 }, { "epoch": 0.4851854996179642, "grad_norm": 1.0657562017440796, "learning_rate": 1.9856901332567587e-05, "loss": 0.0353, "step": 1905 }, { "epoch": 0.48645895237286696, "grad_norm": 1.4122188091278076, "learning_rate": 1.9856150981154415e-05, "loss": 0.0326, "step": 1910 }, { "epoch": 0.4877324051277698, "grad_norm": 1.3346021175384521, "learning_rate": 1.985539868186444e-05, "loss": 0.0265, "step": 1915 }, { "epoch": 0.48900585788267253, "grad_norm": 1.3799077272415161, "learning_rate": 1.9854644434846344e-05, "loss": 0.0286, "step": 1920 }, { "epoch": 0.49027931063757535, "grad_norm": 1.3778138160705566, "learning_rate": 1.985388824024919e-05, "loss": 0.0375, "step": 1925 }, { "epoch": 0.49155276339247816, "grad_norm": 0.6485750675201416, "learning_rate": 1.985313009822242e-05, "loss": 0.0248, "step": 1930 }, { "epoch": 0.4928262161473809, "grad_norm": 1.3397918939590454, "learning_rate": 1.9852370008915867e-05, "loss": 0.0398, "step": 1935 }, { "epoch": 0.49409966890228374, "grad_norm": 1.2807776927947998, "learning_rate": 1.985160797247975e-05, "loss": 0.0439, "step": 1940 }, { "epoch": 0.4953731216571865, "grad_norm": 0.8738381862640381, "learning_rate": 1.985084398906467e-05, "loss": 0.0353, "step": 1945 }, { "epoch": 0.4966465744120893, "grad_norm": 1.1986572742462158, "learning_rate": 1.9850078058821615e-05, "loss": 0.0474, "step": 1950 }, { "epoch": 0.49792002716699213, "grad_norm": 1.4015617370605469, "learning_rate": 1.9849310181901952e-05, "loss": 0.0363, "step": 1955 }, { "epoch": 0.4991934799218949, "grad_norm": 0.9313846230506897, "learning_rate": 1.9848540358457443e-05, "loss": 0.0361, "step": 1960 }, { "epoch": 0.5004669326767976, "grad_norm": 0.9862497448921204, "learning_rate": 1.9847768588640223e-05, "loss": 0.0363, "step": 1965 }, { "epoch": 0.5017403854317005, "grad_norm": 1.333674669265747, "learning_rate": 1.984699487260282e-05, "loss": 0.0431, "step": 1970 }, { "epoch": 0.5030138381866033, "grad_norm": 1.1792182922363281, "learning_rate": 1.984621921049814e-05, "loss": 0.0393, "step": 1975 }, { "epoch": 0.504287290941506, "grad_norm": 1.344711184501648, "learning_rate": 1.984544160247949e-05, "loss": 0.0366, "step": 1980 }, { "epoch": 0.5055607436964089, "grad_norm": 0.7210054397583008, "learning_rate": 1.9844662048700535e-05, "loss": 0.0323, "step": 1985 }, { "epoch": 0.5068341964513117, "grad_norm": 0.8848633766174316, "learning_rate": 1.984388054931534e-05, "loss": 0.0408, "step": 1990 }, { "epoch": 0.5081076492062144, "grad_norm": 0.9046012163162231, "learning_rate": 1.9843097104478365e-05, "loss": 0.0294, "step": 1995 }, { "epoch": 0.5093811019611172, "grad_norm": 0.5804577469825745, "learning_rate": 1.9842311714344428e-05, "loss": 0.0212, "step": 2000 }, { "epoch": 0.5106545547160201, "grad_norm": 1.1361442804336548, "learning_rate": 1.9841524379068757e-05, "loss": 0.0384, "step": 2005 }, { "epoch": 0.5119280074709228, "grad_norm": 1.0004663467407227, "learning_rate": 1.9840735098806946e-05, "loss": 0.0416, "step": 2010 }, { "epoch": 0.5132014602258256, "grad_norm": 1.5288423299789429, "learning_rate": 1.9839943873714987e-05, "loss": 0.0382, "step": 2015 }, { "epoch": 0.5144749129807284, "grad_norm": 2.2737574577331543, "learning_rate": 1.983915070394925e-05, "loss": 0.0399, "step": 2020 }, { "epoch": 0.5157483657356312, "grad_norm": 1.0503613948822021, "learning_rate": 1.9838355589666482e-05, "loss": 0.0395, "step": 2025 }, { "epoch": 0.517021818490534, "grad_norm": 1.2248318195343018, "learning_rate": 1.983755853102383e-05, "loss": 0.0419, "step": 2030 }, { "epoch": 0.5182952712454368, "grad_norm": 1.0895376205444336, "learning_rate": 1.983675952817881e-05, "loss": 0.0306, "step": 2035 }, { "epoch": 0.5195687240003396, "grad_norm": 1.6180204153060913, "learning_rate": 1.983595858128934e-05, "loss": 0.0418, "step": 2040 }, { "epoch": 0.5208421767552424, "grad_norm": 0.7777670621871948, "learning_rate": 1.98351556905137e-05, "loss": 0.0249, "step": 2045 }, { "epoch": 0.5221156295101451, "grad_norm": 1.6059905290603638, "learning_rate": 1.9834350856010572e-05, "loss": 0.039, "step": 2050 }, { "epoch": 0.523389082265048, "grad_norm": 0.8104087114334106, "learning_rate": 1.9833544077939018e-05, "loss": 0.0442, "step": 2055 }, { "epoch": 0.5246625350199507, "grad_norm": 1.5324957370758057, "learning_rate": 1.9832735356458474e-05, "loss": 0.0332, "step": 2060 }, { "epoch": 0.5259359877748535, "grad_norm": 1.2856297492980957, "learning_rate": 1.983192469172878e-05, "loss": 0.0352, "step": 2065 }, { "epoch": 0.5272094405297564, "grad_norm": 1.383690595626831, "learning_rate": 1.9831112083910133e-05, "loss": 0.0419, "step": 2070 }, { "epoch": 0.5284828932846591, "grad_norm": 1.0936410427093506, "learning_rate": 1.983029753316314e-05, "loss": 0.0244, "step": 2075 }, { "epoch": 0.5297563460395619, "grad_norm": 1.3171464204788208, "learning_rate": 1.982948103964878e-05, "loss": 0.0394, "step": 2080 }, { "epoch": 0.5310297987944648, "grad_norm": 1.6192882061004639, "learning_rate": 1.982866260352842e-05, "loss": 0.0334, "step": 2085 }, { "epoch": 0.5323032515493675, "grad_norm": 1.348465919494629, "learning_rate": 1.9827842224963794e-05, "loss": 0.0398, "step": 2090 }, { "epoch": 0.5335767043042703, "grad_norm": 1.8514788150787354, "learning_rate": 1.982701990411705e-05, "loss": 0.0463, "step": 2095 }, { "epoch": 0.5348501570591732, "grad_norm": 0.5796524882316589, "learning_rate": 1.9826195641150693e-05, "loss": 0.0257, "step": 2100 }, { "epoch": 0.5361236098140759, "grad_norm": 0.9356208443641663, "learning_rate": 1.982536943622763e-05, "loss": 0.0359, "step": 2105 }, { "epoch": 0.5373970625689787, "grad_norm": 1.366754412651062, "learning_rate": 1.9824541289511137e-05, "loss": 0.0373, "step": 2110 }, { "epoch": 0.5386705153238814, "grad_norm": 1.4289402961730957, "learning_rate": 1.982371120116489e-05, "loss": 0.0325, "step": 2115 }, { "epoch": 0.5399439680787843, "grad_norm": 1.7749016284942627, "learning_rate": 1.982287917135293e-05, "loss": 0.0391, "step": 2120 }, { "epoch": 0.5412174208336871, "grad_norm": 1.0387176275253296, "learning_rate": 1.9822045200239697e-05, "loss": 0.0435, "step": 2125 }, { "epoch": 0.5424908735885898, "grad_norm": 2.0169737339019775, "learning_rate": 1.9821209287990013e-05, "loss": 0.0346, "step": 2130 }, { "epoch": 0.5437643263434927, "grad_norm": 0.8469727635383606, "learning_rate": 1.9820371434769074e-05, "loss": 0.0323, "step": 2135 }, { "epoch": 0.5450377790983955, "grad_norm": 0.9233104586601257, "learning_rate": 1.9819531640742468e-05, "loss": 0.036, "step": 2140 }, { "epoch": 0.5463112318532982, "grad_norm": 1.201727271080017, "learning_rate": 1.981868990607616e-05, "loss": 0.0334, "step": 2145 }, { "epoch": 0.5475846846082011, "grad_norm": 1.0059622526168823, "learning_rate": 1.9817846230936504e-05, "loss": 0.0344, "step": 2150 }, { "epoch": 0.5488581373631038, "grad_norm": 1.1997060775756836, "learning_rate": 1.981700061549024e-05, "loss": 0.0342, "step": 2155 }, { "epoch": 0.5501315901180066, "grad_norm": 0.9228559136390686, "learning_rate": 1.9816153059904487e-05, "loss": 0.0296, "step": 2160 }, { "epoch": 0.5514050428729094, "grad_norm": 1.1175479888916016, "learning_rate": 1.981530356434674e-05, "loss": 0.0382, "step": 2165 }, { "epoch": 0.5526784956278122, "grad_norm": 1.6349087953567505, "learning_rate": 1.9814452128984897e-05, "loss": 0.0391, "step": 2170 }, { "epoch": 0.553951948382715, "grad_norm": 1.4246751070022583, "learning_rate": 1.981359875398722e-05, "loss": 0.0444, "step": 2175 }, { "epoch": 0.5552254011376178, "grad_norm": 1.3589873313903809, "learning_rate": 1.9812743439522362e-05, "loss": 0.0376, "step": 2180 }, { "epoch": 0.5564988538925206, "grad_norm": 1.2454755306243896, "learning_rate": 1.9811886185759364e-05, "loss": 0.0378, "step": 2185 }, { "epoch": 0.5577723066474234, "grad_norm": 1.0729985237121582, "learning_rate": 1.981102699286764e-05, "loss": 0.0352, "step": 2190 }, { "epoch": 0.5590457594023261, "grad_norm": 0.9496451020240784, "learning_rate": 1.9810165861016997e-05, "loss": 0.0321, "step": 2195 }, { "epoch": 0.560319212157229, "grad_norm": 1.0008935928344727, "learning_rate": 1.980930279037762e-05, "loss": 0.0361, "step": 2200 }, { "epoch": 0.5615926649121318, "grad_norm": 1.4709370136260986, "learning_rate": 1.9808437781120075e-05, "loss": 0.0317, "step": 2205 }, { "epoch": 0.5628661176670345, "grad_norm": 1.225254774093628, "learning_rate": 1.980757083341532e-05, "loss": 0.0293, "step": 2210 }, { "epoch": 0.5641395704219373, "grad_norm": 1.3001477718353271, "learning_rate": 1.980670194743468e-05, "loss": 0.0361, "step": 2215 }, { "epoch": 0.5654130231768402, "grad_norm": 0.999884307384491, "learning_rate": 1.980583112334989e-05, "loss": 0.0315, "step": 2220 }, { "epoch": 0.5666864759317429, "grad_norm": 0.9319555759429932, "learning_rate": 1.9804958361333037e-05, "loss": 0.0374, "step": 2225 }, { "epoch": 0.5679599286866457, "grad_norm": 1.168541431427002, "learning_rate": 1.9804083661556607e-05, "loss": 0.0383, "step": 2230 }, { "epoch": 0.5692333814415486, "grad_norm": 1.2035280466079712, "learning_rate": 1.9803207024193476e-05, "loss": 0.0263, "step": 2235 }, { "epoch": 0.5705068341964513, "grad_norm": 1.3974257707595825, "learning_rate": 1.980232844941689e-05, "loss": 0.0362, "step": 2240 }, { "epoch": 0.5717802869513541, "grad_norm": 1.1986823081970215, "learning_rate": 1.980144793740048e-05, "loss": 0.0358, "step": 2245 }, { "epoch": 0.573053739706257, "grad_norm": 1.503623127937317, "learning_rate": 1.980056548831826e-05, "loss": 0.0393, "step": 2250 }, { "epoch": 0.5743271924611597, "grad_norm": 1.2667828798294067, "learning_rate": 1.979968110234464e-05, "loss": 0.0419, "step": 2255 }, { "epoch": 0.5756006452160625, "grad_norm": 1.1705119609832764, "learning_rate": 1.9798794779654388e-05, "loss": 0.0282, "step": 2260 }, { "epoch": 0.5768740979709652, "grad_norm": 0.9133334755897522, "learning_rate": 1.979790652042268e-05, "loss": 0.0303, "step": 2265 }, { "epoch": 0.5781475507258681, "grad_norm": 1.17423677444458, "learning_rate": 1.9797016324825054e-05, "loss": 0.0391, "step": 2270 }, { "epoch": 0.5794210034807709, "grad_norm": 1.0737946033477783, "learning_rate": 1.9796124193037444e-05, "loss": 0.0541, "step": 2275 }, { "epoch": 0.5806944562356736, "grad_norm": 1.3645612001419067, "learning_rate": 1.9795230125236164e-05, "loss": 0.0442, "step": 2280 }, { "epoch": 0.5819679089905765, "grad_norm": 0.7566941380500793, "learning_rate": 1.9794334121597906e-05, "loss": 0.028, "step": 2285 }, { "epoch": 0.5832413617454792, "grad_norm": 1.1330622434616089, "learning_rate": 1.979343618229975e-05, "loss": 0.0343, "step": 2290 }, { "epoch": 0.584514814500382, "grad_norm": 0.918860137462616, "learning_rate": 1.9792536307519156e-05, "loss": 0.0406, "step": 2295 }, { "epoch": 0.5857882672552849, "grad_norm": 0.9272119998931885, "learning_rate": 1.979163449743396e-05, "loss": 0.0345, "step": 2300 }, { "epoch": 0.5870617200101876, "grad_norm": 1.3765863180160522, "learning_rate": 1.9790730752222404e-05, "loss": 0.0266, "step": 2305 }, { "epoch": 0.5883351727650904, "grad_norm": 1.0156831741333008, "learning_rate": 1.978982507206308e-05, "loss": 0.0385, "step": 2310 }, { "epoch": 0.5896086255199932, "grad_norm": 1.1811867952346802, "learning_rate": 1.978891745713498e-05, "loss": 0.0385, "step": 2315 }, { "epoch": 0.590882078274896, "grad_norm": 1.43423330783844, "learning_rate": 1.9788007907617484e-05, "loss": 0.032, "step": 2320 }, { "epoch": 0.5921555310297988, "grad_norm": 1.3102521896362305, "learning_rate": 1.978709642369034e-05, "loss": 0.0396, "step": 2325 }, { "epoch": 0.5934289837847015, "grad_norm": 1.131756067276001, "learning_rate": 1.978618300553369e-05, "loss": 0.0314, "step": 2330 }, { "epoch": 0.5947024365396044, "grad_norm": 1.5807991027832031, "learning_rate": 1.9785267653328047e-05, "loss": 0.043, "step": 2335 }, { "epoch": 0.5959758892945072, "grad_norm": 2.6601336002349854, "learning_rate": 1.9784350367254322e-05, "loss": 0.0317, "step": 2340 }, { "epoch": 0.5972493420494099, "grad_norm": 1.1144720315933228, "learning_rate": 1.978343114749379e-05, "loss": 0.036, "step": 2345 }, { "epoch": 0.5985227948043128, "grad_norm": 1.146256923675537, "learning_rate": 1.978250999422812e-05, "loss": 0.0289, "step": 2350 }, { "epoch": 0.5997962475592156, "grad_norm": 0.9613630771636963, "learning_rate": 1.978158690763936e-05, "loss": 0.0262, "step": 2355 }, { "epoch": 0.6010697003141183, "grad_norm": 1.1513783931732178, "learning_rate": 1.9780661887909943e-05, "loss": 0.0333, "step": 2360 }, { "epoch": 0.6023431530690211, "grad_norm": 1.5187700986862183, "learning_rate": 1.9779734935222676e-05, "loss": 0.0356, "step": 2365 }, { "epoch": 0.603616605823924, "grad_norm": 1.2530393600463867, "learning_rate": 1.9778806049760757e-05, "loss": 0.0393, "step": 2370 }, { "epoch": 0.6048900585788267, "grad_norm": 0.8498803377151489, "learning_rate": 1.977787523170776e-05, "loss": 0.0427, "step": 2375 }, { "epoch": 0.6061635113337295, "grad_norm": 0.8923577070236206, "learning_rate": 1.9776942481247646e-05, "loss": 0.04, "step": 2380 }, { "epoch": 0.6074369640886323, "grad_norm": 1.1086701154708862, "learning_rate": 1.977600779856475e-05, "loss": 0.0377, "step": 2385 }, { "epoch": 0.6087104168435351, "grad_norm": 1.350811243057251, "learning_rate": 1.97750711838438e-05, "loss": 0.0378, "step": 2390 }, { "epoch": 0.6099838695984379, "grad_norm": 1.3863540887832642, "learning_rate": 1.9774132637269895e-05, "loss": 0.045, "step": 2395 }, { "epoch": 0.6112573223533407, "grad_norm": 1.61689293384552, "learning_rate": 1.9773192159028522e-05, "loss": 0.0447, "step": 2400 }, { "epoch": 0.6125307751082435, "grad_norm": 1.3397938013076782, "learning_rate": 1.9772249749305546e-05, "loss": 0.0398, "step": 2405 }, { "epoch": 0.6138042278631463, "grad_norm": 1.5392061471939087, "learning_rate": 1.9771305408287223e-05, "loss": 0.0415, "step": 2410 }, { "epoch": 0.615077680618049, "grad_norm": 0.5506386160850525, "learning_rate": 1.9770359136160178e-05, "loss": 0.027, "step": 2415 }, { "epoch": 0.6163511333729519, "grad_norm": 1.1953445672988892, "learning_rate": 1.9769600728185922e-05, "loss": 0.048, "step": 2420 }, { "epoch": 0.6176245861278546, "grad_norm": 1.1449947357177734, "learning_rate": 1.9768650980534704e-05, "loss": 0.0324, "step": 2425 }, { "epoch": 0.6188980388827574, "grad_norm": 2.3826406002044678, "learning_rate": 1.9767699302299365e-05, "loss": 0.0381, "step": 2430 }, { "epoch": 0.6201714916376603, "grad_norm": 1.6260267496109009, "learning_rate": 1.976674569366798e-05, "loss": 0.0386, "step": 2435 }, { "epoch": 0.621444944392563, "grad_norm": 1.0759896039962769, "learning_rate": 1.9765790154829013e-05, "loss": 0.0387, "step": 2440 }, { "epoch": 0.6227183971474658, "grad_norm": 1.7840641736984253, "learning_rate": 1.9764832685971304e-05, "loss": 0.0388, "step": 2445 }, { "epoch": 0.6239918499023687, "grad_norm": 1.5872762203216553, "learning_rate": 1.9763873287284087e-05, "loss": 0.0392, "step": 2450 }, { "epoch": 0.6252653026572714, "grad_norm": 1.622179627418518, "learning_rate": 1.9762911958956964e-05, "loss": 0.0382, "step": 2455 }, { "epoch": 0.6265387554121742, "grad_norm": 0.8734006285667419, "learning_rate": 1.9761948701179917e-05, "loss": 0.0387, "step": 2460 }, { "epoch": 0.627812208167077, "grad_norm": 1.260137915611267, "learning_rate": 1.9760983514143323e-05, "loss": 0.0268, "step": 2465 }, { "epoch": 0.6290856609219798, "grad_norm": 0.8690651059150696, "learning_rate": 1.9760016398037934e-05, "loss": 0.0339, "step": 2470 }, { "epoch": 0.6303591136768826, "grad_norm": 1.0145021677017212, "learning_rate": 1.9759047353054873e-05, "loss": 0.0427, "step": 2475 }, { "epoch": 0.6316325664317853, "grad_norm": 1.1046979427337646, "learning_rate": 1.9758076379385658e-05, "loss": 0.0383, "step": 2480 }, { "epoch": 0.6329060191866882, "grad_norm": 1.0659704208374023, "learning_rate": 1.9757103477222183e-05, "loss": 0.0481, "step": 2485 }, { "epoch": 0.634179471941591, "grad_norm": 1.6692711114883423, "learning_rate": 1.9756128646756725e-05, "loss": 0.0366, "step": 2490 }, { "epoch": 0.6354529246964937, "grad_norm": 1.832512617111206, "learning_rate": 1.9755151888181936e-05, "loss": 0.0328, "step": 2495 }, { "epoch": 0.6367263774513966, "grad_norm": 1.0718320608139038, "learning_rate": 1.9754173201690855e-05, "loss": 0.0419, "step": 2500 }, { "epoch": 0.6379998302062994, "grad_norm": 0.6238051652908325, "learning_rate": 1.9753192587476903e-05, "loss": 0.0404, "step": 2505 }, { "epoch": 0.6392732829612021, "grad_norm": 1.0461281538009644, "learning_rate": 1.9752210045733875e-05, "loss": 0.034, "step": 2510 }, { "epoch": 0.640546735716105, "grad_norm": 1.52308988571167, "learning_rate": 1.9751225576655955e-05, "loss": 0.0455, "step": 2515 }, { "epoch": 0.6418201884710077, "grad_norm": 0.9863409996032715, "learning_rate": 1.9750239180437702e-05, "loss": 0.0418, "step": 2520 }, { "epoch": 0.6430936412259105, "grad_norm": 1.2704236507415771, "learning_rate": 1.974925085727406e-05, "loss": 0.044, "step": 2525 }, { "epoch": 0.6443670939808133, "grad_norm": 1.3287547826766968, "learning_rate": 1.974826060736035e-05, "loss": 0.0364, "step": 2530 }, { "epoch": 0.6456405467357161, "grad_norm": 2.5462863445281982, "learning_rate": 1.9747268430892276e-05, "loss": 0.0372, "step": 2535 }, { "epoch": 0.6469139994906189, "grad_norm": 1.3015474081039429, "learning_rate": 1.9746274328065923e-05, "loss": 0.0325, "step": 2540 }, { "epoch": 0.6481874522455217, "grad_norm": 1.6808050870895386, "learning_rate": 1.9745278299077754e-05, "loss": 0.0356, "step": 2545 }, { "epoch": 0.6494609050004245, "grad_norm": 1.3935303688049316, "learning_rate": 1.974428034412462e-05, "loss": 0.0361, "step": 2550 }, { "epoch": 0.6507343577553273, "grad_norm": 1.1990400552749634, "learning_rate": 1.9743280463403743e-05, "loss": 0.0425, "step": 2555 }, { "epoch": 0.65200781051023, "grad_norm": 1.036121129989624, "learning_rate": 1.974227865711273e-05, "loss": 0.0343, "step": 2560 }, { "epoch": 0.6532812632651329, "grad_norm": 0.5093967318534851, "learning_rate": 1.974127492544957e-05, "loss": 0.0257, "step": 2565 }, { "epoch": 0.6545547160200357, "grad_norm": 1.5872493982315063, "learning_rate": 1.974026926861263e-05, "loss": 0.0249, "step": 2570 }, { "epoch": 0.6558281687749384, "grad_norm": 1.396511197090149, "learning_rate": 1.9739261686800662e-05, "loss": 0.0333, "step": 2575 }, { "epoch": 0.6571016215298412, "grad_norm": 0.8602501153945923, "learning_rate": 1.9738252180212788e-05, "loss": 0.0329, "step": 2580 }, { "epoch": 0.6583750742847441, "grad_norm": 1.2520087957382202, "learning_rate": 1.9737240749048526e-05, "loss": 0.0319, "step": 2585 }, { "epoch": 0.6596485270396468, "grad_norm": 1.241092324256897, "learning_rate": 1.9736227393507756e-05, "loss": 0.0302, "step": 2590 }, { "epoch": 0.6609219797945496, "grad_norm": 1.767581820487976, "learning_rate": 1.9735212113790755e-05, "loss": 0.036, "step": 2595 }, { "epoch": 0.6621954325494525, "grad_norm": 1.5686488151550293, "learning_rate": 1.9734194910098174e-05, "loss": 0.0386, "step": 2600 }, { "epoch": 0.6634688853043552, "grad_norm": 1.5252093076705933, "learning_rate": 1.973317578263104e-05, "loss": 0.0366, "step": 2605 }, { "epoch": 0.664742338059258, "grad_norm": 0.8803490996360779, "learning_rate": 1.9732154731590767e-05, "loss": 0.0372, "step": 2610 }, { "epoch": 0.6660157908141608, "grad_norm": 2.0801219940185547, "learning_rate": 1.973113175717914e-05, "loss": 0.0357, "step": 2615 }, { "epoch": 0.6672892435690636, "grad_norm": 1.1355782747268677, "learning_rate": 1.9730106859598335e-05, "loss": 0.0339, "step": 2620 }, { "epoch": 0.6685626963239664, "grad_norm": 1.1129006147384644, "learning_rate": 1.9729080039050905e-05, "loss": 0.0411, "step": 2625 }, { "epoch": 0.6698361490788691, "grad_norm": 1.4002480506896973, "learning_rate": 1.9728051295739776e-05, "loss": 0.0309, "step": 2630 }, { "epoch": 0.671109601833772, "grad_norm": 1.5330166816711426, "learning_rate": 1.9727020629868263e-05, "loss": 0.0313, "step": 2635 }, { "epoch": 0.6723830545886748, "grad_norm": 1.096602201461792, "learning_rate": 1.972598804164005e-05, "loss": 0.0434, "step": 2640 }, { "epoch": 0.6736565073435775, "grad_norm": 1.1342099905014038, "learning_rate": 1.9724953531259217e-05, "loss": 0.0343, "step": 2645 }, { "epoch": 0.6749299600984804, "grad_norm": 0.9752647876739502, "learning_rate": 1.972391709893021e-05, "loss": 0.0393, "step": 2650 }, { "epoch": 0.6762034128533831, "grad_norm": 1.571738839149475, "learning_rate": 1.9722878744857864e-05, "loss": 0.0358, "step": 2655 }, { "epoch": 0.6774768656082859, "grad_norm": 1.8935844898223877, "learning_rate": 1.972183846924738e-05, "loss": 0.0392, "step": 2660 }, { "epoch": 0.6787503183631888, "grad_norm": 1.186568021774292, "learning_rate": 1.972079627230436e-05, "loss": 0.0387, "step": 2665 }, { "epoch": 0.6800237711180915, "grad_norm": 0.6946138739585876, "learning_rate": 1.971975215423477e-05, "loss": 0.0488, "step": 2670 }, { "epoch": 0.6812972238729943, "grad_norm": 1.9565045833587646, "learning_rate": 1.9718706115244956e-05, "loss": 0.0395, "step": 2675 }, { "epoch": 0.682570676627897, "grad_norm": 0.9313160181045532, "learning_rate": 1.9717658155541647e-05, "loss": 0.0428, "step": 2680 }, { "epoch": 0.6838441293827999, "grad_norm": 1.0497124195098877, "learning_rate": 1.9716608275331958e-05, "loss": 0.0392, "step": 2685 }, { "epoch": 0.6851175821377027, "grad_norm": 1.277603030204773, "learning_rate": 1.9715556474823375e-05, "loss": 0.0329, "step": 2690 }, { "epoch": 0.6863910348926054, "grad_norm": 1.3205182552337646, "learning_rate": 1.9714502754223762e-05, "loss": 0.0404, "step": 2695 }, { "epoch": 0.6876644876475083, "grad_norm": 1.0443772077560425, "learning_rate": 1.971344711374137e-05, "loss": 0.0328, "step": 2700 }, { "epoch": 0.6889379404024111, "grad_norm": 1.0011409521102905, "learning_rate": 1.9712389553584825e-05, "loss": 0.0316, "step": 2705 }, { "epoch": 0.6902113931573138, "grad_norm": 1.7460126876831055, "learning_rate": 1.9711330073963135e-05, "loss": 0.0301, "step": 2710 }, { "epoch": 0.6914848459122167, "grad_norm": 1.5033446550369263, "learning_rate": 1.971026867508568e-05, "loss": 0.0451, "step": 2715 }, { "epoch": 0.6927582986671195, "grad_norm": 1.6873455047607422, "learning_rate": 1.9709205357162235e-05, "loss": 0.0469, "step": 2720 }, { "epoch": 0.6940317514220222, "grad_norm": 2.1593430042266846, "learning_rate": 1.9708140120402933e-05, "loss": 0.032, "step": 2725 }, { "epoch": 0.695305204176925, "grad_norm": 1.3242803812026978, "learning_rate": 1.9707072965018304e-05, "loss": 0.0376, "step": 2730 }, { "epoch": 0.6965786569318279, "grad_norm": 1.5849965810775757, "learning_rate": 1.9706003891219247e-05, "loss": 0.0389, "step": 2735 }, { "epoch": 0.6978521096867306, "grad_norm": 1.0734199285507202, "learning_rate": 1.9704932899217047e-05, "loss": 0.0341, "step": 2740 }, { "epoch": 0.6991255624416334, "grad_norm": 1.1228938102722168, "learning_rate": 1.9703859989223363e-05, "loss": 0.0364, "step": 2745 }, { "epoch": 0.7003990151965362, "grad_norm": 1.1544491052627563, "learning_rate": 1.970278516145024e-05, "loss": 0.0337, "step": 2750 }, { "epoch": 0.701672467951439, "grad_norm": 0.848433256149292, "learning_rate": 1.9701708416110085e-05, "loss": 0.0293, "step": 2755 }, { "epoch": 0.7029459207063418, "grad_norm": 1.960632562637329, "learning_rate": 1.9700629753415707e-05, "loss": 0.05, "step": 2760 }, { "epoch": 0.7042193734612446, "grad_norm": 1.420677661895752, "learning_rate": 1.9699549173580274e-05, "loss": 0.0449, "step": 2765 }, { "epoch": 0.7054928262161474, "grad_norm": 0.851005494594574, "learning_rate": 1.9698466676817348e-05, "loss": 0.0344, "step": 2770 }, { "epoch": 0.7067662789710502, "grad_norm": 1.3532646894454956, "learning_rate": 1.9697382263340862e-05, "loss": 0.0384, "step": 2775 }, { "epoch": 0.7080397317259529, "grad_norm": 1.269168496131897, "learning_rate": 1.969629593336513e-05, "loss": 0.0318, "step": 2780 }, { "epoch": 0.7093131844808558, "grad_norm": 1.3737903833389282, "learning_rate": 1.9695207687104843e-05, "loss": 0.0357, "step": 2785 }, { "epoch": 0.7105866372357585, "grad_norm": 1.086531400680542, "learning_rate": 1.969411752477507e-05, "loss": 0.0327, "step": 2790 }, { "epoch": 0.7118600899906613, "grad_norm": 2.2555532455444336, "learning_rate": 1.969302544659126e-05, "loss": 0.0314, "step": 2795 }, { "epoch": 0.7131335427455642, "grad_norm": 1.4022468328475952, "learning_rate": 1.9691931452769246e-05, "loss": 0.0322, "step": 2800 }, { "epoch": 0.7144069955004669, "grad_norm": 1.066648006439209, "learning_rate": 1.969083554352523e-05, "loss": 0.0293, "step": 2805 }, { "epoch": 0.7156804482553697, "grad_norm": 1.2992030382156372, "learning_rate": 1.9689737719075803e-05, "loss": 0.0363, "step": 2810 }, { "epoch": 0.7169539010102726, "grad_norm": 1.1669676303863525, "learning_rate": 1.9688637979637918e-05, "loss": 0.0325, "step": 2815 }, { "epoch": 0.7182273537651753, "grad_norm": 1.6483014822006226, "learning_rate": 1.968753632542893e-05, "loss": 0.0419, "step": 2820 }, { "epoch": 0.7195008065200781, "grad_norm": 1.4626743793487549, "learning_rate": 1.9686432756666545e-05, "loss": 0.0381, "step": 2825 }, { "epoch": 0.7207742592749808, "grad_norm": 1.590528130531311, "learning_rate": 1.968532727356888e-05, "loss": 0.0458, "step": 2830 }, { "epoch": 0.7220477120298837, "grad_norm": 1.1771763563156128, "learning_rate": 1.9684219876354394e-05, "loss": 0.0322, "step": 2835 }, { "epoch": 0.7233211647847865, "grad_norm": 0.9354103803634644, "learning_rate": 1.9683110565241957e-05, "loss": 0.0337, "step": 2840 }, { "epoch": 0.7245946175396892, "grad_norm": 1.1489481925964355, "learning_rate": 1.968199934045079e-05, "loss": 0.035, "step": 2845 }, { "epoch": 0.7258680702945921, "grad_norm": 1.637197732925415, "learning_rate": 1.968088620220052e-05, "loss": 0.0478, "step": 2850 }, { "epoch": 0.7271415230494949, "grad_norm": 1.6253273487091064, "learning_rate": 1.9679771150711124e-05, "loss": 0.0304, "step": 2855 }, { "epoch": 0.7284149758043976, "grad_norm": 1.1681939363479614, "learning_rate": 1.9678654186202974e-05, "loss": 0.0319, "step": 2860 }, { "epoch": 0.7296884285593005, "grad_norm": 1.5114961862564087, "learning_rate": 1.967753530889682e-05, "loss": 0.0424, "step": 2865 }, { "epoch": 0.7309618813142033, "grad_norm": 0.9299352169036865, "learning_rate": 1.9676414519013782e-05, "loss": 0.0291, "step": 2870 }, { "epoch": 0.732235334069106, "grad_norm": 0.8693564534187317, "learning_rate": 1.9675291816775365e-05, "loss": 0.0275, "step": 2875 }, { "epoch": 0.7335087868240088, "grad_norm": 1.3793867826461792, "learning_rate": 1.9674167202403448e-05, "loss": 0.0472, "step": 2880 }, { "epoch": 0.7347822395789116, "grad_norm": 0.45853665471076965, "learning_rate": 1.967304067612029e-05, "loss": 0.0234, "step": 2885 }, { "epoch": 0.7360556923338144, "grad_norm": 0.8654637336730957, "learning_rate": 1.967191223814853e-05, "loss": 0.0363, "step": 2890 }, { "epoch": 0.7373291450887172, "grad_norm": 1.307783603668213, "learning_rate": 1.9670781888711176e-05, "loss": 0.0395, "step": 2895 }, { "epoch": 0.73860259784362, "grad_norm": 1.2077933549880981, "learning_rate": 1.9669649628031623e-05, "loss": 0.0362, "step": 2900 }, { "epoch": 0.7398760505985228, "grad_norm": 1.3181003332138062, "learning_rate": 1.9668515456333638e-05, "loss": 0.0457, "step": 2905 }, { "epoch": 0.7411495033534256, "grad_norm": 0.9811201095581055, "learning_rate": 1.966737937384137e-05, "loss": 0.0326, "step": 2910 }, { "epoch": 0.7424229561083284, "grad_norm": 1.2181751728057861, "learning_rate": 1.9666241380779342e-05, "loss": 0.0431, "step": 2915 }, { "epoch": 0.7436964088632312, "grad_norm": 1.8638619184494019, "learning_rate": 1.9665101477372457e-05, "loss": 0.0298, "step": 2920 }, { "epoch": 0.7449698616181339, "grad_norm": 1.2473053932189941, "learning_rate": 1.9663959663845994e-05, "loss": 0.0323, "step": 2925 }, { "epoch": 0.7462433143730368, "grad_norm": 1.1460069417953491, "learning_rate": 1.9662815940425616e-05, "loss": 0.0372, "step": 2930 }, { "epoch": 0.7475167671279396, "grad_norm": 1.22262704372406, "learning_rate": 1.966167030733735e-05, "loss": 0.0312, "step": 2935 }, { "epoch": 0.7487902198828423, "grad_norm": 1.5022858381271362, "learning_rate": 1.9660522764807613e-05, "loss": 0.0406, "step": 2940 }, { "epoch": 0.7500636726377451, "grad_norm": 1.2410728931427002, "learning_rate": 1.9659373313063194e-05, "loss": 0.0355, "step": 2945 }, { "epoch": 0.751337125392648, "grad_norm": 1.057208776473999, "learning_rate": 1.9658221952331257e-05, "loss": 0.0284, "step": 2950 }, { "epoch": 0.7526105781475507, "grad_norm": 0.6561054587364197, "learning_rate": 1.9657068682839345e-05, "loss": 0.0397, "step": 2955 }, { "epoch": 0.7538840309024535, "grad_norm": 1.080721378326416, "learning_rate": 1.9655913504815382e-05, "loss": 0.0286, "step": 2960 }, { "epoch": 0.7551574836573564, "grad_norm": 1.0722219944000244, "learning_rate": 1.965475641848767e-05, "loss": 0.0314, "step": 2965 }, { "epoch": 0.7564309364122591, "grad_norm": 2.019986391067505, "learning_rate": 1.965359742408488e-05, "loss": 0.035, "step": 2970 }, { "epoch": 0.7577043891671619, "grad_norm": 1.2861512899398804, "learning_rate": 1.9652436521836064e-05, "loss": 0.0323, "step": 2975 }, { "epoch": 0.7589778419220647, "grad_norm": 1.21466064453125, "learning_rate": 1.9651273711970656e-05, "loss": 0.0387, "step": 2980 }, { "epoch": 0.7602512946769675, "grad_norm": 0.8863744735717773, "learning_rate": 1.9650108994718456e-05, "loss": 0.0383, "step": 2985 }, { "epoch": 0.7615247474318703, "grad_norm": 2.0616090297698975, "learning_rate": 1.9648942370309658e-05, "loss": 0.0219, "step": 2990 }, { "epoch": 0.762798200186773, "grad_norm": 0.8084316849708557, "learning_rate": 1.9647773838974812e-05, "loss": 0.0284, "step": 2995 }, { "epoch": 0.7640716529416759, "grad_norm": 1.148572564125061, "learning_rate": 1.9646603400944862e-05, "loss": 0.0379, "step": 3000 }, { "epoch": 0.7653451056965787, "grad_norm": 1.1203052997589111, "learning_rate": 1.9645431056451122e-05, "loss": 0.0346, "step": 3005 }, { "epoch": 0.7666185584514814, "grad_norm": 1.5004984140396118, "learning_rate": 1.9644256805725282e-05, "loss": 0.0418, "step": 3010 }, { "epoch": 0.7678920112063843, "grad_norm": 1.2111212015151978, "learning_rate": 1.9643080648999406e-05, "loss": 0.0293, "step": 3015 }, { "epoch": 0.769165463961287, "grad_norm": 1.4282681941986084, "learning_rate": 1.9641902586505946e-05, "loss": 0.0405, "step": 3020 }, { "epoch": 0.7704389167161898, "grad_norm": 1.8651585578918457, "learning_rate": 1.964072261847772e-05, "loss": 0.0369, "step": 3025 }, { "epoch": 0.7717123694710927, "grad_norm": 1.7189292907714844, "learning_rate": 1.963954074514792e-05, "loss": 0.0325, "step": 3030 }, { "epoch": 0.7729858222259954, "grad_norm": 1.5556915998458862, "learning_rate": 1.963835696675013e-05, "loss": 0.0316, "step": 3035 }, { "epoch": 0.7742592749808982, "grad_norm": 1.515021800994873, "learning_rate": 1.9637171283518292e-05, "loss": 0.0391, "step": 3040 }, { "epoch": 0.775532727735801, "grad_norm": 1.1754788160324097, "learning_rate": 1.9635983695686743e-05, "loss": 0.0383, "step": 3045 }, { "epoch": 0.7768061804907038, "grad_norm": 1.8147343397140503, "learning_rate": 1.9634794203490176e-05, "loss": 0.036, "step": 3050 }, { "epoch": 0.7780796332456066, "grad_norm": 1.6012656688690186, "learning_rate": 1.963360280716368e-05, "loss": 0.0298, "step": 3055 }, { "epoch": 0.7793530860005093, "grad_norm": 1.2951397895812988, "learning_rate": 1.9632409506942703e-05, "loss": 0.0329, "step": 3060 }, { "epoch": 0.7806265387554122, "grad_norm": 1.1627789735794067, "learning_rate": 1.9631214303063087e-05, "loss": 0.0302, "step": 3065 }, { "epoch": 0.781899991510315, "grad_norm": 1.7361358404159546, "learning_rate": 1.9630017195761034e-05, "loss": 0.0372, "step": 3070 }, { "epoch": 0.7831734442652177, "grad_norm": 1.2082151174545288, "learning_rate": 1.962881818527313e-05, "loss": 0.0371, "step": 3075 }, { "epoch": 0.7844468970201206, "grad_norm": 0.8100277185440063, "learning_rate": 1.9627617271836337e-05, "loss": 0.0233, "step": 3080 }, { "epoch": 0.7857203497750234, "grad_norm": 2.3646676540374756, "learning_rate": 1.9626414455687995e-05, "loss": 0.0267, "step": 3085 }, { "epoch": 0.7869938025299261, "grad_norm": 1.637605905532837, "learning_rate": 1.962520973706581e-05, "loss": 0.0359, "step": 3090 }, { "epoch": 0.7882672552848289, "grad_norm": 0.7886127829551697, "learning_rate": 1.962400311620788e-05, "loss": 0.0319, "step": 3095 }, { "epoch": 0.7895407080397318, "grad_norm": 1.1923365592956543, "learning_rate": 1.9622794593352663e-05, "loss": 0.0284, "step": 3100 }, { "epoch": 0.7908141607946345, "grad_norm": 1.665776014328003, "learning_rate": 1.9621584168739004e-05, "loss": 0.0386, "step": 3105 }, { "epoch": 0.7920876135495373, "grad_norm": 1.053982138633728, "learning_rate": 1.962037184260612e-05, "loss": 0.0328, "step": 3110 }, { "epoch": 0.7933610663044401, "grad_norm": 2.2537941932678223, "learning_rate": 1.96191576151936e-05, "loss": 0.0326, "step": 3115 }, { "epoch": 0.7946345190593429, "grad_norm": 1.4013298749923706, "learning_rate": 1.9617941486741418e-05, "loss": 0.0302, "step": 3120 }, { "epoch": 0.7959079718142457, "grad_norm": 1.075559377670288, "learning_rate": 1.9616723457489916e-05, "loss": 0.0404, "step": 3125 }, { "epoch": 0.7971814245691485, "grad_norm": 1.376800298690796, "learning_rate": 1.961550352767981e-05, "loss": 0.0316, "step": 3130 }, { "epoch": 0.7984548773240513, "grad_norm": 1.069680094718933, "learning_rate": 1.96142816975522e-05, "loss": 0.0345, "step": 3135 }, { "epoch": 0.799728330078954, "grad_norm": 1.2389860153198242, "learning_rate": 1.9613057967348556e-05, "loss": 0.0373, "step": 3140 }, { "epoch": 0.8010017828338568, "grad_norm": 1.611585259437561, "learning_rate": 1.961183233731072e-05, "loss": 0.0403, "step": 3145 }, { "epoch": 0.8022752355887597, "grad_norm": 1.6233959197998047, "learning_rate": 1.961060480768092e-05, "loss": 0.0341, "step": 3150 }, { "epoch": 0.8035486883436624, "grad_norm": 1.9482567310333252, "learning_rate": 1.9609375378701753e-05, "loss": 0.0367, "step": 3155 }, { "epoch": 0.8048221410985652, "grad_norm": 1.331463098526001, "learning_rate": 1.9608144050616192e-05, "loss": 0.0308, "step": 3160 }, { "epoch": 0.8060955938534681, "grad_norm": 1.2388510704040527, "learning_rate": 1.9606910823667578e-05, "loss": 0.0369, "step": 3165 }, { "epoch": 0.8073690466083708, "grad_norm": 1.4131979942321777, "learning_rate": 1.9605675698099645e-05, "loss": 0.0302, "step": 3170 }, { "epoch": 0.8086424993632736, "grad_norm": 1.0130943059921265, "learning_rate": 1.9604438674156483e-05, "loss": 0.031, "step": 3175 }, { "epoch": 0.8099159521181765, "grad_norm": 1.2865222692489624, "learning_rate": 1.960319975208257e-05, "loss": 0.0307, "step": 3180 }, { "epoch": 0.8111894048730792, "grad_norm": 1.598451018333435, "learning_rate": 1.960195893212275e-05, "loss": 0.0362, "step": 3185 }, { "epoch": 0.812462857627982, "grad_norm": 2.119769811630249, "learning_rate": 1.9600716214522255e-05, "loss": 0.0283, "step": 3190 }, { "epoch": 0.8137363103828847, "grad_norm": 1.4705471992492676, "learning_rate": 1.959947159952668e-05, "loss": 0.0305, "step": 3195 }, { "epoch": 0.8150097631377876, "grad_norm": 1.7324062585830688, "learning_rate": 1.9598225087381996e-05, "loss": 0.0279, "step": 3200 }, { "epoch": 0.8162832158926904, "grad_norm": 2.457919120788574, "learning_rate": 1.959697667833456e-05, "loss": 0.0475, "step": 3205 }, { "epoch": 0.8175566686475931, "grad_norm": 1.349716067314148, "learning_rate": 1.9595726372631082e-05, "loss": 0.036, "step": 3210 }, { "epoch": 0.818830121402496, "grad_norm": 0.8924099802970886, "learning_rate": 1.9594474170518675e-05, "loss": 0.0301, "step": 3215 }, { "epoch": 0.8201035741573988, "grad_norm": 1.5180528163909912, "learning_rate": 1.9593220072244804e-05, "loss": 0.0314, "step": 3220 }, { "epoch": 0.8213770269123015, "grad_norm": 0.9750965237617493, "learning_rate": 1.9591964078057325e-05, "loss": 0.0358, "step": 3225 }, { "epoch": 0.8226504796672044, "grad_norm": 1.720260739326477, "learning_rate": 1.9590706188204448e-05, "loss": 0.0382, "step": 3230 }, { "epoch": 0.8239239324221072, "grad_norm": 1.246989369392395, "learning_rate": 1.958944640293478e-05, "loss": 0.0324, "step": 3235 }, { "epoch": 0.8251973851770099, "grad_norm": 1.794500708580017, "learning_rate": 1.9588184722497297e-05, "loss": 0.0288, "step": 3240 }, { "epoch": 0.8264708379319127, "grad_norm": 1.6349878311157227, "learning_rate": 1.9586921147141334e-05, "loss": 0.0324, "step": 3245 }, { "epoch": 0.8277442906868155, "grad_norm": 2.1274733543395996, "learning_rate": 1.9585655677116618e-05, "loss": 0.0331, "step": 3250 }, { "epoch": 0.8290177434417183, "grad_norm": 1.5854339599609375, "learning_rate": 1.9584388312673246e-05, "loss": 0.0314, "step": 3255 }, { "epoch": 0.8302911961966211, "grad_norm": 2.334608316421509, "learning_rate": 1.9583119054061686e-05, "loss": 0.0435, "step": 3260 }, { "epoch": 0.8315646489515239, "grad_norm": 1.0409579277038574, "learning_rate": 1.9581847901532783e-05, "loss": 0.0321, "step": 3265 }, { "epoch": 0.8328381017064267, "grad_norm": 1.4236372709274292, "learning_rate": 1.9580574855337752e-05, "loss": 0.0322, "step": 3270 }, { "epoch": 0.8341115544613295, "grad_norm": 1.0730125904083252, "learning_rate": 1.9579299915728195e-05, "loss": 0.0318, "step": 3275 }, { "epoch": 0.8353850072162323, "grad_norm": 1.2766821384429932, "learning_rate": 1.957802308295607e-05, "loss": 0.042, "step": 3280 }, { "epoch": 0.8366584599711351, "grad_norm": 1.4519259929656982, "learning_rate": 1.957674435727372e-05, "loss": 0.0349, "step": 3285 }, { "epoch": 0.8379319127260378, "grad_norm": 1.2695451974868774, "learning_rate": 1.9575463738933865e-05, "loss": 0.0397, "step": 3290 }, { "epoch": 0.8392053654809407, "grad_norm": 1.1725728511810303, "learning_rate": 1.957418122818959e-05, "loss": 0.034, "step": 3295 }, { "epoch": 0.8404788182358435, "grad_norm": 1.3905757665634155, "learning_rate": 1.9572896825294358e-05, "loss": 0.0339, "step": 3300 }, { "epoch": 0.8417522709907462, "grad_norm": 1.173440933227539, "learning_rate": 1.9571610530502008e-05, "loss": 0.0354, "step": 3305 }, { "epoch": 0.843025723745649, "grad_norm": 1.2336695194244385, "learning_rate": 1.957032234406675e-05, "loss": 0.0334, "step": 3310 }, { "epoch": 0.8442991765005519, "grad_norm": 0.7201148271560669, "learning_rate": 1.9569032266243173e-05, "loss": 0.0319, "step": 3315 }, { "epoch": 0.8455726292554546, "grad_norm": 1.132002353668213, "learning_rate": 1.9567740297286232e-05, "loss": 0.0373, "step": 3320 }, { "epoch": 0.8468460820103574, "grad_norm": 1.0714781284332275, "learning_rate": 1.9566446437451255e-05, "loss": 0.0299, "step": 3325 }, { "epoch": 0.8481195347652603, "grad_norm": 1.1837102174758911, "learning_rate": 1.956515068699396e-05, "loss": 0.0368, "step": 3330 }, { "epoch": 0.849392987520163, "grad_norm": 0.9477264285087585, "learning_rate": 1.9563853046170416e-05, "loss": 0.0353, "step": 3335 }, { "epoch": 0.8506664402750658, "grad_norm": 1.8632867336273193, "learning_rate": 1.9562553515237087e-05, "loss": 0.0435, "step": 3340 }, { "epoch": 0.8519398930299686, "grad_norm": 2.8691978454589844, "learning_rate": 1.9561252094450788e-05, "loss": 0.0353, "step": 3345 }, { "epoch": 0.8532133457848714, "grad_norm": 1.583402156829834, "learning_rate": 1.9559948784068726e-05, "loss": 0.0395, "step": 3350 }, { "epoch": 0.8544867985397742, "grad_norm": 1.2038613557815552, "learning_rate": 1.9558643584348478e-05, "loss": 0.0344, "step": 3355 }, { "epoch": 0.8557602512946769, "grad_norm": 1.2842981815338135, "learning_rate": 1.9557336495547988e-05, "loss": 0.0377, "step": 3360 }, { "epoch": 0.8570337040495798, "grad_norm": 1.2354360818862915, "learning_rate": 1.9556027517925574e-05, "loss": 0.0338, "step": 3365 }, { "epoch": 0.8583071568044826, "grad_norm": 1.1341556310653687, "learning_rate": 1.9554716651739935e-05, "loss": 0.0309, "step": 3370 }, { "epoch": 0.8595806095593853, "grad_norm": 1.8231191635131836, "learning_rate": 1.9553403897250132e-05, "loss": 0.0437, "step": 3375 }, { "epoch": 0.8608540623142882, "grad_norm": 1.0181570053100586, "learning_rate": 1.9552089254715613e-05, "loss": 0.0348, "step": 3380 }, { "epoch": 0.8621275150691909, "grad_norm": 1.1691778898239136, "learning_rate": 1.9550772724396186e-05, "loss": 0.0276, "step": 3385 }, { "epoch": 0.8634009678240937, "grad_norm": 1.496671438217163, "learning_rate": 1.954945430655204e-05, "loss": 0.0362, "step": 3390 }, { "epoch": 0.8646744205789966, "grad_norm": 1.1433228254318237, "learning_rate": 1.9548134001443734e-05, "loss": 0.036, "step": 3395 }, { "epoch": 0.8659478733338993, "grad_norm": 1.8127363920211792, "learning_rate": 1.95468118093322e-05, "loss": 0.036, "step": 3400 }, { "epoch": 0.8672213260888021, "grad_norm": 1.0988972187042236, "learning_rate": 1.9545487730478746e-05, "loss": 0.0302, "step": 3405 }, { "epoch": 0.8684947788437049, "grad_norm": 1.7967636585235596, "learning_rate": 1.9544161765145046e-05, "loss": 0.0356, "step": 3410 }, { "epoch": 0.8697682315986077, "grad_norm": 0.8552435040473938, "learning_rate": 1.9542833913593154e-05, "loss": 0.0294, "step": 3415 }, { "epoch": 0.8710416843535105, "grad_norm": 1.220694899559021, "learning_rate": 1.95415041760855e-05, "loss": 0.0323, "step": 3420 }, { "epoch": 0.8723151371084132, "grad_norm": 1.4603036642074585, "learning_rate": 1.9540172552884873e-05, "loss": 0.0376, "step": 3425 }, { "epoch": 0.8735885898633161, "grad_norm": 1.3997256755828857, "learning_rate": 1.9538839044254443e-05, "loss": 0.0386, "step": 3430 }, { "epoch": 0.8748620426182189, "grad_norm": 1.8967336416244507, "learning_rate": 1.953750365045775e-05, "loss": 0.0411, "step": 3435 }, { "epoch": 0.8761354953731216, "grad_norm": 1.4496850967407227, "learning_rate": 1.9536166371758717e-05, "loss": 0.0396, "step": 3440 }, { "epoch": 0.8774089481280245, "grad_norm": 1.1708320379257202, "learning_rate": 1.9534827208421627e-05, "loss": 0.0319, "step": 3445 }, { "epoch": 0.8786824008829273, "grad_norm": 1.071077823638916, "learning_rate": 1.9533486160711136e-05, "loss": 0.0355, "step": 3450 }, { "epoch": 0.87995585363783, "grad_norm": 1.2360773086547852, "learning_rate": 1.953214322889228e-05, "loss": 0.0357, "step": 3455 }, { "epoch": 0.8812293063927328, "grad_norm": 1.276350736618042, "learning_rate": 1.9530798413230462e-05, "loss": 0.0327, "step": 3460 }, { "epoch": 0.8825027591476357, "grad_norm": 1.3998210430145264, "learning_rate": 1.952945171399146e-05, "loss": 0.0372, "step": 3465 }, { "epoch": 0.8837762119025384, "grad_norm": 2.201674461364746, "learning_rate": 1.9528103131441423e-05, "loss": 0.0363, "step": 3470 }, { "epoch": 0.8850496646574412, "grad_norm": 1.9209771156311035, "learning_rate": 1.952675266584687e-05, "loss": 0.0387, "step": 3475 }, { "epoch": 0.886323117412344, "grad_norm": 1.2965327501296997, "learning_rate": 1.9525400317474696e-05, "loss": 0.0386, "step": 3480 }, { "epoch": 0.8875965701672468, "grad_norm": 2.03071665763855, "learning_rate": 1.9524046086592165e-05, "loss": 0.0368, "step": 3485 }, { "epoch": 0.8888700229221496, "grad_norm": 1.5060806274414062, "learning_rate": 1.9522689973466918e-05, "loss": 0.0383, "step": 3490 }, { "epoch": 0.8901434756770524, "grad_norm": 3.797322988510132, "learning_rate": 1.9521331978366958e-05, "loss": 0.0216, "step": 3495 }, { "epoch": 0.8914169284319552, "grad_norm": 1.2911851406097412, "learning_rate": 1.9519972101560673e-05, "loss": 0.0306, "step": 3500 }, { "epoch": 0.892690381186858, "grad_norm": 1.6762653589248657, "learning_rate": 1.951861034331681e-05, "loss": 0.0403, "step": 3505 }, { "epoch": 0.8939638339417607, "grad_norm": 1.1146074533462524, "learning_rate": 1.95172467039045e-05, "loss": 0.0397, "step": 3510 }, { "epoch": 0.8952372866966636, "grad_norm": 1.5786906480789185, "learning_rate": 1.9515881183593235e-05, "loss": 0.0364, "step": 3515 }, { "epoch": 0.8965107394515663, "grad_norm": 0.60141921043396, "learning_rate": 1.951451378265289e-05, "loss": 0.027, "step": 3520 }, { "epoch": 0.8977841922064691, "grad_norm": 1.4273477792739868, "learning_rate": 1.9513144501353697e-05, "loss": 0.0411, "step": 3525 }, { "epoch": 0.899057644961372, "grad_norm": 1.2802141904830933, "learning_rate": 1.9511773339966273e-05, "loss": 0.0321, "step": 3530 }, { "epoch": 0.9003310977162747, "grad_norm": 1.2320600748062134, "learning_rate": 1.9510400298761604e-05, "loss": 0.0409, "step": 3535 }, { "epoch": 0.9016045504711775, "grad_norm": 1.3309319019317627, "learning_rate": 1.9509025378011035e-05, "loss": 0.0393, "step": 3540 }, { "epoch": 0.9028780032260804, "grad_norm": 1.3263518810272217, "learning_rate": 1.95076485779863e-05, "loss": 0.0273, "step": 3545 }, { "epoch": 0.9041514559809831, "grad_norm": 1.7848888635635376, "learning_rate": 1.95062698989595e-05, "loss": 0.0349, "step": 3550 }, { "epoch": 0.9054249087358859, "grad_norm": 1.4738317728042603, "learning_rate": 1.9504889341203096e-05, "loss": 0.0214, "step": 3555 }, { "epoch": 0.9066983614907886, "grad_norm": 1.6150400638580322, "learning_rate": 1.950350690498993e-05, "loss": 0.0358, "step": 3560 }, { "epoch": 0.9079718142456915, "grad_norm": 0.9393568634986877, "learning_rate": 1.950212259059322e-05, "loss": 0.0364, "step": 3565 }, { "epoch": 0.9092452670005943, "grad_norm": 1.3422833681106567, "learning_rate": 1.9500736398286543e-05, "loss": 0.0351, "step": 3570 }, { "epoch": 0.910518719755497, "grad_norm": 1.566400408744812, "learning_rate": 1.9499348328343854e-05, "loss": 0.0332, "step": 3575 }, { "epoch": 0.9117921725103999, "grad_norm": 1.4837932586669922, "learning_rate": 1.9497958381039476e-05, "loss": 0.0397, "step": 3580 }, { "epoch": 0.9130656252653027, "grad_norm": 0.7840548753738403, "learning_rate": 1.949656655664811e-05, "loss": 0.0324, "step": 3585 }, { "epoch": 0.9143390780202054, "grad_norm": 0.8490258455276489, "learning_rate": 1.949517285544482e-05, "loss": 0.0253, "step": 3590 }, { "epoch": 0.9156125307751083, "grad_norm": 1.2944254875183105, "learning_rate": 1.9493777277705047e-05, "loss": 0.0457, "step": 3595 }, { "epoch": 0.916885983530011, "grad_norm": 2.0361807346343994, "learning_rate": 1.94923798237046e-05, "loss": 0.0281, "step": 3600 }, { "epoch": 0.9181594362849138, "grad_norm": 1.4456555843353271, "learning_rate": 1.949098049371965e-05, "loss": 0.0283, "step": 3605 }, { "epoch": 0.9194328890398166, "grad_norm": 1.3256417512893677, "learning_rate": 1.9489579288026754e-05, "loss": 0.0383, "step": 3610 }, { "epoch": 0.9207063417947194, "grad_norm": 1.289277195930481, "learning_rate": 1.9488176206902836e-05, "loss": 0.0269, "step": 3615 }, { "epoch": 0.9219797945496222, "grad_norm": 1.7029519081115723, "learning_rate": 1.9486771250625182e-05, "loss": 0.0317, "step": 3620 }, { "epoch": 0.923253247304525, "grad_norm": 1.2632783651351929, "learning_rate": 1.9485364419471454e-05, "loss": 0.0333, "step": 3625 }, { "epoch": 0.9245267000594278, "grad_norm": 1.611345648765564, "learning_rate": 1.9483955713719694e-05, "loss": 0.03, "step": 3630 }, { "epoch": 0.9258001528143306, "grad_norm": 1.0237045288085938, "learning_rate": 1.9482545133648296e-05, "loss": 0.0309, "step": 3635 }, { "epoch": 0.9270736055692333, "grad_norm": 1.0124350786209106, "learning_rate": 1.9481132679536037e-05, "loss": 0.0267, "step": 3640 }, { "epoch": 0.9283470583241362, "grad_norm": 1.842441201210022, "learning_rate": 1.9479718351662057e-05, "loss": 0.0334, "step": 3645 }, { "epoch": 0.929620511079039, "grad_norm": 1.2253848314285278, "learning_rate": 1.947830215030588e-05, "loss": 0.033, "step": 3650 }, { "epoch": 0.9308939638339417, "grad_norm": 1.3963775634765625, "learning_rate": 1.947688407574738e-05, "loss": 0.0416, "step": 3655 }, { "epoch": 0.9321674165888445, "grad_norm": 1.0739110708236694, "learning_rate": 1.9475464128266818e-05, "loss": 0.0345, "step": 3660 }, { "epoch": 0.9334408693437474, "grad_norm": 0.908584713935852, "learning_rate": 1.9474042308144818e-05, "loss": 0.0488, "step": 3665 }, { "epoch": 0.9347143220986501, "grad_norm": 1.7742162942886353, "learning_rate": 1.9472618615662374e-05, "loss": 0.0446, "step": 3670 }, { "epoch": 0.9359877748535529, "grad_norm": 1.3654981851577759, "learning_rate": 1.9471193051100852e-05, "loss": 0.0322, "step": 3675 }, { "epoch": 0.9372612276084558, "grad_norm": 2.424875497817993, "learning_rate": 1.946976561474199e-05, "loss": 0.031, "step": 3680 }, { "epoch": 0.9385346803633585, "grad_norm": 1.8025472164154053, "learning_rate": 1.946833630686789e-05, "loss": 0.0378, "step": 3685 }, { "epoch": 0.9398081331182613, "grad_norm": 1.6025618314743042, "learning_rate": 1.9466905127761024e-05, "loss": 0.0375, "step": 3690 }, { "epoch": 0.9410815858731642, "grad_norm": 1.1544350385665894, "learning_rate": 1.9465472077704243e-05, "loss": 0.032, "step": 3695 }, { "epoch": 0.9423550386280669, "grad_norm": 1.2241606712341309, "learning_rate": 1.9464037156980758e-05, "loss": 0.0475, "step": 3700 }, { "epoch": 0.9436284913829697, "grad_norm": 1.574996829032898, "learning_rate": 1.9462600365874153e-05, "loss": 0.0303, "step": 3705 }, { "epoch": 0.9449019441378725, "grad_norm": 1.297092318534851, "learning_rate": 1.9461161704668382e-05, "loss": 0.0336, "step": 3710 }, { "epoch": 0.9461753968927753, "grad_norm": 1.6205108165740967, "learning_rate": 1.9459721173647768e-05, "loss": 0.0384, "step": 3715 }, { "epoch": 0.9474488496476781, "grad_norm": 1.5787454843521118, "learning_rate": 1.9458278773097006e-05, "loss": 0.0407, "step": 3720 }, { "epoch": 0.9487223024025808, "grad_norm": 1.05707848072052, "learning_rate": 1.945683450330116e-05, "loss": 0.0285, "step": 3725 }, { "epoch": 0.9499957551574837, "grad_norm": 1.5921543836593628, "learning_rate": 1.9455388364545658e-05, "loss": 0.0322, "step": 3730 }, { "epoch": 0.9512692079123864, "grad_norm": 1.4923707246780396, "learning_rate": 1.94539403571163e-05, "loss": 0.0368, "step": 3735 }, { "epoch": 0.9525426606672892, "grad_norm": 1.2545040845870972, "learning_rate": 1.945249048129926e-05, "loss": 0.0292, "step": 3740 }, { "epoch": 0.9538161134221921, "grad_norm": 1.2409275770187378, "learning_rate": 1.9451038737381078e-05, "loss": 0.0247, "step": 3745 }, { "epoch": 0.9550895661770948, "grad_norm": 1.4763306379318237, "learning_rate": 1.944958512564866e-05, "loss": 0.0349, "step": 3750 }, { "epoch": 0.9563630189319976, "grad_norm": 1.2238115072250366, "learning_rate": 1.9448129646389287e-05, "loss": 0.0436, "step": 3755 }, { "epoch": 0.9576364716869005, "grad_norm": 1.2004365921020508, "learning_rate": 1.9446672299890607e-05, "loss": 0.0265, "step": 3760 }, { "epoch": 0.9589099244418032, "grad_norm": 0.703353226184845, "learning_rate": 1.9445213086440634e-05, "loss": 0.0345, "step": 3765 }, { "epoch": 0.960183377196706, "grad_norm": 1.1060692071914673, "learning_rate": 1.9443752006327753e-05, "loss": 0.0285, "step": 3770 }, { "epoch": 0.9614568299516087, "grad_norm": 0.7906731367111206, "learning_rate": 1.9442289059840718e-05, "loss": 0.0327, "step": 3775 }, { "epoch": 0.9627302827065116, "grad_norm": 0.9884161949157715, "learning_rate": 1.9440824247268654e-05, "loss": 0.0279, "step": 3780 }, { "epoch": 0.9640037354614144, "grad_norm": 0.9332820773124695, "learning_rate": 1.9439357568901047e-05, "loss": 0.0324, "step": 3785 }, { "epoch": 0.9652771882163171, "grad_norm": 1.1775380373001099, "learning_rate": 1.9437889025027766e-05, "loss": 0.0358, "step": 3790 }, { "epoch": 0.96655064097122, "grad_norm": 1.3310000896453857, "learning_rate": 1.9436418615939034e-05, "loss": 0.0394, "step": 3795 }, { "epoch": 0.9678240937261228, "grad_norm": 1.1284977197647095, "learning_rate": 1.943494634192545e-05, "loss": 0.036, "step": 3800 }, { "epoch": 0.9690975464810255, "grad_norm": 1.2835444211959839, "learning_rate": 1.9433472203277986e-05, "loss": 0.0337, "step": 3805 }, { "epoch": 0.9703709992359284, "grad_norm": 1.6167970895767212, "learning_rate": 1.943199620028797e-05, "loss": 0.0283, "step": 3810 }, { "epoch": 0.9716444519908312, "grad_norm": 1.4940444231033325, "learning_rate": 1.9430518333247108e-05, "loss": 0.0273, "step": 3815 }, { "epoch": 0.9729179047457339, "grad_norm": 1.460748314857483, "learning_rate": 1.942903860244747e-05, "loss": 0.0367, "step": 3820 }, { "epoch": 0.9741913575006367, "grad_norm": 0.6897012591362, "learning_rate": 1.9427557008181494e-05, "loss": 0.0302, "step": 3825 }, { "epoch": 0.9754648102555395, "grad_norm": 1.4491699934005737, "learning_rate": 1.9426073550741998e-05, "loss": 0.0316, "step": 3830 }, { "epoch": 0.9767382630104423, "grad_norm": 3.7850501537323, "learning_rate": 1.9424588230422148e-05, "loss": 0.0344, "step": 3835 }, { "epoch": 0.9780117157653451, "grad_norm": 1.6921662092208862, "learning_rate": 1.9423101047515492e-05, "loss": 0.0353, "step": 3840 }, { "epoch": 0.9792851685202479, "grad_norm": 1.4057652950286865, "learning_rate": 1.9421612002315947e-05, "loss": 0.0282, "step": 3845 }, { "epoch": 0.9805586212751507, "grad_norm": 1.5773046016693115, "learning_rate": 1.9420121095117786e-05, "loss": 0.0293, "step": 3850 }, { "epoch": 0.9818320740300535, "grad_norm": 1.085999608039856, "learning_rate": 1.9418628326215665e-05, "loss": 0.0365, "step": 3855 }, { "epoch": 0.9831055267849563, "grad_norm": 1.2486598491668701, "learning_rate": 1.9417133695904598e-05, "loss": 0.0364, "step": 3860 }, { "epoch": 0.9843789795398591, "grad_norm": 1.2816048860549927, "learning_rate": 1.941563720447997e-05, "loss": 0.0293, "step": 3865 }, { "epoch": 0.9856524322947618, "grad_norm": 2.1788816452026367, "learning_rate": 1.9414138852237533e-05, "loss": 0.0362, "step": 3870 }, { "epoch": 0.9869258850496646, "grad_norm": 1.096404790878296, "learning_rate": 1.9412638639473407e-05, "loss": 0.0436, "step": 3875 }, { "epoch": 0.9881993378045675, "grad_norm": 1.5066503286361694, "learning_rate": 1.941113656648408e-05, "loss": 0.038, "step": 3880 }, { "epoch": 0.9894727905594702, "grad_norm": 1.518890619277954, "learning_rate": 1.940963263356641e-05, "loss": 0.037, "step": 3885 }, { "epoch": 0.990746243314373, "grad_norm": 0.971547544002533, "learning_rate": 1.9408126841017617e-05, "loss": 0.029, "step": 3890 }, { "epoch": 0.9920196960692759, "grad_norm": 1.3719186782836914, "learning_rate": 1.9406619189135288e-05, "loss": 0.0336, "step": 3895 }, { "epoch": 0.9932931488241786, "grad_norm": 2.0515329837799072, "learning_rate": 1.940510967821739e-05, "loss": 0.0447, "step": 3900 }, { "epoch": 0.9945666015790814, "grad_norm": 1.2394585609436035, "learning_rate": 1.9403598308562247e-05, "loss": 0.0333, "step": 3905 }, { "epoch": 0.9958400543339843, "grad_norm": 4.610649108886719, "learning_rate": 1.9402085080468548e-05, "loss": 0.0378, "step": 3910 }, { "epoch": 0.997113507088887, "grad_norm": 1.672906756401062, "learning_rate": 1.940056999423535e-05, "loss": 0.0412, "step": 3915 }, { "epoch": 0.9983869598437898, "grad_norm": 1.1755341291427612, "learning_rate": 1.9399053050162087e-05, "loss": 0.0382, "step": 3920 }, { "epoch": 0.9996604125986925, "grad_norm": 1.2448986768722534, "learning_rate": 1.9397534248548552e-05, "loss": 0.0367, "step": 3925 }, { "epoch": 1.0009338653535953, "grad_norm": 1.2264411449432373, "learning_rate": 1.9396013589694903e-05, "loss": 0.0259, "step": 3930 }, { "epoch": 1.0022073181084983, "grad_norm": 1.1448924541473389, "learning_rate": 1.9394491073901677e-05, "loss": 0.0232, "step": 3935 }, { "epoch": 1.003480770863401, "grad_norm": 1.3758710622787476, "learning_rate": 1.939296670146976e-05, "loss": 0.0197, "step": 3940 }, { "epoch": 1.0047542236183038, "grad_norm": 1.0926223993301392, "learning_rate": 1.939144047270042e-05, "loss": 0.018, "step": 3945 }, { "epoch": 1.0060276763732066, "grad_norm": 1.390634298324585, "learning_rate": 1.9389912387895284e-05, "loss": 0.0133, "step": 3950 }, { "epoch": 1.0073011291281093, "grad_norm": 0.9754273295402527, "learning_rate": 1.938838244735635e-05, "loss": 0.0194, "step": 3955 }, { "epoch": 1.008574581883012, "grad_norm": 0.8942519426345825, "learning_rate": 1.938685065138598e-05, "loss": 0.0207, "step": 3960 }, { "epoch": 1.0098480346379148, "grad_norm": 0.36096516251564026, "learning_rate": 1.9385317000286906e-05, "loss": 0.0212, "step": 3965 }, { "epoch": 1.0111214873928178, "grad_norm": 1.0688881874084473, "learning_rate": 1.9383781494362225e-05, "loss": 0.0213, "step": 3970 }, { "epoch": 1.0123949401477206, "grad_norm": 1.5073938369750977, "learning_rate": 1.9382244133915393e-05, "loss": 0.0208, "step": 3975 }, { "epoch": 1.0136683929026233, "grad_norm": 5.144012928009033, "learning_rate": 1.9380704919250248e-05, "loss": 0.0316, "step": 3980 }, { "epoch": 1.014941845657526, "grad_norm": 1.0689911842346191, "learning_rate": 1.9379163850670975e-05, "loss": 0.0248, "step": 3985 }, { "epoch": 1.0162152984124289, "grad_norm": 0.8407047986984253, "learning_rate": 1.937762092848215e-05, "loss": 0.0137, "step": 3990 }, { "epoch": 1.0174887511673316, "grad_norm": 0.9527506232261658, "learning_rate": 1.9376076152988692e-05, "loss": 0.0255, "step": 3995 }, { "epoch": 1.0187622039222344, "grad_norm": 1.1279563903808594, "learning_rate": 1.9374529524495898e-05, "loss": 0.0273, "step": 4000 }, { "epoch": 1.0200356566771374, "grad_norm": 1.6104861497879028, "learning_rate": 1.937298104330943e-05, "loss": 0.0253, "step": 4005 }, { "epoch": 1.0213091094320401, "grad_norm": 1.3667393922805786, "learning_rate": 1.9371430709735314e-05, "loss": 0.0159, "step": 4010 }, { "epoch": 1.0225825621869429, "grad_norm": 1.5494400262832642, "learning_rate": 1.9369878524079947e-05, "loss": 0.021, "step": 4015 }, { "epoch": 1.0238560149418456, "grad_norm": 1.1787192821502686, "learning_rate": 1.9368324486650082e-05, "loss": 0.0201, "step": 4020 }, { "epoch": 1.0251294676967484, "grad_norm": 1.1111611127853394, "learning_rate": 1.9366768597752853e-05, "loss": 0.0244, "step": 4025 }, { "epoch": 1.0264029204516512, "grad_norm": 1.5378721952438354, "learning_rate": 1.9365210857695743e-05, "loss": 0.0219, "step": 4030 }, { "epoch": 1.0276763732065541, "grad_norm": 1.071866750717163, "learning_rate": 1.936365126678661e-05, "loss": 0.0209, "step": 4035 }, { "epoch": 1.028949825961457, "grad_norm": 0.9547898769378662, "learning_rate": 1.9362089825333683e-05, "loss": 0.0196, "step": 4040 }, { "epoch": 1.0302232787163597, "grad_norm": 1.6523330211639404, "learning_rate": 1.9360526533645546e-05, "loss": 0.0262, "step": 4045 }, { "epoch": 1.0314967314712624, "grad_norm": 1.7345911264419556, "learning_rate": 1.935896139203115e-05, "loss": 0.016, "step": 4050 }, { "epoch": 1.0327701842261652, "grad_norm": 1.325411081314087, "learning_rate": 1.935739440079982e-05, "loss": 0.0226, "step": 4055 }, { "epoch": 1.034043636981068, "grad_norm": 1.510317087173462, "learning_rate": 1.9355825560261246e-05, "loss": 0.0198, "step": 4060 }, { "epoch": 1.0353170897359707, "grad_norm": 1.5144071578979492, "learning_rate": 1.9354254870725468e-05, "loss": 0.0203, "step": 4065 }, { "epoch": 1.0365905424908737, "grad_norm": 0.7016156315803528, "learning_rate": 1.9352682332502905e-05, "loss": 0.0162, "step": 4070 }, { "epoch": 1.0378639952457764, "grad_norm": 1.2648762464523315, "learning_rate": 1.9351107945904343e-05, "loss": 0.015, "step": 4075 }, { "epoch": 1.0391374480006792, "grad_norm": 0.8037465214729309, "learning_rate": 1.934953171124093e-05, "loss": 0.0256, "step": 4080 }, { "epoch": 1.040410900755582, "grad_norm": 2.8918728828430176, "learning_rate": 1.934795362882417e-05, "loss": 0.0238, "step": 4085 }, { "epoch": 1.0416843535104847, "grad_norm": 0.9029564261436462, "learning_rate": 1.934637369896595e-05, "loss": 0.0204, "step": 4090 }, { "epoch": 1.0429578062653875, "grad_norm": 1.233101725578308, "learning_rate": 1.9344791921978504e-05, "loss": 0.0206, "step": 4095 }, { "epoch": 1.0442312590202905, "grad_norm": 1.3172751665115356, "learning_rate": 1.9343208298174446e-05, "loss": 0.02, "step": 4100 }, { "epoch": 1.0455047117751932, "grad_norm": 1.420318603515625, "learning_rate": 1.9341622827866745e-05, "loss": 0.0189, "step": 4105 }, { "epoch": 1.046778164530096, "grad_norm": 1.5208293199539185, "learning_rate": 1.934003551136874e-05, "loss": 0.021, "step": 4110 }, { "epoch": 1.0480516172849987, "grad_norm": 1.277449369430542, "learning_rate": 1.9338446348994126e-05, "loss": 0.0152, "step": 4115 }, { "epoch": 1.0493250700399015, "grad_norm": 1.378671646118164, "learning_rate": 1.933685534105698e-05, "loss": 0.024, "step": 4120 }, { "epoch": 1.0505985227948043, "grad_norm": 1.1641845703125, "learning_rate": 1.9335262487871733e-05, "loss": 0.0197, "step": 4125 }, { "epoch": 1.051871975549707, "grad_norm": 1.003162145614624, "learning_rate": 1.9333667789753175e-05, "loss": 0.0212, "step": 4130 }, { "epoch": 1.05314542830461, "grad_norm": 1.1961324214935303, "learning_rate": 1.9332071247016476e-05, "loss": 0.0202, "step": 4135 }, { "epoch": 1.0544188810595128, "grad_norm": 1.8282638788223267, "learning_rate": 1.9330472859977153e-05, "loss": 0.0201, "step": 4140 }, { "epoch": 1.0556923338144155, "grad_norm": 1.4578503370285034, "learning_rate": 1.9328872628951102e-05, "loss": 0.0183, "step": 4145 }, { "epoch": 1.0569657865693183, "grad_norm": 1.340516209602356, "learning_rate": 1.932727055425457e-05, "loss": 0.0221, "step": 4150 }, { "epoch": 1.058239239324221, "grad_norm": 1.1550495624542236, "learning_rate": 1.9325666636204187e-05, "loss": 0.0169, "step": 4155 }, { "epoch": 1.0595126920791238, "grad_norm": 1.1059026718139648, "learning_rate": 1.932406087511693e-05, "loss": 0.02, "step": 4160 }, { "epoch": 1.0607861448340266, "grad_norm": 1.6554921865463257, "learning_rate": 1.9322453271310143e-05, "loss": 0.0204, "step": 4165 }, { "epoch": 1.0620595975889295, "grad_norm": 1.1121129989624023, "learning_rate": 1.9320843825101544e-05, "loss": 0.0157, "step": 4170 }, { "epoch": 1.0633330503438323, "grad_norm": 1.296303153038025, "learning_rate": 1.9319232536809208e-05, "loss": 0.0216, "step": 4175 }, { "epoch": 1.064606503098735, "grad_norm": 1.0438318252563477, "learning_rate": 1.931761940675157e-05, "loss": 0.021, "step": 4180 }, { "epoch": 1.0658799558536378, "grad_norm": 1.2042728662490845, "learning_rate": 1.9316004435247436e-05, "loss": 0.0216, "step": 4185 }, { "epoch": 1.0671534086085406, "grad_norm": 1.1089670658111572, "learning_rate": 1.931438762261598e-05, "loss": 0.0125, "step": 4190 }, { "epoch": 1.0684268613634433, "grad_norm": 1.1684101819992065, "learning_rate": 1.9312768969176726e-05, "loss": 0.0216, "step": 4195 }, { "epoch": 1.069700314118346, "grad_norm": 1.2978224754333496, "learning_rate": 1.931114847524957e-05, "loss": 0.0191, "step": 4200 }, { "epoch": 1.070973766873249, "grad_norm": 1.324310302734375, "learning_rate": 1.9309526141154778e-05, "loss": 0.0247, "step": 4205 }, { "epoch": 1.0722472196281518, "grad_norm": 0.5934655666351318, "learning_rate": 1.9307901967212965e-05, "loss": 0.0153, "step": 4210 }, { "epoch": 1.0735206723830546, "grad_norm": 0.6221123933792114, "learning_rate": 1.9306275953745116e-05, "loss": 0.0232, "step": 4215 }, { "epoch": 1.0747941251379574, "grad_norm": 0.9198203086853027, "learning_rate": 1.930464810107259e-05, "loss": 0.0176, "step": 4220 }, { "epoch": 1.0760675778928601, "grad_norm": 0.8774814605712891, "learning_rate": 1.9303018409517093e-05, "loss": 0.0195, "step": 4225 }, { "epoch": 1.0773410306477629, "grad_norm": 0.7387356162071228, "learning_rate": 1.930138687940071e-05, "loss": 0.0122, "step": 4230 }, { "epoch": 1.0786144834026659, "grad_norm": 1.2341587543487549, "learning_rate": 1.9299753511045873e-05, "loss": 0.0241, "step": 4235 }, { "epoch": 1.0798879361575686, "grad_norm": 1.4728200435638428, "learning_rate": 1.9298118304775384e-05, "loss": 0.0217, "step": 4240 }, { "epoch": 1.0811613889124714, "grad_norm": 1.5673774480819702, "learning_rate": 1.9296481260912418e-05, "loss": 0.0213, "step": 4245 }, { "epoch": 1.0824348416673741, "grad_norm": 1.0509681701660156, "learning_rate": 1.92948423797805e-05, "loss": 0.0166, "step": 4250 }, { "epoch": 1.083708294422277, "grad_norm": 1.257749319076538, "learning_rate": 1.9293201661703524e-05, "loss": 0.0207, "step": 4255 }, { "epoch": 1.0849817471771797, "grad_norm": 1.0975127220153809, "learning_rate": 1.9291559107005748e-05, "loss": 0.0173, "step": 4260 }, { "epoch": 1.0862551999320824, "grad_norm": 1.351738691329956, "learning_rate": 1.9289914716011785e-05, "loss": 0.018, "step": 4265 }, { "epoch": 1.0875286526869854, "grad_norm": 1.166131615638733, "learning_rate": 1.9288268489046627e-05, "loss": 0.0193, "step": 4270 }, { "epoch": 1.0888021054418882, "grad_norm": 1.1667312383651733, "learning_rate": 1.928662042643561e-05, "loss": 0.0162, "step": 4275 }, { "epoch": 1.090075558196791, "grad_norm": 1.0171962976455688, "learning_rate": 1.9284970528504443e-05, "loss": 0.0248, "step": 4280 }, { "epoch": 1.0913490109516937, "grad_norm": 1.5037243366241455, "learning_rate": 1.9283318795579197e-05, "loss": 0.0199, "step": 4285 }, { "epoch": 1.0926224637065964, "grad_norm": 1.2331693172454834, "learning_rate": 1.9281665227986307e-05, "loss": 0.0188, "step": 4290 }, { "epoch": 1.0938959164614992, "grad_norm": 1.3276026248931885, "learning_rate": 1.9280009826052568e-05, "loss": 0.0247, "step": 4295 }, { "epoch": 1.0951693692164022, "grad_norm": 1.0425255298614502, "learning_rate": 1.9278352590105137e-05, "loss": 0.0164, "step": 4300 }, { "epoch": 1.096442821971305, "grad_norm": 1.2473636865615845, "learning_rate": 1.9276693520471533e-05, "loss": 0.023, "step": 4305 }, { "epoch": 1.0977162747262077, "grad_norm": 1.1218444108963013, "learning_rate": 1.9275032617479644e-05, "loss": 0.0125, "step": 4310 }, { "epoch": 1.0989897274811105, "grad_norm": 1.1788579225540161, "learning_rate": 1.9273369881457707e-05, "loss": 0.0209, "step": 4315 }, { "epoch": 1.1002631802360132, "grad_norm": 0.8140197992324829, "learning_rate": 1.927170531273434e-05, "loss": 0.0166, "step": 4320 }, { "epoch": 1.101536632990916, "grad_norm": 1.2207391262054443, "learning_rate": 1.9270038911638502e-05, "loss": 0.0276, "step": 4325 }, { "epoch": 1.1028100857458187, "grad_norm": 1.1271432638168335, "learning_rate": 1.926837067849953e-05, "loss": 0.0205, "step": 4330 }, { "epoch": 1.1040835385007217, "grad_norm": 1.5167170763015747, "learning_rate": 1.9266700613647123e-05, "loss": 0.0249, "step": 4335 }, { "epoch": 1.1053569912556245, "grad_norm": 0.9444289207458496, "learning_rate": 1.9265028717411325e-05, "loss": 0.017, "step": 4340 }, { "epoch": 1.1066304440105272, "grad_norm": 0.8096743226051331, "learning_rate": 1.9263354990122565e-05, "loss": 0.0262, "step": 4345 }, { "epoch": 1.10790389676543, "grad_norm": 0.6470052599906921, "learning_rate": 1.9261679432111617e-05, "loss": 0.0209, "step": 4350 }, { "epoch": 1.1091773495203328, "grad_norm": 0.924764096736908, "learning_rate": 1.926000204370962e-05, "loss": 0.0196, "step": 4355 }, { "epoch": 1.1104508022752355, "grad_norm": 1.3661537170410156, "learning_rate": 1.9258322825248085e-05, "loss": 0.0238, "step": 4360 }, { "epoch": 1.1117242550301385, "grad_norm": 1.141640067100525, "learning_rate": 1.9256641777058868e-05, "loss": 0.0178, "step": 4365 }, { "epoch": 1.1129977077850413, "grad_norm": 0.9224767684936523, "learning_rate": 1.9254958899474207e-05, "loss": 0.0174, "step": 4370 }, { "epoch": 1.114271160539944, "grad_norm": 1.6747952699661255, "learning_rate": 1.9253274192826677e-05, "loss": 0.0241, "step": 4375 }, { "epoch": 1.1155446132948468, "grad_norm": 1.0787923336029053, "learning_rate": 1.925158765744924e-05, "loss": 0.0258, "step": 4380 }, { "epoch": 1.1168180660497495, "grad_norm": 1.0783635377883911, "learning_rate": 1.9249899293675194e-05, "loss": 0.0305, "step": 4385 }, { "epoch": 1.1180915188046523, "grad_norm": 0.906731903553009, "learning_rate": 1.9248209101838223e-05, "loss": 0.0166, "step": 4390 }, { "epoch": 1.119364971559555, "grad_norm": 0.6971744298934937, "learning_rate": 1.924651708227235e-05, "loss": 0.0139, "step": 4395 }, { "epoch": 1.120638424314458, "grad_norm": 1.244194507598877, "learning_rate": 1.924482323531198e-05, "loss": 0.0155, "step": 4400 }, { "epoch": 1.1219118770693608, "grad_norm": 1.2933225631713867, "learning_rate": 1.9243127561291867e-05, "loss": 0.0163, "step": 4405 }, { "epoch": 1.1231853298242636, "grad_norm": 1.5102524757385254, "learning_rate": 1.924143006054712e-05, "loss": 0.0173, "step": 4410 }, { "epoch": 1.1244587825791663, "grad_norm": 0.7560238242149353, "learning_rate": 1.9239730733413222e-05, "loss": 0.0175, "step": 4415 }, { "epoch": 1.125732235334069, "grad_norm": 1.477671504020691, "learning_rate": 1.9238029580226016e-05, "loss": 0.0194, "step": 4420 }, { "epoch": 1.1270056880889718, "grad_norm": 1.2348523139953613, "learning_rate": 1.9236326601321698e-05, "loss": 0.0177, "step": 4425 }, { "epoch": 1.1282791408438748, "grad_norm": 0.8397320508956909, "learning_rate": 1.923462179703683e-05, "loss": 0.0254, "step": 4430 }, { "epoch": 1.1295525935987776, "grad_norm": 1.0892181396484375, "learning_rate": 1.9232915167708327e-05, "loss": 0.0178, "step": 4435 }, { "epoch": 1.1308260463536803, "grad_norm": 1.2701536417007446, "learning_rate": 1.923120671367348e-05, "loss": 0.0233, "step": 4440 }, { "epoch": 1.132099499108583, "grad_norm": 1.0259586572647095, "learning_rate": 1.9229496435269932e-05, "loss": 0.019, "step": 4445 }, { "epoch": 1.1333729518634859, "grad_norm": 1.0278618335723877, "learning_rate": 1.922778433283568e-05, "loss": 0.0194, "step": 4450 }, { "epoch": 1.1346464046183886, "grad_norm": 1.688132643699646, "learning_rate": 1.9226070406709087e-05, "loss": 0.0355, "step": 4455 }, { "epoch": 1.1359198573732914, "grad_norm": 1.1048635244369507, "learning_rate": 1.9224354657228886e-05, "loss": 0.0212, "step": 4460 }, { "epoch": 1.1371933101281941, "grad_norm": 1.3358954191207886, "learning_rate": 1.9222637084734153e-05, "loss": 0.023, "step": 4465 }, { "epoch": 1.1384667628830971, "grad_norm": 1.603843331336975, "learning_rate": 1.922091768956434e-05, "loss": 0.02, "step": 4470 }, { "epoch": 1.1397402156379999, "grad_norm": 1.1198612451553345, "learning_rate": 1.9219196472059244e-05, "loss": 0.0149, "step": 4475 }, { "epoch": 1.1410136683929026, "grad_norm": 1.1108295917510986, "learning_rate": 1.9217473432559035e-05, "loss": 0.0204, "step": 4480 }, { "epoch": 1.1422871211478054, "grad_norm": 2.192941665649414, "learning_rate": 1.921574857140424e-05, "loss": 0.0102, "step": 4485 }, { "epoch": 1.1435605739027082, "grad_norm": 1.5239717960357666, "learning_rate": 1.921402188893574e-05, "loss": 0.0274, "step": 4490 }, { "epoch": 1.144834026657611, "grad_norm": 1.4134660959243774, "learning_rate": 1.9212293385494784e-05, "loss": 0.018, "step": 4495 }, { "epoch": 1.146107479412514, "grad_norm": 1.0701360702514648, "learning_rate": 1.921056306142297e-05, "loss": 0.0251, "step": 4500 }, { "epoch": 1.1473809321674167, "grad_norm": 1.3872921466827393, "learning_rate": 1.9208830917062274e-05, "loss": 0.0179, "step": 4505 }, { "epoch": 1.1486543849223194, "grad_norm": 0.8656800985336304, "learning_rate": 1.9207096952755012e-05, "loss": 0.0183, "step": 4510 }, { "epoch": 1.1499278376772222, "grad_norm": 1.6063001155853271, "learning_rate": 1.9205361168843876e-05, "loss": 0.0183, "step": 4515 }, { "epoch": 1.151201290432125, "grad_norm": 0.9124038219451904, "learning_rate": 1.9203623565671902e-05, "loss": 0.0176, "step": 4520 }, { "epoch": 1.1524747431870277, "grad_norm": 0.9038102030754089, "learning_rate": 1.9201884143582496e-05, "loss": 0.025, "step": 4525 }, { "epoch": 1.1537481959419305, "grad_norm": 0.9471694231033325, "learning_rate": 1.9200142902919423e-05, "loss": 0.0191, "step": 4530 }, { "epoch": 1.1550216486968334, "grad_norm": 1.1794400215148926, "learning_rate": 1.9198399844026803e-05, "loss": 0.0258, "step": 4535 }, { "epoch": 1.1562951014517362, "grad_norm": 1.1797575950622559, "learning_rate": 1.919665496724912e-05, "loss": 0.0253, "step": 4540 }, { "epoch": 1.157568554206639, "grad_norm": 1.1485353708267212, "learning_rate": 1.9194908272931214e-05, "loss": 0.0251, "step": 4545 }, { "epoch": 1.1588420069615417, "grad_norm": 1.2936326265335083, "learning_rate": 1.9193159761418285e-05, "loss": 0.0237, "step": 4550 }, { "epoch": 1.1601154597164445, "grad_norm": 0.6758003234863281, "learning_rate": 1.9191409433055893e-05, "loss": 0.0177, "step": 4555 }, { "epoch": 1.1613889124713472, "grad_norm": 0.8626788258552551, "learning_rate": 1.9189657288189956e-05, "loss": 0.0198, "step": 4560 }, { "epoch": 1.1626623652262502, "grad_norm": 1.3120365142822266, "learning_rate": 1.918790332716675e-05, "loss": 0.022, "step": 4565 }, { "epoch": 1.163935817981153, "grad_norm": 1.229615569114685, "learning_rate": 1.9186147550332913e-05, "loss": 0.022, "step": 4570 }, { "epoch": 1.1652092707360557, "grad_norm": 2.0075972080230713, "learning_rate": 1.9184389958035445e-05, "loss": 0.0241, "step": 4575 }, { "epoch": 1.1664827234909585, "grad_norm": 1.2449009418487549, "learning_rate": 1.918263055062169e-05, "loss": 0.0242, "step": 4580 }, { "epoch": 1.1677561762458613, "grad_norm": 1.165674090385437, "learning_rate": 1.9180869328439363e-05, "loss": 0.0223, "step": 4585 }, { "epoch": 1.169029629000764, "grad_norm": 1.0494978427886963, "learning_rate": 1.917910629183654e-05, "loss": 0.0185, "step": 4590 }, { "epoch": 1.1703030817556668, "grad_norm": 1.5481860637664795, "learning_rate": 1.917734144116165e-05, "loss": 0.0189, "step": 4595 }, { "epoch": 1.1715765345105695, "grad_norm": 1.4607293605804443, "learning_rate": 1.917557477676348e-05, "loss": 0.021, "step": 4600 }, { "epoch": 1.1728499872654725, "grad_norm": 1.762658715248108, "learning_rate": 1.9173806298991174e-05, "loss": 0.022, "step": 4605 }, { "epoch": 1.1741234400203753, "grad_norm": 1.4564285278320312, "learning_rate": 1.9172036008194245e-05, "loss": 0.0202, "step": 4610 }, { "epoch": 1.175396892775278, "grad_norm": 1.2554692029953003, "learning_rate": 1.917026390472255e-05, "loss": 0.0188, "step": 4615 }, { "epoch": 1.1766703455301808, "grad_norm": 0.8013851046562195, "learning_rate": 1.9168489988926312e-05, "loss": 0.0264, "step": 4620 }, { "epoch": 1.1779437982850836, "grad_norm": 1.7867295742034912, "learning_rate": 1.9166714261156116e-05, "loss": 0.0215, "step": 4625 }, { "epoch": 1.1792172510399865, "grad_norm": 1.026876449584961, "learning_rate": 1.9164936721762895e-05, "loss": 0.0248, "step": 4630 }, { "epoch": 1.1804907037948893, "grad_norm": 0.5855617523193359, "learning_rate": 1.9163157371097943e-05, "loss": 0.0221, "step": 4635 }, { "epoch": 1.181764156549792, "grad_norm": 0.795111358165741, "learning_rate": 1.916137620951292e-05, "loss": 0.0183, "step": 4640 }, { "epoch": 1.1830376093046948, "grad_norm": 0.7708178162574768, "learning_rate": 1.9159593237359834e-05, "loss": 0.02, "step": 4645 }, { "epoch": 1.1843110620595976, "grad_norm": 1.0919077396392822, "learning_rate": 1.9157808454991052e-05, "loss": 0.0176, "step": 4650 }, { "epoch": 1.1855845148145003, "grad_norm": 0.9403235912322998, "learning_rate": 1.915602186275931e-05, "loss": 0.0146, "step": 4655 }, { "epoch": 1.186857967569403, "grad_norm": 1.5524553060531616, "learning_rate": 1.915423346101769e-05, "loss": 0.029, "step": 4660 }, { "epoch": 1.1881314203243059, "grad_norm": 2.016056537628174, "learning_rate": 1.915244325011963e-05, "loss": 0.0198, "step": 4665 }, { "epoch": 1.1894048730792088, "grad_norm": 1.463931679725647, "learning_rate": 1.9150651230418937e-05, "loss": 0.0203, "step": 4670 }, { "epoch": 1.1906783258341116, "grad_norm": 1.166615605354309, "learning_rate": 1.9148857402269768e-05, "loss": 0.018, "step": 4675 }, { "epoch": 1.1919517785890144, "grad_norm": 2.0339906215667725, "learning_rate": 1.914706176602663e-05, "loss": 0.0225, "step": 4680 }, { "epoch": 1.1932252313439171, "grad_norm": 0.4010128974914551, "learning_rate": 1.914526432204441e-05, "loss": 0.0174, "step": 4685 }, { "epoch": 1.1944986840988199, "grad_norm": 1.4358991384506226, "learning_rate": 1.914346507067832e-05, "loss": 0.0204, "step": 4690 }, { "epoch": 1.1957721368537226, "grad_norm": 1.241199016571045, "learning_rate": 1.914166401228396e-05, "loss": 0.0175, "step": 4695 }, { "epoch": 1.1970455896086256, "grad_norm": 1.849613904953003, "learning_rate": 1.9139861147217277e-05, "loss": 0.027, "step": 4700 }, { "epoch": 1.1983190423635284, "grad_norm": 1.5722010135650635, "learning_rate": 1.9138056475834563e-05, "loss": 0.0219, "step": 4705 }, { "epoch": 1.1995924951184311, "grad_norm": 0.9386172294616699, "learning_rate": 1.913624999849248e-05, "loss": 0.0218, "step": 4710 }, { "epoch": 1.200865947873334, "grad_norm": 1.395753026008606, "learning_rate": 1.913444171554804e-05, "loss": 0.0244, "step": 4715 }, { "epoch": 1.2021394006282367, "grad_norm": 1.368572473526001, "learning_rate": 1.913263162735862e-05, "loss": 0.0223, "step": 4720 }, { "epoch": 1.2034128533831394, "grad_norm": 1.038437843322754, "learning_rate": 1.9130819734281947e-05, "loss": 0.0188, "step": 4725 }, { "epoch": 1.2046863061380422, "grad_norm": 0.979710578918457, "learning_rate": 1.9129006036676102e-05, "loss": 0.0214, "step": 4730 }, { "epoch": 1.2059597588929452, "grad_norm": 0.7724849581718445, "learning_rate": 1.912719053489954e-05, "loss": 0.0203, "step": 4735 }, { "epoch": 1.207233211647848, "grad_norm": 1.3319976329803467, "learning_rate": 1.912537322931104e-05, "loss": 0.0239, "step": 4740 }, { "epoch": 1.2085066644027507, "grad_norm": 0.9777044057846069, "learning_rate": 1.9123554120269773e-05, "loss": 0.0206, "step": 4745 }, { "epoch": 1.2097801171576534, "grad_norm": 1.6819127798080444, "learning_rate": 1.912173320813525e-05, "loss": 0.0221, "step": 4750 }, { "epoch": 1.2110535699125562, "grad_norm": 1.1199252605438232, "learning_rate": 1.911991049326733e-05, "loss": 0.0147, "step": 4755 }, { "epoch": 1.212327022667459, "grad_norm": 0.9031738042831421, "learning_rate": 1.911808597602624e-05, "loss": 0.0227, "step": 4760 }, { "epoch": 1.213600475422362, "grad_norm": 1.0475668907165527, "learning_rate": 1.9116259656772566e-05, "loss": 0.0285, "step": 4765 }, { "epoch": 1.2148739281772647, "grad_norm": 1.2270095348358154, "learning_rate": 1.9114431535867238e-05, "loss": 0.0224, "step": 4770 }, { "epoch": 1.2161473809321675, "grad_norm": 0.7645114064216614, "learning_rate": 1.911260161367155e-05, "loss": 0.017, "step": 4775 }, { "epoch": 1.2174208336870702, "grad_norm": 1.1455127000808716, "learning_rate": 1.9110769890547153e-05, "loss": 0.0176, "step": 4780 }, { "epoch": 1.218694286441973, "grad_norm": 1.3060660362243652, "learning_rate": 1.910893636685605e-05, "loss": 0.0167, "step": 4785 }, { "epoch": 1.2199677391968757, "grad_norm": 1.3960593938827515, "learning_rate": 1.9107101042960605e-05, "loss": 0.0222, "step": 4790 }, { "epoch": 1.2212411919517785, "grad_norm": 0.9525294303894043, "learning_rate": 1.9105263919223526e-05, "loss": 0.0206, "step": 4795 }, { "epoch": 1.2225146447066815, "grad_norm": 1.4201587438583374, "learning_rate": 1.910342499600789e-05, "loss": 0.0198, "step": 4800 }, { "epoch": 1.2237880974615842, "grad_norm": 1.5103648900985718, "learning_rate": 1.9101584273677124e-05, "loss": 0.0217, "step": 4805 }, { "epoch": 1.225061550216487, "grad_norm": 1.1689082384109497, "learning_rate": 1.9099741752595013e-05, "loss": 0.0229, "step": 4810 }, { "epoch": 1.2263350029713898, "grad_norm": 1.3712701797485352, "learning_rate": 1.909789743312569e-05, "loss": 0.0222, "step": 4815 }, { "epoch": 1.2276084557262925, "grad_norm": 1.259521722793579, "learning_rate": 1.9096051315633656e-05, "loss": 0.0235, "step": 4820 }, { "epoch": 1.2288819084811953, "grad_norm": 1.0272331237792969, "learning_rate": 1.9094203400483756e-05, "loss": 0.0152, "step": 4825 }, { "epoch": 1.2301553612360983, "grad_norm": 1.022386908531189, "learning_rate": 1.9092353688041196e-05, "loss": 0.0246, "step": 4830 }, { "epoch": 1.231428813991001, "grad_norm": 1.298867106437683, "learning_rate": 1.909050217867153e-05, "loss": 0.0307, "step": 4835 }, { "epoch": 1.2327022667459038, "grad_norm": 1.237997055053711, "learning_rate": 1.9088648872740682e-05, "loss": 0.0221, "step": 4840 }, { "epoch": 1.2339757195008065, "grad_norm": 0.6130354404449463, "learning_rate": 1.9086793770614918e-05, "loss": 0.0136, "step": 4845 }, { "epoch": 1.2352491722557093, "grad_norm": 1.3668276071548462, "learning_rate": 1.9084936872660864e-05, "loss": 0.0188, "step": 4850 }, { "epoch": 1.236522625010612, "grad_norm": 1.2560975551605225, "learning_rate": 1.9083078179245494e-05, "loss": 0.0186, "step": 4855 }, { "epoch": 1.2377960777655148, "grad_norm": 1.314291000366211, "learning_rate": 1.908121769073615e-05, "loss": 0.0188, "step": 4860 }, { "epoch": 1.2390695305204176, "grad_norm": 1.517416000366211, "learning_rate": 1.9079355407500516e-05, "loss": 0.0189, "step": 4865 }, { "epoch": 1.2403429832753206, "grad_norm": 0.742119312286377, "learning_rate": 1.9077491329906638e-05, "loss": 0.0215, "step": 4870 }, { "epoch": 1.2416164360302233, "grad_norm": 1.1282964944839478, "learning_rate": 1.907562545832292e-05, "loss": 0.0235, "step": 4875 }, { "epoch": 1.242889888785126, "grad_norm": 1.0772508382797241, "learning_rate": 1.9073757793118107e-05, "loss": 0.0197, "step": 4880 }, { "epoch": 1.2441633415400288, "grad_norm": 1.4147839546203613, "learning_rate": 1.907188833466131e-05, "loss": 0.0233, "step": 4885 }, { "epoch": 1.2454367942949316, "grad_norm": 1.0712999105453491, "learning_rate": 1.9070017083321996e-05, "loss": 0.0173, "step": 4890 }, { "epoch": 1.2467102470498346, "grad_norm": 0.7819250822067261, "learning_rate": 1.9068144039469975e-05, "loss": 0.0195, "step": 4895 }, { "epoch": 1.2479836998047373, "grad_norm": 0.8998242020606995, "learning_rate": 1.9066269203475416e-05, "loss": 0.0199, "step": 4900 }, { "epoch": 1.24925715255964, "grad_norm": 1.430192232131958, "learning_rate": 1.906439257570885e-05, "loss": 0.0243, "step": 4905 }, { "epoch": 1.2505306053145429, "grad_norm": 0.646056592464447, "learning_rate": 1.9062514156541153e-05, "loss": 0.0246, "step": 4910 }, { "epoch": 1.2518040580694456, "grad_norm": 1.387162446975708, "learning_rate": 1.906063394634356e-05, "loss": 0.0248, "step": 4915 }, { "epoch": 1.2530775108243484, "grad_norm": 0.6589651703834534, "learning_rate": 1.9058751945487656e-05, "loss": 0.0161, "step": 4920 }, { "epoch": 1.2543509635792511, "grad_norm": 1.0574009418487549, "learning_rate": 1.9056868154345384e-05, "loss": 0.016, "step": 4925 }, { "epoch": 1.255624416334154, "grad_norm": 1.0452383756637573, "learning_rate": 1.905498257328904e-05, "loss": 0.0275, "step": 4930 }, { "epoch": 1.2568978690890569, "grad_norm": 0.9951068162918091, "learning_rate": 1.9053472819956235e-05, "loss": 0.0228, "step": 4935 }, { "epoch": 1.2581713218439596, "grad_norm": 0.9752163887023926, "learning_rate": 1.9051584017993865e-05, "loss": 0.0228, "step": 4940 }, { "epoch": 1.2594447745988624, "grad_norm": 0.6273873448371887, "learning_rate": 1.904969342716173e-05, "loss": 0.0201, "step": 4945 }, { "epoch": 1.2607182273537652, "grad_norm": 0.9930721521377563, "learning_rate": 1.904780104783346e-05, "loss": 0.0254, "step": 4950 }, { "epoch": 1.261991680108668, "grad_norm": 0.9400107264518738, "learning_rate": 1.9045906880383057e-05, "loss": 0.0198, "step": 4955 }, { "epoch": 1.263265132863571, "grad_norm": 0.7937906980514526, "learning_rate": 1.904401092518486e-05, "loss": 0.0212, "step": 4960 }, { "epoch": 1.2645385856184737, "grad_norm": 1.777734637260437, "learning_rate": 1.9042113182613572e-05, "loss": 0.0262, "step": 4965 }, { "epoch": 1.2658120383733764, "grad_norm": 1.2574753761291504, "learning_rate": 1.9040213653044245e-05, "loss": 0.0228, "step": 4970 }, { "epoch": 1.2670854911282792, "grad_norm": 0.9530244469642639, "learning_rate": 1.9038312336852277e-05, "loss": 0.016, "step": 4975 }, { "epoch": 1.268358943883182, "grad_norm": 0.8230563998222351, "learning_rate": 1.9036409234413435e-05, "loss": 0.0219, "step": 4980 }, { "epoch": 1.2696323966380847, "grad_norm": 1.5094126462936401, "learning_rate": 1.9034504346103825e-05, "loss": 0.0162, "step": 4985 }, { "epoch": 1.2709058493929875, "grad_norm": 1.170581340789795, "learning_rate": 1.9032597672299913e-05, "loss": 0.0221, "step": 4990 }, { "epoch": 1.2721793021478902, "grad_norm": 1.1557250022888184, "learning_rate": 1.9030689213378516e-05, "loss": 0.0228, "step": 4995 }, { "epoch": 1.273452754902793, "grad_norm": 1.1548019647598267, "learning_rate": 1.9028778969716803e-05, "loss": 0.0216, "step": 5000 }, { "epoch": 1.274726207657696, "grad_norm": 1.2715482711791992, "learning_rate": 1.9026866941692297e-05, "loss": 0.0187, "step": 5005 }, { "epoch": 1.2759996604125987, "grad_norm": 1.4899641275405884, "learning_rate": 1.9024953129682875e-05, "loss": 0.0211, "step": 5010 }, { "epoch": 1.2772731131675015, "grad_norm": 0.318001925945282, "learning_rate": 1.902303753406676e-05, "loss": 0.0143, "step": 5015 }, { "epoch": 1.2785465659224042, "grad_norm": 1.1046833992004395, "learning_rate": 1.902112015522254e-05, "loss": 0.0165, "step": 5020 }, { "epoch": 1.279820018677307, "grad_norm": 1.1754765510559082, "learning_rate": 1.901920099352914e-05, "loss": 0.0134, "step": 5025 }, { "epoch": 1.28109347143221, "grad_norm": 1.557288408279419, "learning_rate": 1.901728004936585e-05, "loss": 0.0292, "step": 5030 }, { "epoch": 1.2823669241871127, "grad_norm": 1.6069178581237793, "learning_rate": 1.9015357323112302e-05, "loss": 0.0262, "step": 5035 }, { "epoch": 1.2836403769420155, "grad_norm": 1.4662667512893677, "learning_rate": 1.901343281514849e-05, "loss": 0.0222, "step": 5040 }, { "epoch": 1.2849138296969183, "grad_norm": 0.851110577583313, "learning_rate": 1.9011506525854756e-05, "loss": 0.0182, "step": 5045 }, { "epoch": 1.286187282451821, "grad_norm": 1.369783878326416, "learning_rate": 1.9009578455611794e-05, "loss": 0.0266, "step": 5050 }, { "epoch": 1.2874607352067238, "grad_norm": 1.0297743082046509, "learning_rate": 1.9007648604800642e-05, "loss": 0.0239, "step": 5055 }, { "epoch": 1.2887341879616265, "grad_norm": 0.8531782627105713, "learning_rate": 1.9005716973802707e-05, "loss": 0.0195, "step": 5060 }, { "epoch": 1.2900076407165293, "grad_norm": 1.0122785568237305, "learning_rate": 1.9003783562999736e-05, "loss": 0.0208, "step": 5065 }, { "epoch": 1.2912810934714323, "grad_norm": 1.3761394023895264, "learning_rate": 1.900184837277383e-05, "loss": 0.0157, "step": 5070 }, { "epoch": 1.292554546226335, "grad_norm": 0.7998246550559998, "learning_rate": 1.8999911403507435e-05, "loss": 0.0149, "step": 5075 }, { "epoch": 1.2938279989812378, "grad_norm": 0.9128049612045288, "learning_rate": 1.8997972655583362e-05, "loss": 0.0217, "step": 5080 }, { "epoch": 1.2951014517361406, "grad_norm": 2.3307042121887207, "learning_rate": 1.899603212938477e-05, "loss": 0.0229, "step": 5085 }, { "epoch": 1.2963749044910433, "grad_norm": 1.687964916229248, "learning_rate": 1.899408982529516e-05, "loss": 0.0206, "step": 5090 }, { "epoch": 1.2976483572459463, "grad_norm": 1.0245517492294312, "learning_rate": 1.899214574369839e-05, "loss": 0.0232, "step": 5095 }, { "epoch": 1.298921810000849, "grad_norm": 0.9775306582450867, "learning_rate": 1.899019988497868e-05, "loss": 0.0186, "step": 5100 }, { "epoch": 1.3001952627557518, "grad_norm": 1.1876394748687744, "learning_rate": 1.8988252249520576e-05, "loss": 0.0279, "step": 5105 }, { "epoch": 1.3014687155106546, "grad_norm": 1.0229690074920654, "learning_rate": 1.8986302837709002e-05, "loss": 0.0268, "step": 5110 }, { "epoch": 1.3027421682655573, "grad_norm": 0.8029558062553406, "learning_rate": 1.8984351649929217e-05, "loss": 0.0159, "step": 5115 }, { "epoch": 1.30401562102046, "grad_norm": 0.7057946920394897, "learning_rate": 1.8982398686566836e-05, "loss": 0.0154, "step": 5120 }, { "epoch": 1.3052890737753629, "grad_norm": 0.8902968764305115, "learning_rate": 1.8980443948007826e-05, "loss": 0.0213, "step": 5125 }, { "epoch": 1.3065625265302656, "grad_norm": 1.680289626121521, "learning_rate": 1.8978487434638496e-05, "loss": 0.0302, "step": 5130 }, { "epoch": 1.3078359792851686, "grad_norm": 1.1451873779296875, "learning_rate": 1.8976529146845524e-05, "loss": 0.0209, "step": 5135 }, { "epoch": 1.3091094320400714, "grad_norm": 1.089601993560791, "learning_rate": 1.897456908501592e-05, "loss": 0.0209, "step": 5140 }, { "epoch": 1.3103828847949741, "grad_norm": 0.9715773463249207, "learning_rate": 1.8972607249537053e-05, "loss": 0.0222, "step": 5145 }, { "epoch": 1.3116563375498769, "grad_norm": 1.687667727470398, "learning_rate": 1.8970643640796642e-05, "loss": 0.0278, "step": 5150 }, { "epoch": 1.3129297903047796, "grad_norm": 3.0884833335876465, "learning_rate": 1.8968678259182758e-05, "loss": 0.0285, "step": 5155 }, { "epoch": 1.3142032430596826, "grad_norm": 1.5954716205596924, "learning_rate": 1.8966711105083818e-05, "loss": 0.032, "step": 5160 }, { "epoch": 1.3154766958145854, "grad_norm": 0.9171507358551025, "learning_rate": 1.8964742178888595e-05, "loss": 0.0242, "step": 5165 }, { "epoch": 1.3167501485694881, "grad_norm": 2.67092227935791, "learning_rate": 1.8962771480986207e-05, "loss": 0.0249, "step": 5170 }, { "epoch": 1.318023601324391, "grad_norm": 2.0291996002197266, "learning_rate": 1.896079901176612e-05, "loss": 0.0225, "step": 5175 }, { "epoch": 1.3192970540792937, "grad_norm": 1.0665879249572754, "learning_rate": 1.8958824771618163e-05, "loss": 0.0223, "step": 5180 }, { "epoch": 1.3205705068341964, "grad_norm": 1.1821733713150024, "learning_rate": 1.89568487609325e-05, "loss": 0.0178, "step": 5185 }, { "epoch": 1.3218439595890992, "grad_norm": 0.8934772610664368, "learning_rate": 1.8954870980099653e-05, "loss": 0.0221, "step": 5190 }, { "epoch": 1.323117412344002, "grad_norm": 1.4893866777420044, "learning_rate": 1.8952891429510494e-05, "loss": 0.024, "step": 5195 }, { "epoch": 1.324390865098905, "grad_norm": 1.0125619173049927, "learning_rate": 1.8950910109556237e-05, "loss": 0.0231, "step": 5200 }, { "epoch": 1.3256643178538077, "grad_norm": 0.8151080012321472, "learning_rate": 1.894892702062846e-05, "loss": 0.0238, "step": 5205 }, { "epoch": 1.3269377706087104, "grad_norm": 1.1680233478546143, "learning_rate": 1.8946942163119072e-05, "loss": 0.0183, "step": 5210 }, { "epoch": 1.3282112233636132, "grad_norm": 1.7735196352005005, "learning_rate": 1.8944955537420348e-05, "loss": 0.0233, "step": 5215 }, { "epoch": 1.329484676118516, "grad_norm": 1.0010303258895874, "learning_rate": 1.89429671439249e-05, "loss": 0.0186, "step": 5220 }, { "epoch": 1.330758128873419, "grad_norm": 1.2619481086730957, "learning_rate": 1.8940976983025705e-05, "loss": 0.0246, "step": 5225 }, { "epoch": 1.3320315816283217, "grad_norm": 1.2270805835723877, "learning_rate": 1.8938985055116073e-05, "loss": 0.019, "step": 5230 }, { "epoch": 1.3333050343832245, "grad_norm": 1.3569000959396362, "learning_rate": 1.893699136058967e-05, "loss": 0.0198, "step": 5235 }, { "epoch": 1.3345784871381272, "grad_norm": 1.0439015626907349, "learning_rate": 1.8934995899840512e-05, "loss": 0.0212, "step": 5240 }, { "epoch": 1.33585193989303, "grad_norm": 1.232969880104065, "learning_rate": 1.8932998673262967e-05, "loss": 0.0294, "step": 5245 }, { "epoch": 1.3371253926479327, "grad_norm": 1.260736107826233, "learning_rate": 1.8930999681251743e-05, "loss": 0.0194, "step": 5250 }, { "epoch": 1.3383988454028355, "grad_norm": 2.2700629234313965, "learning_rate": 1.8928998924201903e-05, "loss": 0.0233, "step": 5255 }, { "epoch": 1.3396722981577383, "grad_norm": 1.4779093265533447, "learning_rate": 1.8926996402508856e-05, "loss": 0.0209, "step": 5260 }, { "epoch": 1.340945750912641, "grad_norm": 2.42828631401062, "learning_rate": 1.8924992116568367e-05, "loss": 0.0329, "step": 5265 }, { "epoch": 1.342219203667544, "grad_norm": 1.0374460220336914, "learning_rate": 1.8922986066776537e-05, "loss": 0.0202, "step": 5270 }, { "epoch": 1.3434926564224468, "grad_norm": 1.6559638977050781, "learning_rate": 1.8920978253529833e-05, "loss": 0.0242, "step": 5275 }, { "epoch": 1.3447661091773495, "grad_norm": 1.0095148086547852, "learning_rate": 1.8918968677225046e-05, "loss": 0.0226, "step": 5280 }, { "epoch": 1.3460395619322523, "grad_norm": 1.8345106840133667, "learning_rate": 1.8916957338259342e-05, "loss": 0.0224, "step": 5285 }, { "epoch": 1.347313014687155, "grad_norm": 0.9734413027763367, "learning_rate": 1.8914944237030218e-05, "loss": 0.0183, "step": 5290 }, { "epoch": 1.348586467442058, "grad_norm": 1.7570017576217651, "learning_rate": 1.891292937393552e-05, "loss": 0.0307, "step": 5295 }, { "epoch": 1.3498599201969608, "grad_norm": 1.4814512729644775, "learning_rate": 1.891091274937346e-05, "loss": 0.0221, "step": 5300 }, { "epoch": 1.3511333729518635, "grad_norm": 1.1059551239013672, "learning_rate": 1.8908894363742573e-05, "loss": 0.0249, "step": 5305 }, { "epoch": 1.3524068257067663, "grad_norm": 1.2400660514831543, "learning_rate": 1.8906874217441756e-05, "loss": 0.0311, "step": 5310 }, { "epoch": 1.353680278461669, "grad_norm": 0.5060182213783264, "learning_rate": 1.890485231087025e-05, "loss": 0.0189, "step": 5315 }, { "epoch": 1.3549537312165718, "grad_norm": 1.248148798942566, "learning_rate": 1.890282864442765e-05, "loss": 0.0173, "step": 5320 }, { "epoch": 1.3562271839714746, "grad_norm": 1.2993056774139404, "learning_rate": 1.8900803218513896e-05, "loss": 0.0218, "step": 5325 }, { "epoch": 1.3575006367263773, "grad_norm": 1.7565158605575562, "learning_rate": 1.8898776033529268e-05, "loss": 0.0247, "step": 5330 }, { "epoch": 1.3587740894812803, "grad_norm": 1.131869912147522, "learning_rate": 1.8896747089874404e-05, "loss": 0.0184, "step": 5335 }, { "epoch": 1.360047542236183, "grad_norm": 0.9453079700469971, "learning_rate": 1.8894716387950283e-05, "loss": 0.0201, "step": 5340 }, { "epoch": 1.3613209949910858, "grad_norm": 1.1278555393218994, "learning_rate": 1.8892683928158232e-05, "loss": 0.0194, "step": 5345 }, { "epoch": 1.3625944477459886, "grad_norm": 1.8124022483825684, "learning_rate": 1.8890649710899932e-05, "loss": 0.0229, "step": 5350 }, { "epoch": 1.3638679005008914, "grad_norm": 1.258699655532837, "learning_rate": 1.8888613736577404e-05, "loss": 0.0216, "step": 5355 }, { "epoch": 1.3651413532557943, "grad_norm": 1.4795223474502563, "learning_rate": 1.8886576005593017e-05, "loss": 0.0205, "step": 5360 }, { "epoch": 1.366414806010697, "grad_norm": 0.6910355091094971, "learning_rate": 1.8884536518349493e-05, "loss": 0.0228, "step": 5365 }, { "epoch": 1.3676882587655999, "grad_norm": 1.4786925315856934, "learning_rate": 1.8882495275249896e-05, "loss": 0.0303, "step": 5370 }, { "epoch": 1.3689617115205026, "grad_norm": 0.9715499877929688, "learning_rate": 1.8880452276697636e-05, "loss": 0.0188, "step": 5375 }, { "epoch": 1.3702351642754054, "grad_norm": 0.9410167932510376, "learning_rate": 1.8878407523096473e-05, "loss": 0.0172, "step": 5380 }, { "epoch": 1.3715086170303081, "grad_norm": 1.071064829826355, "learning_rate": 1.887636101485051e-05, "loss": 0.0159, "step": 5385 }, { "epoch": 1.372782069785211, "grad_norm": 1.1901546716690063, "learning_rate": 1.8874312752364208e-05, "loss": 0.0242, "step": 5390 }, { "epoch": 1.3740555225401136, "grad_norm": 1.4030156135559082, "learning_rate": 1.8872262736042356e-05, "loss": 0.0214, "step": 5395 }, { "epoch": 1.3753289752950166, "grad_norm": 1.647448182106018, "learning_rate": 1.8870210966290106e-05, "loss": 0.0175, "step": 5400 }, { "epoch": 1.3766024280499194, "grad_norm": 1.2167015075683594, "learning_rate": 1.886815744351295e-05, "loss": 0.0197, "step": 5405 }, { "epoch": 1.3778758808048222, "grad_norm": 0.8728576898574829, "learning_rate": 1.8866102168116725e-05, "loss": 0.0212, "step": 5410 }, { "epoch": 1.379149333559725, "grad_norm": 1.0810128450393677, "learning_rate": 1.886404514050762e-05, "loss": 0.0245, "step": 5415 }, { "epoch": 1.3804227863146277, "grad_norm": 1.3120814561843872, "learning_rate": 1.886198636109216e-05, "loss": 0.0199, "step": 5420 }, { "epoch": 1.3816962390695307, "grad_norm": 1.4202557802200317, "learning_rate": 1.8859925830277227e-05, "loss": 0.0246, "step": 5425 }, { "epoch": 1.3829696918244334, "grad_norm": 1.6016991138458252, "learning_rate": 1.8857863548470045e-05, "loss": 0.0272, "step": 5430 }, { "epoch": 1.3842431445793362, "grad_norm": 1.5459157228469849, "learning_rate": 1.8855799516078185e-05, "loss": 0.023, "step": 5435 }, { "epoch": 1.385516597334239, "grad_norm": 1.1806159019470215, "learning_rate": 1.885373373350956e-05, "loss": 0.0164, "step": 5440 }, { "epoch": 1.3867900500891417, "grad_norm": 1.4058042764663696, "learning_rate": 1.8851666201172434e-05, "loss": 0.0187, "step": 5445 }, { "epoch": 1.3880635028440445, "grad_norm": 1.9052833318710327, "learning_rate": 1.8849596919475412e-05, "loss": 0.0258, "step": 5450 }, { "epoch": 1.3893369555989472, "grad_norm": 1.2304062843322754, "learning_rate": 1.884752588882745e-05, "loss": 0.0239, "step": 5455 }, { "epoch": 1.39061040835385, "grad_norm": 1.2426061630249023, "learning_rate": 1.8845453109637847e-05, "loss": 0.0177, "step": 5460 }, { "epoch": 1.3918838611087527, "grad_norm": 1.273996114730835, "learning_rate": 1.8843378582316245e-05, "loss": 0.0241, "step": 5465 }, { "epoch": 1.3931573138636557, "grad_norm": 1.5751532316207886, "learning_rate": 1.8841302307272634e-05, "loss": 0.0318, "step": 5470 }, { "epoch": 1.3944307666185585, "grad_norm": 1.6480339765548706, "learning_rate": 1.8839224284917353e-05, "loss": 0.0203, "step": 5475 }, { "epoch": 1.3957042193734612, "grad_norm": 1.437923789024353, "learning_rate": 1.8837144515661085e-05, "loss": 0.0241, "step": 5480 }, { "epoch": 1.396977672128364, "grad_norm": 1.5244892835617065, "learning_rate": 1.8835062999914843e-05, "loss": 0.0233, "step": 5485 }, { "epoch": 1.3982511248832667, "grad_norm": 0.7699630260467529, "learning_rate": 1.8832979738090012e-05, "loss": 0.0253, "step": 5490 }, { "epoch": 1.3995245776381697, "grad_norm": 1.256838321685791, "learning_rate": 1.8830894730598307e-05, "loss": 0.018, "step": 5495 }, { "epoch": 1.4007980303930725, "grad_norm": 0.8181470632553101, "learning_rate": 1.8828807977851776e-05, "loss": 0.0151, "step": 5500 }, { "epoch": 1.4020714831479753, "grad_norm": 1.5273463726043701, "learning_rate": 1.8826719480262842e-05, "loss": 0.0273, "step": 5505 }, { "epoch": 1.403344935902878, "grad_norm": 1.1422758102416992, "learning_rate": 1.8824629238244248e-05, "loss": 0.0239, "step": 5510 }, { "epoch": 1.4046183886577808, "grad_norm": 0.9546235799789429, "learning_rate": 1.8822537252209087e-05, "loss": 0.0237, "step": 5515 }, { "epoch": 1.4058918414126835, "grad_norm": 1.6181542873382568, "learning_rate": 1.8820443522570805e-05, "loss": 0.0252, "step": 5520 }, { "epoch": 1.4071652941675863, "grad_norm": 1.0578491687774658, "learning_rate": 1.8818348049743186e-05, "loss": 0.0288, "step": 5525 }, { "epoch": 1.408438746922489, "grad_norm": 1.29648756980896, "learning_rate": 1.8816250834140355e-05, "loss": 0.0242, "step": 5530 }, { "epoch": 1.409712199677392, "grad_norm": 0.8099473118782043, "learning_rate": 1.881415187617679e-05, "loss": 0.0186, "step": 5535 }, { "epoch": 1.4109856524322948, "grad_norm": 0.5774282217025757, "learning_rate": 1.8812051176267307e-05, "loss": 0.0283, "step": 5540 }, { "epoch": 1.4122591051871975, "grad_norm": 1.586630940437317, "learning_rate": 1.880994873482707e-05, "loss": 0.0225, "step": 5545 }, { "epoch": 1.4135325579421003, "grad_norm": 0.9706496596336365, "learning_rate": 1.880784455227159e-05, "loss": 0.0209, "step": 5550 }, { "epoch": 1.414806010697003, "grad_norm": 1.8703625202178955, "learning_rate": 1.8805738629016708e-05, "loss": 0.0238, "step": 5555 }, { "epoch": 1.416079463451906, "grad_norm": 1.1969608068466187, "learning_rate": 1.880363096547863e-05, "loss": 0.0219, "step": 5560 }, { "epoch": 1.4173529162068088, "grad_norm": 1.500819444656372, "learning_rate": 1.8801521562073886e-05, "loss": 0.0332, "step": 5565 }, { "epoch": 1.4186263689617116, "grad_norm": 1.2063051462173462, "learning_rate": 1.879941041921936e-05, "loss": 0.0218, "step": 5570 }, { "epoch": 1.4198998217166143, "grad_norm": 1.921568512916565, "learning_rate": 1.8797297537332283e-05, "loss": 0.0277, "step": 5575 }, { "epoch": 1.421173274471517, "grad_norm": 1.2881320714950562, "learning_rate": 1.879518291683022e-05, "loss": 0.0284, "step": 5580 }, { "epoch": 1.4224467272264198, "grad_norm": 1.7128103971481323, "learning_rate": 1.8793066558131085e-05, "loss": 0.0266, "step": 5585 }, { "epoch": 1.4237201799813226, "grad_norm": 0.8883779644966125, "learning_rate": 1.8790948461653144e-05, "loss": 0.0247, "step": 5590 }, { "epoch": 1.4249936327362254, "grad_norm": 0.8467413783073425, "learning_rate": 1.8788828627814985e-05, "loss": 0.0187, "step": 5595 }, { "epoch": 1.4262670854911284, "grad_norm": 1.1781072616577148, "learning_rate": 1.8786707057035556e-05, "loss": 0.0167, "step": 5600 }, { "epoch": 1.427540538246031, "grad_norm": 1.536344289779663, "learning_rate": 1.8784583749734147e-05, "loss": 0.0225, "step": 5605 }, { "epoch": 1.4288139910009339, "grad_norm": 1.541422724723816, "learning_rate": 1.878245870633039e-05, "loss": 0.0318, "step": 5610 }, { "epoch": 1.4300874437558366, "grad_norm": 1.1874114274978638, "learning_rate": 1.8780331927244255e-05, "loss": 0.029, "step": 5615 }, { "epoch": 1.4313608965107394, "grad_norm": 0.8654482364654541, "learning_rate": 1.877820341289606e-05, "loss": 0.0184, "step": 5620 }, { "epoch": 1.4326343492656424, "grad_norm": 1.6417146921157837, "learning_rate": 1.8776073163706462e-05, "loss": 0.024, "step": 5625 }, { "epoch": 1.4339078020205451, "grad_norm": 0.943899393081665, "learning_rate": 1.8773941180096466e-05, "loss": 0.0249, "step": 5630 }, { "epoch": 1.435181254775448, "grad_norm": 0.9967460036277771, "learning_rate": 1.8771807462487417e-05, "loss": 0.0251, "step": 5635 }, { "epoch": 1.4364547075303506, "grad_norm": 1.344178318977356, "learning_rate": 1.8769672011301e-05, "loss": 0.0232, "step": 5640 }, { "epoch": 1.4377281602852534, "grad_norm": 1.4939885139465332, "learning_rate": 1.876753482695925e-05, "loss": 0.0246, "step": 5645 }, { "epoch": 1.4390016130401562, "grad_norm": 1.1837810277938843, "learning_rate": 1.8765395909884535e-05, "loss": 0.0182, "step": 5650 }, { "epoch": 1.440275065795059, "grad_norm": 1.260888695716858, "learning_rate": 1.8763255260499576e-05, "loss": 0.0217, "step": 5655 }, { "epoch": 1.4415485185499617, "grad_norm": 1.7025341987609863, "learning_rate": 1.8761112879227428e-05, "loss": 0.0213, "step": 5660 }, { "epoch": 1.4428219713048647, "grad_norm": 1.166237711906433, "learning_rate": 1.875896876649149e-05, "loss": 0.0246, "step": 5665 }, { "epoch": 1.4440954240597674, "grad_norm": 0.7954833507537842, "learning_rate": 1.8756822922715504e-05, "loss": 0.0184, "step": 5670 }, { "epoch": 1.4453688768146702, "grad_norm": 1.1551620960235596, "learning_rate": 1.875467534832356e-05, "loss": 0.0261, "step": 5675 }, { "epoch": 1.446642329569573, "grad_norm": 1.1532433032989502, "learning_rate": 1.8752526043740075e-05, "loss": 0.0202, "step": 5680 }, { "epoch": 1.4479157823244757, "grad_norm": 0.8227391839027405, "learning_rate": 1.8750375009389822e-05, "loss": 0.024, "step": 5685 }, { "epoch": 1.4491892350793787, "grad_norm": 0.6152703166007996, "learning_rate": 1.8748222245697913e-05, "loss": 0.0219, "step": 5690 }, { "epoch": 1.4504626878342814, "grad_norm": 1.1339900493621826, "learning_rate": 1.8746067753089797e-05, "loss": 0.0243, "step": 5695 }, { "epoch": 1.4517361405891842, "grad_norm": 0.723228931427002, "learning_rate": 1.874391153199127e-05, "loss": 0.0201, "step": 5700 }, { "epoch": 1.453009593344087, "grad_norm": 2.4031543731689453, "learning_rate": 1.874175358282847e-05, "loss": 0.0285, "step": 5705 }, { "epoch": 1.4542830460989897, "grad_norm": 0.9625338315963745, "learning_rate": 1.8739593906027864e-05, "loss": 0.0323, "step": 5710 }, { "epoch": 1.4555564988538925, "grad_norm": 1.7577120065689087, "learning_rate": 1.8737432502016277e-05, "loss": 0.0244, "step": 5715 }, { "epoch": 1.4568299516087952, "grad_norm": 1.5195587873458862, "learning_rate": 1.873526937122087e-05, "loss": 0.0243, "step": 5720 }, { "epoch": 1.458103404363698, "grad_norm": 0.7995648384094238, "learning_rate": 1.873310451406914e-05, "loss": 0.018, "step": 5725 }, { "epoch": 1.4593768571186008, "grad_norm": 1.2406035661697388, "learning_rate": 1.8730937930988927e-05, "loss": 0.02, "step": 5730 }, { "epoch": 1.4606503098735037, "grad_norm": 1.281814694404602, "learning_rate": 1.8728769622408423e-05, "loss": 0.03, "step": 5735 }, { "epoch": 1.4619237626284065, "grad_norm": 1.1910829544067383, "learning_rate": 1.8726599588756144e-05, "loss": 0.0203, "step": 5740 }, { "epoch": 1.4631972153833093, "grad_norm": 1.0027446746826172, "learning_rate": 1.8724427830460958e-05, "loss": 0.0197, "step": 5745 }, { "epoch": 1.464470668138212, "grad_norm": 1.7947478294372559, "learning_rate": 1.8722254347952068e-05, "loss": 0.0263, "step": 5750 }, { "epoch": 1.4657441208931148, "grad_norm": 0.9373995661735535, "learning_rate": 1.8720079141659027e-05, "loss": 0.0187, "step": 5755 }, { "epoch": 1.4670175736480178, "grad_norm": 0.7823398113250732, "learning_rate": 1.8717902212011714e-05, "loss": 0.0196, "step": 5760 }, { "epoch": 1.4682910264029205, "grad_norm": 0.985473096370697, "learning_rate": 1.8715723559440363e-05, "loss": 0.0212, "step": 5765 }, { "epoch": 1.4695644791578233, "grad_norm": 1.1476221084594727, "learning_rate": 1.871354318437554e-05, "loss": 0.0226, "step": 5770 }, { "epoch": 1.470837931912726, "grad_norm": 1.1522365808486938, "learning_rate": 1.8711361087248153e-05, "loss": 0.0204, "step": 5775 }, { "epoch": 1.4721113846676288, "grad_norm": 1.4182112216949463, "learning_rate": 1.8709177268489455e-05, "loss": 0.0234, "step": 5780 }, { "epoch": 1.4733848374225316, "grad_norm": 1.8470662832260132, "learning_rate": 1.870699172853103e-05, "loss": 0.0241, "step": 5785 }, { "epoch": 1.4746582901774343, "grad_norm": 1.592654824256897, "learning_rate": 1.8704804467804813e-05, "loss": 0.028, "step": 5790 }, { "epoch": 1.475931742932337, "grad_norm": 1.3710384368896484, "learning_rate": 1.870261548674307e-05, "loss": 0.0214, "step": 5795 }, { "epoch": 1.47720519568724, "grad_norm": 1.738387107849121, "learning_rate": 1.8700424785778408e-05, "loss": 0.0261, "step": 5800 }, { "epoch": 1.4784786484421428, "grad_norm": 1.8522320985794067, "learning_rate": 1.8698232365343783e-05, "loss": 0.0257, "step": 5805 }, { "epoch": 1.4797521011970456, "grad_norm": 1.6947803497314453, "learning_rate": 1.8696038225872478e-05, "loss": 0.0209, "step": 5810 }, { "epoch": 1.4810255539519483, "grad_norm": 0.6898067593574524, "learning_rate": 1.869384236779813e-05, "loss": 0.0282, "step": 5815 }, { "epoch": 1.482299006706851, "grad_norm": 1.309984803199768, "learning_rate": 1.8691644791554698e-05, "loss": 0.0241, "step": 5820 }, { "epoch": 1.483572459461754, "grad_norm": 1.1833584308624268, "learning_rate": 1.86894454975765e-05, "loss": 0.0248, "step": 5825 }, { "epoch": 1.4848459122166568, "grad_norm": 1.5719469785690308, "learning_rate": 1.8687244486298173e-05, "loss": 0.0259, "step": 5830 }, { "epoch": 1.4861193649715596, "grad_norm": 1.4268513917922974, "learning_rate": 1.8685041758154716e-05, "loss": 0.0196, "step": 5835 }, { "epoch": 1.4873928177264624, "grad_norm": 1.5658396482467651, "learning_rate": 1.8682837313581444e-05, "loss": 0.0234, "step": 5840 }, { "epoch": 1.4886662704813651, "grad_norm": 1.309866189956665, "learning_rate": 1.8680631153014032e-05, "loss": 0.0219, "step": 5845 }, { "epoch": 1.4899397232362679, "grad_norm": 1.4578948020935059, "learning_rate": 1.8678423276888477e-05, "loss": 0.0239, "step": 5850 }, { "epoch": 1.4912131759911706, "grad_norm": 1.093474268913269, "learning_rate": 1.867621368564113e-05, "loss": 0.0263, "step": 5855 }, { "epoch": 1.4924866287460734, "grad_norm": 1.2476252317428589, "learning_rate": 1.867400237970867e-05, "loss": 0.0223, "step": 5860 }, { "epoch": 1.4937600815009764, "grad_norm": 1.9275285005569458, "learning_rate": 1.8671789359528118e-05, "loss": 0.0252, "step": 5865 }, { "epoch": 1.4950335342558791, "grad_norm": 1.5439419746398926, "learning_rate": 1.8669574625536837e-05, "loss": 0.025, "step": 5870 }, { "epoch": 1.496306987010782, "grad_norm": 1.44623863697052, "learning_rate": 1.8667358178172525e-05, "loss": 0.0349, "step": 5875 }, { "epoch": 1.4975804397656847, "grad_norm": 1.1004509925842285, "learning_rate": 1.866514001787322e-05, "loss": 0.0211, "step": 5880 }, { "epoch": 1.4988538925205874, "grad_norm": 1.0976898670196533, "learning_rate": 1.8662920145077298e-05, "loss": 0.0268, "step": 5885 }, { "epoch": 1.5001273452754904, "grad_norm": 1.2302378416061401, "learning_rate": 1.866069856022347e-05, "loss": 0.0213, "step": 5890 }, { "epoch": 1.5014007980303932, "grad_norm": 1.7920118570327759, "learning_rate": 1.8658475263750796e-05, "loss": 0.0206, "step": 5895 }, { "epoch": 1.502674250785296, "grad_norm": 0.9816076159477234, "learning_rate": 1.8656250256098666e-05, "loss": 0.0214, "step": 5900 }, { "epoch": 1.5039477035401987, "grad_norm": 1.5619319677352905, "learning_rate": 1.86540235377068e-05, "loss": 0.0237, "step": 5905 }, { "epoch": 1.5052211562951014, "grad_norm": 0.8891597986221313, "learning_rate": 1.8651795109015278e-05, "loss": 0.0201, "step": 5910 }, { "epoch": 1.5064946090500042, "grad_norm": 1.5431758165359497, "learning_rate": 1.8649564970464503e-05, "loss": 0.0232, "step": 5915 }, { "epoch": 1.507768061804907, "grad_norm": 0.9594223499298096, "learning_rate": 1.8647333122495212e-05, "loss": 0.0267, "step": 5920 }, { "epoch": 1.5090415145598097, "grad_norm": 1.7949880361557007, "learning_rate": 1.8645099565548496e-05, "loss": 0.0275, "step": 5925 }, { "epoch": 1.5103149673147125, "grad_norm": 1.2860842943191528, "learning_rate": 1.8642864300065767e-05, "loss": 0.0204, "step": 5930 }, { "epoch": 1.5115884200696155, "grad_norm": 0.8846557140350342, "learning_rate": 1.864062732648878e-05, "loss": 0.023, "step": 5935 }, { "epoch": 1.5128618728245182, "grad_norm": 0.9746060967445374, "learning_rate": 1.8638388645259638e-05, "loss": 0.0246, "step": 5940 }, { "epoch": 1.514135325579421, "grad_norm": 2.010822057723999, "learning_rate": 1.8636148256820763e-05, "loss": 0.0311, "step": 5945 }, { "epoch": 1.5154087783343237, "grad_norm": 1.533766746520996, "learning_rate": 1.8633906161614934e-05, "loss": 0.0319, "step": 5950 }, { "epoch": 1.5166822310892267, "grad_norm": 1.7310080528259277, "learning_rate": 1.863166236008525e-05, "loss": 0.0218, "step": 5955 }, { "epoch": 1.5179556838441295, "grad_norm": 0.8189731240272522, "learning_rate": 1.862941685267516e-05, "loss": 0.0242, "step": 5960 }, { "epoch": 1.5192291365990322, "grad_norm": 1.1442556381225586, "learning_rate": 1.8627169639828443e-05, "loss": 0.0231, "step": 5965 }, { "epoch": 1.520502589353935, "grad_norm": 1.1481983661651611, "learning_rate": 1.8624920721989214e-05, "loss": 0.0214, "step": 5970 }, { "epoch": 1.5217760421088378, "grad_norm": 0.8838867545127869, "learning_rate": 1.862267009960193e-05, "loss": 0.0202, "step": 5975 }, { "epoch": 1.5230494948637405, "grad_norm": 1.3259570598602295, "learning_rate": 1.8620417773111384e-05, "loss": 0.02, "step": 5980 }, { "epoch": 1.5243229476186433, "grad_norm": 1.2382099628448486, "learning_rate": 1.8618163742962708e-05, "loss": 0.0237, "step": 5985 }, { "epoch": 1.525596400373546, "grad_norm": 0.9390523433685303, "learning_rate": 1.8615908009601357e-05, "loss": 0.0194, "step": 5990 }, { "epoch": 1.5268698531284488, "grad_norm": 1.209328293800354, "learning_rate": 1.8613650573473144e-05, "loss": 0.0265, "step": 5995 }, { "epoch": 1.5281433058833516, "grad_norm": 1.1630326509475708, "learning_rate": 1.8611391435024197e-05, "loss": 0.0314, "step": 6000 }, { "epoch": 1.5294167586382545, "grad_norm": 1.720635175704956, "learning_rate": 1.8609130594701002e-05, "loss": 0.0293, "step": 6005 }, { "epoch": 1.5306902113931573, "grad_norm": 1.1176849603652954, "learning_rate": 1.860686805295036e-05, "loss": 0.029, "step": 6010 }, { "epoch": 1.53196366414806, "grad_norm": 1.0458412170410156, "learning_rate": 1.8604603810219425e-05, "loss": 0.0193, "step": 6015 }, { "epoch": 1.533237116902963, "grad_norm": 1.0760481357574463, "learning_rate": 1.8602337866955678e-05, "loss": 0.0213, "step": 6020 }, { "epoch": 1.5345105696578658, "grad_norm": 1.7129069566726685, "learning_rate": 1.860007022360694e-05, "loss": 0.0137, "step": 6025 }, { "epoch": 1.5357840224127686, "grad_norm": 1.0682419538497925, "learning_rate": 1.8597800880621365e-05, "loss": 0.0182, "step": 6030 }, { "epoch": 1.5370574751676713, "grad_norm": 0.8999855518341064, "learning_rate": 1.8595529838447443e-05, "loss": 0.0256, "step": 6035 }, { "epoch": 1.538330927922574, "grad_norm": 1.2589900493621826, "learning_rate": 1.8593257097534007e-05, "loss": 0.025, "step": 6040 }, { "epoch": 1.5396043806774768, "grad_norm": 1.0684521198272705, "learning_rate": 1.8590982658330218e-05, "loss": 0.0229, "step": 6045 }, { "epoch": 1.5408778334323796, "grad_norm": 2.1269338130950928, "learning_rate": 1.858870652128557e-05, "loss": 0.0227, "step": 6050 }, { "epoch": 1.5421512861872824, "grad_norm": 1.4520543813705444, "learning_rate": 1.8586428686849905e-05, "loss": 0.025, "step": 6055 }, { "epoch": 1.5434247389421851, "grad_norm": 1.808072566986084, "learning_rate": 1.8584149155473383e-05, "loss": 0.0362, "step": 6060 }, { "epoch": 1.5446981916970879, "grad_norm": 1.511757254600525, "learning_rate": 1.8581867927606518e-05, "loss": 0.0253, "step": 6065 }, { "epoch": 1.5459716444519909, "grad_norm": 0.7824207544326782, "learning_rate": 1.8579585003700148e-05, "loss": 0.0174, "step": 6070 }, { "epoch": 1.5472450972068936, "grad_norm": 0.8727839589118958, "learning_rate": 1.8577300384205447e-05, "loss": 0.027, "step": 6075 }, { "epoch": 1.5485185499617964, "grad_norm": 1.2028013467788696, "learning_rate": 1.8575014069573923e-05, "loss": 0.0266, "step": 6080 }, { "epoch": 1.5497920027166994, "grad_norm": 1.6659213304519653, "learning_rate": 1.857272606025743e-05, "loss": 0.0233, "step": 6085 }, { "epoch": 1.5510654554716021, "grad_norm": 1.070666790008545, "learning_rate": 1.857043635670814e-05, "loss": 0.0233, "step": 6090 }, { "epoch": 1.5523389082265049, "grad_norm": 1.1199408769607544, "learning_rate": 1.8568144959378572e-05, "loss": 0.0257, "step": 6095 }, { "epoch": 1.5536123609814076, "grad_norm": 1.2268332242965698, "learning_rate": 1.856585186872158e-05, "loss": 0.0245, "step": 6100 }, { "epoch": 1.5548858137363104, "grad_norm": 1.456573247909546, "learning_rate": 1.856355708519034e-05, "loss": 0.0338, "step": 6105 }, { "epoch": 1.5561592664912132, "grad_norm": 1.2423803806304932, "learning_rate": 1.856126060923838e-05, "loss": 0.0237, "step": 6110 }, { "epoch": 1.557432719246116, "grad_norm": 0.6702473163604736, "learning_rate": 1.8558962441319547e-05, "loss": 0.0234, "step": 6115 }, { "epoch": 1.5587061720010187, "grad_norm": 1.5024490356445312, "learning_rate": 1.8556662581888037e-05, "loss": 0.0205, "step": 6120 }, { "epoch": 1.5599796247559214, "grad_norm": 1.4882538318634033, "learning_rate": 1.8554361031398366e-05, "loss": 0.0278, "step": 6125 }, { "epoch": 1.5612530775108242, "grad_norm": 1.4641661643981934, "learning_rate": 1.855205779030539e-05, "loss": 0.026, "step": 6130 }, { "epoch": 1.5625265302657272, "grad_norm": 1.6282750368118286, "learning_rate": 1.8549752859064303e-05, "loss": 0.0209, "step": 6135 }, { "epoch": 1.56379998302063, "grad_norm": 1.5002096891403198, "learning_rate": 1.8547446238130632e-05, "loss": 0.0202, "step": 6140 }, { "epoch": 1.5650734357755327, "grad_norm": 1.7381964921951294, "learning_rate": 1.8545137927960234e-05, "loss": 0.0269, "step": 6145 }, { "epoch": 1.5663468885304355, "grad_norm": 1.4147193431854248, "learning_rate": 1.85428279290093e-05, "loss": 0.0273, "step": 6150 }, { "epoch": 1.5676203412853384, "grad_norm": 1.8497728109359741, "learning_rate": 1.854051624173436e-05, "loss": 0.0285, "step": 6155 }, { "epoch": 1.5688937940402412, "grad_norm": 1.6840671300888062, "learning_rate": 1.8538202866592266e-05, "loss": 0.0237, "step": 6160 }, { "epoch": 1.570167246795144, "grad_norm": 1.3586578369140625, "learning_rate": 1.8535887804040222e-05, "loss": 0.0279, "step": 6165 }, { "epoch": 1.5714406995500467, "grad_norm": 1.6926907300949097, "learning_rate": 1.8533571054535748e-05, "loss": 0.0268, "step": 6170 }, { "epoch": 1.5727141523049495, "grad_norm": 1.1809699535369873, "learning_rate": 1.8531252618536704e-05, "loss": 0.0255, "step": 6175 }, { "epoch": 1.5739876050598522, "grad_norm": 1.0481526851654053, "learning_rate": 1.8528932496501293e-05, "loss": 0.0225, "step": 6180 }, { "epoch": 1.575261057814755, "grad_norm": 0.8398872017860413, "learning_rate": 1.852661068888803e-05, "loss": 0.0271, "step": 6185 }, { "epoch": 1.5765345105696578, "grad_norm": 1.3476440906524658, "learning_rate": 1.8524287196155784e-05, "loss": 0.0238, "step": 6190 }, { "epoch": 1.5778079633245605, "grad_norm": 1.9240022897720337, "learning_rate": 1.8521962018763748e-05, "loss": 0.0297, "step": 6195 }, { "epoch": 1.5790814160794635, "grad_norm": 1.029764175415039, "learning_rate": 1.8519635157171443e-05, "loss": 0.0311, "step": 6200 }, { "epoch": 1.5803548688343663, "grad_norm": 1.213796854019165, "learning_rate": 1.851730661183873e-05, "loss": 0.0291, "step": 6205 }, { "epoch": 1.581628321589269, "grad_norm": 1.407727599143982, "learning_rate": 1.85149763832258e-05, "loss": 0.0304, "step": 6210 }, { "epoch": 1.5829017743441718, "grad_norm": 0.8989773392677307, "learning_rate": 1.851264447179318e-05, "loss": 0.0266, "step": 6215 }, { "epoch": 1.5841752270990748, "grad_norm": 1.725836992263794, "learning_rate": 1.8510310878001724e-05, "loss": 0.0264, "step": 6220 }, { "epoch": 1.5854486798539775, "grad_norm": 1.48646080493927, "learning_rate": 1.8507975602312628e-05, "loss": 0.0214, "step": 6225 }, { "epoch": 1.5867221326088803, "grad_norm": 0.8131179809570312, "learning_rate": 1.8505638645187403e-05, "loss": 0.017, "step": 6230 }, { "epoch": 1.587995585363783, "grad_norm": 1.264231562614441, "learning_rate": 1.850330000708791e-05, "loss": 0.026, "step": 6235 }, { "epoch": 1.5892690381186858, "grad_norm": 1.4154671430587769, "learning_rate": 1.8500959688476337e-05, "loss": 0.0305, "step": 6240 }, { "epoch": 1.5905424908735886, "grad_norm": 1.1607767343521118, "learning_rate": 1.84986176898152e-05, "loss": 0.0277, "step": 6245 }, { "epoch": 1.5918159436284913, "grad_norm": 0.8372571468353271, "learning_rate": 1.8496274011567346e-05, "loss": 0.0265, "step": 6250 }, { "epoch": 1.593089396383394, "grad_norm": 1.3373456001281738, "learning_rate": 1.8493928654195965e-05, "loss": 0.0197, "step": 6255 }, { "epoch": 1.5943628491382968, "grad_norm": 1.0512429475784302, "learning_rate": 1.8491581618164566e-05, "loss": 0.0196, "step": 6260 }, { "epoch": 1.5956363018931996, "grad_norm": 1.2518543004989624, "learning_rate": 1.8489232903936997e-05, "loss": 0.0282, "step": 6265 }, { "epoch": 1.5969097546481026, "grad_norm": 1.310891032218933, "learning_rate": 1.8486882511977432e-05, "loss": 0.0228, "step": 6270 }, { "epoch": 1.5981832074030053, "grad_norm": 1.40314781665802, "learning_rate": 1.8484530442750387e-05, "loss": 0.024, "step": 6275 }, { "epoch": 1.599456660157908, "grad_norm": 0.8866240382194519, "learning_rate": 1.8482176696720695e-05, "loss": 0.0239, "step": 6280 }, { "epoch": 1.600730112912811, "grad_norm": 1.941116452217102, "learning_rate": 1.8479821274353534e-05, "loss": 0.0212, "step": 6285 }, { "epoch": 1.6020035656677138, "grad_norm": 1.0560799837112427, "learning_rate": 1.847746417611441e-05, "loss": 0.022, "step": 6290 }, { "epoch": 1.6032770184226166, "grad_norm": 0.9549133777618408, "learning_rate": 1.847510540246915e-05, "loss": 0.0216, "step": 6295 }, { "epoch": 1.6045504711775194, "grad_norm": 1.9535746574401855, "learning_rate": 1.8472744953883925e-05, "loss": 0.0207, "step": 6300 }, { "epoch": 1.6058239239324221, "grad_norm": 1.1994503736495972, "learning_rate": 1.847038283082523e-05, "loss": 0.0332, "step": 6305 }, { "epoch": 1.6070973766873249, "grad_norm": 1.7314928770065308, "learning_rate": 1.8468019033759893e-05, "loss": 0.0214, "step": 6310 }, { "epoch": 1.6083708294422276, "grad_norm": 1.4588466882705688, "learning_rate": 1.846565356315507e-05, "loss": 0.023, "step": 6315 }, { "epoch": 1.6096442821971304, "grad_norm": 0.8399770855903625, "learning_rate": 1.8463286419478256e-05, "loss": 0.0236, "step": 6320 }, { "epoch": 1.6109177349520332, "grad_norm": 1.7154335975646973, "learning_rate": 1.846091760319727e-05, "loss": 0.0277, "step": 6325 }, { "epoch": 1.612191187706936, "grad_norm": 1.8444459438323975, "learning_rate": 1.8458547114780254e-05, "loss": 0.0262, "step": 6330 }, { "epoch": 1.613464640461839, "grad_norm": 1.2066746950149536, "learning_rate": 1.8456174954695697e-05, "loss": 0.0341, "step": 6335 }, { "epoch": 1.6147380932167417, "grad_norm": 0.8099359273910522, "learning_rate": 1.8453801123412415e-05, "loss": 0.017, "step": 6340 }, { "epoch": 1.6160115459716444, "grad_norm": 1.6393593549728394, "learning_rate": 1.8451425621399538e-05, "loss": 0.0232, "step": 6345 }, { "epoch": 1.6172849987265474, "grad_norm": 1.1385905742645264, "learning_rate": 1.8449048449126542e-05, "loss": 0.0252, "step": 6350 }, { "epoch": 1.6185584514814502, "grad_norm": 1.021136999130249, "learning_rate": 1.8446669607063234e-05, "loss": 0.0209, "step": 6355 }, { "epoch": 1.619831904236353, "grad_norm": 1.4915812015533447, "learning_rate": 1.844428909567974e-05, "loss": 0.0287, "step": 6360 }, { "epoch": 1.6211053569912557, "grad_norm": 1.4841256141662598, "learning_rate": 1.8441906915446525e-05, "loss": 0.0285, "step": 6365 }, { "epoch": 1.6223788097461584, "grad_norm": 1.380545735359192, "learning_rate": 1.8439523066834377e-05, "loss": 0.0258, "step": 6370 }, { "epoch": 1.6236522625010612, "grad_norm": 1.2644317150115967, "learning_rate": 1.843713755031442e-05, "loss": 0.0219, "step": 6375 }, { "epoch": 1.624925715255964, "grad_norm": 0.9924350380897522, "learning_rate": 1.843475036635811e-05, "loss": 0.0171, "step": 6380 }, { "epoch": 1.6261991680108667, "grad_norm": 1.0666592121124268, "learning_rate": 1.8432361515437217e-05, "loss": 0.0204, "step": 6385 }, { "epoch": 1.6274726207657695, "grad_norm": 1.833279013633728, "learning_rate": 1.842997099802386e-05, "loss": 0.0244, "step": 6390 }, { "epoch": 1.6287460735206722, "grad_norm": 0.9659563302993774, "learning_rate": 1.8427578814590473e-05, "loss": 0.0245, "step": 6395 }, { "epoch": 1.6300195262755752, "grad_norm": 1.1397185325622559, "learning_rate": 1.842518496560983e-05, "loss": 0.0254, "step": 6400 }, { "epoch": 1.631292979030478, "grad_norm": 1.5517922639846802, "learning_rate": 1.842278945155502e-05, "loss": 0.026, "step": 6405 }, { "epoch": 1.6325664317853807, "grad_norm": 1.5141046047210693, "learning_rate": 1.8420392272899476e-05, "loss": 0.0254, "step": 6410 }, { "epoch": 1.6338398845402835, "grad_norm": 0.9616473913192749, "learning_rate": 1.8417993430116954e-05, "loss": 0.0248, "step": 6415 }, { "epoch": 1.6351133372951865, "grad_norm": 1.865405559539795, "learning_rate": 1.8415592923681536e-05, "loss": 0.0308, "step": 6420 }, { "epoch": 1.6363867900500892, "grad_norm": 1.4259822368621826, "learning_rate": 1.841319075406764e-05, "loss": 0.0278, "step": 6425 }, { "epoch": 1.637660242804992, "grad_norm": 1.459154725074768, "learning_rate": 1.841078692175e-05, "loss": 0.0288, "step": 6430 }, { "epoch": 1.6389336955598948, "grad_norm": 0.4226120412349701, "learning_rate": 1.8408381427203696e-05, "loss": 0.0269, "step": 6435 }, { "epoch": 1.6402071483147975, "grad_norm": 1.9429343938827515, "learning_rate": 1.8405974270904123e-05, "loss": 0.0211, "step": 6440 }, { "epoch": 1.6414806010697003, "grad_norm": 1.3337866067886353, "learning_rate": 1.8403565453327007e-05, "loss": 0.0258, "step": 6445 }, { "epoch": 1.642754053824603, "grad_norm": 1.6298739910125732, "learning_rate": 1.840115497494841e-05, "loss": 0.029, "step": 6450 }, { "epoch": 1.6440275065795058, "grad_norm": 1.799607515335083, "learning_rate": 1.8398742836244707e-05, "loss": 0.0247, "step": 6455 }, { "epoch": 1.6453009593344086, "grad_norm": 1.2037841081619263, "learning_rate": 1.839632903769262e-05, "loss": 0.0214, "step": 6460 }, { "epoch": 1.6465744120893113, "grad_norm": 1.3110535144805908, "learning_rate": 1.839391357976918e-05, "loss": 0.0219, "step": 6465 }, { "epoch": 1.6478478648442143, "grad_norm": 1.4213957786560059, "learning_rate": 1.8391496462951764e-05, "loss": 0.0183, "step": 6470 }, { "epoch": 1.649121317599117, "grad_norm": 0.8650819659233093, "learning_rate": 1.838907768771806e-05, "loss": 0.0231, "step": 6475 }, { "epoch": 1.6503947703540198, "grad_norm": 2.2444417476654053, "learning_rate": 1.83866572545461e-05, "loss": 0.0228, "step": 6480 }, { "epoch": 1.6516682231089228, "grad_norm": 0.7553511261940002, "learning_rate": 1.8384235163914233e-05, "loss": 0.019, "step": 6485 }, { "epoch": 1.6529416758638256, "grad_norm": 1.1486551761627197, "learning_rate": 1.8381811416301133e-05, "loss": 0.0287, "step": 6490 }, { "epoch": 1.6542151286187283, "grad_norm": 0.7495598196983337, "learning_rate": 1.8379386012185813e-05, "loss": 0.0226, "step": 6495 }, { "epoch": 1.655488581373631, "grad_norm": 1.0411831140518188, "learning_rate": 1.8376958952047606e-05, "loss": 0.0238, "step": 6500 }, { "epoch": 1.6567620341285338, "grad_norm": 1.280569076538086, "learning_rate": 1.8374530236366167e-05, "loss": 0.0295, "step": 6505 }, { "epoch": 1.6580354868834366, "grad_norm": 1.158399224281311, "learning_rate": 1.8372099865621496e-05, "loss": 0.0259, "step": 6510 }, { "epoch": 1.6593089396383394, "grad_norm": 1.0873500108718872, "learning_rate": 1.83696678402939e-05, "loss": 0.0224, "step": 6515 }, { "epoch": 1.6605823923932421, "grad_norm": 1.183156132698059, "learning_rate": 1.836723416086402e-05, "loss": 0.0267, "step": 6520 }, { "epoch": 1.6618558451481449, "grad_norm": 1.1027849912643433, "learning_rate": 1.8364798827812833e-05, "loss": 0.0187, "step": 6525 }, { "epoch": 1.6631292979030476, "grad_norm": 1.0572201013565063, "learning_rate": 1.836236184162163e-05, "loss": 0.017, "step": 6530 }, { "epoch": 1.6644027506579506, "grad_norm": 1.6533379554748535, "learning_rate": 1.8359923202772037e-05, "loss": 0.0186, "step": 6535 }, { "epoch": 1.6656762034128534, "grad_norm": 1.5468223094940186, "learning_rate": 1.8357482911746e-05, "loss": 0.0268, "step": 6540 }, { "epoch": 1.6669496561677561, "grad_norm": 1.8763089179992676, "learning_rate": 1.8355040969025796e-05, "loss": 0.0202, "step": 6545 }, { "epoch": 1.6682231089226591, "grad_norm": 2.319096326828003, "learning_rate": 1.8352597375094032e-05, "loss": 0.0325, "step": 6550 }, { "epoch": 1.6694965616775619, "grad_norm": 1.9518924951553345, "learning_rate": 1.8350152130433632e-05, "loss": 0.0312, "step": 6555 }, { "epoch": 1.6707700144324646, "grad_norm": 0.7690248489379883, "learning_rate": 1.834770523552785e-05, "loss": 0.0202, "step": 6560 }, { "epoch": 1.6720434671873674, "grad_norm": 1.7653558254241943, "learning_rate": 1.8345256690860274e-05, "loss": 0.0194, "step": 6565 }, { "epoch": 1.6733169199422702, "grad_norm": 0.854318380355835, "learning_rate": 1.8342806496914806e-05, "loss": 0.0205, "step": 6570 }, { "epoch": 1.674590372697173, "grad_norm": 1.5310299396514893, "learning_rate": 1.8340354654175677e-05, "loss": 0.0251, "step": 6575 }, { "epoch": 1.6758638254520757, "grad_norm": 1.3955819606781006, "learning_rate": 1.8337901163127455e-05, "loss": 0.0231, "step": 6580 }, { "epoch": 1.6771372782069784, "grad_norm": 1.7867450714111328, "learning_rate": 1.8335446024255015e-05, "loss": 0.0229, "step": 6585 }, { "epoch": 1.6784107309618812, "grad_norm": 1.6994699239730835, "learning_rate": 1.8332989238043575e-05, "loss": 0.0198, "step": 6590 }, { "epoch": 1.679684183716784, "grad_norm": 1.4294517040252686, "learning_rate": 1.8330530804978668e-05, "loss": 0.0286, "step": 6595 }, { "epoch": 1.680957636471687, "grad_norm": 1.2955738306045532, "learning_rate": 1.8328070725546152e-05, "loss": 0.0224, "step": 6600 }, { "epoch": 1.6822310892265897, "grad_norm": 1.4117785692214966, "learning_rate": 1.832560900023222e-05, "loss": 0.0272, "step": 6605 }, { "epoch": 1.6835045419814925, "grad_norm": 0.8185126781463623, "learning_rate": 1.832314562952338e-05, "loss": 0.0226, "step": 6610 }, { "epoch": 1.6847779947363954, "grad_norm": 1.2733162641525269, "learning_rate": 1.8320680613906476e-05, "loss": 0.0254, "step": 6615 }, { "epoch": 1.6860514474912982, "grad_norm": 1.6049821376800537, "learning_rate": 1.8318213953868656e-05, "loss": 0.0291, "step": 6620 }, { "epoch": 1.687324900246201, "grad_norm": 1.246999740600586, "learning_rate": 1.8315745649897424e-05, "loss": 0.0215, "step": 6625 }, { "epoch": 1.6885983530011037, "grad_norm": 0.6692451238632202, "learning_rate": 1.8313275702480583e-05, "loss": 0.0239, "step": 6630 }, { "epoch": 1.6898718057560065, "grad_norm": 1.0274238586425781, "learning_rate": 1.8310804112106276e-05, "loss": 0.0306, "step": 6635 }, { "epoch": 1.6911452585109092, "grad_norm": 1.194385290145874, "learning_rate": 1.8308330879262955e-05, "loss": 0.0181, "step": 6640 }, { "epoch": 1.692418711265812, "grad_norm": 1.0685012340545654, "learning_rate": 1.8305856004439415e-05, "loss": 0.0231, "step": 6645 }, { "epoch": 1.6936921640207148, "grad_norm": 1.4262362718582153, "learning_rate": 1.830337948812477e-05, "loss": 0.0236, "step": 6650 }, { "epoch": 1.6949656167756175, "grad_norm": 1.2602248191833496, "learning_rate": 1.8300901330808442e-05, "loss": 0.0171, "step": 6655 }, { "epoch": 1.6962390695305203, "grad_norm": 1.4441887140274048, "learning_rate": 1.82984215329802e-05, "loss": 0.0312, "step": 6660 }, { "epoch": 1.6975125222854233, "grad_norm": 2.1181857585906982, "learning_rate": 1.829594009513013e-05, "loss": 0.0259, "step": 6665 }, { "epoch": 1.698785975040326, "grad_norm": 0.9508811831474304, "learning_rate": 1.8293457017748633e-05, "loss": 0.0186, "step": 6670 }, { "epoch": 1.7000594277952288, "grad_norm": 1.5141568183898926, "learning_rate": 1.8290972301326447e-05, "loss": 0.0195, "step": 6675 }, { "epoch": 1.7013328805501315, "grad_norm": 1.5596239566802979, "learning_rate": 1.8288485946354623e-05, "loss": 0.0199, "step": 6680 }, { "epoch": 1.7026063333050345, "grad_norm": 1.382277488708496, "learning_rate": 1.8285997953324546e-05, "loss": 0.0205, "step": 6685 }, { "epoch": 1.7038797860599373, "grad_norm": 0.9691487550735474, "learning_rate": 1.8283508322727915e-05, "loss": 0.0261, "step": 6690 }, { "epoch": 1.70515323881484, "grad_norm": 0.5288546681404114, "learning_rate": 1.828101705505676e-05, "loss": 0.0202, "step": 6695 }, { "epoch": 1.7064266915697428, "grad_norm": 1.1831198930740356, "learning_rate": 1.827852415080343e-05, "loss": 0.028, "step": 6700 }, { "epoch": 1.7077001443246456, "grad_norm": 1.004948377609253, "learning_rate": 1.8276029610460598e-05, "loss": 0.0205, "step": 6705 }, { "epoch": 1.7089735970795483, "grad_norm": 1.2932958602905273, "learning_rate": 1.8273533434521262e-05, "loss": 0.0305, "step": 6710 }, { "epoch": 1.710247049834451, "grad_norm": 0.6455768346786499, "learning_rate": 1.8271035623478744e-05, "loss": 0.017, "step": 6715 }, { "epoch": 1.7115205025893538, "grad_norm": 1.0195858478546143, "learning_rate": 1.826853617782669e-05, "loss": 0.0205, "step": 6720 }, { "epoch": 1.7127939553442566, "grad_norm": 0.4957391917705536, "learning_rate": 1.8266035098059066e-05, "loss": 0.0178, "step": 6725 }, { "epoch": 1.7140674080991594, "grad_norm": 0.798285186290741, "learning_rate": 1.8263532384670157e-05, "loss": 0.0247, "step": 6730 }, { "epoch": 1.7153408608540623, "grad_norm": 1.2891851663589478, "learning_rate": 1.826102803815458e-05, "loss": 0.0189, "step": 6735 }, { "epoch": 1.716614313608965, "grad_norm": 1.082024335861206, "learning_rate": 1.8258522059007268e-05, "loss": 0.0245, "step": 6740 }, { "epoch": 1.7178877663638679, "grad_norm": 1.004134178161621, "learning_rate": 1.8256014447723477e-05, "loss": 0.0254, "step": 6745 }, { "epoch": 1.7191612191187708, "grad_norm": 1.5665807723999023, "learning_rate": 1.8253505204798798e-05, "loss": 0.0229, "step": 6750 }, { "epoch": 1.7204346718736736, "grad_norm": 1.103563666343689, "learning_rate": 1.8250994330729126e-05, "loss": 0.0238, "step": 6755 }, { "epoch": 1.7217081246285764, "grad_norm": 2.601050615310669, "learning_rate": 1.8248481826010685e-05, "loss": 0.0295, "step": 6760 }, { "epoch": 1.7229815773834791, "grad_norm": 1.3313089609146118, "learning_rate": 1.8245967691140027e-05, "loss": 0.0217, "step": 6765 }, { "epoch": 1.7242550301383819, "grad_norm": 1.1237602233886719, "learning_rate": 1.824345192661402e-05, "loss": 0.0292, "step": 6770 }, { "epoch": 1.7255284828932846, "grad_norm": 1.2046335935592651, "learning_rate": 1.8240934532929857e-05, "loss": 0.0206, "step": 6775 }, { "epoch": 1.7268019356481874, "grad_norm": 1.4709291458129883, "learning_rate": 1.823841551058505e-05, "loss": 0.0328, "step": 6780 }, { "epoch": 1.7280753884030902, "grad_norm": 1.5220979452133179, "learning_rate": 1.8235894860077437e-05, "loss": 0.0302, "step": 6785 }, { "epoch": 1.729348841157993, "grad_norm": 1.836383581161499, "learning_rate": 1.8233372581905174e-05, "loss": 0.0249, "step": 6790 }, { "epoch": 1.7306222939128957, "grad_norm": 1.4265848398208618, "learning_rate": 1.8230848676566743e-05, "loss": 0.0285, "step": 6795 }, { "epoch": 1.7318957466677987, "grad_norm": 1.3852695226669312, "learning_rate": 1.8228323144560944e-05, "loss": 0.0239, "step": 6800 }, { "epoch": 1.7331691994227014, "grad_norm": 1.6205666065216064, "learning_rate": 1.8225795986386896e-05, "loss": 0.0292, "step": 6805 }, { "epoch": 1.7344426521776042, "grad_norm": 1.6503397226333618, "learning_rate": 1.8223267202544045e-05, "loss": 0.026, "step": 6810 }, { "epoch": 1.7357161049325072, "grad_norm": 1.3755848407745361, "learning_rate": 1.8220736793532156e-05, "loss": 0.0248, "step": 6815 }, { "epoch": 1.73698955768741, "grad_norm": 1.719343662261963, "learning_rate": 1.8218204759851318e-05, "loss": 0.0279, "step": 6820 }, { "epoch": 1.7382630104423127, "grad_norm": 1.174233317375183, "learning_rate": 1.8215671102001937e-05, "loss": 0.0202, "step": 6825 }, { "epoch": 1.7395364631972154, "grad_norm": 1.7808598279953003, "learning_rate": 1.8213135820484735e-05, "loss": 0.022, "step": 6830 }, { "epoch": 1.7408099159521182, "grad_norm": 1.355789303779602, "learning_rate": 1.8210598915800766e-05, "loss": 0.0202, "step": 6835 }, { "epoch": 1.742083368707021, "grad_norm": 1.6282981634140015, "learning_rate": 1.8208060388451403e-05, "loss": 0.027, "step": 6840 }, { "epoch": 1.7433568214619237, "grad_norm": 1.0231965780258179, "learning_rate": 1.820552023893833e-05, "loss": 0.0198, "step": 6845 }, { "epoch": 1.7446302742168265, "grad_norm": 1.749927043914795, "learning_rate": 1.8202978467763562e-05, "loss": 0.0325, "step": 6850 }, { "epoch": 1.7459037269717292, "grad_norm": 0.9259490966796875, "learning_rate": 1.8200435075429434e-05, "loss": 0.0159, "step": 6855 }, { "epoch": 1.747177179726632, "grad_norm": 1.0704177618026733, "learning_rate": 1.8197890062438593e-05, "loss": 0.0263, "step": 6860 }, { "epoch": 1.748450632481535, "grad_norm": 0.8437046408653259, "learning_rate": 1.819534342929401e-05, "loss": 0.0218, "step": 6865 }, { "epoch": 1.7497240852364377, "grad_norm": 0.6589095592498779, "learning_rate": 1.8192795176498985e-05, "loss": 0.0195, "step": 6870 }, { "epoch": 1.7509975379913405, "grad_norm": 0.5547426342964172, "learning_rate": 1.8190245304557124e-05, "loss": 0.0219, "step": 6875 }, { "epoch": 1.7522709907462433, "grad_norm": 0.7897186279296875, "learning_rate": 1.8187693813972363e-05, "loss": 0.0124, "step": 6880 }, { "epoch": 1.7535444435011462, "grad_norm": 1.0272847414016724, "learning_rate": 1.818514070524895e-05, "loss": 0.0269, "step": 6885 }, { "epoch": 1.754817896256049, "grad_norm": 0.9471070766448975, "learning_rate": 1.8182585978891464e-05, "loss": 0.0228, "step": 6890 }, { "epoch": 1.7560913490109518, "grad_norm": 1.3691500425338745, "learning_rate": 1.818002963540479e-05, "loss": 0.0178, "step": 6895 }, { "epoch": 1.7573648017658545, "grad_norm": 1.2658369541168213, "learning_rate": 1.8177471675294148e-05, "loss": 0.0174, "step": 6900 }, { "epoch": 1.7586382545207573, "grad_norm": 1.1216679811477661, "learning_rate": 1.817491209906506e-05, "loss": 0.0185, "step": 6905 }, { "epoch": 1.75991170727566, "grad_norm": 1.3807287216186523, "learning_rate": 1.8172350907223385e-05, "loss": 0.0247, "step": 6910 }, { "epoch": 1.7611851600305628, "grad_norm": 1.1140365600585938, "learning_rate": 1.8169788100275288e-05, "loss": 0.0148, "step": 6915 }, { "epoch": 1.7624586127854656, "grad_norm": 1.0117321014404297, "learning_rate": 1.8167223678727257e-05, "loss": 0.0251, "step": 6920 }, { "epoch": 1.7637320655403683, "grad_norm": 0.5260195732116699, "learning_rate": 1.8164657643086104e-05, "loss": 0.0233, "step": 6925 }, { "epoch": 1.765005518295271, "grad_norm": 0.9684842228889465, "learning_rate": 1.8162089993858956e-05, "loss": 0.0189, "step": 6930 }, { "epoch": 1.766278971050174, "grad_norm": 2.3046741485595703, "learning_rate": 1.8159520731553252e-05, "loss": 0.0232, "step": 6935 }, { "epoch": 1.7675524238050768, "grad_norm": 1.6537145376205444, "learning_rate": 1.8156949856676766e-05, "loss": 0.0294, "step": 6940 }, { "epoch": 1.7688258765599796, "grad_norm": 1.9051313400268555, "learning_rate": 1.8154377369737573e-05, "loss": 0.0222, "step": 6945 }, { "epoch": 1.7700993293148826, "grad_norm": 0.6530911326408386, "learning_rate": 1.8151803271244083e-05, "loss": 0.0229, "step": 6950 }, { "epoch": 1.7713727820697853, "grad_norm": 1.5228102207183838, "learning_rate": 1.8149227561705013e-05, "loss": 0.0221, "step": 6955 }, { "epoch": 1.772646234824688, "grad_norm": 0.7987802028656006, "learning_rate": 1.81466502416294e-05, "loss": 0.0208, "step": 6960 }, { "epoch": 1.7739196875795908, "grad_norm": 1.407116174697876, "learning_rate": 1.8144071311526604e-05, "loss": 0.0236, "step": 6965 }, { "epoch": 1.7751931403344936, "grad_norm": 1.2076177597045898, "learning_rate": 1.81414907719063e-05, "loss": 0.0236, "step": 6970 }, { "epoch": 1.7764665930893964, "grad_norm": 1.3521751165390015, "learning_rate": 1.8138908623278483e-05, "loss": 0.0243, "step": 6975 }, { "epoch": 1.7777400458442991, "grad_norm": 1.3134273290634155, "learning_rate": 1.813632486615346e-05, "loss": 0.0285, "step": 6980 }, { "epoch": 1.7790134985992019, "grad_norm": 1.85429847240448, "learning_rate": 1.8133739501041864e-05, "loss": 0.0213, "step": 6985 }, { "epoch": 1.7802869513541046, "grad_norm": 1.080438494682312, "learning_rate": 1.8131152528454642e-05, "loss": 0.0209, "step": 6990 }, { "epoch": 1.7815604041090074, "grad_norm": 1.937800407409668, "learning_rate": 1.812856394890306e-05, "loss": 0.0233, "step": 6995 }, { "epoch": 1.7828338568639104, "grad_norm": 1.649713158607483, "learning_rate": 1.8125973762898694e-05, "loss": 0.0235, "step": 7000 }, { "epoch": 1.7841073096188131, "grad_norm": 1.0485399961471558, "learning_rate": 1.8123381970953452e-05, "loss": 0.0185, "step": 7005 }, { "epoch": 1.785380762373716, "grad_norm": 1.4346171617507935, "learning_rate": 1.8120788573579546e-05, "loss": 0.0222, "step": 7010 }, { "epoch": 1.7866542151286189, "grad_norm": 1.3830103874206543, "learning_rate": 1.8118193571289516e-05, "loss": 0.02, "step": 7015 }, { "epoch": 1.7879276678835216, "grad_norm": 1.0331827402114868, "learning_rate": 1.8115596964596206e-05, "loss": 0.0236, "step": 7020 }, { "epoch": 1.7892011206384244, "grad_norm": 1.391168236732483, "learning_rate": 1.811299875401279e-05, "loss": 0.025, "step": 7025 }, { "epoch": 1.7904745733933272, "grad_norm": 1.3030999898910522, "learning_rate": 1.8110398940052757e-05, "loss": 0.0203, "step": 7030 }, { "epoch": 1.79174802614823, "grad_norm": 1.1144421100616455, "learning_rate": 1.8107797523229905e-05, "loss": 0.0213, "step": 7035 }, { "epoch": 1.7930214789031327, "grad_norm": 0.7229811549186707, "learning_rate": 1.810519450405835e-05, "loss": 0.0222, "step": 7040 }, { "epoch": 1.7942949316580354, "grad_norm": 1.035588026046753, "learning_rate": 1.8102589883052534e-05, "loss": 0.018, "step": 7045 }, { "epoch": 1.7955683844129382, "grad_norm": 1.5122931003570557, "learning_rate": 1.8099983660727213e-05, "loss": 0.0335, "step": 7050 }, { "epoch": 1.796841837167841, "grad_norm": 1.4510844945907593, "learning_rate": 1.8097375837597446e-05, "loss": 0.0271, "step": 7055 }, { "epoch": 1.7981152899227437, "grad_norm": 1.4161913394927979, "learning_rate": 1.8094766414178625e-05, "loss": 0.0246, "step": 7060 }, { "epoch": 1.7993887426776467, "grad_norm": 1.412573218345642, "learning_rate": 1.8092155390986454e-05, "loss": 0.0229, "step": 7065 }, { "epoch": 1.8006621954325495, "grad_norm": 1.0414832830429077, "learning_rate": 1.808954276853695e-05, "loss": 0.0254, "step": 7070 }, { "epoch": 1.8019356481874522, "grad_norm": 1.289208173751831, "learning_rate": 1.8086928547346437e-05, "loss": 0.0237, "step": 7075 }, { "epoch": 1.8032091009423552, "grad_norm": 1.302159070968628, "learning_rate": 1.8084312727931575e-05, "loss": 0.0231, "step": 7080 }, { "epoch": 1.804482553697258, "grad_norm": 0.7440974712371826, "learning_rate": 1.808169531080933e-05, "loss": 0.0183, "step": 7085 }, { "epoch": 1.8057560064521607, "grad_norm": 0.8840768337249756, "learning_rate": 1.8079076296496984e-05, "loss": 0.0267, "step": 7090 }, { "epoch": 1.8070294592070635, "grad_norm": 1.781747817993164, "learning_rate": 1.807645568551213e-05, "loss": 0.0212, "step": 7095 }, { "epoch": 1.8083029119619662, "grad_norm": 0.9093767404556274, "learning_rate": 1.8073833478372682e-05, "loss": 0.0228, "step": 7100 }, { "epoch": 1.809576364716869, "grad_norm": 1.2678864002227783, "learning_rate": 1.807120967559687e-05, "loss": 0.0199, "step": 7105 }, { "epoch": 1.8108498174717718, "grad_norm": 1.4470880031585693, "learning_rate": 1.806858427770324e-05, "loss": 0.0237, "step": 7110 }, { "epoch": 1.8121232702266745, "grad_norm": 0.880888819694519, "learning_rate": 1.8065957285210642e-05, "loss": 0.0309, "step": 7115 }, { "epoch": 1.8133967229815773, "grad_norm": 1.8777129650115967, "learning_rate": 1.8063328698638257e-05, "loss": 0.0225, "step": 7120 }, { "epoch": 1.81467017573648, "grad_norm": 1.2467365264892578, "learning_rate": 1.8060698518505573e-05, "loss": 0.0212, "step": 7125 }, { "epoch": 1.815943628491383, "grad_norm": 1.1183959245681763, "learning_rate": 1.8058066745332398e-05, "loss": 0.0226, "step": 7130 }, { "epoch": 1.8172170812462858, "grad_norm": 1.4305412769317627, "learning_rate": 1.805543337963884e-05, "loss": 0.0369, "step": 7135 }, { "epoch": 1.8184905340011885, "grad_norm": 1.31026291847229, "learning_rate": 1.8052798421945345e-05, "loss": 0.0265, "step": 7140 }, { "epoch": 1.8197639867560913, "grad_norm": 1.8434633016586304, "learning_rate": 1.805016187277265e-05, "loss": 0.0264, "step": 7145 }, { "epoch": 1.8210374395109943, "grad_norm": 1.3513569831848145, "learning_rate": 1.8047523732641828e-05, "loss": 0.0211, "step": 7150 }, { "epoch": 1.822310892265897, "grad_norm": 1.5636441707611084, "learning_rate": 1.804488400207425e-05, "loss": 0.027, "step": 7155 }, { "epoch": 1.8235843450207998, "grad_norm": 1.1751806735992432, "learning_rate": 1.804224268159161e-05, "loss": 0.0296, "step": 7160 }, { "epoch": 1.8248577977757026, "grad_norm": 1.1618655920028687, "learning_rate": 1.803959977171591e-05, "loss": 0.02, "step": 7165 }, { "epoch": 1.8261312505306053, "grad_norm": 1.1760915517807007, "learning_rate": 1.8036955272969473e-05, "loss": 0.0261, "step": 7170 }, { "epoch": 1.827404703285508, "grad_norm": 0.9220340847969055, "learning_rate": 1.803430918587493e-05, "loss": 0.0199, "step": 7175 }, { "epoch": 1.8286781560404108, "grad_norm": 1.241605281829834, "learning_rate": 1.8031661510955233e-05, "loss": 0.0221, "step": 7180 }, { "epoch": 1.8299516087953136, "grad_norm": 0.9271199107170105, "learning_rate": 1.8029012248733643e-05, "loss": 0.0206, "step": 7185 }, { "epoch": 1.8312250615502164, "grad_norm": 1.0243264436721802, "learning_rate": 1.802636139973373e-05, "loss": 0.0225, "step": 7190 }, { "epoch": 1.8324985143051191, "grad_norm": 1.1771150827407837, "learning_rate": 1.8023708964479388e-05, "loss": 0.0292, "step": 7195 }, { "epoch": 1.833771967060022, "grad_norm": 0.8258827924728394, "learning_rate": 1.802105494349482e-05, "loss": 0.0251, "step": 7200 }, { "epoch": 1.8350454198149249, "grad_norm": 1.463733434677124, "learning_rate": 1.8018399337304536e-05, "loss": 0.0301, "step": 7205 }, { "epoch": 1.8363188725698276, "grad_norm": 1.259400725364685, "learning_rate": 1.801574214643337e-05, "loss": 0.0149, "step": 7210 }, { "epoch": 1.8375923253247306, "grad_norm": 1.09787118434906, "learning_rate": 1.8013083371406463e-05, "loss": 0.0268, "step": 7215 }, { "epoch": 1.8388657780796334, "grad_norm": 1.0874499082565308, "learning_rate": 1.801042301274927e-05, "loss": 0.0224, "step": 7220 }, { "epoch": 1.8401392308345361, "grad_norm": 0.9938299655914307, "learning_rate": 1.800776107098756e-05, "loss": 0.0228, "step": 7225 }, { "epoch": 1.8414126835894389, "grad_norm": 1.344496488571167, "learning_rate": 1.800509754664741e-05, "loss": 0.0226, "step": 7230 }, { "epoch": 1.8426861363443416, "grad_norm": 1.1008360385894775, "learning_rate": 1.8002432440255217e-05, "loss": 0.0195, "step": 7235 }, { "epoch": 1.8439595890992444, "grad_norm": 1.5192662477493286, "learning_rate": 1.799976575233769e-05, "loss": 0.027, "step": 7240 }, { "epoch": 1.8452330418541472, "grad_norm": 1.4185985326766968, "learning_rate": 1.7997097483421845e-05, "loss": 0.0253, "step": 7245 }, { "epoch": 1.84650649460905, "grad_norm": 2.0425257682800293, "learning_rate": 1.7994427634035016e-05, "loss": 0.0259, "step": 7250 }, { "epoch": 1.8477799473639527, "grad_norm": 0.9695214629173279, "learning_rate": 1.799175620470484e-05, "loss": 0.0217, "step": 7255 }, { "epoch": 1.8490534001188554, "grad_norm": 1.3336758613586426, "learning_rate": 1.798908319595928e-05, "loss": 0.0276, "step": 7260 }, { "epoch": 1.8503268528737584, "grad_norm": 1.8092154264450073, "learning_rate": 1.79864086083266e-05, "loss": 0.0344, "step": 7265 }, { "epoch": 1.8516003056286612, "grad_norm": 1.2417622804641724, "learning_rate": 1.7983732442335387e-05, "loss": 0.0307, "step": 7270 }, { "epoch": 1.852873758383564, "grad_norm": 1.5692145824432373, "learning_rate": 1.798105469851453e-05, "loss": 0.0241, "step": 7275 }, { "epoch": 1.854147211138467, "grad_norm": 0.9228546023368835, "learning_rate": 1.7978375377393228e-05, "loss": 0.0268, "step": 7280 }, { "epoch": 1.8554206638933697, "grad_norm": 1.4384911060333252, "learning_rate": 1.7975694479501e-05, "loss": 0.0299, "step": 7285 }, { "epoch": 1.8566941166482724, "grad_norm": 0.8649702072143555, "learning_rate": 1.7973012005367677e-05, "loss": 0.0273, "step": 7290 }, { "epoch": 1.8579675694031752, "grad_norm": 1.5085978507995605, "learning_rate": 1.7970327955523394e-05, "loss": 0.03, "step": 7295 }, { "epoch": 1.859241022158078, "grad_norm": 1.751267910003662, "learning_rate": 1.79676423304986e-05, "loss": 0.0304, "step": 7300 }, { "epoch": 1.8605144749129807, "grad_norm": 1.5766706466674805, "learning_rate": 1.796495513082406e-05, "loss": 0.0213, "step": 7305 }, { "epoch": 1.8617879276678835, "grad_norm": 1.6846559047698975, "learning_rate": 1.7962266357030848e-05, "loss": 0.0268, "step": 7310 }, { "epoch": 1.8630613804227862, "grad_norm": 1.3787713050842285, "learning_rate": 1.7959576009650342e-05, "loss": 0.0204, "step": 7315 }, { "epoch": 1.864334833177689, "grad_norm": 1.8629173040390015, "learning_rate": 1.7956884089214247e-05, "loss": 0.0293, "step": 7320 }, { "epoch": 1.8656082859325918, "grad_norm": 1.7554137706756592, "learning_rate": 1.7954190596254556e-05, "loss": 0.0248, "step": 7325 }, { "epoch": 1.8668817386874947, "grad_norm": 1.7077467441558838, "learning_rate": 1.7951495531303594e-05, "loss": 0.0192, "step": 7330 }, { "epoch": 1.8681551914423975, "grad_norm": 1.631169319152832, "learning_rate": 1.7948798894893986e-05, "loss": 0.0278, "step": 7335 }, { "epoch": 1.8694286441973003, "grad_norm": 0.9570033550262451, "learning_rate": 1.794610068755867e-05, "loss": 0.0314, "step": 7340 }, { "epoch": 1.870702096952203, "grad_norm": 1.5283706188201904, "learning_rate": 1.7943400909830894e-05, "loss": 0.0346, "step": 7345 }, { "epoch": 1.871975549707106, "grad_norm": 1.0224785804748535, "learning_rate": 1.794069956224422e-05, "loss": 0.0218, "step": 7350 }, { "epoch": 1.8732490024620088, "grad_norm": 1.175443172454834, "learning_rate": 1.7937996645332507e-05, "loss": 0.0213, "step": 7355 }, { "epoch": 1.8745224552169115, "grad_norm": 1.7500783205032349, "learning_rate": 1.7935292159629948e-05, "loss": 0.0334, "step": 7360 }, { "epoch": 1.8757959079718143, "grad_norm": 1.5327636003494263, "learning_rate": 1.793258610567102e-05, "loss": 0.0275, "step": 7365 }, { "epoch": 1.877069360726717, "grad_norm": 1.3822015523910522, "learning_rate": 1.7929878483990525e-05, "loss": 0.0255, "step": 7370 }, { "epoch": 1.8783428134816198, "grad_norm": 1.1853119134902954, "learning_rate": 1.7927169295123577e-05, "loss": 0.0248, "step": 7375 }, { "epoch": 1.8796162662365226, "grad_norm": 1.2430839538574219, "learning_rate": 1.7924458539605594e-05, "loss": 0.0268, "step": 7380 }, { "epoch": 1.8808897189914253, "grad_norm": 1.3179031610488892, "learning_rate": 1.7921746217972298e-05, "loss": 0.0239, "step": 7385 }, { "epoch": 1.882163171746328, "grad_norm": 1.2140836715698242, "learning_rate": 1.7919032330759732e-05, "loss": 0.0312, "step": 7390 }, { "epoch": 1.8834366245012308, "grad_norm": 1.3391408920288086, "learning_rate": 1.791631687850424e-05, "loss": 0.0222, "step": 7395 }, { "epoch": 1.8847100772561338, "grad_norm": 1.3449715375900269, "learning_rate": 1.7913599861742475e-05, "loss": 0.0255, "step": 7400 }, { "epoch": 1.8859835300110366, "grad_norm": 0.9960212707519531, "learning_rate": 1.7910881281011413e-05, "loss": 0.031, "step": 7405 }, { "epoch": 1.8872569827659393, "grad_norm": 1.9147775173187256, "learning_rate": 1.7908161136848323e-05, "loss": 0.0351, "step": 7410 }, { "epoch": 1.8885304355208423, "grad_norm": 0.7754691243171692, "learning_rate": 1.7905439429790785e-05, "loss": 0.0166, "step": 7415 }, { "epoch": 1.889803888275745, "grad_norm": 2.4296181201934814, "learning_rate": 1.7902716160376697e-05, "loss": 0.0293, "step": 7420 }, { "epoch": 1.8910773410306478, "grad_norm": 1.4662001132965088, "learning_rate": 1.789999132914426e-05, "loss": 0.0233, "step": 7425 }, { "epoch": 1.8923507937855506, "grad_norm": 1.0428143739700317, "learning_rate": 1.789726493663198e-05, "loss": 0.0244, "step": 7430 }, { "epoch": 1.8936242465404534, "grad_norm": 1.6337813138961792, "learning_rate": 1.789453698337868e-05, "loss": 0.0194, "step": 7435 }, { "epoch": 1.8948976992953561, "grad_norm": 1.974807620048523, "learning_rate": 1.789180746992348e-05, "loss": 0.0299, "step": 7440 }, { "epoch": 1.8961711520502589, "grad_norm": 1.212672472000122, "learning_rate": 1.7889076396805824e-05, "loss": 0.0233, "step": 7445 }, { "epoch": 1.8974446048051616, "grad_norm": 1.4597694873809814, "learning_rate": 1.7886343764565453e-05, "loss": 0.0215, "step": 7450 }, { "epoch": 1.8987180575600644, "grad_norm": 1.3019565343856812, "learning_rate": 1.7883609573742413e-05, "loss": 0.0292, "step": 7455 }, { "epoch": 1.8999915103149672, "grad_norm": 1.785765290260315, "learning_rate": 1.788087382487707e-05, "loss": 0.029, "step": 7460 }, { "epoch": 1.9012649630698701, "grad_norm": 1.6007781028747559, "learning_rate": 1.7878136518510093e-05, "loss": 0.022, "step": 7465 }, { "epoch": 1.902538415824773, "grad_norm": 0.995947539806366, "learning_rate": 1.7875397655182448e-05, "loss": 0.0227, "step": 7470 }, { "epoch": 1.9038118685796757, "grad_norm": 1.4861403703689575, "learning_rate": 1.7872657235435428e-05, "loss": 0.0274, "step": 7475 }, { "epoch": 1.9050853213345786, "grad_norm": 1.6706410646438599, "learning_rate": 1.786991525981062e-05, "loss": 0.0223, "step": 7480 }, { "epoch": 1.9063587740894814, "grad_norm": 3.6364328861236572, "learning_rate": 1.786717172884992e-05, "loss": 0.0238, "step": 7485 }, { "epoch": 1.9076322268443842, "grad_norm": 1.057255506515503, "learning_rate": 1.7864426643095537e-05, "loss": 0.0205, "step": 7490 }, { "epoch": 1.908905679599287, "grad_norm": 1.6496115922927856, "learning_rate": 1.7861680003089984e-05, "loss": 0.0285, "step": 7495 }, { "epoch": 1.9101791323541897, "grad_norm": 1.3797416687011719, "learning_rate": 1.785893180937608e-05, "loss": 0.0267, "step": 7500 }, { "epoch": 1.9114525851090924, "grad_norm": 1.288743495941162, "learning_rate": 1.785618206249695e-05, "loss": 0.0206, "step": 7505 }, { "epoch": 1.9127260378639952, "grad_norm": 1.0903544425964355, "learning_rate": 1.785343076299603e-05, "loss": 0.0208, "step": 7510 }, { "epoch": 1.913999490618898, "grad_norm": 2.1678779125213623, "learning_rate": 1.7850677911417062e-05, "loss": 0.0326, "step": 7515 }, { "epoch": 1.9152729433738007, "grad_norm": 1.3707489967346191, "learning_rate": 1.7847923508304094e-05, "loss": 0.0259, "step": 7520 }, { "epoch": 1.9165463961287035, "grad_norm": 1.1979131698608398, "learning_rate": 1.7845167554201477e-05, "loss": 0.0189, "step": 7525 }, { "epoch": 1.9178198488836065, "grad_norm": 1.3659882545471191, "learning_rate": 1.7842410049653872e-05, "loss": 0.0274, "step": 7530 }, { "epoch": 1.9190933016385092, "grad_norm": 1.6734553575515747, "learning_rate": 1.783965099520625e-05, "loss": 0.0306, "step": 7535 }, { "epoch": 1.920366754393412, "grad_norm": 0.7147119045257568, "learning_rate": 1.7836890391403884e-05, "loss": 0.0216, "step": 7540 }, { "epoch": 1.921640207148315, "grad_norm": 1.5158406496047974, "learning_rate": 1.7834128238792346e-05, "loss": 0.0297, "step": 7545 }, { "epoch": 1.9229136599032177, "grad_norm": 1.6594040393829346, "learning_rate": 1.783136453791753e-05, "loss": 0.0203, "step": 7550 }, { "epoch": 1.9241871126581205, "grad_norm": 1.331146240234375, "learning_rate": 1.782859928932563e-05, "loss": 0.0176, "step": 7555 }, { "epoch": 1.9254605654130232, "grad_norm": 1.5705866813659668, "learning_rate": 1.7825832493563133e-05, "loss": 0.0262, "step": 7560 }, { "epoch": 1.926734018167926, "grad_norm": 1.151930809020996, "learning_rate": 1.7823064151176855e-05, "loss": 0.0259, "step": 7565 }, { "epoch": 1.9280074709228288, "grad_norm": 1.1326842308044434, "learning_rate": 1.7820294262713895e-05, "loss": 0.0251, "step": 7570 }, { "epoch": 1.9292809236777315, "grad_norm": 1.5830457210540771, "learning_rate": 1.781752282872167e-05, "loss": 0.022, "step": 7575 }, { "epoch": 1.9305543764326343, "grad_norm": 1.30061936378479, "learning_rate": 1.7814749849747906e-05, "loss": 0.0209, "step": 7580 }, { "epoch": 1.931827829187537, "grad_norm": 1.4022482633590698, "learning_rate": 1.7811975326340622e-05, "loss": 0.032, "step": 7585 }, { "epoch": 1.9331012819424398, "grad_norm": 0.6149163246154785, "learning_rate": 1.780919925904815e-05, "loss": 0.0231, "step": 7590 }, { "epoch": 1.9343747346973428, "grad_norm": 1.2584731578826904, "learning_rate": 1.7806421648419124e-05, "loss": 0.0237, "step": 7595 }, { "epoch": 1.9356481874522455, "grad_norm": 1.2915430068969727, "learning_rate": 1.7803642495002492e-05, "loss": 0.0184, "step": 7600 }, { "epoch": 1.9369216402071483, "grad_norm": 1.1724607944488525, "learning_rate": 1.7800861799347494e-05, "loss": 0.0226, "step": 7605 }, { "epoch": 1.938195092962051, "grad_norm": 1.099954605102539, "learning_rate": 1.7798079562003678e-05, "loss": 0.0229, "step": 7610 }, { "epoch": 1.939468545716954, "grad_norm": 0.8862691521644592, "learning_rate": 1.7795295783520904e-05, "loss": 0.0233, "step": 7615 }, { "epoch": 1.9407419984718568, "grad_norm": 1.85702645778656, "learning_rate": 1.7792510464449334e-05, "loss": 0.027, "step": 7620 }, { "epoch": 1.9420154512267596, "grad_norm": 1.8711299896240234, "learning_rate": 1.7789723605339427e-05, "loss": 0.0288, "step": 7625 }, { "epoch": 1.9432889039816623, "grad_norm": 0.930159330368042, "learning_rate": 1.778693520674195e-05, "loss": 0.0267, "step": 7630 }, { "epoch": 1.944562356736565, "grad_norm": 1.5911080837249756, "learning_rate": 1.7784145269207983e-05, "loss": 0.0225, "step": 7635 }, { "epoch": 1.9458358094914678, "grad_norm": 1.072916030883789, "learning_rate": 1.7781353793288897e-05, "loss": 0.0174, "step": 7640 }, { "epoch": 1.9471092622463706, "grad_norm": 1.5635690689086914, "learning_rate": 1.7778560779536375e-05, "loss": 0.0292, "step": 7645 }, { "epoch": 1.9483827150012734, "grad_norm": 1.4520618915557861, "learning_rate": 1.77757662285024e-05, "loss": 0.0231, "step": 7650 }, { "epoch": 1.9496561677561761, "grad_norm": 0.9907071590423584, "learning_rate": 1.7772970140739266e-05, "loss": 0.0191, "step": 7655 }, { "epoch": 1.9509296205110789, "grad_norm": 1.7163584232330322, "learning_rate": 1.777017251679956e-05, "loss": 0.025, "step": 7660 }, { "epoch": 1.9522030732659819, "grad_norm": 0.8963913321495056, "learning_rate": 1.776737335723618e-05, "loss": 0.0259, "step": 7665 }, { "epoch": 1.9534765260208846, "grad_norm": 1.507224440574646, "learning_rate": 1.7764572662602324e-05, "loss": 0.0279, "step": 7670 }, { "epoch": 1.9547499787757874, "grad_norm": 0.5537788271903992, "learning_rate": 1.77617704334515e-05, "loss": 0.0177, "step": 7675 }, { "epoch": 1.9560234315306904, "grad_norm": 0.9899587631225586, "learning_rate": 1.7758966670337508e-05, "loss": 0.0208, "step": 7680 }, { "epoch": 1.9572968842855931, "grad_norm": 1.4404184818267822, "learning_rate": 1.7756161373814462e-05, "loss": 0.0273, "step": 7685 }, { "epoch": 1.9585703370404959, "grad_norm": 1.4121865034103394, "learning_rate": 1.7753354544436772e-05, "loss": 0.0245, "step": 7690 }, { "epoch": 1.9598437897953986, "grad_norm": 1.3773221969604492, "learning_rate": 1.775054618275915e-05, "loss": 0.0235, "step": 7695 }, { "epoch": 1.9611172425503014, "grad_norm": 0.9200499653816223, "learning_rate": 1.774773628933662e-05, "loss": 0.0226, "step": 7700 }, { "epoch": 1.9623906953052042, "grad_norm": 0.9141042232513428, "learning_rate": 1.7744924864724503e-05, "loss": 0.0223, "step": 7705 }, { "epoch": 1.963664148060107, "grad_norm": 0.9527604579925537, "learning_rate": 1.774211190947842e-05, "loss": 0.0217, "step": 7710 }, { "epoch": 1.9649376008150097, "grad_norm": 1.2777838706970215, "learning_rate": 1.7739297424154293e-05, "loss": 0.0261, "step": 7715 }, { "epoch": 1.9662110535699124, "grad_norm": 1.174538493156433, "learning_rate": 1.7736481409308357e-05, "loss": 0.0266, "step": 7720 }, { "epoch": 1.9674845063248152, "grad_norm": 1.194578766822815, "learning_rate": 1.773366386549714e-05, "loss": 0.0306, "step": 7725 }, { "epoch": 1.9687579590797182, "grad_norm": 1.1604176759719849, "learning_rate": 1.7730844793277472e-05, "loss": 0.0184, "step": 7730 }, { "epoch": 1.970031411834621, "grad_norm": 1.2383441925048828, "learning_rate": 1.7728024193206493e-05, "loss": 0.0249, "step": 7735 }, { "epoch": 1.9713048645895237, "grad_norm": 0.9952653646469116, "learning_rate": 1.772520206584164e-05, "loss": 0.0232, "step": 7740 }, { "epoch": 1.9725783173444267, "grad_norm": 1.7404839992523193, "learning_rate": 1.7722378411740644e-05, "loss": 0.0326, "step": 7745 }, { "epoch": 1.9738517700993294, "grad_norm": 2.4112231731414795, "learning_rate": 1.7719553231461555e-05, "loss": 0.0294, "step": 7750 }, { "epoch": 1.9751252228542322, "grad_norm": 1.7381013631820679, "learning_rate": 1.7716726525562707e-05, "loss": 0.0315, "step": 7755 }, { "epoch": 1.976398675609135, "grad_norm": 1.02225923538208, "learning_rate": 1.771389829460275e-05, "loss": 0.0267, "step": 7760 }, { "epoch": 1.9776721283640377, "grad_norm": 1.023283839225769, "learning_rate": 1.7711068539140624e-05, "loss": 0.021, "step": 7765 }, { "epoch": 1.9789455811189405, "grad_norm": 1.3114469051361084, "learning_rate": 1.7708237259735578e-05, "loss": 0.0307, "step": 7770 }, { "epoch": 1.9802190338738432, "grad_norm": 1.4453884363174438, "learning_rate": 1.770540445694716e-05, "loss": 0.023, "step": 7775 }, { "epoch": 1.981492486628746, "grad_norm": 1.4767889976501465, "learning_rate": 1.7702570131335217e-05, "loss": 0.0286, "step": 7780 }, { "epoch": 1.9827659393836488, "grad_norm": 1.0959255695343018, "learning_rate": 1.7699734283459897e-05, "loss": 0.0179, "step": 7785 }, { "epoch": 1.9840393921385515, "grad_norm": 2.1368470191955566, "learning_rate": 1.769689691388165e-05, "loss": 0.03, "step": 7790 }, { "epoch": 1.9853128448934545, "grad_norm": 1.6199222803115845, "learning_rate": 1.769405802316123e-05, "loss": 0.0228, "step": 7795 }, { "epoch": 1.9865862976483573, "grad_norm": 1.4632142782211304, "learning_rate": 1.7691217611859688e-05, "loss": 0.0275, "step": 7800 }, { "epoch": 1.98785975040326, "grad_norm": 1.2029310464859009, "learning_rate": 1.7688375680538376e-05, "loss": 0.0231, "step": 7805 }, { "epoch": 1.9891332031581628, "grad_norm": 1.701050043106079, "learning_rate": 1.7685532229758947e-05, "loss": 0.0244, "step": 7810 }, { "epoch": 1.9904066559130658, "grad_norm": 1.176560640335083, "learning_rate": 1.7682687260083347e-05, "loss": 0.0295, "step": 7815 }, { "epoch": 1.9916801086679685, "grad_norm": 1.312325358390808, "learning_rate": 1.767984077207384e-05, "loss": 0.0182, "step": 7820 }, { "epoch": 1.9929535614228713, "grad_norm": 1.0242286920547485, "learning_rate": 1.7676992766292972e-05, "loss": 0.0225, "step": 7825 }, { "epoch": 1.994227014177774, "grad_norm": 1.027729868888855, "learning_rate": 1.7674143243303602e-05, "loss": 0.0293, "step": 7830 }, { "epoch": 1.9955004669326768, "grad_norm": 2.6367831230163574, "learning_rate": 1.7671292203668872e-05, "loss": 0.0285, "step": 7835 }, { "epoch": 1.9967739196875796, "grad_norm": 1.9728991985321045, "learning_rate": 1.7668439647952244e-05, "loss": 0.0243, "step": 7840 }, { "epoch": 1.9980473724424823, "grad_norm": 0.8383434414863586, "learning_rate": 1.766558557671747e-05, "loss": 0.0259, "step": 7845 }, { "epoch": 1.999320825197385, "grad_norm": 1.1861428022384644, "learning_rate": 1.7662729990528594e-05, "loss": 0.0274, "step": 7850 }, { "epoch": 2.000594277952288, "grad_norm": 0.8599772453308105, "learning_rate": 1.765987288994998e-05, "loss": 0.0208, "step": 7855 }, { "epoch": 2.0018677307071906, "grad_norm": 1.3460429906845093, "learning_rate": 1.7657014275546264e-05, "loss": 0.017, "step": 7860 }, { "epoch": 2.0031411834620934, "grad_norm": 1.077890157699585, "learning_rate": 1.7654154147882407e-05, "loss": 0.015, "step": 7865 }, { "epoch": 2.0044146362169966, "grad_norm": 1.0787503719329834, "learning_rate": 1.7651292507523646e-05, "loss": 0.0115, "step": 7870 }, { "epoch": 2.0056880889718993, "grad_norm": 1.3393564224243164, "learning_rate": 1.7648429355035545e-05, "loss": 0.0119, "step": 7875 }, { "epoch": 2.006961541726802, "grad_norm": 1.4248096942901611, "learning_rate": 1.7645564690983936e-05, "loss": 0.0126, "step": 7880 }, { "epoch": 2.008234994481705, "grad_norm": 0.706970751285553, "learning_rate": 1.7642698515934972e-05, "loss": 0.013, "step": 7885 }, { "epoch": 2.0095084472366076, "grad_norm": 1.3820228576660156, "learning_rate": 1.763983083045509e-05, "loss": 0.0135, "step": 7890 }, { "epoch": 2.0107818999915104, "grad_norm": 1.3771398067474365, "learning_rate": 1.763696163511104e-05, "loss": 0.0137, "step": 7895 }, { "epoch": 2.012055352746413, "grad_norm": 0.6278359889984131, "learning_rate": 1.763409093046986e-05, "loss": 0.0114, "step": 7900 }, { "epoch": 2.013328805501316, "grad_norm": 1.7917864322662354, "learning_rate": 1.7631218717098887e-05, "loss": 0.0179, "step": 7905 }, { "epoch": 2.0146022582562186, "grad_norm": 0.6342817544937134, "learning_rate": 1.762834499556576e-05, "loss": 0.0106, "step": 7910 }, { "epoch": 2.0158757110111214, "grad_norm": 1.6018803119659424, "learning_rate": 1.7625469766438414e-05, "loss": 0.0159, "step": 7915 }, { "epoch": 2.017149163766024, "grad_norm": 0.5128383636474609, "learning_rate": 1.762259303028508e-05, "loss": 0.0133, "step": 7920 }, { "epoch": 2.018422616520927, "grad_norm": 1.0018569231033325, "learning_rate": 1.7619714787674293e-05, "loss": 0.0151, "step": 7925 }, { "epoch": 2.0196960692758297, "grad_norm": 2.0253496170043945, "learning_rate": 1.761683503917488e-05, "loss": 0.0144, "step": 7930 }, { "epoch": 2.020969522030733, "grad_norm": 1.037458062171936, "learning_rate": 1.7613953785355965e-05, "loss": 0.0158, "step": 7935 }, { "epoch": 2.0222429747856356, "grad_norm": 0.49814292788505554, "learning_rate": 1.7611071026786975e-05, "loss": 0.0127, "step": 7940 }, { "epoch": 2.0235164275405384, "grad_norm": 0.8783616423606873, "learning_rate": 1.7608186764037634e-05, "loss": 0.018, "step": 7945 }, { "epoch": 2.024789880295441, "grad_norm": 1.1543452739715576, "learning_rate": 1.7605300997677952e-05, "loss": 0.012, "step": 7950 }, { "epoch": 2.026063333050344, "grad_norm": 0.829683244228363, "learning_rate": 1.7602413728278252e-05, "loss": 0.0151, "step": 7955 }, { "epoch": 2.0273367858052467, "grad_norm": 1.181227684020996, "learning_rate": 1.7599524956409145e-05, "loss": 0.0148, "step": 7960 }, { "epoch": 2.0286102385601494, "grad_norm": 0.7017868757247925, "learning_rate": 1.7596634682641535e-05, "loss": 0.0082, "step": 7965 }, { "epoch": 2.029883691315052, "grad_norm": 0.9979165196418762, "learning_rate": 1.7593742907546637e-05, "loss": 0.0159, "step": 7970 }, { "epoch": 2.031157144069955, "grad_norm": 0.6710853576660156, "learning_rate": 1.759084963169595e-05, "loss": 0.0115, "step": 7975 }, { "epoch": 2.0324305968248577, "grad_norm": 1.736250400543213, "learning_rate": 1.7587954855661275e-05, "loss": 0.016, "step": 7980 }, { "epoch": 2.0337040495797605, "grad_norm": 0.7714343667030334, "learning_rate": 1.7585058580014705e-05, "loss": 0.0198, "step": 7985 }, { "epoch": 2.0349775023346632, "grad_norm": 1.0195940732955933, "learning_rate": 1.7582160805328636e-05, "loss": 0.016, "step": 7990 }, { "epoch": 2.036250955089566, "grad_norm": 0.8934540748596191, "learning_rate": 1.7579261532175757e-05, "loss": 0.0132, "step": 7995 }, { "epoch": 2.0375244078444688, "grad_norm": 1.505995512008667, "learning_rate": 1.757636076112905e-05, "loss": 0.014, "step": 8000 }, { "epoch": 2.038797860599372, "grad_norm": 1.135851502418518, "learning_rate": 1.7573458492761802e-05, "loss": 0.0135, "step": 8005 }, { "epoch": 2.0400713133542747, "grad_norm": 0.4444623291492462, "learning_rate": 1.757055472764758e-05, "loss": 0.0097, "step": 8010 }, { "epoch": 2.0413447661091775, "grad_norm": 1.4772287607192993, "learning_rate": 1.7567649466360266e-05, "loss": 0.0154, "step": 8015 }, { "epoch": 2.0426182188640802, "grad_norm": 1.251905083656311, "learning_rate": 1.7564742709474027e-05, "loss": 0.0122, "step": 8020 }, { "epoch": 2.043891671618983, "grad_norm": 1.760085105895996, "learning_rate": 1.756183445756332e-05, "loss": 0.0089, "step": 8025 }, { "epoch": 2.0451651243738858, "grad_norm": 1.3013392686843872, "learning_rate": 1.7558924711202914e-05, "loss": 0.0154, "step": 8030 }, { "epoch": 2.0464385771287885, "grad_norm": 1.2460204362869263, "learning_rate": 1.755601347096786e-05, "loss": 0.011, "step": 8035 }, { "epoch": 2.0477120298836913, "grad_norm": 0.6671959161758423, "learning_rate": 1.7553100737433507e-05, "loss": 0.0183, "step": 8040 }, { "epoch": 2.048985482638594, "grad_norm": 0.8280413150787354, "learning_rate": 1.75501865111755e-05, "loss": 0.0112, "step": 8045 }, { "epoch": 2.050258935393497, "grad_norm": 0.6177046895027161, "learning_rate": 1.754727079276978e-05, "loss": 0.0133, "step": 8050 }, { "epoch": 2.0515323881483996, "grad_norm": 1.2020342350006104, "learning_rate": 1.7544353582792585e-05, "loss": 0.0133, "step": 8055 }, { "epoch": 2.0528058409033023, "grad_norm": 0.6260668039321899, "learning_rate": 1.754143488182044e-05, "loss": 0.0141, "step": 8060 }, { "epoch": 2.054079293658205, "grad_norm": 0.9405205249786377, "learning_rate": 1.7538514690430172e-05, "loss": 0.0111, "step": 8065 }, { "epoch": 2.0553527464131083, "grad_norm": 1.1396284103393555, "learning_rate": 1.7535593009198896e-05, "loss": 0.0202, "step": 8070 }, { "epoch": 2.056626199168011, "grad_norm": 0.8497107028961182, "learning_rate": 1.7532669838704036e-05, "loss": 0.0132, "step": 8075 }, { "epoch": 2.057899651922914, "grad_norm": 0.47472047805786133, "learning_rate": 1.752974517952329e-05, "loss": 0.0099, "step": 8080 }, { "epoch": 2.0591731046778166, "grad_norm": 1.0787973403930664, "learning_rate": 1.7526819032234664e-05, "loss": 0.0117, "step": 8085 }, { "epoch": 2.0604465574327193, "grad_norm": 2.109929084777832, "learning_rate": 1.752389139741645e-05, "loss": 0.0154, "step": 8090 }, { "epoch": 2.061720010187622, "grad_norm": 1.2537307739257812, "learning_rate": 1.7520962275647245e-05, "loss": 0.0135, "step": 8095 }, { "epoch": 2.062993462942525, "grad_norm": 1.1533606052398682, "learning_rate": 1.751803166750593e-05, "loss": 0.0126, "step": 8100 }, { "epoch": 2.0642669156974276, "grad_norm": 2.1867313385009766, "learning_rate": 1.7515099573571682e-05, "loss": 0.0168, "step": 8105 }, { "epoch": 2.0655403684523304, "grad_norm": 1.2132171392440796, "learning_rate": 1.751216599442397e-05, "loss": 0.0115, "step": 8110 }, { "epoch": 2.066813821207233, "grad_norm": 1.1416152715682983, "learning_rate": 1.7509230930642567e-05, "loss": 0.0097, "step": 8115 }, { "epoch": 2.068087273962136, "grad_norm": 1.7390198707580566, "learning_rate": 1.7506294382807524e-05, "loss": 0.0159, "step": 8120 }, { "epoch": 2.0693607267170386, "grad_norm": 1.7911527156829834, "learning_rate": 1.7503356351499194e-05, "loss": 0.014, "step": 8125 }, { "epoch": 2.0706341794719414, "grad_norm": 1.1043930053710938, "learning_rate": 1.750041683729822e-05, "loss": 0.0098, "step": 8130 }, { "epoch": 2.0719076322268446, "grad_norm": 0.5642349720001221, "learning_rate": 1.749747584078555e-05, "loss": 0.0095, "step": 8135 }, { "epoch": 2.0731810849817474, "grad_norm": 0.8172025084495544, "learning_rate": 1.7494533362542397e-05, "loss": 0.0102, "step": 8140 }, { "epoch": 2.07445453773665, "grad_norm": 1.212604284286499, "learning_rate": 1.7491589403150302e-05, "loss": 0.0161, "step": 8145 }, { "epoch": 2.075727990491553, "grad_norm": 1.0377304553985596, "learning_rate": 1.7488643963191073e-05, "loss": 0.0137, "step": 8150 }, { "epoch": 2.0770014432464556, "grad_norm": 1.1217060089111328, "learning_rate": 1.7485697043246822e-05, "loss": 0.0113, "step": 8155 }, { "epoch": 2.0782748960013584, "grad_norm": 1.0280382633209229, "learning_rate": 1.7482748643899944e-05, "loss": 0.0173, "step": 8160 }, { "epoch": 2.079548348756261, "grad_norm": 1.319213628768921, "learning_rate": 1.7479798765733142e-05, "loss": 0.0183, "step": 8165 }, { "epoch": 2.080821801511164, "grad_norm": 0.4625626802444458, "learning_rate": 1.7476847409329397e-05, "loss": 0.0125, "step": 8170 }, { "epoch": 2.0820952542660667, "grad_norm": 1.3423347473144531, "learning_rate": 1.7473894575271987e-05, "loss": 0.0118, "step": 8175 }, { "epoch": 2.0833687070209694, "grad_norm": 1.4290236234664917, "learning_rate": 1.7470940264144485e-05, "loss": 0.016, "step": 8180 }, { "epoch": 2.084642159775872, "grad_norm": 0.9330148100852966, "learning_rate": 1.746798447653075e-05, "loss": 0.0118, "step": 8185 }, { "epoch": 2.085915612530775, "grad_norm": 0.8158690333366394, "learning_rate": 1.746502721301494e-05, "loss": 0.0115, "step": 8190 }, { "epoch": 2.0871890652856777, "grad_norm": 1.2999238967895508, "learning_rate": 1.7462068474181494e-05, "loss": 0.0134, "step": 8195 }, { "epoch": 2.088462518040581, "grad_norm": 1.1359000205993652, "learning_rate": 1.7459108260615157e-05, "loss": 0.0144, "step": 8200 }, { "epoch": 2.0897359707954837, "grad_norm": 1.0853239297866821, "learning_rate": 1.7456146572900955e-05, "loss": 0.0185, "step": 8205 }, { "epoch": 2.0910094235503864, "grad_norm": 0.3116706907749176, "learning_rate": 1.7453183411624203e-05, "loss": 0.0084, "step": 8210 }, { "epoch": 2.092282876305289, "grad_norm": 0.9793471693992615, "learning_rate": 1.7450218777370522e-05, "loss": 0.0127, "step": 8215 }, { "epoch": 2.093556329060192, "grad_norm": 1.430109977722168, "learning_rate": 1.7447252670725804e-05, "loss": 0.0216, "step": 8220 }, { "epoch": 2.0948297818150947, "grad_norm": 0.5504494905471802, "learning_rate": 1.744428509227625e-05, "loss": 0.0121, "step": 8225 }, { "epoch": 2.0961032345699975, "grad_norm": 1.262308120727539, "learning_rate": 1.744131604260834e-05, "loss": 0.0143, "step": 8230 }, { "epoch": 2.0973766873249002, "grad_norm": 0.6129161715507507, "learning_rate": 1.7438345522308855e-05, "loss": 0.009, "step": 8235 }, { "epoch": 2.098650140079803, "grad_norm": 0.6881235837936401, "learning_rate": 1.7435373531964855e-05, "loss": 0.0149, "step": 8240 }, { "epoch": 2.0999235928347058, "grad_norm": 0.9226350784301758, "learning_rate": 1.7432400072163696e-05, "loss": 0.0135, "step": 8245 }, { "epoch": 2.1011970455896085, "grad_norm": 0.9710733294487, "learning_rate": 1.7429425143493027e-05, "loss": 0.0164, "step": 8250 }, { "epoch": 2.1024704983445113, "grad_norm": 1.0524216890335083, "learning_rate": 1.742644874654078e-05, "loss": 0.0187, "step": 8255 }, { "epoch": 2.103743951099414, "grad_norm": 0.8626943230628967, "learning_rate": 1.7423470881895192e-05, "loss": 0.0091, "step": 8260 }, { "epoch": 2.105017403854317, "grad_norm": 0.9012951850891113, "learning_rate": 1.7420491550144768e-05, "loss": 0.0118, "step": 8265 }, { "epoch": 2.10629085660922, "grad_norm": 1.0513906478881836, "learning_rate": 1.7417510751878324e-05, "loss": 0.0134, "step": 8270 }, { "epoch": 2.1075643093641228, "grad_norm": 1.1833096742630005, "learning_rate": 1.7414528487684953e-05, "loss": 0.0172, "step": 8275 }, { "epoch": 2.1088377621190255, "grad_norm": 0.817855715751648, "learning_rate": 1.7411544758154046e-05, "loss": 0.0137, "step": 8280 }, { "epoch": 2.1101112148739283, "grad_norm": 1.8256402015686035, "learning_rate": 1.740855956387527e-05, "loss": 0.0147, "step": 8285 }, { "epoch": 2.111384667628831, "grad_norm": 1.8472589254379272, "learning_rate": 1.7405572905438598e-05, "loss": 0.0187, "step": 8290 }, { "epoch": 2.112658120383734, "grad_norm": 1.4309955835342407, "learning_rate": 1.7402584783434286e-05, "loss": 0.0216, "step": 8295 }, { "epoch": 2.1139315731386366, "grad_norm": 1.0938764810562134, "learning_rate": 1.7399595198452875e-05, "loss": 0.013, "step": 8300 }, { "epoch": 2.1152050258935393, "grad_norm": 0.8491358160972595, "learning_rate": 1.73966041510852e-05, "loss": 0.0159, "step": 8305 }, { "epoch": 2.116478478648442, "grad_norm": 1.0204308032989502, "learning_rate": 1.739361164192238e-05, "loss": 0.0142, "step": 8310 }, { "epoch": 2.117751931403345, "grad_norm": 0.7937861680984497, "learning_rate": 1.7390617671555828e-05, "loss": 0.0135, "step": 8315 }, { "epoch": 2.1190253841582476, "grad_norm": 1.183362364768982, "learning_rate": 1.7387622240577248e-05, "loss": 0.0146, "step": 8320 }, { "epoch": 2.1202988369131504, "grad_norm": 1.0345358848571777, "learning_rate": 1.7384625349578623e-05, "loss": 0.0127, "step": 8325 }, { "epoch": 2.121572289668053, "grad_norm": 0.9565869569778442, "learning_rate": 1.7381626999152235e-05, "loss": 0.0145, "step": 8330 }, { "epoch": 2.1228457424229563, "grad_norm": 0.8700705766677856, "learning_rate": 1.7378627189890645e-05, "loss": 0.0158, "step": 8335 }, { "epoch": 2.124119195177859, "grad_norm": 1.2281070947647095, "learning_rate": 1.737562592238671e-05, "loss": 0.0171, "step": 8340 }, { "epoch": 2.125392647932762, "grad_norm": 1.0974371433258057, "learning_rate": 1.7372623197233577e-05, "loss": 0.0119, "step": 8345 }, { "epoch": 2.1266661006876646, "grad_norm": 0.5822365283966064, "learning_rate": 1.736961901502467e-05, "loss": 0.0134, "step": 8350 }, { "epoch": 2.1279395534425674, "grad_norm": 0.9788495898246765, "learning_rate": 1.7366613376353702e-05, "loss": 0.0124, "step": 8355 }, { "epoch": 2.12921300619747, "grad_norm": 0.5540366172790527, "learning_rate": 1.7363606281814693e-05, "loss": 0.0092, "step": 8360 }, { "epoch": 2.130486458952373, "grad_norm": 0.9288337230682373, "learning_rate": 1.7360597732001925e-05, "loss": 0.0116, "step": 8365 }, { "epoch": 2.1317599117072756, "grad_norm": 1.4739068746566772, "learning_rate": 1.7357587727509987e-05, "loss": 0.013, "step": 8370 }, { "epoch": 2.1330333644621784, "grad_norm": 1.3295719623565674, "learning_rate": 1.735457626893374e-05, "loss": 0.0196, "step": 8375 }, { "epoch": 2.134306817217081, "grad_norm": 0.8673897385597229, "learning_rate": 1.7351563356868347e-05, "loss": 0.0133, "step": 8380 }, { "epoch": 2.135580269971984, "grad_norm": 1.3991187810897827, "learning_rate": 1.7348548991909252e-05, "loss": 0.0135, "step": 8385 }, { "epoch": 2.1368537227268867, "grad_norm": 1.2181707620620728, "learning_rate": 1.7345533174652182e-05, "loss": 0.0188, "step": 8390 }, { "epoch": 2.1381271754817894, "grad_norm": 0.545448899269104, "learning_rate": 1.7342515905693157e-05, "loss": 0.0093, "step": 8395 }, { "epoch": 2.139400628236692, "grad_norm": 1.1488672494888306, "learning_rate": 1.733949718562848e-05, "loss": 0.0116, "step": 8400 }, { "epoch": 2.1406740809915954, "grad_norm": 0.8133335113525391, "learning_rate": 1.7336477015054743e-05, "loss": 0.0134, "step": 8405 }, { "epoch": 2.141947533746498, "grad_norm": 0.9924989938735962, "learning_rate": 1.7333455394568824e-05, "loss": 0.0137, "step": 8410 }, { "epoch": 2.143220986501401, "grad_norm": 1.3198695182800293, "learning_rate": 1.7330432324767885e-05, "loss": 0.0156, "step": 8415 }, { "epoch": 2.1444944392563037, "grad_norm": 1.2578600645065308, "learning_rate": 1.7327407806249383e-05, "loss": 0.0089, "step": 8420 }, { "epoch": 2.1457678920112064, "grad_norm": 0.893928050994873, "learning_rate": 1.732438183961105e-05, "loss": 0.0117, "step": 8425 }, { "epoch": 2.147041344766109, "grad_norm": 0.8674365282058716, "learning_rate": 1.7321354425450915e-05, "loss": 0.0143, "step": 8430 }, { "epoch": 2.148314797521012, "grad_norm": 1.5332592725753784, "learning_rate": 1.731832556436728e-05, "loss": 0.0146, "step": 8435 }, { "epoch": 2.1495882502759147, "grad_norm": 1.2148042917251587, "learning_rate": 1.7315295256958747e-05, "loss": 0.0158, "step": 8440 }, { "epoch": 2.1508617030308175, "grad_norm": 0.9383816719055176, "learning_rate": 1.7312263503824195e-05, "loss": 0.0134, "step": 8445 }, { "epoch": 2.1521351557857202, "grad_norm": 0.43908804655075073, "learning_rate": 1.730923030556279e-05, "loss": 0.0141, "step": 8450 }, { "epoch": 2.153408608540623, "grad_norm": 0.8386489748954773, "learning_rate": 1.7306195662773988e-05, "loss": 0.0114, "step": 8455 }, { "epoch": 2.1546820612955258, "grad_norm": 0.8712080717086792, "learning_rate": 1.730315957605752e-05, "loss": 0.0121, "step": 8460 }, { "epoch": 2.1559555140504285, "grad_norm": 0.8529504537582397, "learning_rate": 1.7300122046013422e-05, "loss": 0.0117, "step": 8465 }, { "epoch": 2.1572289668053317, "grad_norm": 1.3797869682312012, "learning_rate": 1.729708307324199e-05, "loss": 0.0109, "step": 8470 }, { "epoch": 2.1585024195602345, "grad_norm": 1.3759307861328125, "learning_rate": 1.7294042658343828e-05, "loss": 0.0119, "step": 8475 }, { "epoch": 2.1597758723151372, "grad_norm": 0.84576815366745, "learning_rate": 1.7291000801919807e-05, "loss": 0.009, "step": 8480 }, { "epoch": 2.16104932507004, "grad_norm": 0.7139631509780884, "learning_rate": 1.7287957504571096e-05, "loss": 0.019, "step": 8485 }, { "epoch": 2.1623227778249428, "grad_norm": 1.1419365406036377, "learning_rate": 1.7284912766899142e-05, "loss": 0.0102, "step": 8490 }, { "epoch": 2.1635962305798455, "grad_norm": 1.2674903869628906, "learning_rate": 1.7281866589505676e-05, "loss": 0.0158, "step": 8495 }, { "epoch": 2.1648696833347483, "grad_norm": 0.589185357093811, "learning_rate": 1.727881897299272e-05, "loss": 0.015, "step": 8500 }, { "epoch": 2.166143136089651, "grad_norm": 1.0308537483215332, "learning_rate": 1.7275769917962575e-05, "loss": 0.0162, "step": 8505 }, { "epoch": 2.167416588844554, "grad_norm": 1.1168477535247803, "learning_rate": 1.7272719425017826e-05, "loss": 0.0139, "step": 8510 }, { "epoch": 2.1686900415994566, "grad_norm": 0.8296324014663696, "learning_rate": 1.726966749476134e-05, "loss": 0.012, "step": 8515 }, { "epoch": 2.1699634943543593, "grad_norm": 1.4797626733779907, "learning_rate": 1.7266614127796282e-05, "loss": 0.0142, "step": 8520 }, { "epoch": 2.171236947109262, "grad_norm": 0.7270941734313965, "learning_rate": 1.7263559324726082e-05, "loss": 0.0134, "step": 8525 }, { "epoch": 2.172510399864165, "grad_norm": 1.5897393226623535, "learning_rate": 1.7260503086154465e-05, "loss": 0.0165, "step": 8530 }, { "epoch": 2.173783852619068, "grad_norm": 1.4263957738876343, "learning_rate": 1.7257445412685438e-05, "loss": 0.0158, "step": 8535 }, { "epoch": 2.175057305373971, "grad_norm": 0.7539745569229126, "learning_rate": 1.7254386304923293e-05, "loss": 0.0108, "step": 8540 }, { "epoch": 2.1763307581288736, "grad_norm": 0.8240822553634644, "learning_rate": 1.72513257634726e-05, "loss": 0.0145, "step": 8545 }, { "epoch": 2.1776042108837763, "grad_norm": 0.990456223487854, "learning_rate": 1.724826378893821e-05, "loss": 0.0121, "step": 8550 }, { "epoch": 2.178877663638679, "grad_norm": 1.375853419303894, "learning_rate": 1.724520038192527e-05, "loss": 0.0134, "step": 8555 }, { "epoch": 2.180151116393582, "grad_norm": 1.083631157875061, "learning_rate": 1.7242135543039204e-05, "loss": 0.0099, "step": 8560 }, { "epoch": 2.1814245691484846, "grad_norm": 0.617284893989563, "learning_rate": 1.723906927288572e-05, "loss": 0.0096, "step": 8565 }, { "epoch": 2.1826980219033874, "grad_norm": 0.5659587979316711, "learning_rate": 1.7236001572070798e-05, "loss": 0.0166, "step": 8570 }, { "epoch": 2.18397147465829, "grad_norm": 0.5109277367591858, "learning_rate": 1.7232932441200712e-05, "loss": 0.012, "step": 8575 }, { "epoch": 2.185244927413193, "grad_norm": 0.9401317238807678, "learning_rate": 1.722986188088202e-05, "loss": 0.0115, "step": 8580 }, { "epoch": 2.1865183801680956, "grad_norm": 1.10249662399292, "learning_rate": 1.7226789891721558e-05, "loss": 0.015, "step": 8585 }, { "epoch": 2.1877918329229984, "grad_norm": 1.1674567461013794, "learning_rate": 1.722371647432644e-05, "loss": 0.0137, "step": 8590 }, { "epoch": 2.189065285677901, "grad_norm": 0.9033075571060181, "learning_rate": 1.722064162930407e-05, "loss": 0.0142, "step": 8595 }, { "epoch": 2.1903387384328044, "grad_norm": 0.944720447063446, "learning_rate": 1.7217565357262136e-05, "loss": 0.0118, "step": 8600 }, { "epoch": 2.191612191187707, "grad_norm": 0.5715442895889282, "learning_rate": 1.72144876588086e-05, "loss": 0.0138, "step": 8605 }, { "epoch": 2.19288564394261, "grad_norm": 1.2077230215072632, "learning_rate": 1.7211408534551706e-05, "loss": 0.0202, "step": 8610 }, { "epoch": 2.1941590966975126, "grad_norm": 1.634812355041504, "learning_rate": 1.7208327985099985e-05, "loss": 0.0131, "step": 8615 }, { "epoch": 2.1954325494524154, "grad_norm": 0.7721237540245056, "learning_rate": 1.7205246011062252e-05, "loss": 0.0103, "step": 8620 }, { "epoch": 2.196706002207318, "grad_norm": 1.1785237789154053, "learning_rate": 1.7202162613047596e-05, "loss": 0.0133, "step": 8625 }, { "epoch": 2.197979454962221, "grad_norm": 1.2012888193130493, "learning_rate": 1.7199077791665388e-05, "loss": 0.0147, "step": 8630 }, { "epoch": 2.1992529077171237, "grad_norm": 1.1325188875198364, "learning_rate": 1.7195991547525287e-05, "loss": 0.0199, "step": 8635 }, { "epoch": 2.2005263604720264, "grad_norm": 0.4911632537841797, "learning_rate": 1.719290388123723e-05, "loss": 0.0105, "step": 8640 }, { "epoch": 2.201799813226929, "grad_norm": 1.167069673538208, "learning_rate": 1.718981479341143e-05, "loss": 0.0137, "step": 8645 }, { "epoch": 2.203073265981832, "grad_norm": 0.3480061888694763, "learning_rate": 1.7186724284658387e-05, "loss": 0.0174, "step": 8650 }, { "epoch": 2.2043467187367347, "grad_norm": 1.4316976070404053, "learning_rate": 1.7183632355588883e-05, "loss": 0.0174, "step": 8655 }, { "epoch": 2.2056201714916375, "grad_norm": 0.8127975463867188, "learning_rate": 1.7180539006813973e-05, "loss": 0.0156, "step": 8660 }, { "epoch": 2.2068936242465407, "grad_norm": 1.3017079830169678, "learning_rate": 1.7177444238945e-05, "loss": 0.0096, "step": 8665 }, { "epoch": 2.2081670770014434, "grad_norm": 1.323657512664795, "learning_rate": 1.7174348052593584e-05, "loss": 0.0123, "step": 8670 }, { "epoch": 2.209440529756346, "grad_norm": 1.534178376197815, "learning_rate": 1.7171250448371625e-05, "loss": 0.0114, "step": 8675 }, { "epoch": 2.210713982511249, "grad_norm": 1.4250092506408691, "learning_rate": 1.716815142689131e-05, "loss": 0.016, "step": 8680 }, { "epoch": 2.2119874352661517, "grad_norm": 1.0152482986450195, "learning_rate": 1.7165050988765093e-05, "loss": 0.0156, "step": 8685 }, { "epoch": 2.2132608880210545, "grad_norm": 0.6930704116821289, "learning_rate": 1.716194913460572e-05, "loss": 0.0105, "step": 8690 }, { "epoch": 2.2145343407759572, "grad_norm": 1.17677640914917, "learning_rate": 1.715884586502621e-05, "loss": 0.0139, "step": 8695 }, { "epoch": 2.21580779353086, "grad_norm": 1.3526352643966675, "learning_rate": 1.7155741180639864e-05, "loss": 0.0132, "step": 8700 }, { "epoch": 2.2170812462857628, "grad_norm": 0.6844810247421265, "learning_rate": 1.7152635082060262e-05, "loss": 0.0118, "step": 8705 }, { "epoch": 2.2183546990406655, "grad_norm": 1.0701513290405273, "learning_rate": 1.7149527569901266e-05, "loss": 0.0178, "step": 8710 }, { "epoch": 2.2196281517955683, "grad_norm": 1.1035335063934326, "learning_rate": 1.7146418644777016e-05, "loss": 0.0126, "step": 8715 }, { "epoch": 2.220901604550471, "grad_norm": 0.9047471284866333, "learning_rate": 1.7143308307301928e-05, "loss": 0.0168, "step": 8720 }, { "epoch": 2.222175057305374, "grad_norm": 1.3160605430603027, "learning_rate": 1.71401965580907e-05, "loss": 0.0146, "step": 8725 }, { "epoch": 2.223448510060277, "grad_norm": 1.1005231142044067, "learning_rate": 1.7137083397758316e-05, "loss": 0.0116, "step": 8730 }, { "epoch": 2.2247219628151798, "grad_norm": 1.348767876625061, "learning_rate": 1.713396882692002e-05, "loss": 0.0171, "step": 8735 }, { "epoch": 2.2259954155700825, "grad_norm": 1.1545964479446411, "learning_rate": 1.7130852846191353e-05, "loss": 0.0133, "step": 8740 }, { "epoch": 2.2272688683249853, "grad_norm": 0.48281389474868774, "learning_rate": 1.7127735456188128e-05, "loss": 0.0127, "step": 8745 }, { "epoch": 2.228542321079888, "grad_norm": 1.3970293998718262, "learning_rate": 1.7124616657526434e-05, "loss": 0.0158, "step": 8750 }, { "epoch": 2.229815773834791, "grad_norm": 1.250135064125061, "learning_rate": 1.712149645082265e-05, "loss": 0.0196, "step": 8755 }, { "epoch": 2.2310892265896936, "grad_norm": 0.825657844543457, "learning_rate": 1.7118374836693407e-05, "loss": 0.0139, "step": 8760 }, { "epoch": 2.2323626793445963, "grad_norm": 0.9019083380699158, "learning_rate": 1.7115251815755644e-05, "loss": 0.0156, "step": 8765 }, { "epoch": 2.233636132099499, "grad_norm": 0.7095509767532349, "learning_rate": 1.7112127388626565e-05, "loss": 0.0103, "step": 8770 }, { "epoch": 2.234909584854402, "grad_norm": 1.3904372453689575, "learning_rate": 1.710900155592365e-05, "loss": 0.0115, "step": 8775 }, { "epoch": 2.2361830376093046, "grad_norm": 0.8407014012336731, "learning_rate": 1.7105874318264658e-05, "loss": 0.0117, "step": 8780 }, { "epoch": 2.2374564903642074, "grad_norm": 1.2257596254348755, "learning_rate": 1.7102745676267627e-05, "loss": 0.0162, "step": 8785 }, { "epoch": 2.23872994311911, "grad_norm": 0.8025373220443726, "learning_rate": 1.7099615630550875e-05, "loss": 0.014, "step": 8790 }, { "epoch": 2.240003395874013, "grad_norm": 1.4690254926681519, "learning_rate": 1.7096484181732993e-05, "loss": 0.0143, "step": 8795 }, { "epoch": 2.241276848628916, "grad_norm": 1.4885026216506958, "learning_rate": 1.7093351330432852e-05, "loss": 0.0173, "step": 8800 }, { "epoch": 2.242550301383819, "grad_norm": 1.251455307006836, "learning_rate": 1.70902170772696e-05, "loss": 0.02, "step": 8805 }, { "epoch": 2.2438237541387216, "grad_norm": 1.5796935558319092, "learning_rate": 1.708708142286265e-05, "loss": 0.0173, "step": 8810 }, { "epoch": 2.2450972068936244, "grad_norm": 1.0688700675964355, "learning_rate": 1.7083944367831723e-05, "loss": 0.0194, "step": 8815 }, { "epoch": 2.246370659648527, "grad_norm": 1.3123226165771484, "learning_rate": 1.708080591279678e-05, "loss": 0.0133, "step": 8820 }, { "epoch": 2.24764411240343, "grad_norm": 1.016269564628601, "learning_rate": 1.7077666058378088e-05, "loss": 0.0182, "step": 8825 }, { "epoch": 2.2489175651583326, "grad_norm": 1.5108741521835327, "learning_rate": 1.7074524805196167e-05, "loss": 0.015, "step": 8830 }, { "epoch": 2.2501910179132354, "grad_norm": 0.968053936958313, "learning_rate": 1.7071382153871834e-05, "loss": 0.0136, "step": 8835 }, { "epoch": 2.251464470668138, "grad_norm": 1.6006954908370972, "learning_rate": 1.7068238105026167e-05, "loss": 0.0162, "step": 8840 }, { "epoch": 2.252737923423041, "grad_norm": 1.0051493644714355, "learning_rate": 1.706509265928053e-05, "loss": 0.0135, "step": 8845 }, { "epoch": 2.2540113761779437, "grad_norm": 0.7722652554512024, "learning_rate": 1.7061945817256558e-05, "loss": 0.0164, "step": 8850 }, { "epoch": 2.2552848289328464, "grad_norm": 1.4733617305755615, "learning_rate": 1.7058797579576164e-05, "loss": 0.0114, "step": 8855 }, { "epoch": 2.2565582816877496, "grad_norm": 1.1004537343978882, "learning_rate": 1.7055647946861535e-05, "loss": 0.0206, "step": 8860 }, { "epoch": 2.257831734442652, "grad_norm": 0.9603634476661682, "learning_rate": 1.7052496919735132e-05, "loss": 0.016, "step": 8865 }, { "epoch": 2.259105187197555, "grad_norm": 0.6329435706138611, "learning_rate": 1.7049344498819698e-05, "loss": 0.0108, "step": 8870 }, { "epoch": 2.260378639952458, "grad_norm": 1.719296932220459, "learning_rate": 1.7046190684738245e-05, "loss": 0.0138, "step": 8875 }, { "epoch": 2.2616520927073607, "grad_norm": 1.1661343574523926, "learning_rate": 1.7043035478114068e-05, "loss": 0.0178, "step": 8880 }, { "epoch": 2.2629255454622634, "grad_norm": 1.2627025842666626, "learning_rate": 1.7039878879570728e-05, "loss": 0.0139, "step": 8885 }, { "epoch": 2.264198998217166, "grad_norm": 1.3140339851379395, "learning_rate": 1.7036720889732068e-05, "loss": 0.0223, "step": 8890 }, { "epoch": 2.265472450972069, "grad_norm": 1.3583265542984009, "learning_rate": 1.7033561509222197e-05, "loss": 0.0137, "step": 8895 }, { "epoch": 2.2667459037269717, "grad_norm": 0.7387733459472656, "learning_rate": 1.7030400738665516e-05, "loss": 0.0093, "step": 8900 }, { "epoch": 2.2680193564818745, "grad_norm": 1.4116058349609375, "learning_rate": 1.702723857868668e-05, "loss": 0.0125, "step": 8905 }, { "epoch": 2.2692928092367772, "grad_norm": 0.7967307567596436, "learning_rate": 1.7024075029910638e-05, "loss": 0.0189, "step": 8910 }, { "epoch": 2.27056626199168, "grad_norm": 1.2773401737213135, "learning_rate": 1.7020910092962593e-05, "loss": 0.0134, "step": 8915 }, { "epoch": 2.2718397147465828, "grad_norm": 1.6027288436889648, "learning_rate": 1.701774376846804e-05, "loss": 0.0164, "step": 8920 }, { "epoch": 2.2731131675014855, "grad_norm": 1.2075947523117065, "learning_rate": 1.701457605705274e-05, "loss": 0.016, "step": 8925 }, { "epoch": 2.2743866202563883, "grad_norm": 0.9764797687530518, "learning_rate": 1.701140695934273e-05, "loss": 0.0137, "step": 8930 }, { "epoch": 2.2756600730112915, "grad_norm": 1.1441606283187866, "learning_rate": 1.7008236475964323e-05, "loss": 0.0165, "step": 8935 }, { "epoch": 2.2769335257661942, "grad_norm": 0.7842252254486084, "learning_rate": 1.70050646075441e-05, "loss": 0.0117, "step": 8940 }, { "epoch": 2.278206978521097, "grad_norm": 1.0392619371414185, "learning_rate": 1.700189135470892e-05, "loss": 0.0122, "step": 8945 }, { "epoch": 2.2794804312759998, "grad_norm": 0.8434522747993469, "learning_rate": 1.6998716718085914e-05, "loss": 0.0132, "step": 8950 }, { "epoch": 2.2807538840309025, "grad_norm": 1.2515488862991333, "learning_rate": 1.699554069830249e-05, "loss": 0.0197, "step": 8955 }, { "epoch": 2.2820273367858053, "grad_norm": 0.6597116589546204, "learning_rate": 1.699236329598632e-05, "loss": 0.0165, "step": 8960 }, { "epoch": 2.283300789540708, "grad_norm": 0.528951108455658, "learning_rate": 1.6989184511765366e-05, "loss": 0.0163, "step": 8965 }, { "epoch": 2.284574242295611, "grad_norm": 1.0153003931045532, "learning_rate": 1.698600434626785e-05, "loss": 0.0189, "step": 8970 }, { "epoch": 2.2858476950505136, "grad_norm": 0.9373816847801208, "learning_rate": 1.6982822800122263e-05, "loss": 0.0122, "step": 8975 }, { "epoch": 2.2871211478054163, "grad_norm": 0.8779058456420898, "learning_rate": 1.697963987395738e-05, "loss": 0.0119, "step": 8980 }, { "epoch": 2.288394600560319, "grad_norm": 1.480857253074646, "learning_rate": 1.6976455568402248e-05, "loss": 0.0169, "step": 8985 }, { "epoch": 2.289668053315222, "grad_norm": 1.1208627223968506, "learning_rate": 1.6973269884086177e-05, "loss": 0.0169, "step": 8990 }, { "epoch": 2.2909415060701246, "grad_norm": 1.0039232969284058, "learning_rate": 1.6970082821638757e-05, "loss": 0.0148, "step": 8995 }, { "epoch": 2.292214958825028, "grad_norm": 1.2119966745376587, "learning_rate": 1.6966894381689857e-05, "loss": 0.0146, "step": 9000 }, { "epoch": 2.2934884115799306, "grad_norm": 0.9336686730384827, "learning_rate": 1.69637045648696e-05, "loss": 0.0109, "step": 9005 }, { "epoch": 2.2947618643348333, "grad_norm": 1.5424182415008545, "learning_rate": 1.6960513371808396e-05, "loss": 0.0149, "step": 9010 }, { "epoch": 2.296035317089736, "grad_norm": 1.1898759603500366, "learning_rate": 1.6957320803136924e-05, "loss": 0.0132, "step": 9015 }, { "epoch": 2.297308769844639, "grad_norm": 0.762947142124176, "learning_rate": 1.6954126859486123e-05, "loss": 0.0104, "step": 9020 }, { "epoch": 2.2985822225995416, "grad_norm": 1.050337314605713, "learning_rate": 1.695093154148723e-05, "loss": 0.0116, "step": 9025 }, { "epoch": 2.2998556753544444, "grad_norm": 0.791450560092926, "learning_rate": 1.694773484977173e-05, "loss": 0.0149, "step": 9030 }, { "epoch": 2.301129128109347, "grad_norm": 1.6135938167572021, "learning_rate": 1.694453678497138e-05, "loss": 0.016, "step": 9035 }, { "epoch": 2.30240258086425, "grad_norm": 1.0356736183166504, "learning_rate": 1.6941337347718224e-05, "loss": 0.0182, "step": 9040 }, { "epoch": 2.3036760336191526, "grad_norm": 0.8482475280761719, "learning_rate": 1.6938136538644567e-05, "loss": 0.0134, "step": 9045 }, { "epoch": 2.3049494863740554, "grad_norm": 1.2026687860488892, "learning_rate": 1.6934934358382987e-05, "loss": 0.0121, "step": 9050 }, { "epoch": 2.306222939128958, "grad_norm": 1.1053816080093384, "learning_rate": 1.693173080756633e-05, "loss": 0.0141, "step": 9055 }, { "epoch": 2.307496391883861, "grad_norm": 0.8491237163543701, "learning_rate": 1.6928525886827718e-05, "loss": 0.0139, "step": 9060 }, { "epoch": 2.308769844638764, "grad_norm": 1.8652706146240234, "learning_rate": 1.692531959680054e-05, "loss": 0.0144, "step": 9065 }, { "epoch": 2.310043297393667, "grad_norm": 0.7001760601997375, "learning_rate": 1.6922111938118458e-05, "loss": 0.0119, "step": 9070 }, { "epoch": 2.3113167501485696, "grad_norm": 0.5663195848464966, "learning_rate": 1.6918902911415404e-05, "loss": 0.0107, "step": 9075 }, { "epoch": 2.3125902029034724, "grad_norm": 1.3763446807861328, "learning_rate": 1.691569251732558e-05, "loss": 0.0169, "step": 9080 }, { "epoch": 2.313863655658375, "grad_norm": 1.2316844463348389, "learning_rate": 1.6912480756483455e-05, "loss": 0.012, "step": 9085 }, { "epoch": 2.315137108413278, "grad_norm": 0.977307140827179, "learning_rate": 1.690926762952377e-05, "loss": 0.0151, "step": 9090 }, { "epoch": 2.3164105611681807, "grad_norm": 1.198891282081604, "learning_rate": 1.6906053137081548e-05, "loss": 0.0169, "step": 9095 }, { "epoch": 2.3176840139230834, "grad_norm": 0.7629362344741821, "learning_rate": 1.6902837279792053e-05, "loss": 0.0132, "step": 9100 }, { "epoch": 2.318957466677986, "grad_norm": 0.7170091271400452, "learning_rate": 1.6899620058290854e-05, "loss": 0.0119, "step": 9105 }, { "epoch": 2.320230919432889, "grad_norm": 1.4089058637619019, "learning_rate": 1.6896401473213766e-05, "loss": 0.0183, "step": 9110 }, { "epoch": 2.3215043721877917, "grad_norm": 0.6572768092155457, "learning_rate": 1.6893181525196878e-05, "loss": 0.0091, "step": 9115 }, { "epoch": 2.3227778249426945, "grad_norm": 0.9963003396987915, "learning_rate": 1.6889960214876548e-05, "loss": 0.0136, "step": 9120 }, { "epoch": 2.3240512776975972, "grad_norm": 1.3478387594223022, "learning_rate": 1.6886737542889413e-05, "loss": 0.0136, "step": 9125 }, { "epoch": 2.3253247304525004, "grad_norm": 1.6722692251205444, "learning_rate": 1.6883513509872364e-05, "loss": 0.0187, "step": 9130 }, { "epoch": 2.326598183207403, "grad_norm": 0.8555684089660645, "learning_rate": 1.6880288116462577e-05, "loss": 0.0137, "step": 9135 }, { "epoch": 2.327871635962306, "grad_norm": 0.6156668066978455, "learning_rate": 1.6877061363297474e-05, "loss": 0.0159, "step": 9140 }, { "epoch": 2.3291450887172087, "grad_norm": 1.7643744945526123, "learning_rate": 1.6873833251014776e-05, "loss": 0.0186, "step": 9145 }, { "epoch": 2.3304185414721115, "grad_norm": 1.5348697900772095, "learning_rate": 1.687060378025245e-05, "loss": 0.0176, "step": 9150 }, { "epoch": 2.3316919942270142, "grad_norm": 0.7899874448776245, "learning_rate": 1.6867372951648734e-05, "loss": 0.0118, "step": 9155 }, { "epoch": 2.332965446981917, "grad_norm": 1.8238953351974487, "learning_rate": 1.686414076584215e-05, "loss": 0.0178, "step": 9160 }, { "epoch": 2.3342388997368198, "grad_norm": 1.3536580801010132, "learning_rate": 1.6860907223471458e-05, "loss": 0.0135, "step": 9165 }, { "epoch": 2.3355123524917225, "grad_norm": 1.5716469287872314, "learning_rate": 1.685767232517572e-05, "loss": 0.013, "step": 9170 }, { "epoch": 2.3367858052466253, "grad_norm": 0.7480042576789856, "learning_rate": 1.6854436071594255e-05, "loss": 0.0104, "step": 9175 }, { "epoch": 2.338059258001528, "grad_norm": 1.3802094459533691, "learning_rate": 1.685119846336663e-05, "loss": 0.0175, "step": 9180 }, { "epoch": 2.339332710756431, "grad_norm": 1.1441287994384766, "learning_rate": 1.68479595011327e-05, "loss": 0.0125, "step": 9185 }, { "epoch": 2.3406061635113335, "grad_norm": 1.3964345455169678, "learning_rate": 1.6844719185532593e-05, "loss": 0.0103, "step": 9190 }, { "epoch": 2.3418796162662368, "grad_norm": 1.2289605140686035, "learning_rate": 1.6841477517206685e-05, "loss": 0.0157, "step": 9195 }, { "epoch": 2.343153069021139, "grad_norm": 1.0616849660873413, "learning_rate": 1.6838234496795632e-05, "loss": 0.0148, "step": 9200 }, { "epoch": 2.3444265217760423, "grad_norm": 2.3134613037109375, "learning_rate": 1.683499012494035e-05, "loss": 0.0203, "step": 9205 }, { "epoch": 2.345699974530945, "grad_norm": 1.0168565511703491, "learning_rate": 1.6831744402282032e-05, "loss": 0.0113, "step": 9210 }, { "epoch": 2.346973427285848, "grad_norm": 1.414165735244751, "learning_rate": 1.6828497329462133e-05, "loss": 0.0132, "step": 9215 }, { "epoch": 2.3482468800407506, "grad_norm": 0.4610959589481354, "learning_rate": 1.6825248907122363e-05, "loss": 0.0103, "step": 9220 }, { "epoch": 2.3495203327956533, "grad_norm": 0.5838968753814697, "learning_rate": 1.6821999135904722e-05, "loss": 0.0143, "step": 9225 }, { "epoch": 2.350793785550556, "grad_norm": 0.6248171329498291, "learning_rate": 1.6818748016451457e-05, "loss": 0.0143, "step": 9230 }, { "epoch": 2.352067238305459, "grad_norm": 1.1831246614456177, "learning_rate": 1.681549554940509e-05, "loss": 0.0158, "step": 9235 }, { "epoch": 2.3533406910603616, "grad_norm": 0.9611029028892517, "learning_rate": 1.6812241735408408e-05, "loss": 0.0141, "step": 9240 }, { "epoch": 2.3546141438152643, "grad_norm": 1.3150328397750854, "learning_rate": 1.6808986575104464e-05, "loss": 0.0134, "step": 9245 }, { "epoch": 2.355887596570167, "grad_norm": 1.1018966436386108, "learning_rate": 1.680573006913658e-05, "loss": 0.0158, "step": 9250 }, { "epoch": 2.35716104932507, "grad_norm": 1.0275979042053223, "learning_rate": 1.680247221814834e-05, "loss": 0.0162, "step": 9255 }, { "epoch": 2.358434502079973, "grad_norm": 0.4069373607635498, "learning_rate": 1.6799213022783593e-05, "loss": 0.0171, "step": 9260 }, { "epoch": 2.3597079548348754, "grad_norm": 1.1444544792175293, "learning_rate": 1.6795952483686454e-05, "loss": 0.0116, "step": 9265 }, { "epoch": 2.3609814075897786, "grad_norm": 1.9128310680389404, "learning_rate": 1.679269060150131e-05, "loss": 0.0178, "step": 9270 }, { "epoch": 2.3622548603446814, "grad_norm": 1.1924384832382202, "learning_rate": 1.6789427376872805e-05, "loss": 0.016, "step": 9275 }, { "epoch": 2.363528313099584, "grad_norm": 2.141723871231079, "learning_rate": 1.6786162810445855e-05, "loss": 0.0153, "step": 9280 }, { "epoch": 2.364801765854487, "grad_norm": 1.230076551437378, "learning_rate": 1.6782896902865636e-05, "loss": 0.0111, "step": 9285 }, { "epoch": 2.3660752186093896, "grad_norm": 1.7460204362869263, "learning_rate": 1.677962965477759e-05, "loss": 0.0197, "step": 9290 }, { "epoch": 2.3673486713642924, "grad_norm": 0.938036322593689, "learning_rate": 1.6776361066827428e-05, "loss": 0.0161, "step": 9295 }, { "epoch": 2.368622124119195, "grad_norm": 1.3517752885818481, "learning_rate": 1.6773091139661122e-05, "loss": 0.0142, "step": 9300 }, { "epoch": 2.369895576874098, "grad_norm": 1.4316450357437134, "learning_rate": 1.676981987392491e-05, "loss": 0.0114, "step": 9305 }, { "epoch": 2.3711690296290007, "grad_norm": 1.0676943063735962, "learning_rate": 1.6766547270265294e-05, "loss": 0.0153, "step": 9310 }, { "epoch": 2.3724424823839034, "grad_norm": 1.0627250671386719, "learning_rate": 1.676327332932904e-05, "loss": 0.0111, "step": 9315 }, { "epoch": 2.373715935138806, "grad_norm": 1.0630489587783813, "learning_rate": 1.6759998051763176e-05, "loss": 0.0143, "step": 9320 }, { "epoch": 2.3749893878937094, "grad_norm": 1.0772773027420044, "learning_rate": 1.6756721438215005e-05, "loss": 0.0127, "step": 9325 }, { "epoch": 2.3762628406486117, "grad_norm": 0.3487684428691864, "learning_rate": 1.6753443489332078e-05, "loss": 0.0105, "step": 9330 }, { "epoch": 2.377536293403515, "grad_norm": 1.8999123573303223, "learning_rate": 1.675016420576222e-05, "loss": 0.0139, "step": 9335 }, { "epoch": 2.3788097461584177, "grad_norm": 0.8130879998207092, "learning_rate": 1.674688358815352e-05, "loss": 0.013, "step": 9340 }, { "epoch": 2.3800831989133204, "grad_norm": 0.7582122683525085, "learning_rate": 1.6743601637154327e-05, "loss": 0.0088, "step": 9345 }, { "epoch": 2.381356651668223, "grad_norm": 1.4402886629104614, "learning_rate": 1.6740318353413254e-05, "loss": 0.0184, "step": 9350 }, { "epoch": 2.382630104423126, "grad_norm": 0.9588368535041809, "learning_rate": 1.6737033737579176e-05, "loss": 0.011, "step": 9355 }, { "epoch": 2.3839035571780287, "grad_norm": 0.5514962673187256, "learning_rate": 1.6733747790301243e-05, "loss": 0.0137, "step": 9360 }, { "epoch": 2.3851770099329315, "grad_norm": 1.3786699771881104, "learning_rate": 1.673046051222885e-05, "loss": 0.0206, "step": 9365 }, { "epoch": 2.3864504626878342, "grad_norm": 0.89287269115448, "learning_rate": 1.6727171904011663e-05, "loss": 0.0118, "step": 9370 }, { "epoch": 2.387723915442737, "grad_norm": 1.2554993629455566, "learning_rate": 1.6723881966299617e-05, "loss": 0.0137, "step": 9375 }, { "epoch": 2.3889973681976397, "grad_norm": 0.8254138827323914, "learning_rate": 1.67205906997429e-05, "loss": 0.0143, "step": 9380 }, { "epoch": 2.3902708209525425, "grad_norm": 1.307876467704773, "learning_rate": 1.671729810499197e-05, "loss": 0.0145, "step": 9385 }, { "epoch": 2.3915442737074453, "grad_norm": 0.9892244935035706, "learning_rate": 1.6714004182697543e-05, "loss": 0.0151, "step": 9390 }, { "epoch": 2.392817726462348, "grad_norm": 1.059828758239746, "learning_rate": 1.6710708933510597e-05, "loss": 0.0229, "step": 9395 }, { "epoch": 2.3940911792172512, "grad_norm": 0.9919794797897339, "learning_rate": 1.670741235808238e-05, "loss": 0.0121, "step": 9400 }, { "epoch": 2.395364631972154, "grad_norm": 1.2529542446136475, "learning_rate": 1.670411445706439e-05, "loss": 0.017, "step": 9405 }, { "epoch": 2.3966380847270568, "grad_norm": 0.9465361833572388, "learning_rate": 1.670081523110839e-05, "loss": 0.0155, "step": 9410 }, { "epoch": 2.3979115374819595, "grad_norm": 1.367501139640808, "learning_rate": 1.6697514680866417e-05, "loss": 0.013, "step": 9415 }, { "epoch": 2.3991849902368623, "grad_norm": 0.9383494853973389, "learning_rate": 1.669421280699076e-05, "loss": 0.0145, "step": 9420 }, { "epoch": 2.400458442991765, "grad_norm": 1.1360985040664673, "learning_rate": 1.6690909610133964e-05, "loss": 0.0165, "step": 9425 }, { "epoch": 2.401731895746668, "grad_norm": 0.6432919502258301, "learning_rate": 1.668760509094884e-05, "loss": 0.0133, "step": 9430 }, { "epoch": 2.4030053485015705, "grad_norm": 1.7310737371444702, "learning_rate": 1.6684299250088473e-05, "loss": 0.0184, "step": 9435 }, { "epoch": 2.4042788012564733, "grad_norm": 0.3746192157268524, "learning_rate": 1.668099208820619e-05, "loss": 0.0139, "step": 9440 }, { "epoch": 2.405552254011376, "grad_norm": 1.0229977369308472, "learning_rate": 1.6677683605955585e-05, "loss": 0.0114, "step": 9445 }, { "epoch": 2.406825706766279, "grad_norm": 1.7075921297073364, "learning_rate": 1.6674373803990524e-05, "loss": 0.0165, "step": 9450 }, { "epoch": 2.4080991595211816, "grad_norm": 1.299294352531433, "learning_rate": 1.6671062682965118e-05, "loss": 0.019, "step": 9455 }, { "epoch": 2.4093726122760843, "grad_norm": 1.2739678621292114, "learning_rate": 1.666775024353375e-05, "loss": 0.0191, "step": 9460 }, { "epoch": 2.4106460650309876, "grad_norm": 1.0969387292861938, "learning_rate": 1.6664436486351056e-05, "loss": 0.0159, "step": 9465 }, { "epoch": 2.4119195177858903, "grad_norm": 1.1287360191345215, "learning_rate": 1.6661121412071937e-05, "loss": 0.0213, "step": 9470 }, { "epoch": 2.413192970540793, "grad_norm": 0.48234936594963074, "learning_rate": 1.665780502135155e-05, "loss": 0.0166, "step": 9475 }, { "epoch": 2.414466423295696, "grad_norm": 0.7410439848899841, "learning_rate": 1.6654487314845324e-05, "loss": 0.0168, "step": 9480 }, { "epoch": 2.4157398760505986, "grad_norm": 1.1432605981826782, "learning_rate": 1.665116829320893e-05, "loss": 0.0152, "step": 9485 }, { "epoch": 2.4170133288055013, "grad_norm": 1.3017127513885498, "learning_rate": 1.6647847957098315e-05, "loss": 0.0173, "step": 9490 }, { "epoch": 2.418286781560404, "grad_norm": 1.0865412950515747, "learning_rate": 1.6644526307169675e-05, "loss": 0.0192, "step": 9495 }, { "epoch": 2.419560234315307, "grad_norm": 1.1678227186203003, "learning_rate": 1.6641203344079468e-05, "loss": 0.0133, "step": 9500 }, { "epoch": 2.4208336870702096, "grad_norm": 0.9039960503578186, "learning_rate": 1.663787906848442e-05, "loss": 0.0156, "step": 9505 }, { "epoch": 2.4221071398251124, "grad_norm": 1.3165162801742554, "learning_rate": 1.66345534810415e-05, "loss": 0.0209, "step": 9510 }, { "epoch": 2.423380592580015, "grad_norm": 1.2296032905578613, "learning_rate": 1.6631226582407954e-05, "loss": 0.0126, "step": 9515 }, { "epoch": 2.424654045334918, "grad_norm": 0.49240636825561523, "learning_rate": 1.6627898373241275e-05, "loss": 0.0147, "step": 9520 }, { "epoch": 2.4259274980898207, "grad_norm": 1.187268853187561, "learning_rate": 1.6624568854199218e-05, "loss": 0.0146, "step": 9525 }, { "epoch": 2.427200950844724, "grad_norm": 1.3469018936157227, "learning_rate": 1.66212380259398e-05, "loss": 0.0156, "step": 9530 }, { "epoch": 2.4284744035996266, "grad_norm": Infinity, "learning_rate": 1.6618572421138117e-05, "loss": 0.0172, "step": 9535 }, { "epoch": 2.4297478563545294, "grad_norm": 1.1084973812103271, "learning_rate": 1.6615239237946473e-05, "loss": 0.0186, "step": 9540 }, { "epoch": 2.431021309109432, "grad_norm": 0.9781989455223083, "learning_rate": 1.6611904747381283e-05, "loss": 0.016, "step": 9545 }, { "epoch": 2.432294761864335, "grad_norm": 1.2986539602279663, "learning_rate": 1.6608568950101544e-05, "loss": 0.0202, "step": 9550 }, { "epoch": 2.4335682146192377, "grad_norm": 1.5111712217330933, "learning_rate": 1.6605231846766518e-05, "loss": 0.0156, "step": 9555 }, { "epoch": 2.4348416673741404, "grad_norm": 0.7666068077087402, "learning_rate": 1.660189343803571e-05, "loss": 0.0133, "step": 9560 }, { "epoch": 2.436115120129043, "grad_norm": 0.4608633518218994, "learning_rate": 1.65985537245689e-05, "loss": 0.014, "step": 9565 }, { "epoch": 2.437388572883946, "grad_norm": 1.2241610288619995, "learning_rate": 1.659521270702611e-05, "loss": 0.0213, "step": 9570 }, { "epoch": 2.4386620256388487, "grad_norm": 1.3618080615997314, "learning_rate": 1.659187038606763e-05, "loss": 0.0186, "step": 9575 }, { "epoch": 2.4399354783937515, "grad_norm": 1.8587796688079834, "learning_rate": 1.6588526762354006e-05, "loss": 0.0221, "step": 9580 }, { "epoch": 2.4412089311486542, "grad_norm": 0.9279047846794128, "learning_rate": 1.6585181836546042e-05, "loss": 0.0151, "step": 9585 }, { "epoch": 2.442482383903557, "grad_norm": 1.1884359121322632, "learning_rate": 1.6581835609304796e-05, "loss": 0.0195, "step": 9590 }, { "epoch": 2.44375583665846, "grad_norm": 1.4578886032104492, "learning_rate": 1.657848808129158e-05, "loss": 0.0199, "step": 9595 }, { "epoch": 2.445029289413363, "grad_norm": 0.49519217014312744, "learning_rate": 1.6575139253167973e-05, "loss": 0.0123, "step": 9600 }, { "epoch": 2.4463027421682657, "grad_norm": 0.9629542231559753, "learning_rate": 1.657178912559581e-05, "loss": 0.0184, "step": 9605 }, { "epoch": 2.4475761949231685, "grad_norm": 1.2287424802780151, "learning_rate": 1.6568437699237167e-05, "loss": 0.0158, "step": 9610 }, { "epoch": 2.4488496476780712, "grad_norm": 1.0823780298233032, "learning_rate": 1.6565084974754397e-05, "loss": 0.0158, "step": 9615 }, { "epoch": 2.450123100432974, "grad_norm": 0.9083195328712463, "learning_rate": 1.65617309528101e-05, "loss": 0.0119, "step": 9620 }, { "epoch": 2.4513965531878767, "grad_norm": 0.9579418301582336, "learning_rate": 1.6558375634067128e-05, "loss": 0.0131, "step": 9625 }, { "epoch": 2.4526700059427795, "grad_norm": 1.359291434288025, "learning_rate": 1.6555019019188605e-05, "loss": 0.0129, "step": 9630 }, { "epoch": 2.4539434586976823, "grad_norm": 1.1732532978057861, "learning_rate": 1.6551661108837892e-05, "loss": 0.0114, "step": 9635 }, { "epoch": 2.455216911452585, "grad_norm": 1.3746085166931152, "learning_rate": 1.6548301903678616e-05, "loss": 0.0234, "step": 9640 }, { "epoch": 2.456490364207488, "grad_norm": 1.1000820398330688, "learning_rate": 1.654494140437466e-05, "loss": 0.0134, "step": 9645 }, { "epoch": 2.4577638169623905, "grad_norm": 0.8363916277885437, "learning_rate": 1.654157961159016e-05, "loss": 0.0189, "step": 9650 }, { "epoch": 2.4590372697172933, "grad_norm": 1.067101001739502, "learning_rate": 1.653821652598951e-05, "loss": 0.0166, "step": 9655 }, { "epoch": 2.4603107224721965, "grad_norm": 1.1143996715545654, "learning_rate": 1.653485214823736e-05, "loss": 0.0155, "step": 9660 }, { "epoch": 2.461584175227099, "grad_norm": 0.850409209728241, "learning_rate": 1.6531486478998613e-05, "loss": 0.0124, "step": 9665 }, { "epoch": 2.462857627982002, "grad_norm": 1.176026463508606, "learning_rate": 1.652811951893843e-05, "loss": 0.0159, "step": 9670 }, { "epoch": 2.464131080736905, "grad_norm": 1.2701678276062012, "learning_rate": 1.6524751268722216e-05, "loss": 0.0162, "step": 9675 }, { "epoch": 2.4654045334918075, "grad_norm": 1.0674312114715576, "learning_rate": 1.6521381729015652e-05, "loss": 0.0162, "step": 9680 }, { "epoch": 2.4666779862467103, "grad_norm": 1.6388185024261475, "learning_rate": 1.6518010900484657e-05, "loss": 0.0109, "step": 9685 }, { "epoch": 2.467951439001613, "grad_norm": 1.5210140943527222, "learning_rate": 1.651463878379541e-05, "loss": 0.0125, "step": 9690 }, { "epoch": 2.469224891756516, "grad_norm": 1.0870112180709839, "learning_rate": 1.6511265379614342e-05, "loss": 0.0159, "step": 9695 }, { "epoch": 2.4704983445114186, "grad_norm": 1.1331382989883423, "learning_rate": 1.6507890688608148e-05, "loss": 0.0149, "step": 9700 }, { "epoch": 2.4717717972663213, "grad_norm": 1.5832812786102295, "learning_rate": 1.650451471144376e-05, "loss": 0.0198, "step": 9705 }, { "epoch": 2.473045250021224, "grad_norm": 1.2299119234085083, "learning_rate": 1.6501137448788384e-05, "loss": 0.0155, "step": 9710 }, { "epoch": 2.474318702776127, "grad_norm": 1.8220568895339966, "learning_rate": 1.6497758901309465e-05, "loss": 0.0209, "step": 9715 }, { "epoch": 2.4755921555310296, "grad_norm": 0.8487249612808228, "learning_rate": 1.649437906967471e-05, "loss": 0.0138, "step": 9720 }, { "epoch": 2.476865608285933, "grad_norm": 1.127465844154358, "learning_rate": 1.6490997954552074e-05, "loss": 0.0179, "step": 9725 }, { "epoch": 2.478139061040835, "grad_norm": 1.2566802501678467, "learning_rate": 1.6487615556609767e-05, "loss": 0.0178, "step": 9730 }, { "epoch": 2.4794125137957383, "grad_norm": 0.8013891577720642, "learning_rate": 1.6484231876516262e-05, "loss": 0.0209, "step": 9735 }, { "epoch": 2.480685966550641, "grad_norm": 1.0489535331726074, "learning_rate": 1.6480846914940272e-05, "loss": 0.017, "step": 9740 }, { "epoch": 2.481959419305544, "grad_norm": 1.0427929162979126, "learning_rate": 1.647746067255077e-05, "loss": 0.0134, "step": 9745 }, { "epoch": 2.4832328720604466, "grad_norm": 1.9804848432540894, "learning_rate": 1.647407315001698e-05, "loss": 0.0124, "step": 9750 }, { "epoch": 2.4845063248153494, "grad_norm": 1.5003858804702759, "learning_rate": 1.647068434800838e-05, "loss": 0.0219, "step": 9755 }, { "epoch": 2.485779777570252, "grad_norm": 1.251078724861145, "learning_rate": 1.6467294267194708e-05, "loss": 0.0173, "step": 9760 }, { "epoch": 2.487053230325155, "grad_norm": 1.3949753046035767, "learning_rate": 1.6463902908245933e-05, "loss": 0.0174, "step": 9765 }, { "epoch": 2.4883266830800577, "grad_norm": 0.6662002205848694, "learning_rate": 1.6460510271832307e-05, "loss": 0.0131, "step": 9770 }, { "epoch": 2.4896001358349604, "grad_norm": 1.1411341428756714, "learning_rate": 1.6457116358624306e-05, "loss": 0.0139, "step": 9775 }, { "epoch": 2.490873588589863, "grad_norm": 1.2769149541854858, "learning_rate": 1.645372116929268e-05, "loss": 0.0168, "step": 9780 }, { "epoch": 2.492147041344766, "grad_norm": 0.7727435231208801, "learning_rate": 1.645032470450842e-05, "loss": 0.0102, "step": 9785 }, { "epoch": 2.493420494099669, "grad_norm": 0.8384838104248047, "learning_rate": 1.6446926964942767e-05, "loss": 0.0129, "step": 9790 }, { "epoch": 2.4946939468545715, "grad_norm": 1.0090346336364746, "learning_rate": 1.6443527951267222e-05, "loss": 0.0136, "step": 9795 }, { "epoch": 2.4959673996094747, "grad_norm": 1.2123687267303467, "learning_rate": 1.6440127664153532e-05, "loss": 0.014, "step": 9800 }, { "epoch": 2.4972408523643774, "grad_norm": 1.0073033571243286, "learning_rate": 1.6436726104273702e-05, "loss": 0.0139, "step": 9805 }, { "epoch": 2.49851430511928, "grad_norm": 1.220468521118164, "learning_rate": 1.6433323272299978e-05, "loss": 0.0157, "step": 9810 }, { "epoch": 2.499787757874183, "grad_norm": 0.6916623115539551, "learning_rate": 1.642991916890487e-05, "loss": 0.0102, "step": 9815 }, { "epoch": 2.5010612106290857, "grad_norm": 1.867563009262085, "learning_rate": 1.6426513794761126e-05, "loss": 0.0149, "step": 9820 }, { "epoch": 2.5023346633839885, "grad_norm": 3.7655584812164307, "learning_rate": 1.642310715054176e-05, "loss": 0.0208, "step": 9825 }, { "epoch": 2.5036081161388912, "grad_norm": 0.7786493897438049, "learning_rate": 1.6419699236920027e-05, "loss": 0.0157, "step": 9830 }, { "epoch": 2.504881568893794, "grad_norm": 1.8416900634765625, "learning_rate": 1.6416290054569427e-05, "loss": 0.023, "step": 9835 }, { "epoch": 2.5061550216486967, "grad_norm": 0.7881823778152466, "learning_rate": 1.6412879604163728e-05, "loss": 0.0081, "step": 9840 }, { "epoch": 2.5074284744035995, "grad_norm": 0.7593820691108704, "learning_rate": 1.6409467886376932e-05, "loss": 0.0156, "step": 9845 }, { "epoch": 2.5087019271585023, "grad_norm": 1.1502586603164673, "learning_rate": 1.6406054901883307e-05, "loss": 0.0167, "step": 9850 }, { "epoch": 2.5099753799134055, "grad_norm": 1.050551176071167, "learning_rate": 1.6402640651357356e-05, "loss": 0.0223, "step": 9855 }, { "epoch": 2.511248832668308, "grad_norm": 1.2325177192687988, "learning_rate": 1.6399225135473842e-05, "loss": 0.0139, "step": 9860 }, { "epoch": 2.512522285423211, "grad_norm": 1.096028447151184, "learning_rate": 1.639580835490778e-05, "loss": 0.0143, "step": 9865 }, { "epoch": 2.5137957381781137, "grad_norm": 1.3170334100723267, "learning_rate": 1.6392390310334422e-05, "loss": 0.0206, "step": 9870 }, { "epoch": 2.5150691909330165, "grad_norm": 0.8715745210647583, "learning_rate": 1.638897100242928e-05, "loss": 0.0116, "step": 9875 }, { "epoch": 2.5163426436879193, "grad_norm": 1.2198474407196045, "learning_rate": 1.6385550431868118e-05, "loss": 0.0164, "step": 9880 }, { "epoch": 2.517616096442822, "grad_norm": 1.0241812467575073, "learning_rate": 1.638212859932694e-05, "loss": 0.0151, "step": 9885 }, { "epoch": 2.518889549197725, "grad_norm": 0.8735573887825012, "learning_rate": 1.637870550548201e-05, "loss": 0.0172, "step": 9890 }, { "epoch": 2.5201630019526275, "grad_norm": 1.0637215375900269, "learning_rate": 1.6375281151009834e-05, "loss": 0.0123, "step": 9895 }, { "epoch": 2.5214364547075303, "grad_norm": 1.1650211811065674, "learning_rate": 1.6371855536587168e-05, "loss": 0.0194, "step": 9900 }, { "epoch": 2.522709907462433, "grad_norm": 1.2745596170425415, "learning_rate": 1.6368428662891018e-05, "loss": 0.0187, "step": 9905 }, { "epoch": 2.523983360217336, "grad_norm": 1.4422121047973633, "learning_rate": 1.636500053059864e-05, "loss": 0.0167, "step": 9910 }, { "epoch": 2.5252568129722386, "grad_norm": 1.4229894876480103, "learning_rate": 1.636157114038754e-05, "loss": 0.0125, "step": 9915 }, { "epoch": 2.526530265727142, "grad_norm": 0.6195891499519348, "learning_rate": 1.6358140492935467e-05, "loss": 0.0127, "step": 9920 }, { "epoch": 2.527803718482044, "grad_norm": 1.0253502130508423, "learning_rate": 1.6354708588920422e-05, "loss": 0.0141, "step": 9925 }, { "epoch": 2.5290771712369473, "grad_norm": 2.866917848587036, "learning_rate": 1.635127542902066e-05, "loss": 0.0203, "step": 9930 }, { "epoch": 2.53035062399185, "grad_norm": 1.1899516582489014, "learning_rate": 1.6347841013914666e-05, "loss": 0.0118, "step": 9935 }, { "epoch": 2.531624076746753, "grad_norm": 1.0087660551071167, "learning_rate": 1.63444053442812e-05, "loss": 0.0165, "step": 9940 }, { "epoch": 2.5328975295016556, "grad_norm": 1.5253407955169678, "learning_rate": 1.634096842079924e-05, "loss": 0.0124, "step": 9945 }, { "epoch": 2.5341709822565583, "grad_norm": 1.0079611539840698, "learning_rate": 1.6337530244148044e-05, "loss": 0.013, "step": 9950 }, { "epoch": 2.535444435011461, "grad_norm": 1.880912184715271, "learning_rate": 1.6334090815007085e-05, "loss": 0.0165, "step": 9955 }, { "epoch": 2.536717887766364, "grad_norm": 1.2397247552871704, "learning_rate": 1.6330650134056112e-05, "loss": 0.0146, "step": 9960 }, { "epoch": 2.5379913405212666, "grad_norm": 0.7682598829269409, "learning_rate": 1.63272082019751e-05, "loss": 0.0122, "step": 9965 }, { "epoch": 2.5392647932761694, "grad_norm": 1.5487027168273926, "learning_rate": 1.6323765019444287e-05, "loss": 0.013, "step": 9970 }, { "epoch": 2.540538246031072, "grad_norm": 0.932924747467041, "learning_rate": 1.6320320587144145e-05, "loss": 0.0128, "step": 9975 }, { "epoch": 2.541811698785975, "grad_norm": 1.6069331169128418, "learning_rate": 1.6316874905755402e-05, "loss": 0.014, "step": 9980 }, { "epoch": 2.543085151540878, "grad_norm": 1.3336604833602905, "learning_rate": 1.631342797595903e-05, "loss": 0.0235, "step": 9985 }, { "epoch": 2.5443586042957804, "grad_norm": 1.3603297472000122, "learning_rate": 1.630997979843625e-05, "loss": 0.0131, "step": 9990 }, { "epoch": 2.5456320570506836, "grad_norm": 1.4977880716323853, "learning_rate": 1.6306530373868518e-05, "loss": 0.0191, "step": 9995 }, { "epoch": 2.546905509805586, "grad_norm": 0.865764856338501, "learning_rate": 1.6303079702937558e-05, "loss": 0.0128, "step": 10000 }, { "epoch": 2.548178962560489, "grad_norm": 1.4530055522918701, "learning_rate": 1.629962778632532e-05, "loss": 0.0178, "step": 10005 }, { "epoch": 2.549452415315392, "grad_norm": 1.0574480295181274, "learning_rate": 1.6296174624714013e-05, "loss": 0.0172, "step": 10010 }, { "epoch": 2.5507258680702947, "grad_norm": 0.9590906500816345, "learning_rate": 1.629272021878608e-05, "loss": 0.0186, "step": 10015 }, { "epoch": 2.5519993208251974, "grad_norm": 0.7114768624305725, "learning_rate": 1.628926456922423e-05, "loss": 0.0088, "step": 10020 }, { "epoch": 2.5532727735801, "grad_norm": 1.261226773262024, "learning_rate": 1.6285807676711394e-05, "loss": 0.0101, "step": 10025 }, { "epoch": 2.554546226335003, "grad_norm": 0.9104642271995544, "learning_rate": 1.628234954193076e-05, "loss": 0.0182, "step": 10030 }, { "epoch": 2.5558196790899057, "grad_norm": 1.6106085777282715, "learning_rate": 1.6278890165565773e-05, "loss": 0.0191, "step": 10035 }, { "epoch": 2.5570931318448085, "grad_norm": 0.988638699054718, "learning_rate": 1.6275429548300096e-05, "loss": 0.0157, "step": 10040 }, { "epoch": 2.5583665845997112, "grad_norm": 0.9883014559745789, "learning_rate": 1.6271967690817662e-05, "loss": 0.0102, "step": 10045 }, { "epoch": 2.559640037354614, "grad_norm": 1.0194250345230103, "learning_rate": 1.6268504593802635e-05, "loss": 0.0182, "step": 10050 }, { "epoch": 2.5609134901095167, "grad_norm": 0.9590712189674377, "learning_rate": 1.6265040257939432e-05, "loss": 0.0113, "step": 10055 }, { "epoch": 2.56218694286442, "grad_norm": 1.851501226425171, "learning_rate": 1.6261574683912714e-05, "loss": 0.011, "step": 10060 }, { "epoch": 2.5634603956193223, "grad_norm": 1.2340863943099976, "learning_rate": 1.6258107872407376e-05, "loss": 0.0155, "step": 10065 }, { "epoch": 2.5647338483742255, "grad_norm": 0.9489201307296753, "learning_rate": 1.6254639824108575e-05, "loss": 0.0119, "step": 10070 }, { "epoch": 2.5660073011291282, "grad_norm": 1.2376753091812134, "learning_rate": 1.6251170539701702e-05, "loss": 0.0153, "step": 10075 }, { "epoch": 2.567280753884031, "grad_norm": 1.3984489440917969, "learning_rate": 1.624770001987239e-05, "loss": 0.0109, "step": 10080 }, { "epoch": 2.5685542066389337, "grad_norm": 0.7591749429702759, "learning_rate": 1.6244228265306517e-05, "loss": 0.0189, "step": 10085 }, { "epoch": 2.5698276593938365, "grad_norm": 1.4800140857696533, "learning_rate": 1.6240755276690216e-05, "loss": 0.0164, "step": 10090 }, { "epoch": 2.5711011121487393, "grad_norm": 1.355859637260437, "learning_rate": 1.6237281054709854e-05, "loss": 0.0142, "step": 10095 }, { "epoch": 2.572374564903642, "grad_norm": 1.4784408807754517, "learning_rate": 1.623380560005204e-05, "loss": 0.0113, "step": 10100 }, { "epoch": 2.573648017658545, "grad_norm": 0.8970335721969604, "learning_rate": 1.623032891340363e-05, "loss": 0.018, "step": 10105 }, { "epoch": 2.5749214704134475, "grad_norm": 1.0426064729690552, "learning_rate": 1.622685099545172e-05, "loss": 0.0168, "step": 10110 }, { "epoch": 2.5761949231683503, "grad_norm": 0.7218124866485596, "learning_rate": 1.6223371846883665e-05, "loss": 0.0142, "step": 10115 }, { "epoch": 2.577468375923253, "grad_norm": 0.7220420241355896, "learning_rate": 1.621989146838704e-05, "loss": 0.0114, "step": 10120 }, { "epoch": 2.5787418286781563, "grad_norm": 0.8553042411804199, "learning_rate": 1.6216409860649684e-05, "loss": 0.0142, "step": 10125 }, { "epoch": 2.5800152814330586, "grad_norm": 1.7239274978637695, "learning_rate": 1.621292702435966e-05, "loss": 0.0179, "step": 10130 }, { "epoch": 2.581288734187962, "grad_norm": 1.0451754331588745, "learning_rate": 1.6209442960205286e-05, "loss": 0.0164, "step": 10135 }, { "epoch": 2.5825621869428645, "grad_norm": 1.2873603105545044, "learning_rate": 1.620595766887512e-05, "loss": 0.0196, "step": 10140 }, { "epoch": 2.5838356396977673, "grad_norm": 1.0105712413787842, "learning_rate": 1.6202471151057962e-05, "loss": 0.0156, "step": 10145 }, { "epoch": 2.58510909245267, "grad_norm": 1.0693449974060059, "learning_rate": 1.6198983407442857e-05, "loss": 0.0159, "step": 10150 }, { "epoch": 2.586382545207573, "grad_norm": 3.416125535964966, "learning_rate": 1.6195494438719087e-05, "loss": 0.0125, "step": 10155 }, { "epoch": 2.5876559979624756, "grad_norm": 0.9327288866043091, "learning_rate": 1.6192004245576177e-05, "loss": 0.0147, "step": 10160 }, { "epoch": 2.5889294507173783, "grad_norm": 1.8597617149353027, "learning_rate": 1.6188512828703902e-05, "loss": 0.0194, "step": 10165 }, { "epoch": 2.590202903472281, "grad_norm": 1.1001644134521484, "learning_rate": 1.6185020188792263e-05, "loss": 0.0191, "step": 10170 }, { "epoch": 2.591476356227184, "grad_norm": 0.8140286803245544, "learning_rate": 1.6181526326531525e-05, "loss": 0.018, "step": 10175 }, { "epoch": 2.5927498089820866, "grad_norm": 1.4987162351608276, "learning_rate": 1.6178031242612172e-05, "loss": 0.0153, "step": 10180 }, { "epoch": 2.5940232617369894, "grad_norm": 1.3844197988510132, "learning_rate": 1.6174534937724943e-05, "loss": 0.0173, "step": 10185 }, { "epoch": 2.5952967144918926, "grad_norm": 1.5325015783309937, "learning_rate": 1.6171037412560817e-05, "loss": 0.0179, "step": 10190 }, { "epoch": 2.596570167246795, "grad_norm": 0.955522894859314, "learning_rate": 1.6167538667811006e-05, "loss": 0.0171, "step": 10195 }, { "epoch": 2.597843620001698, "grad_norm": 0.8396666049957275, "learning_rate": 1.6164038704166977e-05, "loss": 0.0164, "step": 10200 }, { "epoch": 2.599117072756601, "grad_norm": 0.7276314496994019, "learning_rate": 1.616053752232042e-05, "loss": 0.0121, "step": 10205 }, { "epoch": 2.6003905255115036, "grad_norm": 0.7143403887748718, "learning_rate": 1.6157035122963285e-05, "loss": 0.0177, "step": 10210 }, { "epoch": 2.6016639782664064, "grad_norm": 1.2120096683502197, "learning_rate": 1.6153531506787746e-05, "loss": 0.0168, "step": 10215 }, { "epoch": 2.602937431021309, "grad_norm": 1.559185266494751, "learning_rate": 1.615002667448623e-05, "loss": 0.0176, "step": 10220 }, { "epoch": 2.604210883776212, "grad_norm": 1.2011380195617676, "learning_rate": 1.6146520626751397e-05, "loss": 0.0178, "step": 10225 }, { "epoch": 2.6054843365311147, "grad_norm": 1.1892247200012207, "learning_rate": 1.614301336427615e-05, "loss": 0.0176, "step": 10230 }, { "epoch": 2.6067577892860174, "grad_norm": 0.29666534066200256, "learning_rate": 1.6139504887753624e-05, "loss": 0.0185, "step": 10235 }, { "epoch": 2.60803124204092, "grad_norm": 0.5440550446510315, "learning_rate": 1.6135995197877216e-05, "loss": 0.0178, "step": 10240 }, { "epoch": 2.609304694795823, "grad_norm": 1.017486572265625, "learning_rate": 1.6132484295340536e-05, "loss": 0.0088, "step": 10245 }, { "epoch": 2.6105781475507257, "grad_norm": 1.7340039014816284, "learning_rate": 1.612897218083745e-05, "loss": 0.0156, "step": 10250 }, { "epoch": 2.611851600305629, "grad_norm": 1.0880515575408936, "learning_rate": 1.6125458855062056e-05, "loss": 0.0146, "step": 10255 }, { "epoch": 2.6131250530605312, "grad_norm": 0.8836652636528015, "learning_rate": 1.61219443187087e-05, "loss": 0.0146, "step": 10260 }, { "epoch": 2.6143985058154344, "grad_norm": 0.9027925133705139, "learning_rate": 1.611842857247196e-05, "loss": 0.0142, "step": 10265 }, { "epoch": 2.615671958570337, "grad_norm": 1.185311198234558, "learning_rate": 1.611491161704665e-05, "loss": 0.0174, "step": 10270 }, { "epoch": 2.61694541132524, "grad_norm": 1.4064494371414185, "learning_rate": 1.6111393453127835e-05, "loss": 0.018, "step": 10275 }, { "epoch": 2.6182188640801427, "grad_norm": 1.1505972146987915, "learning_rate": 1.6107874081410807e-05, "loss": 0.0145, "step": 10280 }, { "epoch": 2.6194923168350455, "grad_norm": 1.1449698209762573, "learning_rate": 1.6104353502591105e-05, "loss": 0.0124, "step": 10285 }, { "epoch": 2.6207657695899482, "grad_norm": 0.6622280478477478, "learning_rate": 1.61008317173645e-05, "loss": 0.0174, "step": 10290 }, { "epoch": 2.622039222344851, "grad_norm": 1.4143646955490112, "learning_rate": 1.6097308726427007e-05, "loss": 0.012, "step": 10295 }, { "epoch": 2.6233126750997537, "grad_norm": 0.6762780547142029, "learning_rate": 1.6093784530474872e-05, "loss": 0.0205, "step": 10300 }, { "epoch": 2.6245861278546565, "grad_norm": 1.2240090370178223, "learning_rate": 1.609025913020459e-05, "loss": 0.0167, "step": 10305 }, { "epoch": 2.6258595806095593, "grad_norm": 0.6872504353523254, "learning_rate": 1.6086732526312884e-05, "loss": 0.0152, "step": 10310 }, { "epoch": 2.627133033364462, "grad_norm": 1.1986842155456543, "learning_rate": 1.6083204719496717e-05, "loss": 0.0135, "step": 10315 }, { "epoch": 2.6284064861193652, "grad_norm": 1.9124962091445923, "learning_rate": 1.6079675710453302e-05, "loss": 0.0253, "step": 10320 }, { "epoch": 2.6296799388742675, "grad_norm": 1.2719781398773193, "learning_rate": 1.6076145499880068e-05, "loss": 0.0165, "step": 10325 }, { "epoch": 2.6309533916291707, "grad_norm": 1.2900980710983276, "learning_rate": 1.6072614088474693e-05, "loss": 0.0183, "step": 10330 }, { "epoch": 2.6322268443840735, "grad_norm": 1.24074125289917, "learning_rate": 1.6069081476935097e-05, "loss": 0.0153, "step": 10335 }, { "epoch": 2.6335002971389763, "grad_norm": 0.9373801350593567, "learning_rate": 1.606554766595943e-05, "loss": 0.0137, "step": 10340 }, { "epoch": 2.634773749893879, "grad_norm": 1.2782284021377563, "learning_rate": 1.606201265624608e-05, "loss": 0.0186, "step": 10345 }, { "epoch": 2.636047202648782, "grad_norm": 1.44431734085083, "learning_rate": 1.6058476448493673e-05, "loss": 0.0178, "step": 10350 }, { "epoch": 2.6373206554036845, "grad_norm": 1.799264669418335, "learning_rate": 1.6054939043401078e-05, "loss": 0.0244, "step": 10355 }, { "epoch": 2.6385941081585873, "grad_norm": 1.4799847602844238, "learning_rate": 1.6051400441667384e-05, "loss": 0.0169, "step": 10360 }, { "epoch": 2.63986756091349, "grad_norm": 1.3281598091125488, "learning_rate": 1.6047860643991933e-05, "loss": 0.0165, "step": 10365 }, { "epoch": 2.641141013668393, "grad_norm": 1.7919460535049438, "learning_rate": 1.6044319651074298e-05, "loss": 0.0208, "step": 10370 }, { "epoch": 2.6424144664232956, "grad_norm": 1.2256412506103516, "learning_rate": 1.6040777463614283e-05, "loss": 0.0157, "step": 10375 }, { "epoch": 2.6436879191781983, "grad_norm": 1.0350561141967773, "learning_rate": 1.603723408231194e-05, "loss": 0.0139, "step": 10380 }, { "epoch": 2.6449613719331015, "grad_norm": 1.6675766706466675, "learning_rate": 1.603368950786754e-05, "loss": 0.0169, "step": 10385 }, { "epoch": 2.646234824688004, "grad_norm": 1.0125664472579956, "learning_rate": 1.6030143740981605e-05, "loss": 0.0156, "step": 10390 }, { "epoch": 2.647508277442907, "grad_norm": 0.9311812520027161, "learning_rate": 1.602659678235489e-05, "loss": 0.0157, "step": 10395 }, { "epoch": 2.64878173019781, "grad_norm": 1.5387394428253174, "learning_rate": 1.6023048632688377e-05, "loss": 0.017, "step": 10400 }, { "epoch": 2.6500551829527126, "grad_norm": 1.4177525043487549, "learning_rate": 1.6019499292683285e-05, "loss": 0.0177, "step": 10405 }, { "epoch": 2.6513286357076153, "grad_norm": 1.3488670587539673, "learning_rate": 1.6015948763041084e-05, "loss": 0.0159, "step": 10410 }, { "epoch": 2.652602088462518, "grad_norm": 1.8314217329025269, "learning_rate": 1.6012397044463458e-05, "loss": 0.0174, "step": 10415 }, { "epoch": 2.653875541217421, "grad_norm": 0.7083155512809753, "learning_rate": 1.600884413765234e-05, "loss": 0.0131, "step": 10420 }, { "epoch": 2.6551489939723236, "grad_norm": 0.8347048759460449, "learning_rate": 1.600529004330989e-05, "loss": 0.0177, "step": 10425 }, { "epoch": 2.6564224467272264, "grad_norm": 1.5260933637619019, "learning_rate": 1.6001734762138507e-05, "loss": 0.0153, "step": 10430 }, { "epoch": 2.657695899482129, "grad_norm": 1.1393858194351196, "learning_rate": 1.599817829484082e-05, "loss": 0.0166, "step": 10435 }, { "epoch": 2.658969352237032, "grad_norm": 1.331371784210205, "learning_rate": 1.59946206421197e-05, "loss": 0.0144, "step": 10440 }, { "epoch": 2.6602428049919347, "grad_norm": 0.9645492434501648, "learning_rate": 1.599106180467825e-05, "loss": 0.0178, "step": 10445 }, { "epoch": 2.661516257746838, "grad_norm": 2.0610032081604004, "learning_rate": 1.59875017832198e-05, "loss": 0.0177, "step": 10450 }, { "epoch": 2.66278971050174, "grad_norm": 2.452064275741577, "learning_rate": 1.598394057844792e-05, "loss": 0.021, "step": 10455 }, { "epoch": 2.6640631632566434, "grad_norm": 1.4035615921020508, "learning_rate": 1.5980378191066415e-05, "loss": 0.0204, "step": 10460 }, { "epoch": 2.6653366160115457, "grad_norm": 1.298882007598877, "learning_rate": 1.5976814621779318e-05, "loss": 0.0162, "step": 10465 }, { "epoch": 2.666610068766449, "grad_norm": 2.409097194671631, "learning_rate": 1.5973249871290906e-05, "loss": 0.0122, "step": 10470 }, { "epoch": 2.6678835215213517, "grad_norm": 1.9847806692123413, "learning_rate": 1.5969683940305675e-05, "loss": 0.0199, "step": 10475 }, { "epoch": 2.6691569742762544, "grad_norm": 1.3485208749771118, "learning_rate": 1.5966116829528364e-05, "loss": 0.0161, "step": 10480 }, { "epoch": 2.670430427031157, "grad_norm": 1.1518607139587402, "learning_rate": 1.5962548539663945e-05, "loss": 0.0171, "step": 10485 }, { "epoch": 2.67170387978606, "grad_norm": 1.6843754053115845, "learning_rate": 1.5958979071417624e-05, "loss": 0.0152, "step": 10490 }, { "epoch": 2.6729773325409627, "grad_norm": 1.0463300943374634, "learning_rate": 1.5955408425494833e-05, "loss": 0.0119, "step": 10495 }, { "epoch": 2.6742507852958655, "grad_norm": 1.8730124235153198, "learning_rate": 1.595183660260124e-05, "loss": 0.0206, "step": 10500 }, { "epoch": 2.6755242380507682, "grad_norm": 1.5182993412017822, "learning_rate": 1.5948263603442748e-05, "loss": 0.0127, "step": 10505 }, { "epoch": 2.676797690805671, "grad_norm": 0.8937500715255737, "learning_rate": 1.5944689428725492e-05, "loss": 0.012, "step": 10510 }, { "epoch": 2.6780711435605737, "grad_norm": 0.7913023829460144, "learning_rate": 1.5941114079155836e-05, "loss": 0.0182, "step": 10515 }, { "epoch": 2.6793445963154765, "grad_norm": 1.3976962566375732, "learning_rate": 1.593753755544038e-05, "loss": 0.017, "step": 10520 }, { "epoch": 2.6806180490703797, "grad_norm": 0.8003461360931396, "learning_rate": 1.5933959858285954e-05, "loss": 0.0185, "step": 10525 }, { "epoch": 2.681891501825282, "grad_norm": 0.7758123874664307, "learning_rate": 1.593038098839962e-05, "loss": 0.0134, "step": 10530 }, { "epoch": 2.6831649545801852, "grad_norm": 0.5670610666275024, "learning_rate": 1.592680094648867e-05, "loss": 0.0167, "step": 10535 }, { "epoch": 2.684438407335088, "grad_norm": 1.3237898349761963, "learning_rate": 1.592321973326064e-05, "loss": 0.0156, "step": 10540 }, { "epoch": 2.6857118600899907, "grad_norm": 2.0601906776428223, "learning_rate": 1.591963734942327e-05, "loss": 0.0166, "step": 10545 }, { "epoch": 2.6869853128448935, "grad_norm": 0.6005727648735046, "learning_rate": 1.5916053795684563e-05, "loss": 0.012, "step": 10550 }, { "epoch": 2.6882587655997963, "grad_norm": 1.836086630821228, "learning_rate": 1.5912469072752736e-05, "loss": 0.0112, "step": 10555 }, { "epoch": 2.689532218354699, "grad_norm": 1.1017671823501587, "learning_rate": 1.590888318133623e-05, "loss": 0.0167, "step": 10560 }, { "epoch": 2.690805671109602, "grad_norm": 1.4640567302703857, "learning_rate": 1.590529612214374e-05, "loss": 0.0187, "step": 10565 }, { "epoch": 2.6920791238645045, "grad_norm": 1.3125741481781006, "learning_rate": 1.5901707895884176e-05, "loss": 0.0177, "step": 10570 }, { "epoch": 2.6933525766194073, "grad_norm": 1.0670984983444214, "learning_rate": 1.5898118503266674e-05, "loss": 0.016, "step": 10575 }, { "epoch": 2.69462602937431, "grad_norm": 1.2886327505111694, "learning_rate": 1.589452794500061e-05, "loss": 0.0189, "step": 10580 }, { "epoch": 2.695899482129213, "grad_norm": 1.0661747455596924, "learning_rate": 1.5890936221795593e-05, "loss": 0.017, "step": 10585 }, { "epoch": 2.697172934884116, "grad_norm": 0.9505540132522583, "learning_rate": 1.5887343334361455e-05, "loss": 0.0159, "step": 10590 }, { "epoch": 2.6984463876390183, "grad_norm": 0.7109880447387695, "learning_rate": 1.5883749283408255e-05, "loss": 0.0106, "step": 10595 }, { "epoch": 2.6997198403939215, "grad_norm": 2.3289144039154053, "learning_rate": 1.5880154069646296e-05, "loss": 0.0139, "step": 10600 }, { "epoch": 2.7009932931488243, "grad_norm": 1.816609263420105, "learning_rate": 1.5876557693786102e-05, "loss": 0.0133, "step": 10605 }, { "epoch": 2.702266745903727, "grad_norm": 0.6822256445884705, "learning_rate": 1.5872960156538417e-05, "loss": 0.0101, "step": 10610 }, { "epoch": 2.70354019865863, "grad_norm": 0.7440282106399536, "learning_rate": 1.5869361458614233e-05, "loss": 0.0161, "step": 10615 }, { "epoch": 2.7048136514135326, "grad_norm": 1.5055968761444092, "learning_rate": 1.5865761600724758e-05, "loss": 0.02, "step": 10620 }, { "epoch": 2.7060871041684353, "grad_norm": 1.001340389251709, "learning_rate": 1.586216058358144e-05, "loss": 0.0136, "step": 10625 }, { "epoch": 2.707360556923338, "grad_norm": 1.523153305053711, "learning_rate": 1.5858558407895945e-05, "loss": 0.0155, "step": 10630 }, { "epoch": 2.708634009678241, "grad_norm": 1.2985118627548218, "learning_rate": 1.5854955074380172e-05, "loss": 0.0116, "step": 10635 }, { "epoch": 2.7099074624331436, "grad_norm": 1.528686761856079, "learning_rate": 1.5851350583746256e-05, "loss": 0.0137, "step": 10640 }, { "epoch": 2.7111809151880464, "grad_norm": 1.1369520425796509, "learning_rate": 1.5847744936706548e-05, "loss": 0.0172, "step": 10645 }, { "epoch": 2.712454367942949, "grad_norm": 1.6134026050567627, "learning_rate": 1.584413813397364e-05, "loss": 0.015, "step": 10650 }, { "epoch": 2.7137278206978523, "grad_norm": 1.146395206451416, "learning_rate": 1.584053017626034e-05, "loss": 0.0161, "step": 10655 }, { "epoch": 2.7150012734527547, "grad_norm": 1.2166661024093628, "learning_rate": 1.5836921064279695e-05, "loss": 0.0211, "step": 10660 }, { "epoch": 2.716274726207658, "grad_norm": 1.69478178024292, "learning_rate": 1.5833310798744975e-05, "loss": 0.0158, "step": 10665 }, { "epoch": 2.7175481789625606, "grad_norm": 3.580169677734375, "learning_rate": 1.5829699380369682e-05, "loss": 0.0142, "step": 10670 }, { "epoch": 2.7188216317174634, "grad_norm": 0.8435219526290894, "learning_rate": 1.582608680986754e-05, "loss": 0.0095, "step": 10675 }, { "epoch": 2.720095084472366, "grad_norm": 1.3723564147949219, "learning_rate": 1.5822473087952495e-05, "loss": 0.0172, "step": 10680 }, { "epoch": 2.721368537227269, "grad_norm": 0.740059494972229, "learning_rate": 1.5818858215338744e-05, "loss": 0.0111, "step": 10685 }, { "epoch": 2.7226419899821717, "grad_norm": 0.8255486488342285, "learning_rate": 1.5815242192740686e-05, "loss": 0.0138, "step": 10690 }, { "epoch": 2.7239154427370744, "grad_norm": 1.2990361452102661, "learning_rate": 1.5811625020872967e-05, "loss": 0.0181, "step": 10695 }, { "epoch": 2.725188895491977, "grad_norm": 1.3566499948501587, "learning_rate": 1.580800670045044e-05, "loss": 0.0183, "step": 10700 }, { "epoch": 2.72646234824688, "grad_norm": 1.188035249710083, "learning_rate": 1.5804387232188202e-05, "loss": 0.0174, "step": 10705 }, { "epoch": 2.7277358010017827, "grad_norm": 2.057582378387451, "learning_rate": 1.5800766616801565e-05, "loss": 0.0152, "step": 10710 }, { "epoch": 2.7290092537566855, "grad_norm": 1.0818636417388916, "learning_rate": 1.5797144855006084e-05, "loss": 0.0131, "step": 10715 }, { "epoch": 2.7302827065115887, "grad_norm": 1.1613038778305054, "learning_rate": 1.5793521947517517e-05, "loss": 0.0225, "step": 10720 }, { "epoch": 2.731556159266491, "grad_norm": 1.1352412700653076, "learning_rate": 1.578989789505187e-05, "loss": 0.0115, "step": 10725 }, { "epoch": 2.732829612021394, "grad_norm": 0.9621849656105042, "learning_rate": 1.5786272698325365e-05, "loss": 0.0109, "step": 10730 }, { "epoch": 2.734103064776297, "grad_norm": 1.5821828842163086, "learning_rate": 1.578264635805445e-05, "loss": 0.0166, "step": 10735 }, { "epoch": 2.7353765175311997, "grad_norm": 1.6508166790008545, "learning_rate": 1.57790188749558e-05, "loss": 0.0192, "step": 10740 }, { "epoch": 2.7366499702861025, "grad_norm": 0.9790111184120178, "learning_rate": 1.5775390249746324e-05, "loss": 0.0146, "step": 10745 }, { "epoch": 2.737923423041005, "grad_norm": 1.8611313104629517, "learning_rate": 1.577176048314314e-05, "loss": 0.0183, "step": 10750 }, { "epoch": 2.739196875795908, "grad_norm": 1.2228561639785767, "learning_rate": 1.57681295758636e-05, "loss": 0.0168, "step": 10755 }, { "epoch": 2.7404703285508107, "grad_norm": 0.45614203810691833, "learning_rate": 1.576449752862529e-05, "loss": 0.0135, "step": 10760 }, { "epoch": 2.7417437813057135, "grad_norm": 0.8259633779525757, "learning_rate": 1.576086434214601e-05, "loss": 0.0145, "step": 10765 }, { "epoch": 2.7430172340606163, "grad_norm": 1.1204288005828857, "learning_rate": 1.5757230017143792e-05, "loss": 0.0159, "step": 10770 }, { "epoch": 2.744290686815519, "grad_norm": 1.2127383947372437, "learning_rate": 1.5753594554336885e-05, "loss": 0.0176, "step": 10775 }, { "epoch": 2.745564139570422, "grad_norm": 1.5748369693756104, "learning_rate": 1.5749957954443768e-05, "loss": 0.0151, "step": 10780 }, { "epoch": 2.746837592325325, "grad_norm": 1.3296458721160889, "learning_rate": 1.5746320218183148e-05, "loss": 0.0187, "step": 10785 }, { "epoch": 2.7481110450802273, "grad_norm": 1.0497132539749146, "learning_rate": 1.5742681346273946e-05, "loss": 0.0166, "step": 10790 }, { "epoch": 2.7493844978351305, "grad_norm": 0.9039579033851624, "learning_rate": 1.5739041339435323e-05, "loss": 0.0131, "step": 10795 }, { "epoch": 2.7506579505900333, "grad_norm": 1.327143907546997, "learning_rate": 1.573540019838665e-05, "loss": 0.0159, "step": 10800 }, { "epoch": 2.751931403344936, "grad_norm": 1.2444303035736084, "learning_rate": 1.573175792384753e-05, "loss": 0.0163, "step": 10805 }, { "epoch": 2.753204856099839, "grad_norm": 1.413147211074829, "learning_rate": 1.5728114516537785e-05, "loss": 0.0134, "step": 10810 }, { "epoch": 2.7544783088547415, "grad_norm": 1.215092658996582, "learning_rate": 1.572446997717747e-05, "loss": 0.0171, "step": 10815 }, { "epoch": 2.7557517616096443, "grad_norm": 1.3840967416763306, "learning_rate": 1.572082430648685e-05, "loss": 0.0229, "step": 10820 }, { "epoch": 2.757025214364547, "grad_norm": 1.440315842628479, "learning_rate": 1.5717177505186424e-05, "loss": 0.015, "step": 10825 }, { "epoch": 2.75829866711945, "grad_norm": 1.4010885953903198, "learning_rate": 1.5713529573996912e-05, "loss": 0.0117, "step": 10830 }, { "epoch": 2.7595721198743526, "grad_norm": 0.7399412393569946, "learning_rate": 1.570988051363926e-05, "loss": 0.0128, "step": 10835 }, { "epoch": 2.7608455726292553, "grad_norm": 1.746846318244934, "learning_rate": 1.570623032483463e-05, "loss": 0.0226, "step": 10840 }, { "epoch": 2.762119025384158, "grad_norm": 1.0048682689666748, "learning_rate": 1.5702579008304403e-05, "loss": 0.0115, "step": 10845 }, { "epoch": 2.7633924781390613, "grad_norm": 0.9712662100791931, "learning_rate": 1.5698926564770206e-05, "loss": 0.0137, "step": 10850 }, { "epoch": 2.7646659308939636, "grad_norm": 1.2745163440704346, "learning_rate": 1.5695272994953867e-05, "loss": 0.0164, "step": 10855 }, { "epoch": 2.765939383648867, "grad_norm": 1.0051828622817993, "learning_rate": 1.5691618299577444e-05, "loss": 0.0147, "step": 10860 }, { "epoch": 2.7672128364037696, "grad_norm": 1.1809275150299072, "learning_rate": 1.568796247936321e-05, "loss": 0.0149, "step": 10865 }, { "epoch": 2.7684862891586723, "grad_norm": 1.1123275756835938, "learning_rate": 1.5684305535033676e-05, "loss": 0.0165, "step": 10870 }, { "epoch": 2.769759741913575, "grad_norm": 1.2347079515457153, "learning_rate": 1.568064746731156e-05, "loss": 0.0128, "step": 10875 }, { "epoch": 2.771033194668478, "grad_norm": 0.5509200096130371, "learning_rate": 1.567698827691981e-05, "loss": 0.0168, "step": 10880 }, { "epoch": 2.7723066474233806, "grad_norm": 1.3377132415771484, "learning_rate": 1.5673327964581596e-05, "loss": 0.013, "step": 10885 }, { "epoch": 2.7735801001782834, "grad_norm": 0.751311719417572, "learning_rate": 1.5669666531020303e-05, "loss": 0.0123, "step": 10890 }, { "epoch": 2.774853552933186, "grad_norm": 1.5870641469955444, "learning_rate": 1.5666003976959548e-05, "loss": 0.0172, "step": 10895 }, { "epoch": 2.776127005688089, "grad_norm": 1.0827378034591675, "learning_rate": 1.566234030312316e-05, "loss": 0.0179, "step": 10900 }, { "epoch": 2.7774004584429917, "grad_norm": 1.4116692543029785, "learning_rate": 1.5658675510235193e-05, "loss": 0.0157, "step": 10905 }, { "epoch": 2.7786739111978944, "grad_norm": 1.6567314863204956, "learning_rate": 1.565500959901992e-05, "loss": 0.016, "step": 10910 }, { "epoch": 2.7799473639527976, "grad_norm": 0.9211397767066956, "learning_rate": 1.5651342570201843e-05, "loss": 0.0174, "step": 10915 }, { "epoch": 2.7812208167077, "grad_norm": 0.9430907964706421, "learning_rate": 1.5647674424505677e-05, "loss": 0.0113, "step": 10920 }, { "epoch": 2.782494269462603, "grad_norm": 0.8518361449241638, "learning_rate": 1.564400516265636e-05, "loss": 0.0156, "step": 10925 }, { "epoch": 2.7837677222175055, "grad_norm": 0.930211067199707, "learning_rate": 1.564033478537904e-05, "loss": 0.0112, "step": 10930 }, { "epoch": 2.7850411749724087, "grad_norm": 0.7611104249954224, "learning_rate": 1.5636663293399112e-05, "loss": 0.0172, "step": 10935 }, { "epoch": 2.7863146277273114, "grad_norm": 1.0545941591262817, "learning_rate": 1.5632990687442165e-05, "loss": 0.0133, "step": 10940 }, { "epoch": 2.787588080482214, "grad_norm": 1.2123072147369385, "learning_rate": 1.5629316968234024e-05, "loss": 0.0192, "step": 10945 }, { "epoch": 2.788861533237117, "grad_norm": 1.1898143291473389, "learning_rate": 1.5625642136500725e-05, "loss": 0.0128, "step": 10950 }, { "epoch": 2.7901349859920197, "grad_norm": 1.1006982326507568, "learning_rate": 1.5621966192968526e-05, "loss": 0.015, "step": 10955 }, { "epoch": 2.7914084387469225, "grad_norm": 0.9142806529998779, "learning_rate": 1.561828913836391e-05, "loss": 0.0158, "step": 10960 }, { "epoch": 2.792681891501825, "grad_norm": 1.5550332069396973, "learning_rate": 1.5614610973413565e-05, "loss": 0.0199, "step": 10965 }, { "epoch": 2.793955344256728, "grad_norm": 1.2927037477493286, "learning_rate": 1.5610931698844423e-05, "loss": 0.0177, "step": 10970 }, { "epoch": 2.7952287970116307, "grad_norm": 1.338070273399353, "learning_rate": 1.5607251315383612e-05, "loss": 0.0166, "step": 10975 }, { "epoch": 2.7965022497665335, "grad_norm": 1.4398558139801025, "learning_rate": 1.5603569823758494e-05, "loss": 0.017, "step": 10980 }, { "epoch": 2.7977757025214363, "grad_norm": 1.6990877389907837, "learning_rate": 1.559988722469664e-05, "loss": 0.0159, "step": 10985 }, { "epoch": 2.7990491552763395, "grad_norm": 1.5410538911819458, "learning_rate": 1.5596203518925844e-05, "loss": 0.0169, "step": 10990 }, { "epoch": 2.8003226080312418, "grad_norm": 0.9620311856269836, "learning_rate": 1.5592518707174122e-05, "loss": 0.0165, "step": 10995 }, { "epoch": 2.801596060786145, "grad_norm": 0.8022574782371521, "learning_rate": 1.5588832790169704e-05, "loss": 0.0176, "step": 11000 }, { "epoch": 2.8028695135410477, "grad_norm": 1.5435161590576172, "learning_rate": 1.5585145768641038e-05, "loss": 0.0176, "step": 11005 }, { "epoch": 2.8041429662959505, "grad_norm": 1.2940621376037598, "learning_rate": 1.5581457643316796e-05, "loss": 0.0182, "step": 11010 }, { "epoch": 2.8054164190508533, "grad_norm": 1.420082926750183, "learning_rate": 1.557776841492586e-05, "loss": 0.0138, "step": 11015 }, { "epoch": 2.806689871805756, "grad_norm": 1.298505425453186, "learning_rate": 1.557407808419734e-05, "loss": 0.013, "step": 11020 }, { "epoch": 2.807963324560659, "grad_norm": 1.063286542892456, "learning_rate": 1.5570386651860548e-05, "loss": 0.0142, "step": 11025 }, { "epoch": 2.8092367773155615, "grad_norm": 1.165483832359314, "learning_rate": 1.5566694118645035e-05, "loss": 0.0211, "step": 11030 }, { "epoch": 2.8105102300704643, "grad_norm": 0.9184626340866089, "learning_rate": 1.5563000485280554e-05, "loss": 0.0192, "step": 11035 }, { "epoch": 2.811783682825367, "grad_norm": 1.005933165550232, "learning_rate": 1.5559305752497083e-05, "loss": 0.0094, "step": 11040 }, { "epoch": 2.81305713558027, "grad_norm": 1.012657642364502, "learning_rate": 1.5555609921024804e-05, "loss": 0.0163, "step": 11045 }, { "epoch": 2.8143305883351726, "grad_norm": 1.9123409986495972, "learning_rate": 1.555191299159414e-05, "loss": 0.0238, "step": 11050 }, { "epoch": 2.815604041090076, "grad_norm": 0.9774526357650757, "learning_rate": 1.554821496493571e-05, "loss": 0.0172, "step": 11055 }, { "epoch": 2.816877493844978, "grad_norm": 1.0030287504196167, "learning_rate": 1.5544515841780358e-05, "loss": 0.0163, "step": 11060 }, { "epoch": 2.8181509465998813, "grad_norm": 1.0729728937149048, "learning_rate": 1.554081562285914e-05, "loss": 0.0189, "step": 11065 }, { "epoch": 2.819424399354784, "grad_norm": 1.4268800020217896, "learning_rate": 1.553711430890334e-05, "loss": 0.0115, "step": 11070 }, { "epoch": 2.820697852109687, "grad_norm": 1.1924725770950317, "learning_rate": 1.5533411900644444e-05, "loss": 0.0201, "step": 11075 }, { "epoch": 2.8219713048645896, "grad_norm": 1.307098388671875, "learning_rate": 1.5529708398814167e-05, "loss": 0.0181, "step": 11080 }, { "epoch": 2.8232447576194923, "grad_norm": 1.1501268148422241, "learning_rate": 1.5526003804144432e-05, "loss": 0.0158, "step": 11085 }, { "epoch": 2.824518210374395, "grad_norm": 1.4774305820465088, "learning_rate": 1.5522298117367375e-05, "loss": 0.0217, "step": 11090 }, { "epoch": 2.825791663129298, "grad_norm": 1.1305631399154663, "learning_rate": 1.551859133921536e-05, "loss": 0.0144, "step": 11095 }, { "epoch": 2.8270651158842006, "grad_norm": 1.283125638961792, "learning_rate": 1.551488347042096e-05, "loss": 0.0148, "step": 11100 }, { "epoch": 2.8283385686391034, "grad_norm": 1.1507362127304077, "learning_rate": 1.5511174511716958e-05, "loss": 0.0156, "step": 11105 }, { "epoch": 2.829612021394006, "grad_norm": 1.4488322734832764, "learning_rate": 1.550746446383636e-05, "loss": 0.0137, "step": 11110 }, { "epoch": 2.830885474148909, "grad_norm": 1.4301221370697021, "learning_rate": 1.5503753327512384e-05, "loss": 0.0194, "step": 11115 }, { "epoch": 2.832158926903812, "grad_norm": 1.1989166736602783, "learning_rate": 1.5500041103478464e-05, "loss": 0.0204, "step": 11120 }, { "epoch": 2.8334323796587144, "grad_norm": 0.8560177087783813, "learning_rate": 1.5496327792468254e-05, "loss": 0.013, "step": 11125 }, { "epoch": 2.8347058324136176, "grad_norm": 1.1194442510604858, "learning_rate": 1.549261339521561e-05, "loss": 0.0191, "step": 11130 }, { "epoch": 2.8359792851685204, "grad_norm": 1.1512329578399658, "learning_rate": 1.5488897912454616e-05, "loss": 0.0149, "step": 11135 }, { "epoch": 2.837252737923423, "grad_norm": 1.3729572296142578, "learning_rate": 1.548518134491956e-05, "loss": 0.0185, "step": 11140 }, { "epoch": 2.838526190678326, "grad_norm": 0.9181482791900635, "learning_rate": 1.5481463693344958e-05, "loss": 0.0117, "step": 11145 }, { "epoch": 2.8397996434332287, "grad_norm": 1.032310128211975, "learning_rate": 1.5477744958465525e-05, "loss": 0.0149, "step": 11150 }, { "epoch": 2.8410730961881314, "grad_norm": 1.1570444107055664, "learning_rate": 1.5474025141016196e-05, "loss": 0.0142, "step": 11155 }, { "epoch": 2.842346548943034, "grad_norm": 1.5869140625, "learning_rate": 1.547030424173212e-05, "loss": 0.0169, "step": 11160 }, { "epoch": 2.843620001697937, "grad_norm": 1.1697545051574707, "learning_rate": 1.546658226134867e-05, "loss": 0.017, "step": 11165 }, { "epoch": 2.8448934544528397, "grad_norm": 0.7103164792060852, "learning_rate": 1.5462859200601412e-05, "loss": 0.0112, "step": 11170 }, { "epoch": 2.8461669072077425, "grad_norm": 1.5863962173461914, "learning_rate": 1.545913506022614e-05, "loss": 0.0185, "step": 11175 }, { "epoch": 2.847440359962645, "grad_norm": 1.081742763519287, "learning_rate": 1.5455409840958863e-05, "loss": 0.0183, "step": 11180 }, { "epoch": 2.8487138127175484, "grad_norm": 0.9471403360366821, "learning_rate": 1.545168354353579e-05, "loss": 0.015, "step": 11185 }, { "epoch": 2.8499872654724507, "grad_norm": 1.1155855655670166, "learning_rate": 1.5447956168693355e-05, "loss": 0.0232, "step": 11190 }, { "epoch": 2.851260718227354, "grad_norm": 1.0420583486557007, "learning_rate": 1.5444227717168208e-05, "loss": 0.0192, "step": 11195 }, { "epoch": 2.8525341709822567, "grad_norm": 1.1612029075622559, "learning_rate": 1.544049818969719e-05, "loss": 0.0157, "step": 11200 }, { "epoch": 2.8538076237371595, "grad_norm": 0.8114510178565979, "learning_rate": 1.5436767587017384e-05, "loss": 0.0143, "step": 11205 }, { "epoch": 2.855081076492062, "grad_norm": 1.343857765197754, "learning_rate": 1.5433035909866067e-05, "loss": 0.0166, "step": 11210 }, { "epoch": 2.856354529246965, "grad_norm": 0.7991287112236023, "learning_rate": 1.5429303158980725e-05, "loss": 0.0204, "step": 11215 }, { "epoch": 2.8576279820018677, "grad_norm": 2.4141347408294678, "learning_rate": 1.5425569335099074e-05, "loss": 0.0195, "step": 11220 }, { "epoch": 2.8589014347567705, "grad_norm": 1.028533697128296, "learning_rate": 1.5421834438959024e-05, "loss": 0.0154, "step": 11225 }, { "epoch": 2.8601748875116733, "grad_norm": 1.1255912780761719, "learning_rate": 1.5418098471298712e-05, "loss": 0.021, "step": 11230 }, { "epoch": 2.861448340266576, "grad_norm": 1.206860065460205, "learning_rate": 1.5414361432856475e-05, "loss": 0.0194, "step": 11235 }, { "epoch": 2.8627217930214788, "grad_norm": 0.947636604309082, "learning_rate": 1.5410623324370865e-05, "loss": 0.0155, "step": 11240 }, { "epoch": 2.8639952457763815, "grad_norm": 1.3013689517974854, "learning_rate": 1.540688414658065e-05, "loss": 0.0185, "step": 11245 }, { "epoch": 2.8652686985312847, "grad_norm": 1.3397746086120605, "learning_rate": 1.5403143900224803e-05, "loss": 0.0213, "step": 11250 }, { "epoch": 2.866542151286187, "grad_norm": 0.5133063793182373, "learning_rate": 1.5399402586042512e-05, "loss": 0.0194, "step": 11255 }, { "epoch": 2.8678156040410903, "grad_norm": 0.6270730495452881, "learning_rate": 1.5395660204773176e-05, "loss": 0.0123, "step": 11260 }, { "epoch": 2.869089056795993, "grad_norm": 1.2616565227508545, "learning_rate": 1.53919167571564e-05, "loss": 0.0183, "step": 11265 }, { "epoch": 2.870362509550896, "grad_norm": 1.0246646404266357, "learning_rate": 1.538817224393201e-05, "loss": 0.0125, "step": 11270 }, { "epoch": 2.8716359623057985, "grad_norm": 0.9599324464797974, "learning_rate": 1.5384426665840032e-05, "loss": 0.0239, "step": 11275 }, { "epoch": 2.8729094150607013, "grad_norm": 1.109852910041809, "learning_rate": 1.5380680023620705e-05, "loss": 0.0145, "step": 11280 }, { "epoch": 2.874182867815604, "grad_norm": 1.2652844190597534, "learning_rate": 1.5376932318014487e-05, "loss": 0.0161, "step": 11285 }, { "epoch": 2.875456320570507, "grad_norm": 0.9785670638084412, "learning_rate": 1.5373183549762036e-05, "loss": 0.0189, "step": 11290 }, { "epoch": 2.8767297733254096, "grad_norm": 1.9108614921569824, "learning_rate": 1.5369433719604222e-05, "loss": 0.0121, "step": 11295 }, { "epoch": 2.8780032260803123, "grad_norm": 1.0504778623580933, "learning_rate": 1.5365682828282123e-05, "loss": 0.0174, "step": 11300 }, { "epoch": 2.879276678835215, "grad_norm": 0.7070373892784119, "learning_rate": 1.5361930876537038e-05, "loss": 0.0109, "step": 11305 }, { "epoch": 2.880550131590118, "grad_norm": 0.7963285446166992, "learning_rate": 1.535817786511046e-05, "loss": 0.0131, "step": 11310 }, { "epoch": 2.881823584345021, "grad_norm": 0.7529622316360474, "learning_rate": 1.5354423794744105e-05, "loss": 0.015, "step": 11315 }, { "epoch": 2.8830970370999234, "grad_norm": 0.8608189225196838, "learning_rate": 1.5350668666179884e-05, "loss": 0.0121, "step": 11320 }, { "epoch": 2.8843704898548266, "grad_norm": 1.4791812896728516, "learning_rate": 1.534691248015993e-05, "loss": 0.0155, "step": 11325 }, { "epoch": 2.8856439426097293, "grad_norm": 1.0135332345962524, "learning_rate": 1.534315523742658e-05, "loss": 0.0207, "step": 11330 }, { "epoch": 2.886917395364632, "grad_norm": 1.2688813209533691, "learning_rate": 1.5339396938722387e-05, "loss": 0.0155, "step": 11335 }, { "epoch": 2.888190848119535, "grad_norm": 1.0069994926452637, "learning_rate": 1.5335637584790094e-05, "loss": 0.0205, "step": 11340 }, { "epoch": 2.8894643008744376, "grad_norm": 0.699374794960022, "learning_rate": 1.5331877176372665e-05, "loss": 0.0119, "step": 11345 }, { "epoch": 2.8907377536293404, "grad_norm": 1.1187968254089355, "learning_rate": 1.532811571421328e-05, "loss": 0.017, "step": 11350 }, { "epoch": 2.892011206384243, "grad_norm": 1.2205687761306763, "learning_rate": 1.5324353199055314e-05, "loss": 0.0174, "step": 11355 }, { "epoch": 2.893284659139146, "grad_norm": 1.153671383857727, "learning_rate": 1.532058963164236e-05, "loss": 0.0162, "step": 11360 }, { "epoch": 2.8945581118940487, "grad_norm": 0.9550788402557373, "learning_rate": 1.5316825012718204e-05, "loss": 0.015, "step": 11365 }, { "epoch": 2.8958315646489514, "grad_norm": 2.227964401245117, "learning_rate": 1.5313059343026858e-05, "loss": 0.0192, "step": 11370 }, { "epoch": 2.897105017403854, "grad_norm": 1.33482825756073, "learning_rate": 1.5309292623312532e-05, "loss": 0.0224, "step": 11375 }, { "epoch": 2.8983784701587574, "grad_norm": 1.953109622001648, "learning_rate": 1.5305524854319643e-05, "loss": 0.0122, "step": 11380 }, { "epoch": 2.8996519229136597, "grad_norm": 1.9786282777786255, "learning_rate": 1.5301756036792817e-05, "loss": 0.0209, "step": 11385 }, { "epoch": 2.900925375668563, "grad_norm": 0.7011091113090515, "learning_rate": 1.529798617147689e-05, "loss": 0.0071, "step": 11390 }, { "epoch": 2.902198828423465, "grad_norm": 1.3264992237091064, "learning_rate": 1.5294215259116904e-05, "loss": 0.0135, "step": 11395 }, { "epoch": 2.9034722811783684, "grad_norm": 0.8980849981307983, "learning_rate": 1.5290443300458103e-05, "loss": 0.0167, "step": 11400 }, { "epoch": 2.904745733933271, "grad_norm": 0.8491497039794922, "learning_rate": 1.528667029624594e-05, "loss": 0.0209, "step": 11405 }, { "epoch": 2.906019186688174, "grad_norm": 1.2305554151535034, "learning_rate": 1.528289624722608e-05, "loss": 0.0179, "step": 11410 }, { "epoch": 2.9072926394430767, "grad_norm": 1.096915364265442, "learning_rate": 1.527912115414439e-05, "loss": 0.0138, "step": 11415 }, { "epoch": 2.9085660921979795, "grad_norm": 1.1351507902145386, "learning_rate": 1.5275345017746942e-05, "loss": 0.0162, "step": 11420 }, { "epoch": 2.909839544952882, "grad_norm": 1.058016061782837, "learning_rate": 1.5271567838780015e-05, "loss": 0.0173, "step": 11425 }, { "epoch": 2.911112997707785, "grad_norm": 1.5280839204788208, "learning_rate": 1.52677896179901e-05, "loss": 0.0208, "step": 11430 }, { "epoch": 2.9123864504626877, "grad_norm": 0.9989846348762512, "learning_rate": 1.5264010356123885e-05, "loss": 0.0209, "step": 11435 }, { "epoch": 2.9136599032175905, "grad_norm": 1.8454687595367432, "learning_rate": 1.526023005392827e-05, "loss": 0.0179, "step": 11440 }, { "epoch": 2.9149333559724933, "grad_norm": 1.1328314542770386, "learning_rate": 1.5256448712150355e-05, "loss": 0.0149, "step": 11445 }, { "epoch": 2.916206808727396, "grad_norm": 0.9827338457107544, "learning_rate": 1.525266633153745e-05, "loss": 0.0159, "step": 11450 }, { "epoch": 2.917480261482299, "grad_norm": 1.548566460609436, "learning_rate": 1.5248882912837073e-05, "loss": 0.0151, "step": 11455 }, { "epoch": 2.9187537142372015, "grad_norm": 0.8141647577285767, "learning_rate": 1.5245098456796935e-05, "loss": 0.0156, "step": 11460 }, { "epoch": 2.9200271669921047, "grad_norm": 1.672793984413147, "learning_rate": 1.5241312964164972e-05, "loss": 0.0137, "step": 11465 }, { "epoch": 2.9213006197470075, "grad_norm": 1.6992193460464478, "learning_rate": 1.52375264356893e-05, "loss": 0.0162, "step": 11470 }, { "epoch": 2.9225740725019103, "grad_norm": 1.6614445447921753, "learning_rate": 1.5233738872118262e-05, "loss": 0.019, "step": 11475 }, { "epoch": 2.923847525256813, "grad_norm": 0.5874881148338318, "learning_rate": 1.5229950274200396e-05, "loss": 0.0141, "step": 11480 }, { "epoch": 2.9251209780117158, "grad_norm": 1.917592167854309, "learning_rate": 1.5226160642684438e-05, "loss": 0.024, "step": 11485 }, { "epoch": 2.9263944307666185, "grad_norm": 0.7081392407417297, "learning_rate": 1.522236997831934e-05, "loss": 0.013, "step": 11490 }, { "epoch": 2.9276678835215213, "grad_norm": 0.7719967365264893, "learning_rate": 1.5218578281854255e-05, "loss": 0.0146, "step": 11495 }, { "epoch": 2.928941336276424, "grad_norm": 0.9610752463340759, "learning_rate": 1.5214785554038531e-05, "loss": 0.0162, "step": 11500 }, { "epoch": 2.930214789031327, "grad_norm": 1.6823747158050537, "learning_rate": 1.5210991795621736e-05, "loss": 0.0148, "step": 11505 }, { "epoch": 2.9314882417862296, "grad_norm": 1.1410892009735107, "learning_rate": 1.520719700735363e-05, "loss": 0.0141, "step": 11510 }, { "epoch": 2.9327616945411323, "grad_norm": 1.1702383756637573, "learning_rate": 1.5203401189984171e-05, "loss": 0.0163, "step": 11515 }, { "epoch": 2.9340351472960355, "grad_norm": 0.7768813967704773, "learning_rate": 1.5199604344263538e-05, "loss": 0.0131, "step": 11520 }, { "epoch": 2.935308600050938, "grad_norm": 0.6458883881568909, "learning_rate": 1.5195806470942103e-05, "loss": 0.0126, "step": 11525 }, { "epoch": 2.936582052805841, "grad_norm": 1.7220869064331055, "learning_rate": 1.5192007570770435e-05, "loss": 0.0206, "step": 11530 }, { "epoch": 2.937855505560744, "grad_norm": 1.231221318244934, "learning_rate": 1.5188207644499322e-05, "loss": 0.0201, "step": 11535 }, { "epoch": 2.9391289583156466, "grad_norm": 0.5061484575271606, "learning_rate": 1.518440669287974e-05, "loss": 0.0155, "step": 11540 }, { "epoch": 2.9404024110705493, "grad_norm": 1.0861648321151733, "learning_rate": 1.5180604716662875e-05, "loss": 0.0131, "step": 11545 }, { "epoch": 2.941675863825452, "grad_norm": 1.6054669618606567, "learning_rate": 1.5176801716600112e-05, "loss": 0.0174, "step": 11550 }, { "epoch": 2.942949316580355, "grad_norm": 1.4200770854949951, "learning_rate": 1.5172997693443043e-05, "loss": 0.0145, "step": 11555 }, { "epoch": 2.9442227693352576, "grad_norm": 0.5372039675712585, "learning_rate": 1.5169192647943455e-05, "loss": 0.0156, "step": 11560 }, { "epoch": 2.9454962220901604, "grad_norm": 1.2716469764709473, "learning_rate": 1.5165386580853346e-05, "loss": 0.0141, "step": 11565 }, { "epoch": 2.946769674845063, "grad_norm": 1.1080565452575684, "learning_rate": 1.5161579492924911e-05, "loss": 0.0149, "step": 11570 }, { "epoch": 2.948043127599966, "grad_norm": 1.115730881690979, "learning_rate": 1.5157771384910543e-05, "loss": 0.0222, "step": 11575 }, { "epoch": 2.9493165803548687, "grad_norm": 0.8294344544410706, "learning_rate": 1.515396225756284e-05, "loss": 0.018, "step": 11580 }, { "epoch": 2.950590033109772, "grad_norm": 0.8489083647727966, "learning_rate": 1.515015211163461e-05, "loss": 0.0172, "step": 11585 }, { "epoch": 2.951863485864674, "grad_norm": 0.96368008852005, "learning_rate": 1.5146340947878849e-05, "loss": 0.022, "step": 11590 }, { "epoch": 2.9531369386195774, "grad_norm": 0.8370074033737183, "learning_rate": 1.5142528767048754e-05, "loss": 0.0129, "step": 11595 }, { "epoch": 2.95441039137448, "grad_norm": 0.9003834128379822, "learning_rate": 1.5138715569897737e-05, "loss": 0.0144, "step": 11600 }, { "epoch": 2.955683844129383, "grad_norm": 1.4174245595932007, "learning_rate": 1.5134901357179402e-05, "loss": 0.0216, "step": 11605 }, { "epoch": 2.9569572968842857, "grad_norm": 0.933114767074585, "learning_rate": 1.5131086129647547e-05, "loss": 0.0235, "step": 11610 }, { "epoch": 2.9582307496391884, "grad_norm": 1.487002968788147, "learning_rate": 1.5127269888056186e-05, "loss": 0.0154, "step": 11615 }, { "epoch": 2.959504202394091, "grad_norm": 1.553670883178711, "learning_rate": 1.512345263315952e-05, "loss": 0.018, "step": 11620 }, { "epoch": 2.960777655148994, "grad_norm": 1.0703197717666626, "learning_rate": 1.5119634365711955e-05, "loss": 0.0239, "step": 11625 }, { "epoch": 2.9620511079038967, "grad_norm": 1.232479453086853, "learning_rate": 1.5115815086468103e-05, "loss": 0.0177, "step": 11630 }, { "epoch": 2.9633245606587995, "grad_norm": 0.6502572298049927, "learning_rate": 1.5111994796182763e-05, "loss": 0.0159, "step": 11635 }, { "epoch": 2.964598013413702, "grad_norm": 1.2529726028442383, "learning_rate": 1.5108173495610943e-05, "loss": 0.0136, "step": 11640 }, { "epoch": 2.965871466168605, "grad_norm": 1.2104743719100952, "learning_rate": 1.510435118550785e-05, "loss": 0.0156, "step": 11645 }, { "epoch": 2.967144918923508, "grad_norm": 0.902874767780304, "learning_rate": 1.5100527866628893e-05, "loss": 0.0131, "step": 11650 }, { "epoch": 2.9684183716784105, "grad_norm": 1.254024624824524, "learning_rate": 1.5096703539729669e-05, "loss": 0.0186, "step": 11655 }, { "epoch": 2.9696918244333137, "grad_norm": 1.1549859046936035, "learning_rate": 1.5092878205565991e-05, "loss": 0.0182, "step": 11660 }, { "epoch": 2.9709652771882165, "grad_norm": 0.7095955610275269, "learning_rate": 1.5089051864893855e-05, "loss": 0.0246, "step": 11665 }, { "epoch": 2.972238729943119, "grad_norm": 1.0372450351715088, "learning_rate": 1.5085224518469468e-05, "loss": 0.0166, "step": 11670 }, { "epoch": 2.973512182698022, "grad_norm": 1.0318135023117065, "learning_rate": 1.5081396167049222e-05, "loss": 0.0106, "step": 11675 }, { "epoch": 2.9747856354529247, "grad_norm": 0.9704948663711548, "learning_rate": 1.5077566811389726e-05, "loss": 0.0199, "step": 11680 }, { "epoch": 2.9760590882078275, "grad_norm": 1.5310685634613037, "learning_rate": 1.5073736452247773e-05, "loss": 0.0169, "step": 11685 }, { "epoch": 2.9773325409627303, "grad_norm": 1.7396221160888672, "learning_rate": 1.5069905090380362e-05, "loss": 0.0205, "step": 11690 }, { "epoch": 2.978605993717633, "grad_norm": 0.5688095092773438, "learning_rate": 1.5066072726544682e-05, "loss": 0.0095, "step": 11695 }, { "epoch": 2.9798794464725358, "grad_norm": 1.0928171873092651, "learning_rate": 1.5062239361498127e-05, "loss": 0.0155, "step": 11700 }, { "epoch": 2.9811528992274385, "grad_norm": 1.3637111186981201, "learning_rate": 1.5058404995998293e-05, "loss": 0.0168, "step": 11705 }, { "epoch": 2.9824263519823413, "grad_norm": 1.1806981563568115, "learning_rate": 1.5054569630802963e-05, "loss": 0.0141, "step": 11710 }, { "epoch": 2.9836998047372445, "grad_norm": 1.5680058002471924, "learning_rate": 1.505073326667012e-05, "loss": 0.018, "step": 11715 }, { "epoch": 2.984973257492147, "grad_norm": 2.1096789836883545, "learning_rate": 1.5046895904357955e-05, "loss": 0.0188, "step": 11720 }, { "epoch": 2.98624671024705, "grad_norm": 0.9980878233909607, "learning_rate": 1.5043057544624837e-05, "loss": 0.0135, "step": 11725 }, { "epoch": 2.9875201630019528, "grad_norm": 1.1690359115600586, "learning_rate": 1.5039218188229355e-05, "loss": 0.0178, "step": 11730 }, { "epoch": 2.9887936157568555, "grad_norm": 0.9407362341880798, "learning_rate": 1.5035377835930272e-05, "loss": 0.0114, "step": 11735 }, { "epoch": 2.9900670685117583, "grad_norm": 1.8606525659561157, "learning_rate": 1.5031536488486564e-05, "loss": 0.0147, "step": 11740 }, { "epoch": 2.991340521266661, "grad_norm": 1.4234479665756226, "learning_rate": 1.5027694146657404e-05, "loss": 0.0132, "step": 11745 }, { "epoch": 2.992613974021564, "grad_norm": 0.9174773693084717, "learning_rate": 1.5023850811202146e-05, "loss": 0.0188, "step": 11750 }, { "epoch": 2.9938874267764666, "grad_norm": 1.6289827823638916, "learning_rate": 1.5020006482880361e-05, "loss": 0.0153, "step": 11755 }, { "epoch": 2.9951608795313693, "grad_norm": 0.9333860874176025, "learning_rate": 1.5016161162451797e-05, "loss": 0.0147, "step": 11760 }, { "epoch": 2.996434332286272, "grad_norm": 1.3600802421569824, "learning_rate": 1.501231485067641e-05, "loss": 0.0141, "step": 11765 }, { "epoch": 2.997707785041175, "grad_norm": 0.8706961274147034, "learning_rate": 1.5008467548314354e-05, "loss": 0.0172, "step": 11770 }, { "epoch": 2.9989812377960776, "grad_norm": 1.5143283605575562, "learning_rate": 1.5004619256125967e-05, "loss": 0.0138, "step": 11775 }, { "epoch": 3.0002546905509804, "grad_norm": 0.6010201573371887, "learning_rate": 1.500076997487179e-05, "loss": 0.0102, "step": 11780 }, { "epoch": 3.001528143305883, "grad_norm": 0.6889100670814514, "learning_rate": 1.4996919705312562e-05, "loss": 0.0123, "step": 11785 }, { "epoch": 3.0028015960607863, "grad_norm": 1.0358946323394775, "learning_rate": 1.4993068448209212e-05, "loss": 0.0104, "step": 11790 }, { "epoch": 3.004075048815689, "grad_norm": 0.8985348343849182, "learning_rate": 1.4989216204322862e-05, "loss": 0.0105, "step": 11795 }, { "epoch": 3.005348501570592, "grad_norm": 0.6257368326187134, "learning_rate": 1.498536297441484e-05, "loss": 0.0083, "step": 11800 }, { "epoch": 3.0066219543254946, "grad_norm": 1.5708023309707642, "learning_rate": 1.4981508759246662e-05, "loss": 0.0061, "step": 11805 }, { "epoch": 3.0078954070803974, "grad_norm": 1.1564970016479492, "learning_rate": 1.4977653559580032e-05, "loss": 0.0071, "step": 11810 }, { "epoch": 3.0091688598353, "grad_norm": 0.72208571434021, "learning_rate": 1.4973797376176862e-05, "loss": 0.0062, "step": 11815 }, { "epoch": 3.010442312590203, "grad_norm": 0.7138368487358093, "learning_rate": 1.4969940209799248e-05, "loss": 0.0093, "step": 11820 }, { "epoch": 3.0117157653451057, "grad_norm": 1.109910249710083, "learning_rate": 1.4966082061209484e-05, "loss": 0.0112, "step": 11825 }, { "epoch": 3.0129892181000084, "grad_norm": 0.8754097819328308, "learning_rate": 1.496222293117006e-05, "loss": 0.0049, "step": 11830 }, { "epoch": 3.014262670854911, "grad_norm": 0.810592532157898, "learning_rate": 1.495836282044366e-05, "loss": 0.0059, "step": 11835 }, { "epoch": 3.015536123609814, "grad_norm": 1.2518366575241089, "learning_rate": 1.4954501729793154e-05, "loss": 0.0096, "step": 11840 }, { "epoch": 3.0168095763647167, "grad_norm": 1.0976603031158447, "learning_rate": 1.4950639659981617e-05, "loss": 0.0128, "step": 11845 }, { "epoch": 3.0180830291196195, "grad_norm": 1.1442469358444214, "learning_rate": 1.4946776611772309e-05, "loss": 0.0064, "step": 11850 }, { "epoch": 3.0193564818745227, "grad_norm": 1.1142276525497437, "learning_rate": 1.4942912585928689e-05, "loss": 0.0117, "step": 11855 }, { "epoch": 3.0206299346294254, "grad_norm": 0.5535274147987366, "learning_rate": 1.4939047583214401e-05, "loss": 0.0074, "step": 11860 }, { "epoch": 3.021903387384328, "grad_norm": 0.8812920451164246, "learning_rate": 1.4935181604393296e-05, "loss": 0.0102, "step": 11865 }, { "epoch": 3.023176840139231, "grad_norm": 0.4970581829547882, "learning_rate": 1.4931314650229405e-05, "loss": 0.0102, "step": 11870 }, { "epoch": 3.0244502928941337, "grad_norm": 0.7791002988815308, "learning_rate": 1.4927446721486953e-05, "loss": 0.0081, "step": 11875 }, { "epoch": 3.0257237456490365, "grad_norm": 1.0561214685440063, "learning_rate": 1.4923577818930369e-05, "loss": 0.0113, "step": 11880 }, { "epoch": 3.026997198403939, "grad_norm": 0.890824019908905, "learning_rate": 1.4919707943324262e-05, "loss": 0.0101, "step": 11885 }, { "epoch": 3.028270651158842, "grad_norm": 1.0708762407302856, "learning_rate": 1.4915837095433434e-05, "loss": 0.0095, "step": 11890 }, { "epoch": 3.0295441039137447, "grad_norm": 0.8133692145347595, "learning_rate": 1.491196527602289e-05, "loss": 0.0062, "step": 11895 }, { "epoch": 3.0308175566686475, "grad_norm": 0.8533095717430115, "learning_rate": 1.4908092485857818e-05, "loss": 0.0094, "step": 11900 }, { "epoch": 3.0320910094235503, "grad_norm": 0.7793605923652649, "learning_rate": 1.4904218725703597e-05, "loss": 0.0058, "step": 11905 }, { "epoch": 3.033364462178453, "grad_norm": 1.6903822422027588, "learning_rate": 1.49003439963258e-05, "loss": 0.007, "step": 11910 }, { "epoch": 3.0346379149333558, "grad_norm": 1.2587145566940308, "learning_rate": 1.4896468298490197e-05, "loss": 0.0097, "step": 11915 }, { "epoch": 3.035911367688259, "grad_norm": 1.0461549758911133, "learning_rate": 1.4892591632962738e-05, "loss": 0.0097, "step": 11920 }, { "epoch": 3.0371848204431617, "grad_norm": 0.7018455862998962, "learning_rate": 1.4888714000509577e-05, "loss": 0.0092, "step": 11925 }, { "epoch": 3.0384582731980645, "grad_norm": 0.3440871238708496, "learning_rate": 1.4884835401897051e-05, "loss": 0.0107, "step": 11930 }, { "epoch": 3.0397317259529673, "grad_norm": 1.0874868631362915, "learning_rate": 1.4880955837891684e-05, "loss": 0.0072, "step": 11935 }, { "epoch": 3.04100517870787, "grad_norm": 0.8270758986473083, "learning_rate": 1.4877075309260205e-05, "loss": 0.0095, "step": 11940 }, { "epoch": 3.0422786314627728, "grad_norm": 0.39526164531707764, "learning_rate": 1.4873193816769518e-05, "loss": 0.0079, "step": 11945 }, { "epoch": 3.0435520842176755, "grad_norm": 1.0494834184646606, "learning_rate": 1.4869311361186726e-05, "loss": 0.0107, "step": 11950 }, { "epoch": 3.0448255369725783, "grad_norm": 0.6495004296302795, "learning_rate": 1.486542794327913e-05, "loss": 0.0067, "step": 11955 }, { "epoch": 3.046098989727481, "grad_norm": 1.057328462600708, "learning_rate": 1.4861543563814197e-05, "loss": 0.0102, "step": 11960 }, { "epoch": 3.047372442482384, "grad_norm": 0.6801275014877319, "learning_rate": 1.485765822355961e-05, "loss": 0.0091, "step": 11965 }, { "epoch": 3.0486458952372866, "grad_norm": 0.9518849849700928, "learning_rate": 1.4853771923283228e-05, "loss": 0.0062, "step": 11970 }, { "epoch": 3.0499193479921893, "grad_norm": 0.8114588260650635, "learning_rate": 1.4849884663753104e-05, "loss": 0.0044, "step": 11975 }, { "epoch": 3.051192800747092, "grad_norm": 0.686006486415863, "learning_rate": 1.4845996445737476e-05, "loss": 0.0075, "step": 11980 }, { "epoch": 3.0524662535019953, "grad_norm": 0.8720389604568481, "learning_rate": 1.4842107270004775e-05, "loss": 0.0077, "step": 11985 }, { "epoch": 3.053739706256898, "grad_norm": 1.1003398895263672, "learning_rate": 1.4838217137323627e-05, "loss": 0.0093, "step": 11990 }, { "epoch": 3.055013159011801, "grad_norm": 0.23389442265033722, "learning_rate": 1.4834326048462835e-05, "loss": 0.0051, "step": 11995 }, { "epoch": 3.0562866117667036, "grad_norm": 0.8659753799438477, "learning_rate": 1.48304340041914e-05, "loss": 0.0076, "step": 12000 }, { "epoch": 3.0575600645216063, "grad_norm": 0.6240156888961792, "learning_rate": 1.4826541005278506e-05, "loss": 0.0079, "step": 12005 }, { "epoch": 3.058833517276509, "grad_norm": 0.4594862461090088, "learning_rate": 1.482264705249353e-05, "loss": 0.0057, "step": 12010 }, { "epoch": 3.060106970031412, "grad_norm": 1.7140331268310547, "learning_rate": 1.481875214660604e-05, "loss": 0.0087, "step": 12015 }, { "epoch": 3.0613804227863146, "grad_norm": 0.8907782435417175, "learning_rate": 1.4814856288385785e-05, "loss": 0.0098, "step": 12020 }, { "epoch": 3.0626538755412174, "grad_norm": 1.2499181032180786, "learning_rate": 1.4810959478602703e-05, "loss": 0.0079, "step": 12025 }, { "epoch": 3.06392732829612, "grad_norm": 0.7106938362121582, "learning_rate": 1.4807061718026927e-05, "loss": 0.0058, "step": 12030 }, { "epoch": 3.065200781051023, "grad_norm": 1.2049882411956787, "learning_rate": 1.4803163007428773e-05, "loss": 0.0071, "step": 12035 }, { "epoch": 3.0664742338059257, "grad_norm": 1.1134051084518433, "learning_rate": 1.4799263347578745e-05, "loss": 0.0046, "step": 12040 }, { "epoch": 3.0677476865608284, "grad_norm": 0.8924962878227234, "learning_rate": 1.4795362739247532e-05, "loss": 0.0089, "step": 12045 }, { "epoch": 3.069021139315731, "grad_norm": 0.6523284912109375, "learning_rate": 1.4791461183206019e-05, "loss": 0.0088, "step": 12050 }, { "epoch": 3.0702945920706344, "grad_norm": 0.6894357204437256, "learning_rate": 1.478755868022527e-05, "loss": 0.0114, "step": 12055 }, { "epoch": 3.071568044825537, "grad_norm": 0.46236148476600647, "learning_rate": 1.478365523107654e-05, "loss": 0.0054, "step": 12060 }, { "epoch": 3.07284149758044, "grad_norm": 1.5602953433990479, "learning_rate": 1.4779750836531269e-05, "loss": 0.0078, "step": 12065 }, { "epoch": 3.0741149503353427, "grad_norm": 0.8636108636856079, "learning_rate": 1.4775845497361084e-05, "loss": 0.0091, "step": 12070 }, { "epoch": 3.0753884030902454, "grad_norm": 0.6130300164222717, "learning_rate": 1.4771939214337796e-05, "loss": 0.0051, "step": 12075 }, { "epoch": 3.076661855845148, "grad_norm": 0.7933991551399231, "learning_rate": 1.4768031988233414e-05, "loss": 0.0065, "step": 12080 }, { "epoch": 3.077935308600051, "grad_norm": 0.8736277222633362, "learning_rate": 1.4764123819820123e-05, "loss": 0.0088, "step": 12085 }, { "epoch": 3.0792087613549537, "grad_norm": 0.5069186687469482, "learning_rate": 1.4760214709870291e-05, "loss": 0.0049, "step": 12090 }, { "epoch": 3.0804822141098565, "grad_norm": 1.0207083225250244, "learning_rate": 1.4756304659156485e-05, "loss": 0.006, "step": 12095 }, { "epoch": 3.081755666864759, "grad_norm": 1.0722668170928955, "learning_rate": 1.4752393668451446e-05, "loss": 0.0108, "step": 12100 }, { "epoch": 3.083029119619662, "grad_norm": 0.49081578850746155, "learning_rate": 1.4748481738528107e-05, "loss": 0.0079, "step": 12105 }, { "epoch": 3.0843025723745647, "grad_norm": 0.6894317865371704, "learning_rate": 1.4744568870159581e-05, "loss": 0.0091, "step": 12110 }, { "epoch": 3.0855760251294675, "grad_norm": 1.3880079984664917, "learning_rate": 1.4740655064119177e-05, "loss": 0.0096, "step": 12115 }, { "epoch": 3.0868494778843707, "grad_norm": 0.8070310354232788, "learning_rate": 1.473674032118038e-05, "loss": 0.0058, "step": 12120 }, { "epoch": 3.0881229306392735, "grad_norm": 0.7036991715431213, "learning_rate": 1.4732824642116863e-05, "loss": 0.0068, "step": 12125 }, { "epoch": 3.089396383394176, "grad_norm": 0.4681655466556549, "learning_rate": 1.4728908027702483e-05, "loss": 0.0091, "step": 12130 }, { "epoch": 3.090669836149079, "grad_norm": 1.4943208694458008, "learning_rate": 1.4724990478711283e-05, "loss": 0.0112, "step": 12135 }, { "epoch": 3.0919432889039817, "grad_norm": 0.7071904540061951, "learning_rate": 1.4721071995917488e-05, "loss": 0.0087, "step": 12140 }, { "epoch": 3.0932167416588845, "grad_norm": 0.7827587723731995, "learning_rate": 1.4717152580095517e-05, "loss": 0.0105, "step": 12145 }, { "epoch": 3.0944901944137873, "grad_norm": 0.5634180307388306, "learning_rate": 1.471323223201996e-05, "loss": 0.0081, "step": 12150 }, { "epoch": 3.09576364716869, "grad_norm": 0.9872891902923584, "learning_rate": 1.47093109524656e-05, "loss": 0.0096, "step": 12155 }, { "epoch": 3.0970370999235928, "grad_norm": 0.7917616963386536, "learning_rate": 1.4705388742207403e-05, "loss": 0.0063, "step": 12160 }, { "epoch": 3.0983105526784955, "grad_norm": 1.2962524890899658, "learning_rate": 1.4701465602020518e-05, "loss": 0.0085, "step": 12165 }, { "epoch": 3.0995840054333983, "grad_norm": 0.5972346067428589, "learning_rate": 1.469754153268027e-05, "loss": 0.0066, "step": 12170 }, { "epoch": 3.100857458188301, "grad_norm": 0.9476566314697266, "learning_rate": 1.4693616534962186e-05, "loss": 0.0136, "step": 12175 }, { "epoch": 3.102130910943204, "grad_norm": 0.6382269263267517, "learning_rate": 1.4689690609641957e-05, "loss": 0.0052, "step": 12180 }, { "epoch": 3.1034043636981066, "grad_norm": 0.836326003074646, "learning_rate": 1.4685763757495467e-05, "loss": 0.0083, "step": 12185 }, { "epoch": 3.1046778164530098, "grad_norm": 0.7254179120063782, "learning_rate": 1.468183597929879e-05, "loss": 0.008, "step": 12190 }, { "epoch": 3.1059512692079125, "grad_norm": 1.0496718883514404, "learning_rate": 1.4677907275828168e-05, "loss": 0.01, "step": 12195 }, { "epoch": 3.1072247219628153, "grad_norm": 0.6805468201637268, "learning_rate": 1.4673977647860032e-05, "loss": 0.0093, "step": 12200 }, { "epoch": 3.108498174717718, "grad_norm": 0.39972808957099915, "learning_rate": 1.4670047096171001e-05, "loss": 0.007, "step": 12205 }, { "epoch": 3.109771627472621, "grad_norm": 0.8143256902694702, "learning_rate": 1.4666115621537868e-05, "loss": 0.0088, "step": 12210 }, { "epoch": 3.1110450802275236, "grad_norm": 1.0755236148834229, "learning_rate": 1.4662183224737616e-05, "loss": 0.0076, "step": 12215 }, { "epoch": 3.1123185329824263, "grad_norm": 0.5712524652481079, "learning_rate": 1.4658249906547405e-05, "loss": 0.0078, "step": 12220 }, { "epoch": 3.113591985737329, "grad_norm": 1.8975365161895752, "learning_rate": 1.4654315667744582e-05, "loss": 0.0146, "step": 12225 }, { "epoch": 3.114865438492232, "grad_norm": 0.6268263459205627, "learning_rate": 1.4650380509106668e-05, "loss": 0.0086, "step": 12230 }, { "epoch": 3.1161388912471346, "grad_norm": 1.0670396089553833, "learning_rate": 1.4646444431411373e-05, "loss": 0.0086, "step": 12235 }, { "epoch": 3.1174123440020374, "grad_norm": 1.4347796440124512, "learning_rate": 1.4642507435436589e-05, "loss": 0.0098, "step": 12240 }, { "epoch": 3.11868579675694, "grad_norm": 0.5696189403533936, "learning_rate": 1.4638569521960382e-05, "loss": 0.0056, "step": 12245 }, { "epoch": 3.119959249511843, "grad_norm": 0.8066709041595459, "learning_rate": 1.4634630691761004e-05, "loss": 0.0046, "step": 12250 }, { "epoch": 3.121232702266746, "grad_norm": 1.0931763648986816, "learning_rate": 1.4630690945616892e-05, "loss": 0.0075, "step": 12255 }, { "epoch": 3.122506155021649, "grad_norm": 1.2297630310058594, "learning_rate": 1.4626750284306661e-05, "loss": 0.0099, "step": 12260 }, { "epoch": 3.1237796077765516, "grad_norm": 0.5826345682144165, "learning_rate": 1.4622808708609102e-05, "loss": 0.0054, "step": 12265 }, { "epoch": 3.1250530605314544, "grad_norm": 0.7905558943748474, "learning_rate": 1.4618866219303193e-05, "loss": 0.0058, "step": 12270 }, { "epoch": 3.126326513286357, "grad_norm": 2.1003975868225098, "learning_rate": 1.4614922817168093e-05, "loss": 0.0112, "step": 12275 }, { "epoch": 3.12759996604126, "grad_norm": 0.5898324847221375, "learning_rate": 1.4610978502983132e-05, "loss": 0.007, "step": 12280 }, { "epoch": 3.1288734187961627, "grad_norm": 0.736713171005249, "learning_rate": 1.4607033277527834e-05, "loss": 0.0065, "step": 12285 }, { "epoch": 3.1301468715510654, "grad_norm": 0.45376330614089966, "learning_rate": 1.4603087141581893e-05, "loss": 0.0074, "step": 12290 }, { "epoch": 3.131420324305968, "grad_norm": 1.2005382776260376, "learning_rate": 1.4599140095925184e-05, "loss": 0.0076, "step": 12295 }, { "epoch": 3.132693777060871, "grad_norm": 0.9545817375183105, "learning_rate": 1.4595192141337771e-05, "loss": 0.0065, "step": 12300 }, { "epoch": 3.1339672298157737, "grad_norm": 0.7922012209892273, "learning_rate": 1.4591243278599883e-05, "loss": 0.0106, "step": 12305 }, { "epoch": 3.1352406825706765, "grad_norm": 0.8656730055809021, "learning_rate": 1.458729350849194e-05, "loss": 0.0085, "step": 12310 }, { "epoch": 3.136514135325579, "grad_norm": 1.3355743885040283, "learning_rate": 1.4583342831794536e-05, "loss": 0.0161, "step": 12315 }, { "epoch": 3.1377875880804824, "grad_norm": 0.909976065158844, "learning_rate": 1.4579391249288446e-05, "loss": 0.0082, "step": 12320 }, { "epoch": 3.139061040835385, "grad_norm": 0.8129810690879822, "learning_rate": 1.4575438761754624e-05, "loss": 0.0087, "step": 12325 }, { "epoch": 3.140334493590288, "grad_norm": 0.8216841220855713, "learning_rate": 1.4571485369974201e-05, "loss": 0.0124, "step": 12330 }, { "epoch": 3.1416079463451907, "grad_norm": 1.1624656915664673, "learning_rate": 1.4567531074728488e-05, "loss": 0.0078, "step": 12335 }, { "epoch": 3.1428813991000935, "grad_norm": 0.9317910671234131, "learning_rate": 1.4563575876798976e-05, "loss": 0.0079, "step": 12340 }, { "epoch": 3.144154851854996, "grad_norm": 1.3101696968078613, "learning_rate": 1.4559619776967335e-05, "loss": 0.0096, "step": 12345 }, { "epoch": 3.145428304609899, "grad_norm": 1.1186078786849976, "learning_rate": 1.4555662776015407e-05, "loss": 0.0079, "step": 12350 }, { "epoch": 3.1467017573648017, "grad_norm": 0.4872641861438751, "learning_rate": 1.4551704874725217e-05, "loss": 0.007, "step": 12355 }, { "epoch": 3.1479752101197045, "grad_norm": 0.8824448585510254, "learning_rate": 1.454774607387897e-05, "loss": 0.008, "step": 12360 }, { "epoch": 3.1492486628746073, "grad_norm": 0.8980182409286499, "learning_rate": 1.4543786374259046e-05, "loss": 0.0101, "step": 12365 }, { "epoch": 3.15052211562951, "grad_norm": 0.9917596578598022, "learning_rate": 1.4539825776648e-05, "loss": 0.0104, "step": 12370 }, { "epoch": 3.1517955683844128, "grad_norm": 0.82027268409729, "learning_rate": 1.4535864281828567e-05, "loss": 0.0112, "step": 12375 }, { "epoch": 3.1530690211393155, "grad_norm": 1.1384437084197998, "learning_rate": 1.4531901890583661e-05, "loss": 0.0094, "step": 12380 }, { "epoch": 3.1543424738942187, "grad_norm": 1.0374454259872437, "learning_rate": 1.4527938603696376e-05, "loss": 0.0077, "step": 12385 }, { "epoch": 3.1556159266491215, "grad_norm": 0.9967702031135559, "learning_rate": 1.4523974421949968e-05, "loss": 0.0099, "step": 12390 }, { "epoch": 3.1568893794040243, "grad_norm": 1.0571627616882324, "learning_rate": 1.4520009346127892e-05, "loss": 0.014, "step": 12395 }, { "epoch": 3.158162832158927, "grad_norm": 0.3755408227443695, "learning_rate": 1.4516043377013764e-05, "loss": 0.0108, "step": 12400 }, { "epoch": 3.1594362849138298, "grad_norm": 0.7878175973892212, "learning_rate": 1.4512076515391375e-05, "loss": 0.0092, "step": 12405 }, { "epoch": 3.1607097376687325, "grad_norm": 0.8050683736801147, "learning_rate": 1.4508108762044707e-05, "loss": 0.0095, "step": 12410 }, { "epoch": 3.1619831904236353, "grad_norm": 0.12386477738618851, "learning_rate": 1.4504140117757905e-05, "loss": 0.005, "step": 12415 }, { "epoch": 3.163256643178538, "grad_norm": 0.9580883979797363, "learning_rate": 1.4500170583315292e-05, "loss": 0.0104, "step": 12420 }, { "epoch": 3.164530095933441, "grad_norm": 0.7121081352233887, "learning_rate": 1.4496200159501375e-05, "loss": 0.0049, "step": 12425 }, { "epoch": 3.1658035486883436, "grad_norm": 0.9167422652244568, "learning_rate": 1.4492228847100828e-05, "loss": 0.0057, "step": 12430 }, { "epoch": 3.1670770014432463, "grad_norm": 0.9809591174125671, "learning_rate": 1.4488256646898502e-05, "loss": 0.0092, "step": 12435 }, { "epoch": 3.168350454198149, "grad_norm": 0.9127336144447327, "learning_rate": 1.448428355967943e-05, "loss": 0.0087, "step": 12440 }, { "epoch": 3.169623906953052, "grad_norm": 1.9687410593032837, "learning_rate": 1.4480309586228813e-05, "loss": 0.0087, "step": 12445 }, { "epoch": 3.170897359707955, "grad_norm": 1.137405514717102, "learning_rate": 1.4476334727332027e-05, "loss": 0.0097, "step": 12450 }, { "epoch": 3.172170812462858, "grad_norm": 0.7707895040512085, "learning_rate": 1.4472358983774633e-05, "loss": 0.0087, "step": 12455 }, { "epoch": 3.1734442652177606, "grad_norm": 0.7467387318611145, "learning_rate": 1.446838235634235e-05, "loss": 0.0097, "step": 12460 }, { "epoch": 3.1747177179726633, "grad_norm": 1.0727615356445312, "learning_rate": 1.4464404845821088e-05, "loss": 0.0084, "step": 12465 }, { "epoch": 3.175991170727566, "grad_norm": 0.5754563212394714, "learning_rate": 1.4460426452996923e-05, "loss": 0.0085, "step": 12470 }, { "epoch": 3.177264623482469, "grad_norm": 1.1513805389404297, "learning_rate": 1.4456447178656106e-05, "loss": 0.0099, "step": 12475 }, { "epoch": 3.1785380762373716, "grad_norm": 0.7079944014549255, "learning_rate": 1.4452467023585062e-05, "loss": 0.0049, "step": 12480 }, { "epoch": 3.1798115289922744, "grad_norm": 1.5082696676254272, "learning_rate": 1.4448485988570396e-05, "loss": 0.0102, "step": 12485 }, { "epoch": 3.181084981747177, "grad_norm": 1.3402740955352783, "learning_rate": 1.444450407439888e-05, "loss": 0.0096, "step": 12490 }, { "epoch": 3.18235843450208, "grad_norm": 1.318315863609314, "learning_rate": 1.444052128185746e-05, "loss": 0.0077, "step": 12495 }, { "epoch": 3.1836318872569827, "grad_norm": 1.3832403421401978, "learning_rate": 1.4436537611733259e-05, "loss": 0.0085, "step": 12500 }, { "epoch": 3.1849053400118854, "grad_norm": 0.914902925491333, "learning_rate": 1.4432553064813573e-05, "loss": 0.0087, "step": 12505 }, { "epoch": 3.186178792766788, "grad_norm": 0.9284449219703674, "learning_rate": 1.442856764188587e-05, "loss": 0.0081, "step": 12510 }, { "epoch": 3.1874522455216914, "grad_norm": 0.9735705256462097, "learning_rate": 1.4424581343737787e-05, "loss": 0.0084, "step": 12515 }, { "epoch": 3.188725698276594, "grad_norm": 1.1670539379119873, "learning_rate": 1.4420594171157148e-05, "loss": 0.008, "step": 12520 }, { "epoch": 3.189999151031497, "grad_norm": 1.6751985549926758, "learning_rate": 1.4416606124931932e-05, "loss": 0.0088, "step": 12525 }, { "epoch": 3.1912726037863997, "grad_norm": 1.38275146484375, "learning_rate": 1.4412617205850299e-05, "loss": 0.0085, "step": 12530 }, { "epoch": 3.1925460565413024, "grad_norm": 1.2807719707489014, "learning_rate": 1.4408627414700587e-05, "loss": 0.0115, "step": 12535 }, { "epoch": 3.193819509296205, "grad_norm": 0.7962555289268494, "learning_rate": 1.4404636752271297e-05, "loss": 0.0082, "step": 12540 }, { "epoch": 3.195092962051108, "grad_norm": 1.4364590644836426, "learning_rate": 1.4400645219351103e-05, "loss": 0.008, "step": 12545 }, { "epoch": 3.1963664148060107, "grad_norm": 1.2946851253509521, "learning_rate": 1.4396652816728862e-05, "loss": 0.0079, "step": 12550 }, { "epoch": 3.1976398675609135, "grad_norm": 0.8049689531326294, "learning_rate": 1.4392659545193588e-05, "loss": 0.0092, "step": 12555 }, { "epoch": 3.198913320315816, "grad_norm": 1.1149545907974243, "learning_rate": 1.4388665405534475e-05, "loss": 0.0104, "step": 12560 }, { "epoch": 3.200186773070719, "grad_norm": 0.5706244707107544, "learning_rate": 1.4384670398540888e-05, "loss": 0.0078, "step": 12565 }, { "epoch": 3.2014602258256217, "grad_norm": 0.9623197317123413, "learning_rate": 1.4380674525002364e-05, "loss": 0.0094, "step": 12570 }, { "epoch": 3.2027336785805245, "grad_norm": 1.3036829233169556, "learning_rate": 1.4376677785708607e-05, "loss": 0.0075, "step": 12575 }, { "epoch": 3.2040071313354272, "grad_norm": 0.988131582736969, "learning_rate": 1.4372680181449496e-05, "loss": 0.0079, "step": 12580 }, { "epoch": 3.2052805840903305, "grad_norm": 0.9676885604858398, "learning_rate": 1.4368681713015082e-05, "loss": 0.0097, "step": 12585 }, { "epoch": 3.206554036845233, "grad_norm": 1.1378389596939087, "learning_rate": 1.4364682381195578e-05, "loss": 0.0082, "step": 12590 }, { "epoch": 3.207827489600136, "grad_norm": 1.4295519590377808, "learning_rate": 1.4360682186781384e-05, "loss": 0.0117, "step": 12595 }, { "epoch": 3.2091009423550387, "grad_norm": 0.8998960256576538, "learning_rate": 1.4356681130563053e-05, "loss": 0.0084, "step": 12600 }, { "epoch": 3.2103743951099415, "grad_norm": 1.8779507875442505, "learning_rate": 1.4352679213331317e-05, "loss": 0.0091, "step": 12605 }, { "epoch": 3.2116478478648443, "grad_norm": 0.7678120732307434, "learning_rate": 1.4348676435877083e-05, "loss": 0.0097, "step": 12610 }, { "epoch": 3.212921300619747, "grad_norm": 0.9281109571456909, "learning_rate": 1.4344672798991415e-05, "loss": 0.0059, "step": 12615 }, { "epoch": 3.2141947533746498, "grad_norm": 1.0870068073272705, "learning_rate": 1.4340668303465558e-05, "loss": 0.0101, "step": 12620 }, { "epoch": 3.2154682061295525, "grad_norm": 0.9851688146591187, "learning_rate": 1.433666295009092e-05, "loss": 0.0084, "step": 12625 }, { "epoch": 3.2167416588844553, "grad_norm": 0.8536729216575623, "learning_rate": 1.4332656739659086e-05, "loss": 0.0106, "step": 12630 }, { "epoch": 3.218015111639358, "grad_norm": 1.148978352546692, "learning_rate": 1.4328649672961801e-05, "loss": 0.0084, "step": 12635 }, { "epoch": 3.219288564394261, "grad_norm": 0.6506516933441162, "learning_rate": 1.4324641750790984e-05, "loss": 0.0089, "step": 12640 }, { "epoch": 3.2205620171491636, "grad_norm": 1.1925184726715088, "learning_rate": 1.4320632973938726e-05, "loss": 0.0061, "step": 12645 }, { "epoch": 3.2218354699040663, "grad_norm": 1.3313425779342651, "learning_rate": 1.4316623343197285e-05, "loss": 0.0133, "step": 12650 }, { "epoch": 3.2231089226589695, "grad_norm": 1.1479182243347168, "learning_rate": 1.4312612859359081e-05, "loss": 0.0087, "step": 12655 }, { "epoch": 3.2243823754138723, "grad_norm": 1.0411555767059326, "learning_rate": 1.4308601523216714e-05, "loss": 0.0127, "step": 12660 }, { "epoch": 3.225655828168775, "grad_norm": 0.763054370880127, "learning_rate": 1.4304589335562943e-05, "loss": 0.0066, "step": 12665 }, { "epoch": 3.226929280923678, "grad_norm": 0.7067250609397888, "learning_rate": 1.4300576297190697e-05, "loss": 0.0094, "step": 12670 }, { "epoch": 3.2282027336785806, "grad_norm": 1.4652273654937744, "learning_rate": 1.4296562408893082e-05, "loss": 0.0091, "step": 12675 }, { "epoch": 3.2294761864334833, "grad_norm": 0.8470927476882935, "learning_rate": 1.4292547671463362e-05, "loss": 0.0058, "step": 12680 }, { "epoch": 3.230749639188386, "grad_norm": 1.0243043899536133, "learning_rate": 1.4288532085694967e-05, "loss": 0.0109, "step": 12685 }, { "epoch": 3.232023091943289, "grad_norm": 0.8130828738212585, "learning_rate": 1.4284515652381506e-05, "loss": 0.0114, "step": 12690 }, { "epoch": 3.2332965446981916, "grad_norm": 1.3983365297317505, "learning_rate": 1.4280498372316747e-05, "loss": 0.0063, "step": 12695 }, { "epoch": 3.2345699974530944, "grad_norm": 1.3910094499588013, "learning_rate": 1.4276480246294626e-05, "loss": 0.0109, "step": 12700 }, { "epoch": 3.235843450207997, "grad_norm": 0.6071146130561829, "learning_rate": 1.4272461275109248e-05, "loss": 0.0075, "step": 12705 }, { "epoch": 3.2371169029629, "grad_norm": 1.0220168828964233, "learning_rate": 1.4268441459554887e-05, "loss": 0.0094, "step": 12710 }, { "epoch": 3.2383903557178026, "grad_norm": 0.7832404375076294, "learning_rate": 1.4264420800425977e-05, "loss": 0.0127, "step": 12715 }, { "epoch": 3.239663808472706, "grad_norm": 1.6874690055847168, "learning_rate": 1.4260399298517129e-05, "loss": 0.0104, "step": 12720 }, { "epoch": 3.2409372612276086, "grad_norm": 0.5608682036399841, "learning_rate": 1.425637695462311e-05, "loss": 0.0087, "step": 12725 }, { "epoch": 3.2422107139825114, "grad_norm": 0.7561659216880798, "learning_rate": 1.4252353769538857e-05, "loss": 0.0096, "step": 12730 }, { "epoch": 3.243484166737414, "grad_norm": 0.7976580858230591, "learning_rate": 1.4248329744059479e-05, "loss": 0.0084, "step": 12735 }, { "epoch": 3.244757619492317, "grad_norm": 0.8334284424781799, "learning_rate": 1.4244304878980248e-05, "loss": 0.0055, "step": 12740 }, { "epoch": 3.2460310722472197, "grad_norm": 0.6542688608169556, "learning_rate": 1.4240279175096593e-05, "loss": 0.0083, "step": 12745 }, { "epoch": 3.2473045250021224, "grad_norm": 1.1672405004501343, "learning_rate": 1.4236252633204121e-05, "loss": 0.0074, "step": 12750 }, { "epoch": 3.248577977757025, "grad_norm": 0.884590208530426, "learning_rate": 1.4232225254098599e-05, "loss": 0.0085, "step": 12755 }, { "epoch": 3.249851430511928, "grad_norm": 0.8862399458885193, "learning_rate": 1.4228197038575961e-05, "loss": 0.011, "step": 12760 }, { "epoch": 3.2511248832668307, "grad_norm": 1.0772398710250854, "learning_rate": 1.4224167987432302e-05, "loss": 0.0084, "step": 12765 }, { "epoch": 3.2523983360217334, "grad_norm": 0.9559034705162048, "learning_rate": 1.4220138101463892e-05, "loss": 0.008, "step": 12770 }, { "epoch": 3.253671788776636, "grad_norm": 0.5946658849716187, "learning_rate": 1.4216107381467154e-05, "loss": 0.0072, "step": 12775 }, { "epoch": 3.254945241531539, "grad_norm": 0.6116548776626587, "learning_rate": 1.4212075828238683e-05, "loss": 0.0074, "step": 12780 }, { "epoch": 3.256218694286442, "grad_norm": 0.4877958595752716, "learning_rate": 1.420804344257524e-05, "loss": 0.0128, "step": 12785 }, { "epoch": 3.257492147041345, "grad_norm": 0.9701492190361023, "learning_rate": 1.4204010225273746e-05, "loss": 0.0071, "step": 12790 }, { "epoch": 3.2587655997962477, "grad_norm": 0.9958541393280029, "learning_rate": 1.4199976177131283e-05, "loss": 0.0088, "step": 12795 }, { "epoch": 3.2600390525511505, "grad_norm": 1.0906776189804077, "learning_rate": 1.4195941298945111e-05, "loss": 0.0084, "step": 12800 }, { "epoch": 3.261312505306053, "grad_norm": 0.7296920418739319, "learning_rate": 1.419190559151264e-05, "loss": 0.0075, "step": 12805 }, { "epoch": 3.262585958060956, "grad_norm": 0.9404197335243225, "learning_rate": 1.4187869055631449e-05, "loss": 0.0078, "step": 12810 }, { "epoch": 3.2638594108158587, "grad_norm": 1.6696436405181885, "learning_rate": 1.4183831692099284e-05, "loss": 0.0087, "step": 12815 }, { "epoch": 3.2651328635707615, "grad_norm": 2.2259914875030518, "learning_rate": 1.4179793501714052e-05, "loss": 0.0082, "step": 12820 }, { "epoch": 3.2664063163256642, "grad_norm": 0.5714438557624817, "learning_rate": 1.4175754485273813e-05, "loss": 0.0094, "step": 12825 }, { "epoch": 3.267679769080567, "grad_norm": 0.7153294682502747, "learning_rate": 1.4171714643576814e-05, "loss": 0.0095, "step": 12830 }, { "epoch": 3.2689532218354698, "grad_norm": 1.1022802591323853, "learning_rate": 1.4167673977421446e-05, "loss": 0.0069, "step": 12835 }, { "epoch": 3.2702266745903725, "grad_norm": 1.1081335544586182, "learning_rate": 1.4163632487606263e-05, "loss": 0.0111, "step": 12840 }, { "epoch": 3.2715001273452753, "grad_norm": 1.1115998029708862, "learning_rate": 1.4159590174929993e-05, "loss": 0.013, "step": 12845 }, { "epoch": 3.2727735801001785, "grad_norm": 1.3195834159851074, "learning_rate": 1.4155547040191519e-05, "loss": 0.0098, "step": 12850 }, { "epoch": 3.2740470328550813, "grad_norm": 0.992899477481842, "learning_rate": 1.4151503084189888e-05, "loss": 0.0121, "step": 12855 }, { "epoch": 3.275320485609984, "grad_norm": 0.7567044496536255, "learning_rate": 1.4147458307724308e-05, "loss": 0.0084, "step": 12860 }, { "epoch": 3.2765939383648868, "grad_norm": 0.9151207208633423, "learning_rate": 1.4143412711594153e-05, "loss": 0.0107, "step": 12865 }, { "epoch": 3.2778673911197895, "grad_norm": 1.0128434896469116, "learning_rate": 1.4139366296598955e-05, "loss": 0.0097, "step": 12870 }, { "epoch": 3.2791408438746923, "grad_norm": 0.9327370524406433, "learning_rate": 1.4135319063538408e-05, "loss": 0.0133, "step": 12875 }, { "epoch": 3.280414296629595, "grad_norm": 1.460728645324707, "learning_rate": 1.4131271013212373e-05, "loss": 0.0089, "step": 12880 }, { "epoch": 3.281687749384498, "grad_norm": 0.7522826790809631, "learning_rate": 1.4127222146420864e-05, "loss": 0.007, "step": 12885 }, { "epoch": 3.2829612021394006, "grad_norm": 0.9324296712875366, "learning_rate": 1.4123172463964063e-05, "loss": 0.006, "step": 12890 }, { "epoch": 3.2842346548943033, "grad_norm": 1.0138051509857178, "learning_rate": 1.4119121966642311e-05, "loss": 0.0106, "step": 12895 }, { "epoch": 3.285508107649206, "grad_norm": 1.413987636566162, "learning_rate": 1.4115070655256112e-05, "loss": 0.0059, "step": 12900 }, { "epoch": 3.286781560404109, "grad_norm": 0.8273129463195801, "learning_rate": 1.4111018530606124e-05, "loss": 0.0122, "step": 12905 }, { "epoch": 3.2880550131590116, "grad_norm": 1.4478284120559692, "learning_rate": 1.4106965593493174e-05, "loss": 0.0121, "step": 12910 }, { "epoch": 3.289328465913915, "grad_norm": 2.4623632431030273, "learning_rate": 1.410291184471825e-05, "loss": 0.0084, "step": 12915 }, { "epoch": 3.2906019186688176, "grad_norm": 0.8420528173446655, "learning_rate": 1.4098857285082488e-05, "loss": 0.0085, "step": 12920 }, { "epoch": 3.2918753714237203, "grad_norm": 0.9810566902160645, "learning_rate": 1.40948019153872e-05, "loss": 0.0089, "step": 12925 }, { "epoch": 3.293148824178623, "grad_norm": 0.6936976313591003, "learning_rate": 1.4090745736433848e-05, "loss": 0.0103, "step": 12930 }, { "epoch": 3.294422276933526, "grad_norm": 0.563739001750946, "learning_rate": 1.4086688749024058e-05, "loss": 0.0095, "step": 12935 }, { "epoch": 3.2956957296884286, "grad_norm": 0.8645238280296326, "learning_rate": 1.4082630953959617e-05, "loss": 0.0074, "step": 12940 }, { "epoch": 3.2969691824433314, "grad_norm": 1.0323330163955688, "learning_rate": 1.4078572352042466e-05, "loss": 0.0101, "step": 12945 }, { "epoch": 3.298242635198234, "grad_norm": 6.630836486816406, "learning_rate": 1.4075324890113801e-05, "loss": 0.0361, "step": 12950 }, { "epoch": 3.299516087953137, "grad_norm": 0.9556183815002441, "learning_rate": 1.4071264837883182e-05, "loss": 0.0104, "step": 12955 }, { "epoch": 3.3007895407080396, "grad_norm": 2.409315347671509, "learning_rate": 1.4067203981046144e-05, "loss": 0.0084, "step": 12960 }, { "epoch": 3.3020629934629424, "grad_norm": 1.1236096620559692, "learning_rate": 1.4063954716799204e-05, "loss": 0.0099, "step": 12965 }, { "epoch": 3.303336446217845, "grad_norm": 3.5954792499542236, "learning_rate": 1.4059892413693148e-05, "loss": 0.0141, "step": 12970 }, { "epoch": 3.304609898972748, "grad_norm": 0.6263399124145508, "learning_rate": 1.4055829308228215e-05, "loss": 0.0129, "step": 12975 }, { "epoch": 3.305883351727651, "grad_norm": 1.304831624031067, "learning_rate": 1.4051765401207399e-05, "loss": 0.0097, "step": 12980 }, { "epoch": 3.307156804482554, "grad_norm": 0.9082533717155457, "learning_rate": 1.4047700693433845e-05, "loss": 0.0094, "step": 12985 }, { "epoch": 3.3084302572374567, "grad_norm": 1.7635918855667114, "learning_rate": 1.4043635185710874e-05, "loss": 0.0145, "step": 12990 }, { "epoch": 3.3097037099923594, "grad_norm": 1.868361234664917, "learning_rate": 1.403956887884195e-05, "loss": 0.0105, "step": 12995 }, { "epoch": 3.310977162747262, "grad_norm": 1.2272725105285645, "learning_rate": 1.4035501773630699e-05, "loss": 0.0122, "step": 13000 }, { "epoch": 3.312250615502165, "grad_norm": 1.0493272542953491, "learning_rate": 1.4031433870880907e-05, "loss": 0.0095, "step": 13005 }, { "epoch": 3.3135240682570677, "grad_norm": 1.6527961492538452, "learning_rate": 1.4027365171396512e-05, "loss": 0.0087, "step": 13010 }, { "epoch": 3.3147975210119704, "grad_norm": 1.1979049444198608, "learning_rate": 1.4023295675981618e-05, "loss": 0.0106, "step": 13015 }, { "epoch": 3.316070973766873, "grad_norm": 1.004393219947815, "learning_rate": 1.401922538544048e-05, "loss": 0.0126, "step": 13020 }, { "epoch": 3.317344426521776, "grad_norm": 0.8670983910560608, "learning_rate": 1.4015154300577514e-05, "loss": 0.0087, "step": 13025 }, { "epoch": 3.3186178792766787, "grad_norm": 1.792157530784607, "learning_rate": 1.4011082422197287e-05, "loss": 0.0157, "step": 13030 }, { "epoch": 3.3198913320315815, "grad_norm": 1.1015547513961792, "learning_rate": 1.4007009751104527e-05, "loss": 0.0103, "step": 13035 }, { "epoch": 3.3211647847864842, "grad_norm": 1.0951308012008667, "learning_rate": 1.4002936288104123e-05, "loss": 0.0079, "step": 13040 }, { "epoch": 3.3224382375413875, "grad_norm": 0.5278254747390747, "learning_rate": 1.3998862034001112e-05, "loss": 0.008, "step": 13045 }, { "epoch": 3.3237116902962898, "grad_norm": 0.9112992286682129, "learning_rate": 1.399478698960069e-05, "loss": 0.0091, "step": 13050 }, { "epoch": 3.324985143051193, "grad_norm": 1.0323207378387451, "learning_rate": 1.3990711155708215e-05, "loss": 0.0085, "step": 13055 }, { "epoch": 3.3262585958060957, "grad_norm": 1.286510944366455, "learning_rate": 1.3986634533129194e-05, "loss": 0.0093, "step": 13060 }, { "epoch": 3.3275320485609985, "grad_norm": 0.5610948204994202, "learning_rate": 1.3982557122669286e-05, "loss": 0.0094, "step": 13065 }, { "epoch": 3.3288055013159012, "grad_norm": 0.7205216884613037, "learning_rate": 1.3978478925134325e-05, "loss": 0.0086, "step": 13070 }, { "epoch": 3.330078954070804, "grad_norm": 1.6725906133651733, "learning_rate": 1.3974399941330276e-05, "loss": 0.011, "step": 13075 }, { "epoch": 3.3313524068257068, "grad_norm": 1.08562433719635, "learning_rate": 1.3970320172063277e-05, "loss": 0.0087, "step": 13080 }, { "epoch": 3.3326258595806095, "grad_norm": 0.7954168915748596, "learning_rate": 1.3966239618139614e-05, "loss": 0.0075, "step": 13085 }, { "epoch": 3.3338993123355123, "grad_norm": 0.617902934551239, "learning_rate": 1.396215828036573e-05, "loss": 0.0091, "step": 13090 }, { "epoch": 3.335172765090415, "grad_norm": 1.128908395767212, "learning_rate": 1.3958076159548219e-05, "loss": 0.0077, "step": 13095 }, { "epoch": 3.336446217845318, "grad_norm": 0.8558573722839355, "learning_rate": 1.3953993256493834e-05, "loss": 0.0099, "step": 13100 }, { "epoch": 3.3377196706002206, "grad_norm": 1.5348074436187744, "learning_rate": 1.3949909572009484e-05, "loss": 0.0114, "step": 13105 }, { "epoch": 3.3389931233551238, "grad_norm": 0.7545166015625, "learning_rate": 1.3945825106902227e-05, "loss": 0.0091, "step": 13110 }, { "epoch": 3.340266576110026, "grad_norm": 0.7150624394416809, "learning_rate": 1.394173986197928e-05, "loss": 0.0104, "step": 13115 }, { "epoch": 3.3415400288649293, "grad_norm": 1.7483181953430176, "learning_rate": 1.3937653838048012e-05, "loss": 0.0099, "step": 13120 }, { "epoch": 3.342813481619832, "grad_norm": 0.9431474208831787, "learning_rate": 1.3933567035915943e-05, "loss": 0.013, "step": 13125 }, { "epoch": 3.344086934374735, "grad_norm": 1.11593496799469, "learning_rate": 1.3929479456390756e-05, "loss": 0.0077, "step": 13130 }, { "epoch": 3.3453603871296376, "grad_norm": 1.570478916168213, "learning_rate": 1.3925391100280279e-05, "loss": 0.0117, "step": 13135 }, { "epoch": 3.3466338398845403, "grad_norm": 0.9195541739463806, "learning_rate": 1.3921301968392493e-05, "loss": 0.0091, "step": 13140 }, { "epoch": 3.347907292639443, "grad_norm": 0.8666048645973206, "learning_rate": 1.3917212061535542e-05, "loss": 0.0088, "step": 13145 }, { "epoch": 3.349180745394346, "grad_norm": 0.9350867867469788, "learning_rate": 1.3913121380517711e-05, "loss": 0.0109, "step": 13150 }, { "epoch": 3.3504541981492486, "grad_norm": 3.5957133769989014, "learning_rate": 1.3909029926147449e-05, "loss": 0.019, "step": 13155 }, { "epoch": 3.3517276509041514, "grad_norm": 2.197162628173828, "learning_rate": 1.3904937699233342e-05, "loss": 0.0168, "step": 13160 }, { "epoch": 3.353001103659054, "grad_norm": 0.7568861246109009, "learning_rate": 1.3900844700584154e-05, "loss": 0.0079, "step": 13165 }, { "epoch": 3.354274556413957, "grad_norm": 0.9339295029640198, "learning_rate": 1.3896750931008773e-05, "loss": 0.0098, "step": 13170 }, { "epoch": 3.3555480091688596, "grad_norm": 0.7870463728904724, "learning_rate": 1.389265639131626e-05, "loss": 0.0084, "step": 13175 }, { "epoch": 3.3568214619237624, "grad_norm": 0.8547097444534302, "learning_rate": 1.3888561082315823e-05, "loss": 0.008, "step": 13180 }, { "epoch": 3.3580949146786656, "grad_norm": 0.9921444058418274, "learning_rate": 1.3884465004816815e-05, "loss": 0.0127, "step": 13185 }, { "epoch": 3.3593683674335684, "grad_norm": 1.2102794647216797, "learning_rate": 1.388036815962875e-05, "loss": 0.007, "step": 13190 }, { "epoch": 3.360641820188471, "grad_norm": 0.5298036932945251, "learning_rate": 1.387627054756129e-05, "loss": 0.0094, "step": 13195 }, { "epoch": 3.361915272943374, "grad_norm": 0.8147972226142883, "learning_rate": 1.3872172169424246e-05, "loss": 0.0057, "step": 13200 }, { "epoch": 3.3631887256982766, "grad_norm": 0.638422429561615, "learning_rate": 1.3868073026027582e-05, "loss": 0.0099, "step": 13205 }, { "epoch": 3.3644621784531794, "grad_norm": 1.777901530265808, "learning_rate": 1.3863973118181417e-05, "loss": 0.0112, "step": 13210 }, { "epoch": 3.365735631208082, "grad_norm": 0.9581437110900879, "learning_rate": 1.3859872446696019e-05, "loss": 0.0111, "step": 13215 }, { "epoch": 3.367009083962985, "grad_norm": 0.8425670266151428, "learning_rate": 1.3855771012381802e-05, "loss": 0.0071, "step": 13220 }, { "epoch": 3.3682825367178877, "grad_norm": 0.7759663462638855, "learning_rate": 1.3851668816049337e-05, "loss": 0.0063, "step": 13225 }, { "epoch": 3.3695559894727904, "grad_norm": 0.6638793349266052, "learning_rate": 1.3847565858509344e-05, "loss": 0.0089, "step": 13230 }, { "epoch": 3.370829442227693, "grad_norm": 1.564316749572754, "learning_rate": 1.3843462140572692e-05, "loss": 0.0091, "step": 13235 }, { "epoch": 3.372102894982596, "grad_norm": 1.130937099456787, "learning_rate": 1.38393576630504e-05, "loss": 0.0138, "step": 13240 }, { "epoch": 3.3733763477374987, "grad_norm": 1.5405043363571167, "learning_rate": 1.3835252426753645e-05, "loss": 0.0108, "step": 13245 }, { "epoch": 3.374649800492402, "grad_norm": 0.5906837582588196, "learning_rate": 1.3831146432493737e-05, "loss": 0.0103, "step": 13250 }, { "epoch": 3.3759232532473047, "grad_norm": 0.6210595965385437, "learning_rate": 1.3827039681082152e-05, "loss": 0.0089, "step": 13255 }, { "epoch": 3.3771967060022074, "grad_norm": 1.0437562465667725, "learning_rate": 1.382293217333051e-05, "loss": 0.01, "step": 13260 }, { "epoch": 3.37847015875711, "grad_norm": 1.092686414718628, "learning_rate": 1.381882391005058e-05, "loss": 0.0075, "step": 13265 }, { "epoch": 3.379743611512013, "grad_norm": 0.6011712551116943, "learning_rate": 1.3814714892054273e-05, "loss": 0.0071, "step": 13270 }, { "epoch": 3.3810170642669157, "grad_norm": 1.1592607498168945, "learning_rate": 1.3810605120153666e-05, "loss": 0.0095, "step": 13275 }, { "epoch": 3.3822905170218185, "grad_norm": 0.5839558243751526, "learning_rate": 1.3806494595160973e-05, "loss": 0.008, "step": 13280 }, { "epoch": 3.3835639697767212, "grad_norm": 0.8893694281578064, "learning_rate": 1.3802383317888557e-05, "loss": 0.0092, "step": 13285 }, { "epoch": 3.384837422531624, "grad_norm": 0.43400847911834717, "learning_rate": 1.3798271289148937e-05, "loss": 0.0086, "step": 13290 }, { "epoch": 3.3861108752865268, "grad_norm": 1.2590241432189941, "learning_rate": 1.3794158509754771e-05, "loss": 0.0091, "step": 13295 }, { "epoch": 3.3873843280414295, "grad_norm": 1.1739109754562378, "learning_rate": 1.3790044980518869e-05, "loss": 0.01, "step": 13300 }, { "epoch": 3.3886577807963323, "grad_norm": 0.5348182320594788, "learning_rate": 1.3785930702254195e-05, "loss": 0.0085, "step": 13305 }, { "epoch": 3.389931233551235, "grad_norm": 1.5695610046386719, "learning_rate": 1.3781815675773853e-05, "loss": 0.0102, "step": 13310 }, { "epoch": 3.3912046863061382, "grad_norm": 1.2886196374893188, "learning_rate": 1.3777699901891098e-05, "loss": 0.0132, "step": 13315 }, { "epoch": 3.392478139061041, "grad_norm": 0.695736825466156, "learning_rate": 1.3773583381419336e-05, "loss": 0.0093, "step": 13320 }, { "epoch": 3.3937515918159438, "grad_norm": 0.8407790064811707, "learning_rate": 1.3769466115172115e-05, "loss": 0.0104, "step": 13325 }, { "epoch": 3.3950250445708465, "grad_norm": 1.0694512128829956, "learning_rate": 1.3765348103963128e-05, "loss": 0.0109, "step": 13330 }, { "epoch": 3.3962984973257493, "grad_norm": 1.1347581148147583, "learning_rate": 1.3761229348606229e-05, "loss": 0.0108, "step": 13335 }, { "epoch": 3.397571950080652, "grad_norm": 0.8780020475387573, "learning_rate": 1.3757109849915404e-05, "loss": 0.0103, "step": 13340 }, { "epoch": 3.398845402835555, "grad_norm": 0.37128135561943054, "learning_rate": 1.3752989608704791e-05, "loss": 0.0103, "step": 13345 }, { "epoch": 3.4001188555904576, "grad_norm": 1.1814141273498535, "learning_rate": 1.3748868625788682e-05, "loss": 0.0067, "step": 13350 }, { "epoch": 3.4013923083453603, "grad_norm": 0.8408476114273071, "learning_rate": 1.3744746901981504e-05, "loss": 0.0088, "step": 13355 }, { "epoch": 3.402665761100263, "grad_norm": 2.178257942199707, "learning_rate": 1.3740624438097837e-05, "loss": 0.0107, "step": 13360 }, { "epoch": 3.403939213855166, "grad_norm": 0.4720776677131653, "learning_rate": 1.3736501234952404e-05, "loss": 0.0063, "step": 13365 }, { "epoch": 3.4052126666100686, "grad_norm": 0.7370906472206116, "learning_rate": 1.3732377293360081e-05, "loss": 0.0063, "step": 13370 }, { "epoch": 3.4064861193649714, "grad_norm": 0.5578373074531555, "learning_rate": 1.3728252614135879e-05, "loss": 0.0092, "step": 13375 }, { "epoch": 3.4077595721198746, "grad_norm": 1.1673990488052368, "learning_rate": 1.3724127198094965e-05, "loss": 0.0085, "step": 13380 }, { "epoch": 3.4090330248747773, "grad_norm": 1.0913996696472168, "learning_rate": 1.3720001046052646e-05, "loss": 0.0114, "step": 13385 }, { "epoch": 3.41030647762968, "grad_norm": 1.0813415050506592, "learning_rate": 1.3715874158824376e-05, "loss": 0.0108, "step": 13390 }, { "epoch": 3.411579930384583, "grad_norm": 1.1715357303619385, "learning_rate": 1.3711746537225751e-05, "loss": 0.0078, "step": 13395 }, { "epoch": 3.4128533831394856, "grad_norm": 1.0048853158950806, "learning_rate": 1.3707618182072522e-05, "loss": 0.0084, "step": 13400 }, { "epoch": 3.4141268358943884, "grad_norm": 1.8618100881576538, "learning_rate": 1.3703489094180574e-05, "loss": 0.0079, "step": 13405 }, { "epoch": 3.415400288649291, "grad_norm": 0.7561237215995789, "learning_rate": 1.3699359274365938e-05, "loss": 0.0081, "step": 13410 }, { "epoch": 3.416673741404194, "grad_norm": 1.3355501890182495, "learning_rate": 1.3695228723444798e-05, "loss": 0.0076, "step": 13415 }, { "epoch": 3.4179471941590966, "grad_norm": 0.7274786829948425, "learning_rate": 1.3691097442233474e-05, "loss": 0.008, "step": 13420 }, { "epoch": 3.4192206469139994, "grad_norm": 0.6963048577308655, "learning_rate": 1.3686965431548435e-05, "loss": 0.0094, "step": 13425 }, { "epoch": 3.420494099668902, "grad_norm": 1.06816828250885, "learning_rate": 1.3682832692206292e-05, "loss": 0.0099, "step": 13430 }, { "epoch": 3.421767552423805, "grad_norm": 0.7290567755699158, "learning_rate": 1.3678699225023802e-05, "loss": 0.0079, "step": 13435 }, { "epoch": 3.4230410051787077, "grad_norm": 1.2746261358261108, "learning_rate": 1.3674565030817863e-05, "loss": 0.0097, "step": 13440 }, { "epoch": 3.424314457933611, "grad_norm": 0.9035087823867798, "learning_rate": 1.367043011040552e-05, "loss": 0.0084, "step": 13445 }, { "epoch": 3.4255879106885136, "grad_norm": 0.7565810680389404, "learning_rate": 1.3666294464603955e-05, "loss": 0.0077, "step": 13450 }, { "epoch": 3.4268613634434164, "grad_norm": 0.9630064964294434, "learning_rate": 1.3662158094230504e-05, "loss": 0.0088, "step": 13455 }, { "epoch": 3.428134816198319, "grad_norm": 1.1513639688491821, "learning_rate": 1.3658021000102638e-05, "loss": 0.0115, "step": 13460 }, { "epoch": 3.429408268953222, "grad_norm": 0.22320032119750977, "learning_rate": 1.365388318303797e-05, "loss": 0.0065, "step": 13465 }, { "epoch": 3.4306817217081247, "grad_norm": 1.88547682762146, "learning_rate": 1.3649744643854264e-05, "loss": 0.0101, "step": 13470 }, { "epoch": 3.4319551744630274, "grad_norm": 0.8783929347991943, "learning_rate": 1.364560538336942e-05, "loss": 0.0095, "step": 13475 }, { "epoch": 3.43322862721793, "grad_norm": 1.179524540901184, "learning_rate": 1.3641465402401481e-05, "loss": 0.0109, "step": 13480 }, { "epoch": 3.434502079972833, "grad_norm": 1.8061537742614746, "learning_rate": 1.3637324701768635e-05, "loss": 0.0133, "step": 13485 }, { "epoch": 3.4357755327277357, "grad_norm": 1.2372015714645386, "learning_rate": 1.3633183282289215e-05, "loss": 0.009, "step": 13490 }, { "epoch": 3.4370489854826385, "grad_norm": 0.6873633861541748, "learning_rate": 1.3629041144781685e-05, "loss": 0.0091, "step": 13495 }, { "epoch": 3.4383224382375412, "grad_norm": 1.07859206199646, "learning_rate": 1.3624898290064661e-05, "loss": 0.0104, "step": 13500 }, { "epoch": 3.439595890992444, "grad_norm": 1.1964683532714844, "learning_rate": 1.3620754718956901e-05, "loss": 0.0104, "step": 13505 }, { "epoch": 3.440869343747347, "grad_norm": 0.713410496711731, "learning_rate": 1.3616610432277298e-05, "loss": 0.0079, "step": 13510 }, { "epoch": 3.4421427965022495, "grad_norm": 0.9085888862609863, "learning_rate": 1.361246543084489e-05, "loss": 0.0095, "step": 13515 }, { "epoch": 3.4434162492571527, "grad_norm": 0.8952046036720276, "learning_rate": 1.3608319715478854e-05, "loss": 0.0102, "step": 13520 }, { "epoch": 3.4446897020120555, "grad_norm": 0.6662551760673523, "learning_rate": 1.3604173286998514e-05, "loss": 0.0065, "step": 13525 }, { "epoch": 3.4459631547669582, "grad_norm": 1.2443616390228271, "learning_rate": 1.3600026146223331e-05, "loss": 0.0126, "step": 13530 }, { "epoch": 3.447236607521861, "grad_norm": 0.9595752954483032, "learning_rate": 1.3595878293972904e-05, "loss": 0.01, "step": 13535 }, { "epoch": 3.4485100602767638, "grad_norm": 0.6445420980453491, "learning_rate": 1.3591729731066976e-05, "loss": 0.0075, "step": 13540 }, { "epoch": 3.4497835130316665, "grad_norm": 1.2048126459121704, "learning_rate": 1.3587580458325432e-05, "loss": 0.0142, "step": 13545 }, { "epoch": 3.4510569657865693, "grad_norm": 0.9119572043418884, "learning_rate": 1.3583430476568292e-05, "loss": 0.0103, "step": 13550 }, { "epoch": 3.452330418541472, "grad_norm": 0.9824792742729187, "learning_rate": 1.3579279786615722e-05, "loss": 0.0094, "step": 13555 }, { "epoch": 3.453603871296375, "grad_norm": 1.440169334411621, "learning_rate": 1.3575128389288026e-05, "loss": 0.0088, "step": 13560 }, { "epoch": 3.4548773240512776, "grad_norm": 1.304409384727478, "learning_rate": 1.3570976285405641e-05, "loss": 0.0156, "step": 13565 }, { "epoch": 3.4561507768061803, "grad_norm": 0.920400083065033, "learning_rate": 1.3566823475789159e-05, "loss": 0.0066, "step": 13570 }, { "epoch": 3.4574242295610835, "grad_norm": 1.0626486539840698, "learning_rate": 1.3562669961259296e-05, "loss": 0.01, "step": 13575 }, { "epoch": 3.458697682315986, "grad_norm": 1.1382886171340942, "learning_rate": 1.3558515742636912e-05, "loss": 0.0107, "step": 13580 }, { "epoch": 3.459971135070889, "grad_norm": 0.6963071227073669, "learning_rate": 1.3554360820743014e-05, "loss": 0.0119, "step": 13585 }, { "epoch": 3.461244587825792, "grad_norm": 1.7662941217422485, "learning_rate": 1.3550205196398736e-05, "loss": 0.0117, "step": 13590 }, { "epoch": 3.4625180405806946, "grad_norm": 0.9851407408714294, "learning_rate": 1.3546048870425356e-05, "loss": 0.0099, "step": 13595 }, { "epoch": 3.4637914933355973, "grad_norm": 0.8964735269546509, "learning_rate": 1.3541891843644297e-05, "loss": 0.009, "step": 13600 }, { "epoch": 3.4650649460905, "grad_norm": 1.3268988132476807, "learning_rate": 1.353773411687711e-05, "loss": 0.0076, "step": 13605 }, { "epoch": 3.466338398845403, "grad_norm": 0.8892123103141785, "learning_rate": 1.3533575690945489e-05, "loss": 0.0135, "step": 13610 }, { "epoch": 3.4676118516003056, "grad_norm": 1.053755521774292, "learning_rate": 1.3529416566671269e-05, "loss": 0.0099, "step": 13615 }, { "epoch": 3.4688853043552084, "grad_norm": 0.8502674102783203, "learning_rate": 1.3525256744876419e-05, "loss": 0.0076, "step": 13620 }, { "epoch": 3.470158757110111, "grad_norm": 1.26726233959198, "learning_rate": 1.352109622638304e-05, "loss": 0.0086, "step": 13625 }, { "epoch": 3.471432209865014, "grad_norm": 0.9822936654090881, "learning_rate": 1.3516935012013392e-05, "loss": 0.0112, "step": 13630 }, { "epoch": 3.4727056626199166, "grad_norm": 1.0541630983352661, "learning_rate": 1.3512773102589847e-05, "loss": 0.0129, "step": 13635 }, { "epoch": 3.4739791153748194, "grad_norm": 1.045680284500122, "learning_rate": 1.3508610498934929e-05, "loss": 0.0114, "step": 13640 }, { "epoch": 3.475252568129722, "grad_norm": 1.7344952821731567, "learning_rate": 1.3504447201871291e-05, "loss": 0.0115, "step": 13645 }, { "epoch": 3.4765260208846254, "grad_norm": 0.8467074036598206, "learning_rate": 1.3500283212221739e-05, "loss": 0.0094, "step": 13650 }, { "epoch": 3.477799473639528, "grad_norm": 1.3725371360778809, "learning_rate": 1.3496118530809195e-05, "loss": 0.0142, "step": 13655 }, { "epoch": 3.479072926394431, "grad_norm": 0.9725553393363953, "learning_rate": 1.3491953158456727e-05, "loss": 0.0089, "step": 13660 }, { "epoch": 3.4803463791493336, "grad_norm": 1.1722218990325928, "learning_rate": 1.3487787095987548e-05, "loss": 0.0079, "step": 13665 }, { "epoch": 3.4816198319042364, "grad_norm": 0.9052077531814575, "learning_rate": 1.3483620344224993e-05, "loss": 0.0061, "step": 13670 }, { "epoch": 3.482893284659139, "grad_norm": 0.44864198565483093, "learning_rate": 1.3479452903992537e-05, "loss": 0.0093, "step": 13675 }, { "epoch": 3.484166737414042, "grad_norm": 1.068482518196106, "learning_rate": 1.3475284776113803e-05, "loss": 0.0128, "step": 13680 }, { "epoch": 3.4854401901689447, "grad_norm": 1.0896199941635132, "learning_rate": 1.3471115961412536e-05, "loss": 0.008, "step": 13685 }, { "epoch": 3.4867136429238474, "grad_norm": 0.709388792514801, "learning_rate": 1.3466946460712615e-05, "loss": 0.0092, "step": 13690 }, { "epoch": 3.48798709567875, "grad_norm": 0.7642784714698792, "learning_rate": 1.346277627483807e-05, "loss": 0.0097, "step": 13695 }, { "epoch": 3.489260548433653, "grad_norm": 0.6806069612503052, "learning_rate": 1.3458605404613052e-05, "loss": 0.0095, "step": 13700 }, { "epoch": 3.4905340011885557, "grad_norm": 0.5345680117607117, "learning_rate": 1.3454433850861854e-05, "loss": 0.0065, "step": 13705 }, { "epoch": 3.4918074539434585, "grad_norm": 1.4732743501663208, "learning_rate": 1.3450261614408903e-05, "loss": 0.0109, "step": 13710 }, { "epoch": 3.4930809066983617, "grad_norm": 0.8274696469306946, "learning_rate": 1.3446088696078763e-05, "loss": 0.0111, "step": 13715 }, { "epoch": 3.4943543594532644, "grad_norm": 0.43045222759246826, "learning_rate": 1.3441915096696123e-05, "loss": 0.0084, "step": 13720 }, { "epoch": 3.495627812208167, "grad_norm": 0.6319058537483215, "learning_rate": 1.343774081708582e-05, "loss": 0.0087, "step": 13725 }, { "epoch": 3.49690126496307, "grad_norm": 0.5920105576515198, "learning_rate": 1.343356585807282e-05, "loss": 0.0089, "step": 13730 }, { "epoch": 3.4981747177179727, "grad_norm": 1.4330083131790161, "learning_rate": 1.3429390220482217e-05, "loss": 0.0081, "step": 13735 }, { "epoch": 3.4994481704728755, "grad_norm": 0.8496478199958801, "learning_rate": 1.3425213905139247e-05, "loss": 0.0122, "step": 13740 }, { "epoch": 3.5007216232277782, "grad_norm": 0.9803323745727539, "learning_rate": 1.342103691286928e-05, "loss": 0.0092, "step": 13745 }, { "epoch": 3.501995075982681, "grad_norm": 0.7847593426704407, "learning_rate": 1.3416859244497815e-05, "loss": 0.0063, "step": 13750 }, { "epoch": 3.5032685287375838, "grad_norm": 1.3546315431594849, "learning_rate": 1.341268090085049e-05, "loss": 0.0084, "step": 13755 }, { "epoch": 3.5045419814924865, "grad_norm": 0.7166176438331604, "learning_rate": 1.3408501882753067e-05, "loss": 0.0066, "step": 13760 }, { "epoch": 3.5058154342473893, "grad_norm": 1.2568159103393555, "learning_rate": 1.3404322191031454e-05, "loss": 0.0106, "step": 13765 }, { "epoch": 3.5070888870022925, "grad_norm": 0.8856304883956909, "learning_rate": 1.3400141826511686e-05, "loss": 0.01, "step": 13770 }, { "epoch": 3.508362339757195, "grad_norm": 0.6926121711730957, "learning_rate": 1.3395960790019926e-05, "loss": 0.008, "step": 13775 }, { "epoch": 3.509635792512098, "grad_norm": 1.2955313920974731, "learning_rate": 1.3391779082382477e-05, "loss": 0.0121, "step": 13780 }, { "epoch": 3.5109092452670003, "grad_norm": 1.331619381904602, "learning_rate": 1.3387596704425773e-05, "loss": 0.0107, "step": 13785 }, { "epoch": 3.5121826980219035, "grad_norm": 1.0107563734054565, "learning_rate": 1.338341365697638e-05, "loss": 0.0103, "step": 13790 }, { "epoch": 3.5134561507768063, "grad_norm": 1.2266169786453247, "learning_rate": 1.3379229940860996e-05, "loss": 0.0084, "step": 13795 }, { "epoch": 3.514729603531709, "grad_norm": 0.642680287361145, "learning_rate": 1.3375045556906448e-05, "loss": 0.0093, "step": 13800 }, { "epoch": 3.516003056286612, "grad_norm": 0.39588800072669983, "learning_rate": 1.3370860505939705e-05, "loss": 0.0108, "step": 13805 }, { "epoch": 3.5172765090415146, "grad_norm": 1.2133573293685913, "learning_rate": 1.3366674788787854e-05, "loss": 0.0084, "step": 13810 }, { "epoch": 3.5185499617964173, "grad_norm": 0.9067606329917908, "learning_rate": 1.3362488406278122e-05, "loss": 0.0088, "step": 13815 }, { "epoch": 3.51982341455132, "grad_norm": 2.1573104858398438, "learning_rate": 1.335830135923787e-05, "loss": 0.0091, "step": 13820 }, { "epoch": 3.521096867306223, "grad_norm": 0.8378525376319885, "learning_rate": 1.3354113648494586e-05, "loss": 0.0089, "step": 13825 }, { "epoch": 3.5223703200611256, "grad_norm": 1.1230543851852417, "learning_rate": 1.3349925274875885e-05, "loss": 0.0063, "step": 13830 }, { "epoch": 3.5236437728160284, "grad_norm": 1.4104139804840088, "learning_rate": 1.3345736239209523e-05, "loss": 0.0077, "step": 13835 }, { "epoch": 3.524917225570931, "grad_norm": 1.3299087285995483, "learning_rate": 1.334154654232338e-05, "loss": 0.0152, "step": 13840 }, { "epoch": 3.5261906783258343, "grad_norm": 1.0745011568069458, "learning_rate": 1.3337356185045462e-05, "loss": 0.0106, "step": 13845 }, { "epoch": 3.5274641310807366, "grad_norm": 0.9627993702888489, "learning_rate": 1.3333165168203924e-05, "loss": 0.0077, "step": 13850 }, { "epoch": 3.52873758383564, "grad_norm": 0.22850720584392548, "learning_rate": 1.3328973492627029e-05, "loss": 0.0077, "step": 13855 }, { "epoch": 3.5300110365905426, "grad_norm": 1.0009938478469849, "learning_rate": 1.3324781159143182e-05, "loss": 0.0111, "step": 13860 }, { "epoch": 3.5312844893454454, "grad_norm": 0.6334408521652222, "learning_rate": 1.3320588168580922e-05, "loss": 0.0081, "step": 13865 }, { "epoch": 3.532557942100348, "grad_norm": 0.6526767611503601, "learning_rate": 1.3316394521768904e-05, "loss": 0.0116, "step": 13870 }, { "epoch": 3.533831394855251, "grad_norm": 0.977640688419342, "learning_rate": 1.3312200219535925e-05, "loss": 0.0099, "step": 13875 }, { "epoch": 3.5351048476101536, "grad_norm": 2.1175060272216797, "learning_rate": 1.3308005262710908e-05, "loss": 0.0109, "step": 13880 }, { "epoch": 3.5363783003650564, "grad_norm": 1.0649337768554688, "learning_rate": 1.3303809652122905e-05, "loss": 0.0095, "step": 13885 }, { "epoch": 3.537651753119959, "grad_norm": 1.0738050937652588, "learning_rate": 1.3299613388601094e-05, "loss": 0.0108, "step": 13890 }, { "epoch": 3.538925205874862, "grad_norm": 0.9581064581871033, "learning_rate": 1.3295416472974789e-05, "loss": 0.0107, "step": 13895 }, { "epoch": 3.5401986586297647, "grad_norm": 0.8618133068084717, "learning_rate": 1.3291218906073427e-05, "loss": 0.0065, "step": 13900 }, { "epoch": 3.5414721113846674, "grad_norm": 1.4013538360595703, "learning_rate": 1.3287020688726574e-05, "loss": 0.0075, "step": 13905 }, { "epoch": 3.5427455641395706, "grad_norm": 0.7984133362770081, "learning_rate": 1.3282821821763925e-05, "loss": 0.0092, "step": 13910 }, { "epoch": 3.544019016894473, "grad_norm": 0.6969540119171143, "learning_rate": 1.327862230601531e-05, "loss": 0.0095, "step": 13915 }, { "epoch": 3.545292469649376, "grad_norm": 0.3136398494243622, "learning_rate": 1.3274422142310676e-05, "loss": 0.0129, "step": 13920 }, { "epoch": 3.546565922404279, "grad_norm": 1.0050917863845825, "learning_rate": 1.327022133148011e-05, "loss": 0.0129, "step": 13925 }, { "epoch": 3.5478393751591817, "grad_norm": 1.697398066520691, "learning_rate": 1.3266019874353812e-05, "loss": 0.0087, "step": 13930 }, { "epoch": 3.5491128279140844, "grad_norm": 0.5727803707122803, "learning_rate": 1.3261817771762129e-05, "loss": 0.0122, "step": 13935 }, { "epoch": 3.550386280668987, "grad_norm": 1.0166378021240234, "learning_rate": 1.3257615024535513e-05, "loss": 0.01, "step": 13940 }, { "epoch": 3.55165973342389, "grad_norm": 0.8266984224319458, "learning_rate": 1.3253411633504567e-05, "loss": 0.0098, "step": 13945 }, { "epoch": 3.5529331861787927, "grad_norm": 0.9469349384307861, "learning_rate": 1.3249207599500002e-05, "loss": 0.0129, "step": 13950 }, { "epoch": 3.5542066389336955, "grad_norm": 1.0084362030029297, "learning_rate": 1.3245002923352665e-05, "loss": 0.0062, "step": 13955 }, { "epoch": 3.5554800916885982, "grad_norm": 0.8867306113243103, "learning_rate": 1.3240797605893532e-05, "loss": 0.0112, "step": 13960 }, { "epoch": 3.556753544443501, "grad_norm": 1.2668287754058838, "learning_rate": 1.3236591647953701e-05, "loss": 0.0089, "step": 13965 }, { "epoch": 3.5580269971984038, "grad_norm": 1.2807425260543823, "learning_rate": 1.323238505036439e-05, "loss": 0.0084, "step": 13970 }, { "epoch": 3.559300449953307, "grad_norm": 1.3571391105651855, "learning_rate": 1.3228177813956963e-05, "loss": 0.0105, "step": 13975 }, { "epoch": 3.5605739027082093, "grad_norm": 1.21644127368927, "learning_rate": 1.3223969939562895e-05, "loss": 0.0097, "step": 13980 }, { "epoch": 3.5618473554631125, "grad_norm": 1.0587431192398071, "learning_rate": 1.3219761428013787e-05, "loss": 0.0072, "step": 13985 }, { "epoch": 3.5631208082180152, "grad_norm": 0.8100307583808899, "learning_rate": 1.3215552280141373e-05, "loss": 0.0082, "step": 13990 }, { "epoch": 3.564394260972918, "grad_norm": 1.5567682981491089, "learning_rate": 1.3211342496777509e-05, "loss": 0.0151, "step": 13995 }, { "epoch": 3.5656677137278208, "grad_norm": 1.0338928699493408, "learning_rate": 1.3207132078754171e-05, "loss": 0.0095, "step": 14000 }, { "epoch": 3.5669411664827235, "grad_norm": 0.6664203405380249, "learning_rate": 1.3202921026903475e-05, "loss": 0.0077, "step": 14005 }, { "epoch": 3.5682146192376263, "grad_norm": 1.624936819076538, "learning_rate": 1.3198709342057651e-05, "loss": 0.0126, "step": 14010 }, { "epoch": 3.569488071992529, "grad_norm": 1.476464033126831, "learning_rate": 1.3194497025049052e-05, "loss": 0.013, "step": 14015 }, { "epoch": 3.570761524747432, "grad_norm": 0.2657676637172699, "learning_rate": 1.3190284076710168e-05, "loss": 0.0113, "step": 14020 }, { "epoch": 3.5720349775023346, "grad_norm": 0.8722571134567261, "learning_rate": 1.3186070497873602e-05, "loss": 0.0086, "step": 14025 }, { "epoch": 3.5733084302572373, "grad_norm": 0.2473885715007782, "learning_rate": 1.3181856289372086e-05, "loss": 0.0078, "step": 14030 }, { "epoch": 3.57458188301214, "grad_norm": 1.2542897462844849, "learning_rate": 1.3177641452038476e-05, "loss": 0.0098, "step": 14035 }, { "epoch": 3.5758553357670433, "grad_norm": 0.7918240427970886, "learning_rate": 1.3173425986705756e-05, "loss": 0.0129, "step": 14040 }, { "epoch": 3.5771287885219456, "grad_norm": 0.6455349922180176, "learning_rate": 1.316920989420703e-05, "loss": 0.0072, "step": 14045 }, { "epoch": 3.578402241276849, "grad_norm": 1.312945008277893, "learning_rate": 1.3164993175375522e-05, "loss": 0.0075, "step": 14050 }, { "epoch": 3.5796756940317516, "grad_norm": 1.1266032457351685, "learning_rate": 1.316077583104459e-05, "loss": 0.0096, "step": 14055 }, { "epoch": 3.5809491467866543, "grad_norm": 0.5513628721237183, "learning_rate": 1.3156557862047707e-05, "loss": 0.0076, "step": 14060 }, { "epoch": 3.582222599541557, "grad_norm": 0.4831436574459076, "learning_rate": 1.3152339269218476e-05, "loss": 0.0087, "step": 14065 }, { "epoch": 3.58349605229646, "grad_norm": 0.8577108979225159, "learning_rate": 1.3148120053390616e-05, "loss": 0.0098, "step": 14070 }, { "epoch": 3.5847695050513626, "grad_norm": 1.393559217453003, "learning_rate": 1.3143900215397976e-05, "loss": 0.0113, "step": 14075 }, { "epoch": 3.5860429578062654, "grad_norm": 1.1251647472381592, "learning_rate": 1.3139679756074526e-05, "loss": 0.0077, "step": 14080 }, { "epoch": 3.587316410561168, "grad_norm": 1.001242995262146, "learning_rate": 1.3135458676254354e-05, "loss": 0.0094, "step": 14085 }, { "epoch": 3.588589863316071, "grad_norm": 0.5485708117485046, "learning_rate": 1.3131236976771673e-05, "loss": 0.0113, "step": 14090 }, { "epoch": 3.5898633160709736, "grad_norm": 1.1553834676742554, "learning_rate": 1.3127014658460828e-05, "loss": 0.0081, "step": 14095 }, { "epoch": 3.5911367688258764, "grad_norm": 0.6093717217445374, "learning_rate": 1.3122791722156272e-05, "loss": 0.0075, "step": 14100 }, { "epoch": 3.5924102215807796, "grad_norm": 0.8266305327415466, "learning_rate": 1.3118568168692587e-05, "loss": 0.0106, "step": 14105 }, { "epoch": 3.593683674335682, "grad_norm": 1.3430334329605103, "learning_rate": 1.3114343998904478e-05, "loss": 0.0088, "step": 14110 }, { "epoch": 3.594957127090585, "grad_norm": 0.7599483132362366, "learning_rate": 1.3110119213626766e-05, "loss": 0.0066, "step": 14115 }, { "epoch": 3.596230579845488, "grad_norm": 0.8295024037361145, "learning_rate": 1.3105893813694403e-05, "loss": 0.0073, "step": 14120 }, { "epoch": 3.5975040326003906, "grad_norm": 1.026872992515564, "learning_rate": 1.3101667799942454e-05, "loss": 0.0082, "step": 14125 }, { "epoch": 3.5987774853552934, "grad_norm": 0.6829109191894531, "learning_rate": 1.3097441173206113e-05, "loss": 0.0118, "step": 14130 }, { "epoch": 3.600050938110196, "grad_norm": 1.259023904800415, "learning_rate": 1.3093213934320686e-05, "loss": 0.011, "step": 14135 }, { "epoch": 3.601324390865099, "grad_norm": 0.937795877456665, "learning_rate": 1.3088986084121603e-05, "loss": 0.0124, "step": 14140 }, { "epoch": 3.6025978436200017, "grad_norm": 1.0760573148727417, "learning_rate": 1.3084757623444421e-05, "loss": 0.0119, "step": 14145 }, { "epoch": 3.6038712963749044, "grad_norm": 0.9593319892883301, "learning_rate": 1.3080528553124815e-05, "loss": 0.0096, "step": 14150 }, { "epoch": 3.605144749129807, "grad_norm": 1.0845811367034912, "learning_rate": 1.307629887399857e-05, "loss": 0.0076, "step": 14155 }, { "epoch": 3.60641820188471, "grad_norm": 0.6846482753753662, "learning_rate": 1.3072068586901605e-05, "loss": 0.0084, "step": 14160 }, { "epoch": 3.6076916546396127, "grad_norm": 0.7921370267868042, "learning_rate": 1.3067837692669958e-05, "loss": 0.0076, "step": 14165 }, { "epoch": 3.608965107394516, "grad_norm": 1.449751377105713, "learning_rate": 1.3063606192139778e-05, "loss": 0.0098, "step": 14170 }, { "epoch": 3.6102385601494182, "grad_norm": 0.9268667101860046, "learning_rate": 1.3059374086147338e-05, "loss": 0.0112, "step": 14175 }, { "epoch": 3.6115120129043214, "grad_norm": 1.4511202573776245, "learning_rate": 1.3055141375529035e-05, "loss": 0.0082, "step": 14180 }, { "epoch": 3.612785465659224, "grad_norm": 1.2646132707595825, "learning_rate": 1.3050908061121382e-05, "loss": 0.0104, "step": 14185 }, { "epoch": 3.614058918414127, "grad_norm": 1.383104681968689, "learning_rate": 1.3046674143761007e-05, "loss": 0.0106, "step": 14190 }, { "epoch": 3.6153323711690297, "grad_norm": 1.274628758430481, "learning_rate": 1.3042439624284667e-05, "loss": 0.0099, "step": 14195 }, { "epoch": 3.6166058239239325, "grad_norm": 1.4651081562042236, "learning_rate": 1.3038204503529231e-05, "loss": 0.0104, "step": 14200 }, { "epoch": 3.6178792766788352, "grad_norm": 0.9807472825050354, "learning_rate": 1.3033968782331683e-05, "loss": 0.0143, "step": 14205 }, { "epoch": 3.619152729433738, "grad_norm": 0.6622213125228882, "learning_rate": 1.3029732461529138e-05, "loss": 0.0086, "step": 14210 }, { "epoch": 3.6204261821886408, "grad_norm": 0.8344578742980957, "learning_rate": 1.302549554195882e-05, "loss": 0.0119, "step": 14215 }, { "epoch": 3.6216996349435435, "grad_norm": 1.5765355825424194, "learning_rate": 1.3021258024458075e-05, "loss": 0.0136, "step": 14220 }, { "epoch": 3.6229730876984463, "grad_norm": 0.8524581789970398, "learning_rate": 1.3017019909864365e-05, "loss": 0.0094, "step": 14225 }, { "epoch": 3.624246540453349, "grad_norm": 1.0883424282073975, "learning_rate": 1.301278119901527e-05, "loss": 0.0117, "step": 14230 }, { "epoch": 3.6255199932082522, "grad_norm": 0.853168249130249, "learning_rate": 1.3008541892748489e-05, "loss": 0.0117, "step": 14235 }, { "epoch": 3.6267934459631546, "grad_norm": 0.8196349143981934, "learning_rate": 1.300430199190184e-05, "loss": 0.0068, "step": 14240 }, { "epoch": 3.6280668987180578, "grad_norm": 1.0404845476150513, "learning_rate": 1.3000061497313256e-05, "loss": 0.0093, "step": 14245 }, { "epoch": 3.62934035147296, "grad_norm": 1.1911367177963257, "learning_rate": 1.299582040982079e-05, "loss": 0.0089, "step": 14250 }, { "epoch": 3.6306138042278633, "grad_norm": 0.8934445381164551, "learning_rate": 1.2991578730262609e-05, "loss": 0.0071, "step": 14255 }, { "epoch": 3.631887256982766, "grad_norm": 1.6931452751159668, "learning_rate": 1.2987336459476999e-05, "loss": 0.0109, "step": 14260 }, { "epoch": 3.633160709737669, "grad_norm": 1.0386099815368652, "learning_rate": 1.298309359830236e-05, "loss": 0.0108, "step": 14265 }, { "epoch": 3.6344341624925716, "grad_norm": 1.3334249258041382, "learning_rate": 1.2978850147577216e-05, "loss": 0.0101, "step": 14270 }, { "epoch": 3.6357076152474743, "grad_norm": 0.9760417342185974, "learning_rate": 1.29746061081402e-05, "loss": 0.0074, "step": 14275 }, { "epoch": 3.636981068002377, "grad_norm": 0.39582952857017517, "learning_rate": 1.2970361480830062e-05, "loss": 0.0105, "step": 14280 }, { "epoch": 3.63825452075728, "grad_norm": 1.4576365947723389, "learning_rate": 1.2966116266485674e-05, "loss": 0.009, "step": 14285 }, { "epoch": 3.6395279735121826, "grad_norm": 1.0270376205444336, "learning_rate": 1.2961870465946017e-05, "loss": 0.0096, "step": 14290 }, { "epoch": 3.6408014262670854, "grad_norm": 1.8194416761398315, "learning_rate": 1.2957624080050192e-05, "loss": 0.0088, "step": 14295 }, { "epoch": 3.642074879021988, "grad_norm": 0.7815544009208679, "learning_rate": 1.2953377109637418e-05, "loss": 0.011, "step": 14300 }, { "epoch": 3.643348331776891, "grad_norm": 0.26067066192626953, "learning_rate": 1.294912955554702e-05, "loss": 0.0068, "step": 14305 }, { "epoch": 3.644621784531794, "grad_norm": 1.1374951601028442, "learning_rate": 1.294488141861845e-05, "loss": 0.0104, "step": 14310 }, { "epoch": 3.6458952372866964, "grad_norm": 0.8360746502876282, "learning_rate": 1.2940632699691265e-05, "loss": 0.0085, "step": 14315 }, { "epoch": 3.6471686900415996, "grad_norm": 0.6368467211723328, "learning_rate": 1.2936383399605146e-05, "loss": 0.0072, "step": 14320 }, { "epoch": 3.6484421427965024, "grad_norm": 1.5100271701812744, "learning_rate": 1.2932133519199883e-05, "loss": 0.009, "step": 14325 }, { "epoch": 3.649715595551405, "grad_norm": 1.2619763612747192, "learning_rate": 1.292788305931538e-05, "loss": 0.0121, "step": 14330 }, { "epoch": 3.650989048306308, "grad_norm": 0.5857354402542114, "learning_rate": 1.2923632020791664e-05, "loss": 0.0095, "step": 14335 }, { "epoch": 3.6522625010612106, "grad_norm": 0.6267552971839905, "learning_rate": 1.2919380404468865e-05, "loss": 0.0091, "step": 14340 }, { "epoch": 3.6535359538161134, "grad_norm": 0.8376909494400024, "learning_rate": 1.2915128211187231e-05, "loss": 0.0115, "step": 14345 }, { "epoch": 3.654809406571016, "grad_norm": 0.7061530351638794, "learning_rate": 1.291087544178713e-05, "loss": 0.0104, "step": 14350 }, { "epoch": 3.656082859325919, "grad_norm": 1.0681978464126587, "learning_rate": 1.2906622097109039e-05, "loss": 0.009, "step": 14355 }, { "epoch": 3.6573563120808217, "grad_norm": 1.1264004707336426, "learning_rate": 1.2902368177993543e-05, "loss": 0.0061, "step": 14360 }, { "epoch": 3.6586297648357244, "grad_norm": 0.9992992281913757, "learning_rate": 1.2898113685281355e-05, "loss": 0.0132, "step": 14365 }, { "epoch": 3.659903217590627, "grad_norm": 1.2774240970611572, "learning_rate": 1.2893858619813287e-05, "loss": 0.0103, "step": 14370 }, { "epoch": 3.6611766703455304, "grad_norm": 1.1542669534683228, "learning_rate": 1.2889602982430271e-05, "loss": 0.0082, "step": 14375 }, { "epoch": 3.6624501231004327, "grad_norm": 0.824631929397583, "learning_rate": 1.2885346773973353e-05, "loss": 0.0097, "step": 14380 }, { "epoch": 3.663723575855336, "grad_norm": 0.9373425245285034, "learning_rate": 1.2881089995283689e-05, "loss": 0.0099, "step": 14385 }, { "epoch": 3.6649970286102387, "grad_norm": 0.5834643840789795, "learning_rate": 1.2876832647202546e-05, "loss": 0.0077, "step": 14390 }, { "epoch": 3.6662704813651414, "grad_norm": 1.3998403549194336, "learning_rate": 1.2872574730571313e-05, "loss": 0.009, "step": 14395 }, { "epoch": 3.667543934120044, "grad_norm": 1.492091417312622, "learning_rate": 1.2868316246231474e-05, "loss": 0.011, "step": 14400 }, { "epoch": 3.668817386874947, "grad_norm": 0.939755916595459, "learning_rate": 1.2864057195024644e-05, "loss": 0.0157, "step": 14405 }, { "epoch": 3.6700908396298497, "grad_norm": 0.5230185985565186, "learning_rate": 1.2859797577792541e-05, "loss": 0.0073, "step": 14410 }, { "epoch": 3.6713642923847525, "grad_norm": 0.2776152789592743, "learning_rate": 1.2855537395376994e-05, "loss": 0.0094, "step": 14415 }, { "epoch": 3.6726377451396552, "grad_norm": 0.5826010704040527, "learning_rate": 1.2851276648619941e-05, "loss": 0.0105, "step": 14420 }, { "epoch": 3.673911197894558, "grad_norm": 0.9552101492881775, "learning_rate": 1.2847015338363447e-05, "loss": 0.0085, "step": 14425 }, { "epoch": 3.6751846506494608, "grad_norm": 0.8267037868499756, "learning_rate": 1.2842753465449666e-05, "loss": 0.01, "step": 14430 }, { "epoch": 3.6764581034043635, "grad_norm": 0.8518446683883667, "learning_rate": 1.2838491030720882e-05, "loss": 0.0115, "step": 14435 }, { "epoch": 3.6777315561592667, "grad_norm": 0.7736194729804993, "learning_rate": 1.2834228035019474e-05, "loss": 0.0073, "step": 14440 }, { "epoch": 3.679005008914169, "grad_norm": 0.9222553372383118, "learning_rate": 1.2829964479187949e-05, "loss": 0.0119, "step": 14445 }, { "epoch": 3.6802784616690722, "grad_norm": 0.5230563282966614, "learning_rate": 1.2825700364068911e-05, "loss": 0.0071, "step": 14450 }, { "epoch": 3.681551914423975, "grad_norm": 1.7023801803588867, "learning_rate": 1.2821435690505082e-05, "loss": 0.0111, "step": 14455 }, { "epoch": 3.6828253671788778, "grad_norm": 0.735134482383728, "learning_rate": 1.2817170459339292e-05, "loss": 0.0087, "step": 14460 }, { "epoch": 3.6840988199337805, "grad_norm": 0.803031861782074, "learning_rate": 1.2812904671414479e-05, "loss": 0.0139, "step": 14465 }, { "epoch": 3.6853722726886833, "grad_norm": 1.168974757194519, "learning_rate": 1.2808638327573692e-05, "loss": 0.0087, "step": 14470 }, { "epoch": 3.686645725443586, "grad_norm": 0.8790977597236633, "learning_rate": 1.2804371428660096e-05, "loss": 0.0094, "step": 14475 }, { "epoch": 3.687919178198489, "grad_norm": 1.1773262023925781, "learning_rate": 1.2800103975516957e-05, "loss": 0.0129, "step": 14480 }, { "epoch": 3.6891926309533916, "grad_norm": 0.7665451169013977, "learning_rate": 1.2795835968987653e-05, "loss": 0.0097, "step": 14485 }, { "epoch": 3.6904660837082943, "grad_norm": 0.8831143379211426, "learning_rate": 1.2791567409915677e-05, "loss": 0.0101, "step": 14490 }, { "epoch": 3.691739536463197, "grad_norm": 0.8839424848556519, "learning_rate": 1.2787298299144625e-05, "loss": 0.0074, "step": 14495 }, { "epoch": 3.6930129892181, "grad_norm": 1.4124877452850342, "learning_rate": 1.2783028637518202e-05, "loss": 0.0095, "step": 14500 }, { "epoch": 3.694286441973003, "grad_norm": 1.5518747568130493, "learning_rate": 1.2778758425880226e-05, "loss": 0.0135, "step": 14505 }, { "epoch": 3.6955598947279054, "grad_norm": 0.694604754447937, "learning_rate": 1.2774487665074623e-05, "loss": 0.0127, "step": 14510 }, { "epoch": 3.6968333474828086, "grad_norm": 0.7031749486923218, "learning_rate": 1.2770216355945417e-05, "loss": 0.0121, "step": 14515 }, { "epoch": 3.6981068002377113, "grad_norm": 1.041620135307312, "learning_rate": 1.276594449933676e-05, "loss": 0.0111, "step": 14520 }, { "epoch": 3.699380252992614, "grad_norm": 1.3295981884002686, "learning_rate": 1.2761672096092896e-05, "loss": 0.0105, "step": 14525 }, { "epoch": 3.700653705747517, "grad_norm": 0.7711631655693054, "learning_rate": 1.275739914705818e-05, "loss": 0.0093, "step": 14530 }, { "epoch": 3.7019271585024196, "grad_norm": 1.2514746189117432, "learning_rate": 1.2753125653077085e-05, "loss": 0.0133, "step": 14535 }, { "epoch": 3.7032006112573224, "grad_norm": 0.796187162399292, "learning_rate": 1.274885161499418e-05, "loss": 0.0099, "step": 14540 }, { "epoch": 3.704474064012225, "grad_norm": 0.4559337794780731, "learning_rate": 1.2744577033654146e-05, "loss": 0.0102, "step": 14545 }, { "epoch": 3.705747516767128, "grad_norm": 1.024570107460022, "learning_rate": 1.2740301909901769e-05, "loss": 0.0143, "step": 14550 }, { "epoch": 3.7070209695220306, "grad_norm": 0.8128252625465393, "learning_rate": 1.2736026244581945e-05, "loss": 0.0103, "step": 14555 }, { "epoch": 3.7082944222769334, "grad_norm": 0.6302767395973206, "learning_rate": 1.2731750038539678e-05, "loss": 0.0095, "step": 14560 }, { "epoch": 3.709567875031836, "grad_norm": 1.5137410163879395, "learning_rate": 1.2727473292620075e-05, "loss": 0.0089, "step": 14565 }, { "epoch": 3.7108413277867394, "grad_norm": 1.106123447418213, "learning_rate": 1.2723196007668355e-05, "loss": 0.0078, "step": 14570 }, { "epoch": 3.7121147805416417, "grad_norm": 0.9720634818077087, "learning_rate": 1.2718918184529838e-05, "loss": 0.0138, "step": 14575 }, { "epoch": 3.713388233296545, "grad_norm": 1.0629976987838745, "learning_rate": 1.271463982404995e-05, "loss": 0.0099, "step": 14580 }, { "epoch": 3.7146616860514476, "grad_norm": 1.0092781782150269, "learning_rate": 1.2710360927074232e-05, "loss": 0.0121, "step": 14585 }, { "epoch": 3.7159351388063504, "grad_norm": 1.0594531297683716, "learning_rate": 1.2706081494448319e-05, "loss": 0.0121, "step": 14590 }, { "epoch": 3.717208591561253, "grad_norm": 0.6209545731544495, "learning_rate": 1.270180152701796e-05, "loss": 0.0069, "step": 14595 }, { "epoch": 3.718482044316156, "grad_norm": 1.534438133239746, "learning_rate": 1.2697521025629009e-05, "loss": 0.0098, "step": 14600 }, { "epoch": 3.7197554970710587, "grad_norm": 1.3323651552200317, "learning_rate": 1.2693239991127423e-05, "loss": 0.0122, "step": 14605 }, { "epoch": 3.7210289498259614, "grad_norm": 1.51106595993042, "learning_rate": 1.2688958424359263e-05, "loss": 0.0119, "step": 14610 }, { "epoch": 3.722302402580864, "grad_norm": 1.5645209550857544, "learning_rate": 1.2684676326170702e-05, "loss": 0.0072, "step": 14615 }, { "epoch": 3.723575855335767, "grad_norm": 1.3117308616638184, "learning_rate": 1.2680393697408008e-05, "loss": 0.0125, "step": 14620 }, { "epoch": 3.7248493080906697, "grad_norm": 1.1632907390594482, "learning_rate": 1.2676110538917565e-05, "loss": 0.0097, "step": 14625 }, { "epoch": 3.7261227608455725, "grad_norm": 1.1528693437576294, "learning_rate": 1.2671826851545851e-05, "loss": 0.0093, "step": 14630 }, { "epoch": 3.7273962136004757, "grad_norm": 0.7622177600860596, "learning_rate": 1.2667542636139458e-05, "loss": 0.0095, "step": 14635 }, { "epoch": 3.728669666355378, "grad_norm": 0.9285953640937805, "learning_rate": 1.2663257893545075e-05, "loss": 0.0144, "step": 14640 }, { "epoch": 3.729943119110281, "grad_norm": 0.7228437662124634, "learning_rate": 1.26589726246095e-05, "loss": 0.0096, "step": 14645 }, { "epoch": 3.731216571865184, "grad_norm": 2.053893566131592, "learning_rate": 1.2654686830179632e-05, "loss": 0.0133, "step": 14650 }, { "epoch": 3.7324900246200867, "grad_norm": 1.0396186113357544, "learning_rate": 1.2650400511102474e-05, "loss": 0.0095, "step": 14655 }, { "epoch": 3.7337634773749895, "grad_norm": 0.6820263266563416, "learning_rate": 1.2646113668225137e-05, "loss": 0.011, "step": 14660 }, { "epoch": 3.7350369301298922, "grad_norm": 0.8677361011505127, "learning_rate": 1.264182630239483e-05, "loss": 0.011, "step": 14665 }, { "epoch": 3.736310382884795, "grad_norm": 1.3964922428131104, "learning_rate": 1.2637538414458867e-05, "loss": 0.0099, "step": 14670 }, { "epoch": 3.7375838356396978, "grad_norm": 0.7884085774421692, "learning_rate": 1.2633250005264665e-05, "loss": 0.0058, "step": 14675 }, { "epoch": 3.7388572883946005, "grad_norm": 1.0765094757080078, "learning_rate": 1.262896107565975e-05, "loss": 0.0108, "step": 14680 }, { "epoch": 3.7401307411495033, "grad_norm": 0.5159112811088562, "learning_rate": 1.2624671626491743e-05, "loss": 0.0114, "step": 14685 }, { "epoch": 3.741404193904406, "grad_norm": 0.7661750316619873, "learning_rate": 1.2620381658608362e-05, "loss": 0.0084, "step": 14690 }, { "epoch": 3.742677646659309, "grad_norm": 1.1747015714645386, "learning_rate": 1.261609117285745e-05, "loss": 0.014, "step": 14695 }, { "epoch": 3.743951099414212, "grad_norm": 1.328312635421753, "learning_rate": 1.261180017008693e-05, "loss": 0.0091, "step": 14700 }, { "epoch": 3.7452245521691143, "grad_norm": 1.7272108793258667, "learning_rate": 1.2607508651144836e-05, "loss": 0.0093, "step": 14705 }, { "epoch": 3.7464980049240175, "grad_norm": 1.1113674640655518, "learning_rate": 1.2603216616879302e-05, "loss": 0.0085, "step": 14710 }, { "epoch": 3.74777145767892, "grad_norm": 0.7642978429794312, "learning_rate": 1.2598924068138572e-05, "loss": 0.0095, "step": 14715 }, { "epoch": 3.749044910433823, "grad_norm": 1.103806972503662, "learning_rate": 1.2594631005770977e-05, "loss": 0.0104, "step": 14720 }, { "epoch": 3.750318363188726, "grad_norm": 0.9074177742004395, "learning_rate": 1.2590337430624959e-05, "loss": 0.0089, "step": 14725 }, { "epoch": 3.7515918159436286, "grad_norm": 0.7107997536659241, "learning_rate": 1.2586043343549064e-05, "loss": 0.0076, "step": 14730 }, { "epoch": 3.7528652686985313, "grad_norm": 0.7854945063591003, "learning_rate": 1.2581748745391931e-05, "loss": 0.0095, "step": 14735 }, { "epoch": 3.754138721453434, "grad_norm": 0.9334495067596436, "learning_rate": 1.2577453637002306e-05, "loss": 0.0107, "step": 14740 }, { "epoch": 3.755412174208337, "grad_norm": 0.5993887186050415, "learning_rate": 1.257315801922903e-05, "loss": 0.0071, "step": 14745 }, { "epoch": 3.7566856269632396, "grad_norm": 1.0465421676635742, "learning_rate": 1.2568861892921056e-05, "loss": 0.0055, "step": 14750 }, { "epoch": 3.7579590797181424, "grad_norm": 0.6155872344970703, "learning_rate": 1.2564565258927424e-05, "loss": 0.0097, "step": 14755 }, { "epoch": 3.759232532473045, "grad_norm": 1.384759783744812, "learning_rate": 1.2560268118097279e-05, "loss": 0.0095, "step": 14760 }, { "epoch": 3.760505985227948, "grad_norm": 0.5090533494949341, "learning_rate": 1.255597047127987e-05, "loss": 0.009, "step": 14765 }, { "epoch": 3.7617794379828506, "grad_norm": 1.8642643690109253, "learning_rate": 1.2551672319324544e-05, "loss": 0.0097, "step": 14770 }, { "epoch": 3.763052890737754, "grad_norm": 0.9508642554283142, "learning_rate": 1.2547373663080748e-05, "loss": 0.0074, "step": 14775 }, { "epoch": 3.764326343492656, "grad_norm": 0.9936817288398743, "learning_rate": 1.2543074503398022e-05, "loss": 0.0088, "step": 14780 }, { "epoch": 3.7655997962475594, "grad_norm": 1.0516715049743652, "learning_rate": 1.253877484112602e-05, "loss": 0.0118, "step": 14785 }, { "epoch": 3.766873249002462, "grad_norm": 1.3034694194793701, "learning_rate": 1.2534474677114483e-05, "loss": 0.0123, "step": 14790 }, { "epoch": 3.768146701757365, "grad_norm": 0.8412666320800781, "learning_rate": 1.2530174012213254e-05, "loss": 0.0126, "step": 14795 }, { "epoch": 3.7694201545122676, "grad_norm": 1.22767174243927, "learning_rate": 1.2525872847272273e-05, "loss": 0.0124, "step": 14800 }, { "epoch": 3.7706936072671704, "grad_norm": 0.5669569373130798, "learning_rate": 1.2521571183141587e-05, "loss": 0.0107, "step": 14805 }, { "epoch": 3.771967060022073, "grad_norm": 1.358109474182129, "learning_rate": 1.2517269020671336e-05, "loss": 0.013, "step": 14810 }, { "epoch": 3.773240512776976, "grad_norm": 1.0943630933761597, "learning_rate": 1.2512966360711753e-05, "loss": 0.0103, "step": 14815 }, { "epoch": 3.7745139655318787, "grad_norm": 1.066354513168335, "learning_rate": 1.2508663204113185e-05, "loss": 0.0108, "step": 14820 }, { "epoch": 3.7757874182867814, "grad_norm": 0.7926653027534485, "learning_rate": 1.250435955172606e-05, "loss": 0.0113, "step": 14825 }, { "epoch": 3.777060871041684, "grad_norm": 0.7815549373626709, "learning_rate": 1.2500055404400908e-05, "loss": 0.0091, "step": 14830 }, { "epoch": 3.778334323796587, "grad_norm": 0.7625598907470703, "learning_rate": 1.2495750762988373e-05, "loss": 0.0085, "step": 14835 }, { "epoch": 3.77960777655149, "grad_norm": 0.7876824140548706, "learning_rate": 1.2491445628339171e-05, "loss": 0.0085, "step": 14840 }, { "epoch": 3.7808812293063925, "grad_norm": 1.216631531715393, "learning_rate": 1.2487140001304131e-05, "loss": 0.0118, "step": 14845 }, { "epoch": 3.7821546820612957, "grad_norm": 1.1648880243301392, "learning_rate": 1.2482833882734184e-05, "loss": 0.0107, "step": 14850 }, { "epoch": 3.7834281348161984, "grad_norm": 0.15553544461727142, "learning_rate": 1.2478527273480345e-05, "loss": 0.0072, "step": 14855 }, { "epoch": 3.784701587571101, "grad_norm": 0.8038197159767151, "learning_rate": 1.247422017439373e-05, "loss": 0.0093, "step": 14860 }, { "epoch": 3.785975040326004, "grad_norm": 1.4340451955795288, "learning_rate": 1.2469912586325554e-05, "loss": 0.011, "step": 14865 }, { "epoch": 3.7872484930809067, "grad_norm": 1.0204864740371704, "learning_rate": 1.2465604510127135e-05, "loss": 0.0057, "step": 14870 }, { "epoch": 3.7885219458358095, "grad_norm": 0.7368050217628479, "learning_rate": 1.246129594664987e-05, "loss": 0.0131, "step": 14875 }, { "epoch": 3.7897953985907122, "grad_norm": 0.9837214946746826, "learning_rate": 1.245698689674527e-05, "loss": 0.0061, "step": 14880 }, { "epoch": 3.791068851345615, "grad_norm": 0.576133668422699, "learning_rate": 1.2452677361264933e-05, "loss": 0.0074, "step": 14885 }, { "epoch": 3.7923423041005178, "grad_norm": 1.0762399435043335, "learning_rate": 1.2448367341060555e-05, "loss": 0.009, "step": 14890 }, { "epoch": 3.7936157568554205, "grad_norm": 0.7303717136383057, "learning_rate": 1.244405683698393e-05, "loss": 0.0104, "step": 14895 }, { "epoch": 3.7948892096103233, "grad_norm": 0.6960472464561462, "learning_rate": 1.2439745849886942e-05, "loss": 0.0089, "step": 14900 }, { "epoch": 3.7961626623652265, "grad_norm": 1.4142742156982422, "learning_rate": 1.2435434380621572e-05, "loss": 0.0099, "step": 14905 }, { "epoch": 3.797436115120129, "grad_norm": 1.1923203468322754, "learning_rate": 1.2431122430039903e-05, "loss": 0.008, "step": 14910 }, { "epoch": 3.798709567875032, "grad_norm": 0.9982702136039734, "learning_rate": 1.242680999899411e-05, "loss": 0.014, "step": 14915 }, { "epoch": 3.7999830206299348, "grad_norm": 1.234907865524292, "learning_rate": 1.2422497088336456e-05, "loss": 0.0093, "step": 14920 }, { "epoch": 3.8012564733848375, "grad_norm": 1.0996378660202026, "learning_rate": 1.2418183698919303e-05, "loss": 0.013, "step": 14925 }, { "epoch": 3.8025299261397403, "grad_norm": 1.0088120698928833, "learning_rate": 1.2413869831595116e-05, "loss": 0.0105, "step": 14930 }, { "epoch": 3.803803378894643, "grad_norm": 1.210534930229187, "learning_rate": 1.240955548721644e-05, "loss": 0.0082, "step": 14935 }, { "epoch": 3.805076831649546, "grad_norm": 0.7403919100761414, "learning_rate": 1.2405240666635925e-05, "loss": 0.0076, "step": 14940 }, { "epoch": 3.8063502844044486, "grad_norm": 0.9308886528015137, "learning_rate": 1.2400925370706309e-05, "loss": 0.0077, "step": 14945 }, { "epoch": 3.8076237371593513, "grad_norm": 1.123984932899475, "learning_rate": 1.2396609600280429e-05, "loss": 0.0126, "step": 14950 }, { "epoch": 3.808897189914254, "grad_norm": 0.8887700438499451, "learning_rate": 1.2392293356211211e-05, "loss": 0.0083, "step": 14955 }, { "epoch": 3.810170642669157, "grad_norm": 1.011159896850586, "learning_rate": 1.238797663935168e-05, "loss": 0.0083, "step": 14960 }, { "epoch": 3.8114440954240596, "grad_norm": 0.914474368095398, "learning_rate": 1.2383659450554949e-05, "loss": 0.0077, "step": 14965 }, { "epoch": 3.812717548178963, "grad_norm": 0.994381308555603, "learning_rate": 1.2379341790674223e-05, "loss": 0.0086, "step": 14970 }, { "epoch": 3.813991000933865, "grad_norm": 1.048954725265503, "learning_rate": 1.237502366056281e-05, "loss": 0.0074, "step": 14975 }, { "epoch": 3.8152644536887683, "grad_norm": 0.9556573033332825, "learning_rate": 1.2370705061074101e-05, "loss": 0.0082, "step": 14980 }, { "epoch": 3.816537906443671, "grad_norm": 1.7932813167572021, "learning_rate": 1.2366385993061585e-05, "loss": 0.0113, "step": 14985 }, { "epoch": 3.817811359198574, "grad_norm": 1.5103017091751099, "learning_rate": 1.236206645737884e-05, "loss": 0.0099, "step": 14990 }, { "epoch": 3.8190848119534766, "grad_norm": 1.8824267387390137, "learning_rate": 1.2357746454879542e-05, "loss": 0.0063, "step": 14995 }, { "epoch": 3.8203582647083794, "grad_norm": 0.5124837160110474, "learning_rate": 1.235342598641745e-05, "loss": 0.0081, "step": 15000 }, { "epoch": 3.821631717463282, "grad_norm": 1.2594901323318481, "learning_rate": 1.2349105052846423e-05, "loss": 0.0106, "step": 15005 }, { "epoch": 3.822905170218185, "grad_norm": 1.43930184841156, "learning_rate": 1.2344783655020413e-05, "loss": 0.0074, "step": 15010 }, { "epoch": 3.8241786229730876, "grad_norm": 0.7158402800559998, "learning_rate": 1.2340461793793455e-05, "loss": 0.0106, "step": 15015 }, { "epoch": 3.8254520757279904, "grad_norm": 1.8599834442138672, "learning_rate": 1.2336139470019684e-05, "loss": 0.0101, "step": 15020 }, { "epoch": 3.826725528482893, "grad_norm": 1.0543519258499146, "learning_rate": 1.2331816684553324e-05, "loss": 0.0082, "step": 15025 }, { "epoch": 3.827998981237796, "grad_norm": 1.1882330179214478, "learning_rate": 1.2327493438248688e-05, "loss": 0.0077, "step": 15030 }, { "epoch": 3.829272433992699, "grad_norm": 1.0167628526687622, "learning_rate": 1.232316973196018e-05, "loss": 0.0109, "step": 15035 }, { "epoch": 3.8305458867476014, "grad_norm": 0.9691566228866577, "learning_rate": 1.2318845566542301e-05, "loss": 0.0109, "step": 15040 }, { "epoch": 3.8318193395025046, "grad_norm": 0.5891013741493225, "learning_rate": 1.2314520942849635e-05, "loss": 0.0072, "step": 15045 }, { "epoch": 3.8330927922574074, "grad_norm": 1.1323624849319458, "learning_rate": 1.2310195861736862e-05, "loss": 0.015, "step": 15050 }, { "epoch": 3.83436624501231, "grad_norm": 0.5750037431716919, "learning_rate": 1.2305870324058746e-05, "loss": 0.0083, "step": 15055 }, { "epoch": 3.835639697767213, "grad_norm": 1.5591157674789429, "learning_rate": 1.2301544330670149e-05, "loss": 0.0095, "step": 15060 }, { "epoch": 3.8369131505221157, "grad_norm": 0.5088854432106018, "learning_rate": 1.2297217882426022e-05, "loss": 0.0116, "step": 15065 }, { "epoch": 3.8381866032770184, "grad_norm": 0.6184219121932983, "learning_rate": 1.22928909801814e-05, "loss": 0.0111, "step": 15070 }, { "epoch": 3.839460056031921, "grad_norm": 0.6392411589622498, "learning_rate": 1.228856362479141e-05, "loss": 0.0065, "step": 15075 }, { "epoch": 3.840733508786824, "grad_norm": 0.6248442530632019, "learning_rate": 1.2284235817111272e-05, "loss": 0.0091, "step": 15080 }, { "epoch": 3.8420069615417267, "grad_norm": 1.3482927083969116, "learning_rate": 1.2279907557996293e-05, "loss": 0.0111, "step": 15085 }, { "epoch": 3.8432804142966295, "grad_norm": 1.0780364274978638, "learning_rate": 1.227557884830187e-05, "loss": 0.0126, "step": 15090 }, { "epoch": 3.8445538670515322, "grad_norm": 0.78641676902771, "learning_rate": 1.2271249688883484e-05, "loss": 0.0077, "step": 15095 }, { "epoch": 3.8458273198064354, "grad_norm": 0.8162963390350342, "learning_rate": 1.2266920080596715e-05, "loss": 0.0111, "step": 15100 }, { "epoch": 3.8471007725613378, "grad_norm": 1.1012094020843506, "learning_rate": 1.2262590024297226e-05, "loss": 0.011, "step": 15105 }, { "epoch": 3.848374225316241, "grad_norm": 0.9266193509101868, "learning_rate": 1.2258259520840761e-05, "loss": 0.0082, "step": 15110 }, { "epoch": 3.8496476780711437, "grad_norm": 0.6634398102760315, "learning_rate": 1.2253928571083167e-05, "loss": 0.0136, "step": 15115 }, { "epoch": 3.8509211308260465, "grad_norm": 1.3311835527420044, "learning_rate": 1.224959717588037e-05, "loss": 0.0112, "step": 15120 }, { "epoch": 3.8521945835809492, "grad_norm": 0.8387280702590942, "learning_rate": 1.2245265336088386e-05, "loss": 0.0088, "step": 15125 }, { "epoch": 3.853468036335852, "grad_norm": 0.5780900716781616, "learning_rate": 1.2240933052563323e-05, "loss": 0.0107, "step": 15130 }, { "epoch": 3.8547414890907548, "grad_norm": 1.1797239780426025, "learning_rate": 1.2236600326161364e-05, "loss": 0.009, "step": 15135 }, { "epoch": 3.8560149418456575, "grad_norm": 0.7145696878433228, "learning_rate": 1.2232267157738793e-05, "loss": 0.0116, "step": 15140 }, { "epoch": 3.8572883946005603, "grad_norm": 1.1685832738876343, "learning_rate": 1.222793354815198e-05, "loss": 0.0096, "step": 15145 }, { "epoch": 3.858561847355463, "grad_norm": 1.526401162147522, "learning_rate": 1.2223599498257375e-05, "loss": 0.0112, "step": 15150 }, { "epoch": 3.859835300110366, "grad_norm": 1.7263734340667725, "learning_rate": 1.2219265008911516e-05, "loss": 0.0156, "step": 15155 }, { "epoch": 3.8611087528652686, "grad_norm": 1.1409715414047241, "learning_rate": 1.2214930080971036e-05, "loss": 0.0067, "step": 15160 }, { "epoch": 3.8623822056201718, "grad_norm": 0.937142550945282, "learning_rate": 1.2210594715292649e-05, "loss": 0.0106, "step": 15165 }, { "epoch": 3.863655658375074, "grad_norm": 0.7456691861152649, "learning_rate": 1.2206258912733153e-05, "loss": 0.0079, "step": 15170 }, { "epoch": 3.8649291111299773, "grad_norm": 1.185478925704956, "learning_rate": 1.2201922674149436e-05, "loss": 0.0159, "step": 15175 }, { "epoch": 3.8662025638848796, "grad_norm": 1.0567303895950317, "learning_rate": 1.2197586000398471e-05, "loss": 0.011, "step": 15180 }, { "epoch": 3.867476016639783, "grad_norm": 1.2992957830429077, "learning_rate": 1.219324889233732e-05, "loss": 0.0102, "step": 15185 }, { "epoch": 3.8687494693946856, "grad_norm": 1.2150541543960571, "learning_rate": 1.2188911350823123e-05, "loss": 0.0093, "step": 15190 }, { "epoch": 3.8700229221495883, "grad_norm": 0.20505444705486298, "learning_rate": 1.2184573376713114e-05, "loss": 0.0061, "step": 15195 }, { "epoch": 3.871296374904491, "grad_norm": 1.0431228876113892, "learning_rate": 1.2180234970864611e-05, "loss": 0.0089, "step": 15200 }, { "epoch": 3.872569827659394, "grad_norm": 0.7058334350585938, "learning_rate": 1.2175896134135013e-05, "loss": 0.0075, "step": 15205 }, { "epoch": 3.8738432804142966, "grad_norm": 1.9901412725448608, "learning_rate": 1.2171556867381806e-05, "loss": 0.011, "step": 15210 }, { "epoch": 3.8751167331691994, "grad_norm": 0.6610703468322754, "learning_rate": 1.2167217171462566e-05, "loss": 0.0051, "step": 15215 }, { "epoch": 3.876390185924102, "grad_norm": 0.458645224571228, "learning_rate": 1.2162877047234945e-05, "loss": 0.0117, "step": 15220 }, { "epoch": 3.877663638679005, "grad_norm": 1.0339856147766113, "learning_rate": 1.2158536495556687e-05, "loss": 0.0101, "step": 15225 }, { "epoch": 3.8789370914339076, "grad_norm": 0.8789392113685608, "learning_rate": 1.2154195517285619e-05, "loss": 0.0069, "step": 15230 }, { "epoch": 3.8802105441888104, "grad_norm": 0.23543202877044678, "learning_rate": 1.2149854113279646e-05, "loss": 0.0093, "step": 15235 }, { "epoch": 3.8814839969437136, "grad_norm": 0.7335676550865173, "learning_rate": 1.2145512284396767e-05, "loss": 0.0112, "step": 15240 }, { "epoch": 3.882757449698616, "grad_norm": 1.0875858068466187, "learning_rate": 1.2141170031495058e-05, "loss": 0.0089, "step": 15245 }, { "epoch": 3.884030902453519, "grad_norm": 1.462996244430542, "learning_rate": 1.2136827355432681e-05, "loss": 0.0144, "step": 15250 }, { "epoch": 3.885304355208422, "grad_norm": 0.8684794306755066, "learning_rate": 1.2132484257067883e-05, "loss": 0.0114, "step": 15255 }, { "epoch": 3.8865778079633246, "grad_norm": 0.6741424202919006, "learning_rate": 1.2128140737258991e-05, "loss": 0.0079, "step": 15260 }, { "epoch": 3.8878512607182274, "grad_norm": 0.8387752771377563, "learning_rate": 1.2123796796864415e-05, "loss": 0.0125, "step": 15265 }, { "epoch": 3.88912471347313, "grad_norm": 1.1244114637374878, "learning_rate": 1.2119452436742658e-05, "loss": 0.0105, "step": 15270 }, { "epoch": 3.890398166228033, "grad_norm": 1.645620346069336, "learning_rate": 1.2115107657752294e-05, "loss": 0.0123, "step": 15275 }, { "epoch": 3.8916716189829357, "grad_norm": 1.2751924991607666, "learning_rate": 1.2110762460751981e-05, "loss": 0.01, "step": 15280 }, { "epoch": 3.8929450717378384, "grad_norm": 1.2506728172302246, "learning_rate": 1.2106416846600468e-05, "loss": 0.0102, "step": 15285 }, { "epoch": 3.894218524492741, "grad_norm": 0.965923011302948, "learning_rate": 1.2102070816156582e-05, "loss": 0.007, "step": 15290 }, { "epoch": 3.895491977247644, "grad_norm": 1.5226809978485107, "learning_rate": 1.2097724370279225e-05, "loss": 0.0109, "step": 15295 }, { "epoch": 3.8967654300025467, "grad_norm": 2.0153937339782715, "learning_rate": 1.2093377509827393e-05, "loss": 0.0106, "step": 15300 }, { "epoch": 3.89803888275745, "grad_norm": 0.9631196856498718, "learning_rate": 1.2089030235660156e-05, "loss": 0.0081, "step": 15305 }, { "epoch": 3.8993123355123522, "grad_norm": 1.113203525543213, "learning_rate": 1.2084682548636671e-05, "loss": 0.0088, "step": 15310 }, { "epoch": 3.9005857882672554, "grad_norm": 1.0349888801574707, "learning_rate": 1.2080334449616169e-05, "loss": 0.0079, "step": 15315 }, { "epoch": 3.901859241022158, "grad_norm": 0.9034736752510071, "learning_rate": 1.2075985939457972e-05, "loss": 0.0093, "step": 15320 }, { "epoch": 3.903132693777061, "grad_norm": 0.9186230897903442, "learning_rate": 1.2071637019021478e-05, "loss": 0.0105, "step": 15325 }, { "epoch": 3.9044061465319637, "grad_norm": 1.0112330913543701, "learning_rate": 1.2067287689166163e-05, "loss": 0.0102, "step": 15330 }, { "epoch": 3.9056795992868665, "grad_norm": 1.0559908151626587, "learning_rate": 1.2062937950751592e-05, "loss": 0.0102, "step": 15335 }, { "epoch": 3.9069530520417692, "grad_norm": 0.6606760621070862, "learning_rate": 1.2058587804637408e-05, "loss": 0.0092, "step": 15340 }, { "epoch": 3.908226504796672, "grad_norm": 0.8026177883148193, "learning_rate": 1.2054237251683323e-05, "loss": 0.0103, "step": 15345 }, { "epoch": 3.9094999575515748, "grad_norm": 1.0816574096679688, "learning_rate": 1.2049886292749148e-05, "loss": 0.0086, "step": 15350 }, { "epoch": 3.9107734103064775, "grad_norm": 1.1244573593139648, "learning_rate": 1.2045534928694767e-05, "loss": 0.0133, "step": 15355 }, { "epoch": 3.9120468630613803, "grad_norm": 0.8788244128227234, "learning_rate": 1.2041183160380135e-05, "loss": 0.0078, "step": 15360 }, { "epoch": 3.913320315816283, "grad_norm": 0.7627731561660767, "learning_rate": 1.2036830988665298e-05, "loss": 0.0094, "step": 15365 }, { "epoch": 3.9145937685711862, "grad_norm": 1.1131881475448608, "learning_rate": 1.203247841441038e-05, "loss": 0.0078, "step": 15370 }, { "epoch": 3.9158672213260886, "grad_norm": 0.786156952381134, "learning_rate": 1.202812543847558e-05, "loss": 0.0118, "step": 15375 }, { "epoch": 3.9171406740809918, "grad_norm": 0.8840194940567017, "learning_rate": 1.2023772061721182e-05, "loss": 0.0078, "step": 15380 }, { "epoch": 3.9184141268358945, "grad_norm": 0.5897212028503418, "learning_rate": 1.2019418285007542e-05, "loss": 0.0116, "step": 15385 }, { "epoch": 3.9196875795907973, "grad_norm": 0.5992445349693298, "learning_rate": 1.2015064109195104e-05, "loss": 0.0109, "step": 15390 }, { "epoch": 3.9209610323457, "grad_norm": 0.9074527621269226, "learning_rate": 1.2010709535144387e-05, "loss": 0.0059, "step": 15395 }, { "epoch": 3.922234485100603, "grad_norm": 0.8278557062149048, "learning_rate": 1.2006354563715983e-05, "loss": 0.0098, "step": 15400 }, { "epoch": 3.9235079378555056, "grad_norm": 1.0304239988327026, "learning_rate": 1.2001999195770568e-05, "loss": 0.0108, "step": 15405 }, { "epoch": 3.9247813906104083, "grad_norm": 1.5481866598129272, "learning_rate": 1.19976434321689e-05, "loss": 0.0099, "step": 15410 }, { "epoch": 3.926054843365311, "grad_norm": 1.0246025323867798, "learning_rate": 1.1993287273771808e-05, "loss": 0.0115, "step": 15415 }, { "epoch": 3.927328296120214, "grad_norm": 0.7398747205734253, "learning_rate": 1.1988930721440201e-05, "loss": 0.01, "step": 15420 }, { "epoch": 3.9286017488751166, "grad_norm": 0.7802883386611938, "learning_rate": 1.1984573776035074e-05, "loss": 0.0092, "step": 15425 }, { "epoch": 3.9298752016300194, "grad_norm": 0.9574548602104187, "learning_rate": 1.1980216438417485e-05, "loss": 0.008, "step": 15430 }, { "epoch": 3.9311486543849226, "grad_norm": 0.9867544174194336, "learning_rate": 1.197585870944858e-05, "loss": 0.0064, "step": 15435 }, { "epoch": 3.932422107139825, "grad_norm": 0.939161479473114, "learning_rate": 1.1971500589989575e-05, "loss": 0.0082, "step": 15440 }, { "epoch": 3.933695559894728, "grad_norm": 1.2096532583236694, "learning_rate": 1.1967142080901776e-05, "loss": 0.0105, "step": 15445 }, { "epoch": 3.934969012649631, "grad_norm": 0.5611027479171753, "learning_rate": 1.1962783183046556e-05, "loss": 0.0091, "step": 15450 }, { "epoch": 3.9362424654045336, "grad_norm": 0.8797750473022461, "learning_rate": 1.1958423897285359e-05, "loss": 0.0079, "step": 15455 }, { "epoch": 3.9375159181594364, "grad_norm": 0.6911653876304626, "learning_rate": 1.195406422447972e-05, "loss": 0.0095, "step": 15460 }, { "epoch": 3.938789370914339, "grad_norm": 0.4293379783630371, "learning_rate": 1.1949704165491243e-05, "loss": 0.0087, "step": 15465 }, { "epoch": 3.940062823669242, "grad_norm": 1.0076873302459717, "learning_rate": 1.1945343721181603e-05, "loss": 0.0086, "step": 15470 }, { "epoch": 3.9413362764241446, "grad_norm": 1.1579921245574951, "learning_rate": 1.1940982892412567e-05, "loss": 0.0121, "step": 15475 }, { "epoch": 3.9426097291790474, "grad_norm": 1.18046236038208, "learning_rate": 1.1936621680045963e-05, "loss": 0.01, "step": 15480 }, { "epoch": 3.94388318193395, "grad_norm": 0.9057881236076355, "learning_rate": 1.1932260084943696e-05, "loss": 0.0133, "step": 15485 }, { "epoch": 3.945156634688853, "grad_norm": 1.0182338953018188, "learning_rate": 1.1927898107967759e-05, "loss": 0.0082, "step": 15490 }, { "epoch": 3.9464300874437557, "grad_norm": 1.0406734943389893, "learning_rate": 1.1923535749980206e-05, "loss": 0.0099, "step": 15495 }, { "epoch": 3.947703540198659, "grad_norm": 1.728353500366211, "learning_rate": 1.1919173011843173e-05, "loss": 0.0117, "step": 15500 }, { "epoch": 3.948976992953561, "grad_norm": 0.7475186586380005, "learning_rate": 1.191480989441887e-05, "loss": 0.0094, "step": 15505 }, { "epoch": 3.9502504457084644, "grad_norm": 0.8183074593544006, "learning_rate": 1.1910446398569586e-05, "loss": 0.006, "step": 15510 }, { "epoch": 3.951523898463367, "grad_norm": 0.6048024296760559, "learning_rate": 1.1906082525157676e-05, "loss": 0.0116, "step": 15515 }, { "epoch": 3.95279735121827, "grad_norm": 1.2666473388671875, "learning_rate": 1.1901718275045582e-05, "loss": 0.0142, "step": 15520 }, { "epoch": 3.9540708039731727, "grad_norm": 0.8360289931297302, "learning_rate": 1.1897353649095807e-05, "loss": 0.009, "step": 15525 }, { "epoch": 3.9553442567280754, "grad_norm": 1.187110424041748, "learning_rate": 1.1892988648170936e-05, "loss": 0.0138, "step": 15530 }, { "epoch": 3.956617709482978, "grad_norm": 0.938075602054596, "learning_rate": 1.1888623273133625e-05, "loss": 0.0117, "step": 15535 }, { "epoch": 3.957891162237881, "grad_norm": 0.36454957723617554, "learning_rate": 1.1884257524846611e-05, "loss": 0.0143, "step": 15540 }, { "epoch": 3.9591646149927837, "grad_norm": 0.9001792073249817, "learning_rate": 1.1879891404172694e-05, "loss": 0.0082, "step": 15545 }, { "epoch": 3.9604380677476865, "grad_norm": 0.9280193448066711, "learning_rate": 1.1875524911974759e-05, "loss": 0.0095, "step": 15550 }, { "epoch": 3.9617115205025892, "grad_norm": 0.7405206561088562, "learning_rate": 1.1871158049115752e-05, "loss": 0.0057, "step": 15555 }, { "epoch": 3.962984973257492, "grad_norm": 1.4901012182235718, "learning_rate": 1.1866790816458702e-05, "loss": 0.016, "step": 15560 }, { "epoch": 3.964258426012395, "grad_norm": 1.2829246520996094, "learning_rate": 1.1862423214866703e-05, "loss": 0.0131, "step": 15565 }, { "epoch": 3.9655318787672975, "grad_norm": 0.9037299156188965, "learning_rate": 1.1858055245202932e-05, "loss": 0.0089, "step": 15570 }, { "epoch": 3.9668053315222007, "grad_norm": 1.33984375, "learning_rate": 1.1853686908330634e-05, "loss": 0.0098, "step": 15575 }, { "epoch": 3.9680787842771035, "grad_norm": 0.5806562900543213, "learning_rate": 1.184931820511312e-05, "loss": 0.0081, "step": 15580 }, { "epoch": 3.9693522370320062, "grad_norm": 0.3300263285636902, "learning_rate": 1.1844949136413789e-05, "loss": 0.0081, "step": 15585 }, { "epoch": 3.970625689786909, "grad_norm": 0.2553764581680298, "learning_rate": 1.1840579703096092e-05, "loss": 0.0078, "step": 15590 }, { "epoch": 3.9718991425418118, "grad_norm": 0.6262523531913757, "learning_rate": 1.1836209906023566e-05, "loss": 0.0071, "step": 15595 }, { "epoch": 3.9731725952967145, "grad_norm": 1.0032844543457031, "learning_rate": 1.183183974605982e-05, "loss": 0.0097, "step": 15600 }, { "epoch": 3.9744460480516173, "grad_norm": 0.7871229648590088, "learning_rate": 1.1827469224068531e-05, "loss": 0.0108, "step": 15605 }, { "epoch": 3.97571950080652, "grad_norm": 0.6893402934074402, "learning_rate": 1.1823098340913441e-05, "loss": 0.0082, "step": 15610 }, { "epoch": 3.976992953561423, "grad_norm": 1.3205156326293945, "learning_rate": 1.1818727097458377e-05, "loss": 0.0128, "step": 15615 }, { "epoch": 3.9782664063163256, "grad_norm": 0.5035563707351685, "learning_rate": 1.1814355494567229e-05, "loss": 0.0138, "step": 15620 }, { "epoch": 3.9795398590712283, "grad_norm": 0.9967522025108337, "learning_rate": 1.1809983533103957e-05, "loss": 0.0114, "step": 15625 }, { "epoch": 3.9808133118261315, "grad_norm": 1.2173388004302979, "learning_rate": 1.18056112139326e-05, "loss": 0.0092, "step": 15630 }, { "epoch": 3.982086764581034, "grad_norm": 1.108522891998291, "learning_rate": 1.1801238537917254e-05, "loss": 0.0118, "step": 15635 }, { "epoch": 3.983360217335937, "grad_norm": 1.33229660987854, "learning_rate": 1.1796865505922096e-05, "loss": 0.0097, "step": 15640 }, { "epoch": 3.9846336700908394, "grad_norm": 1.074447512626648, "learning_rate": 1.1792492118811376e-05, "loss": 0.0094, "step": 15645 }, { "epoch": 3.9859071228457426, "grad_norm": 0.9515808820724487, "learning_rate": 1.1788118377449405e-05, "loss": 0.009, "step": 15650 }, { "epoch": 3.9871805756006453, "grad_norm": 1.090500831604004, "learning_rate": 1.1783744282700564e-05, "loss": 0.008, "step": 15655 }, { "epoch": 3.988454028355548, "grad_norm": 1.1324771642684937, "learning_rate": 1.1779369835429315e-05, "loss": 0.0105, "step": 15660 }, { "epoch": 3.989727481110451, "grad_norm": 0.8284988403320312, "learning_rate": 1.177499503650018e-05, "loss": 0.0103, "step": 15665 }, { "epoch": 3.9910009338653536, "grad_norm": 0.6195374131202698, "learning_rate": 1.1770619886777755e-05, "loss": 0.0072, "step": 15670 }, { "epoch": 3.9922743866202564, "grad_norm": 1.6949445009231567, "learning_rate": 1.1766244387126694e-05, "loss": 0.01, "step": 15675 }, { "epoch": 3.993547839375159, "grad_norm": 1.2973432540893555, "learning_rate": 1.1761868538411742e-05, "loss": 0.0118, "step": 15680 }, { "epoch": 3.994821292130062, "grad_norm": 0.6253224015235901, "learning_rate": 1.1757492341497696e-05, "loss": 0.0091, "step": 15685 }, { "epoch": 3.9960947448849646, "grad_norm": 0.7087976932525635, "learning_rate": 1.1753115797249423e-05, "loss": 0.012, "step": 15690 }, { "epoch": 3.997368197639868, "grad_norm": 0.7621458768844604, "learning_rate": 1.1748738906531862e-05, "loss": 0.0093, "step": 15695 }, { "epoch": 3.99864165039477, "grad_norm": 1.2397352457046509, "learning_rate": 1.1744361670210028e-05, "loss": 0.0089, "step": 15700 }, { "epoch": 3.9999151031496734, "grad_norm": 1.2494279146194458, "learning_rate": 1.1739984089148988e-05, "loss": 0.0097, "step": 15705 }, { "epoch": 4.001188555904576, "grad_norm": 0.4836680591106415, "learning_rate": 1.1735606164213891e-05, "loss": 0.0043, "step": 15710 }, { "epoch": 4.002462008659479, "grad_norm": 1.0730623006820679, "learning_rate": 1.1731227896269945e-05, "loss": 0.0063, "step": 15715 }, { "epoch": 4.003735461414381, "grad_norm": 0.663447380065918, "learning_rate": 1.1726849286182433e-05, "loss": 0.0055, "step": 15720 }, { "epoch": 4.005008914169284, "grad_norm": 1.4824830293655396, "learning_rate": 1.1722470334816703e-05, "loss": 0.007, "step": 15725 }, { "epoch": 4.006282366924187, "grad_norm": 0.2845683693885803, "learning_rate": 1.1718091043038167e-05, "loss": 0.0072, "step": 15730 }, { "epoch": 4.00755581967909, "grad_norm": 0.9379552602767944, "learning_rate": 1.1713711411712305e-05, "loss": 0.006, "step": 15735 }, { "epoch": 4.008829272433993, "grad_norm": 0.6295620799064636, "learning_rate": 1.170933144170467e-05, "loss": 0.0035, "step": 15740 }, { "epoch": 4.010102725188895, "grad_norm": 0.545703113079071, "learning_rate": 1.1704951133880877e-05, "loss": 0.0063, "step": 15745 }, { "epoch": 4.011376177943799, "grad_norm": 0.3170238435268402, "learning_rate": 1.1700570489106608e-05, "loss": 0.0065, "step": 15750 }, { "epoch": 4.012649630698701, "grad_norm": 1.565608024597168, "learning_rate": 1.1696189508247615e-05, "loss": 0.0113, "step": 15755 }, { "epoch": 4.013923083453604, "grad_norm": 0.718679666519165, "learning_rate": 1.169180819216971e-05, "loss": 0.0046, "step": 15760 }, { "epoch": 4.0151965362085065, "grad_norm": 0.21685358881950378, "learning_rate": 1.1687426541738775e-05, "loss": 0.0039, "step": 15765 }, { "epoch": 4.01646998896341, "grad_norm": 0.43362537026405334, "learning_rate": 1.1683044557820763e-05, "loss": 0.0058, "step": 15770 }, { "epoch": 4.017743441718312, "grad_norm": 0.695081889629364, "learning_rate": 1.1678662241281682e-05, "loss": 0.0055, "step": 15775 }, { "epoch": 4.019016894473215, "grad_norm": 0.29282981157302856, "learning_rate": 1.1674279592987616e-05, "loss": 0.0065, "step": 15780 }, { "epoch": 4.0202903472281175, "grad_norm": 0.3453017473220825, "learning_rate": 1.166989661380471e-05, "loss": 0.0045, "step": 15785 }, { "epoch": 4.021563799983021, "grad_norm": 0.7767567038536072, "learning_rate": 1.1665513304599171e-05, "loss": 0.0052, "step": 15790 }, { "epoch": 4.022837252737923, "grad_norm": 0.3807118833065033, "learning_rate": 1.1661129666237278e-05, "loss": 0.0033, "step": 15795 }, { "epoch": 4.024110705492826, "grad_norm": 0.7202064990997314, "learning_rate": 1.1656745699585373e-05, "loss": 0.0048, "step": 15800 }, { "epoch": 4.025384158247729, "grad_norm": 1.099284052848816, "learning_rate": 1.165236140550986e-05, "loss": 0.0043, "step": 15805 }, { "epoch": 4.026657611002632, "grad_norm": 1.0879433155059814, "learning_rate": 1.1647976784877206e-05, "loss": 0.0061, "step": 15810 }, { "epoch": 4.027931063757535, "grad_norm": 0.6141963601112366, "learning_rate": 1.1643591838553955e-05, "loss": 0.003, "step": 15815 }, { "epoch": 4.029204516512437, "grad_norm": 1.07760488986969, "learning_rate": 1.16392065674067e-05, "loss": 0.0049, "step": 15820 }, { "epoch": 4.0304779692673405, "grad_norm": 0.7590822577476501, "learning_rate": 1.1634820972302106e-05, "loss": 0.0053, "step": 15825 }, { "epoch": 4.031751422022243, "grad_norm": 0.3328470289707184, "learning_rate": 1.1630435054106898e-05, "loss": 0.004, "step": 15830 }, { "epoch": 4.033024874777146, "grad_norm": 0.7368077635765076, "learning_rate": 1.1626048813687874e-05, "loss": 0.0074, "step": 15835 }, { "epoch": 4.034298327532048, "grad_norm": 0.555335283279419, "learning_rate": 1.1621662251911885e-05, "loss": 0.0037, "step": 15840 }, { "epoch": 4.0355717802869515, "grad_norm": 0.7955542206764221, "learning_rate": 1.1617275369645849e-05, "loss": 0.0067, "step": 15845 }, { "epoch": 4.036845233041854, "grad_norm": 0.5096911191940308, "learning_rate": 1.1612888167756751e-05, "loss": 0.0057, "step": 15850 }, { "epoch": 4.038118685796757, "grad_norm": 0.6959207653999329, "learning_rate": 1.1608500647111634e-05, "loss": 0.0056, "step": 15855 }, { "epoch": 4.039392138551659, "grad_norm": 0.6333912014961243, "learning_rate": 1.1604112808577603e-05, "loss": 0.0051, "step": 15860 }, { "epoch": 4.0406655913065626, "grad_norm": 0.5577733516693115, "learning_rate": 1.1599724653021837e-05, "loss": 0.0047, "step": 15865 }, { "epoch": 4.041939044061466, "grad_norm": 1.0991721153259277, "learning_rate": 1.1595336181311565e-05, "loss": 0.0052, "step": 15870 }, { "epoch": 4.043212496816368, "grad_norm": 0.8036783337593079, "learning_rate": 1.1590947394314082e-05, "loss": 0.007, "step": 15875 }, { "epoch": 4.044485949571271, "grad_norm": 0.8121493458747864, "learning_rate": 1.158655829289675e-05, "loss": 0.0049, "step": 15880 }, { "epoch": 4.045759402326174, "grad_norm": 0.629112720489502, "learning_rate": 1.158216887792699e-05, "loss": 0.0053, "step": 15885 }, { "epoch": 4.047032855081077, "grad_norm": 0.5684329867362976, "learning_rate": 1.1577779150272277e-05, "loss": 0.005, "step": 15890 }, { "epoch": 4.048306307835979, "grad_norm": 1.2170076370239258, "learning_rate": 1.1573389110800166e-05, "loss": 0.0076, "step": 15895 }, { "epoch": 4.049579760590882, "grad_norm": 0.4734116792678833, "learning_rate": 1.1568998760378256e-05, "loss": 0.0058, "step": 15900 }, { "epoch": 4.050853213345785, "grad_norm": 0.5363080501556396, "learning_rate": 1.1564608099874215e-05, "loss": 0.0041, "step": 15905 }, { "epoch": 4.052126666100688, "grad_norm": 0.5058848261833191, "learning_rate": 1.1560217130155775e-05, "loss": 0.0063, "step": 15910 }, { "epoch": 4.05340011885559, "grad_norm": 0.6829907298088074, "learning_rate": 1.1555825852090725e-05, "loss": 0.0048, "step": 15915 }, { "epoch": 4.054673571610493, "grad_norm": 0.7274470925331116, "learning_rate": 1.1551434266546912e-05, "loss": 0.0042, "step": 15920 }, { "epoch": 4.055947024365396, "grad_norm": 0.47396841645240784, "learning_rate": 1.1547042374392254e-05, "loss": 0.0049, "step": 15925 }, { "epoch": 4.057220477120299, "grad_norm": 0.4355199933052063, "learning_rate": 1.1542650176494719e-05, "loss": 0.0071, "step": 15930 }, { "epoch": 4.058493929875201, "grad_norm": 0.8047902584075928, "learning_rate": 1.1538257673722337e-05, "loss": 0.0061, "step": 15935 }, { "epoch": 4.059767382630104, "grad_norm": 0.9217956066131592, "learning_rate": 1.1533864866943207e-05, "loss": 0.0061, "step": 15940 }, { "epoch": 4.061040835385008, "grad_norm": 0.17359007894992828, "learning_rate": 1.152947175702548e-05, "loss": 0.0049, "step": 15945 }, { "epoch": 4.06231428813991, "grad_norm": 0.7850171327590942, "learning_rate": 1.1525078344837368e-05, "loss": 0.0054, "step": 15950 }, { "epoch": 4.063587740894813, "grad_norm": 0.9272680878639221, "learning_rate": 1.152068463124714e-05, "loss": 0.0056, "step": 15955 }, { "epoch": 4.064861193649715, "grad_norm": 0.2317131608724594, "learning_rate": 1.1516290617123134e-05, "loss": 0.0036, "step": 15960 }, { "epoch": 4.066134646404619, "grad_norm": 0.7591544389724731, "learning_rate": 1.151189630333374e-05, "loss": 0.0041, "step": 15965 }, { "epoch": 4.067408099159521, "grad_norm": 0.9532989859580994, "learning_rate": 1.1507501690747405e-05, "loss": 0.0066, "step": 15970 }, { "epoch": 4.068681551914424, "grad_norm": 0.4763377904891968, "learning_rate": 1.1503106780232644e-05, "loss": 0.0051, "step": 15975 }, { "epoch": 4.0699550046693265, "grad_norm": 0.5873628258705139, "learning_rate": 1.1498711572658024e-05, "loss": 0.004, "step": 15980 }, { "epoch": 4.07122845742423, "grad_norm": 0.28763842582702637, "learning_rate": 1.1494316068892171e-05, "loss": 0.0055, "step": 15985 }, { "epoch": 4.072501910179132, "grad_norm": 0.5899890065193176, "learning_rate": 1.1489920269803771e-05, "loss": 0.0041, "step": 15990 }, { "epoch": 4.073775362934035, "grad_norm": 0.6143194437026978, "learning_rate": 1.148552417626157e-05, "loss": 0.0055, "step": 15995 }, { "epoch": 4.0750488156889375, "grad_norm": 0.5877683162689209, "learning_rate": 1.1481127789134369e-05, "loss": 0.0037, "step": 16000 }, { "epoch": 4.076322268443841, "grad_norm": 1.4437123537063599, "learning_rate": 1.1476731109291031e-05, "loss": 0.005, "step": 16005 }, { "epoch": 4.077595721198744, "grad_norm": 0.5017604827880859, "learning_rate": 1.1472334137600472e-05, "loss": 0.0053, "step": 16010 }, { "epoch": 4.078869173953646, "grad_norm": 0.8441069722175598, "learning_rate": 1.1467936874931669e-05, "loss": 0.0051, "step": 16015 }, { "epoch": 4.080142626708549, "grad_norm": 0.8119441866874695, "learning_rate": 1.1463539322153653e-05, "loss": 0.0047, "step": 16020 }, { "epoch": 4.081416079463452, "grad_norm": 0.3559557795524597, "learning_rate": 1.1459141480135517e-05, "loss": 0.0081, "step": 16025 }, { "epoch": 4.082689532218355, "grad_norm": 0.712375819683075, "learning_rate": 1.145474334974641e-05, "loss": 0.0039, "step": 16030 }, { "epoch": 4.083962984973257, "grad_norm": 1.1960538625717163, "learning_rate": 1.1450344931855538e-05, "loss": 0.0067, "step": 16035 }, { "epoch": 4.0852364377281605, "grad_norm": 0.5962866544723511, "learning_rate": 1.1445946227332161e-05, "loss": 0.0029, "step": 16040 }, { "epoch": 4.086509890483063, "grad_norm": 0.9283566474914551, "learning_rate": 1.1441547237045593e-05, "loss": 0.0039, "step": 16045 }, { "epoch": 4.087783343237966, "grad_norm": 0.7596489787101746, "learning_rate": 1.1437147961865219e-05, "loss": 0.0026, "step": 16050 }, { "epoch": 4.089056795992868, "grad_norm": 0.5445420742034912, "learning_rate": 1.143274840266046e-05, "loss": 0.0048, "step": 16055 }, { "epoch": 4.0903302487477715, "grad_norm": 0.802463710308075, "learning_rate": 1.142834856030081e-05, "loss": 0.0054, "step": 16060 }, { "epoch": 4.091603701502674, "grad_norm": 0.9837080836296082, "learning_rate": 1.1423948435655812e-05, "loss": 0.0058, "step": 16065 }, { "epoch": 4.092877154257577, "grad_norm": 0.4768978953361511, "learning_rate": 1.1419548029595063e-05, "loss": 0.0059, "step": 16070 }, { "epoch": 4.09415060701248, "grad_norm": 0.9157548546791077, "learning_rate": 1.141514734298822e-05, "loss": 0.0054, "step": 16075 }, { "epoch": 4.0954240597673826, "grad_norm": 0.2413116842508316, "learning_rate": 1.141074637670499e-05, "loss": 0.0045, "step": 16080 }, { "epoch": 4.096697512522286, "grad_norm": 0.35965695977211, "learning_rate": 1.1406345131615138e-05, "loss": 0.0032, "step": 16085 }, { "epoch": 4.097970965277188, "grad_norm": 1.1045454740524292, "learning_rate": 1.1401943608588491e-05, "loss": 0.006, "step": 16090 }, { "epoch": 4.099244418032091, "grad_norm": 0.22624191641807556, "learning_rate": 1.1397541808494916e-05, "loss": 0.0036, "step": 16095 }, { "epoch": 4.100517870786994, "grad_norm": 0.9397381544113159, "learning_rate": 1.1393139732204351e-05, "loss": 0.0042, "step": 16100 }, { "epoch": 4.101791323541897, "grad_norm": 0.7618440985679626, "learning_rate": 1.1388737380586778e-05, "loss": 0.0046, "step": 16105 }, { "epoch": 4.103064776296799, "grad_norm": 0.6746475100517273, "learning_rate": 1.1384334754512235e-05, "loss": 0.007, "step": 16110 }, { "epoch": 4.104338229051702, "grad_norm": 0.5564061999320984, "learning_rate": 1.1379931854850815e-05, "loss": 0.0041, "step": 16115 }, { "epoch": 4.105611681806605, "grad_norm": 0.5545084476470947, "learning_rate": 1.137552868247267e-05, "loss": 0.0031, "step": 16120 }, { "epoch": 4.106885134561508, "grad_norm": 0.7943898439407349, "learning_rate": 1.1371125238247993e-05, "loss": 0.006, "step": 16125 }, { "epoch": 4.10815858731641, "grad_norm": 0.7114565968513489, "learning_rate": 1.1366721523047051e-05, "loss": 0.0062, "step": 16130 }, { "epoch": 4.109432040071313, "grad_norm": 0.9906663298606873, "learning_rate": 1.1362317537740143e-05, "loss": 0.0077, "step": 16135 }, { "epoch": 4.110705492826217, "grad_norm": 1.1509487628936768, "learning_rate": 1.1357913283197635e-05, "loss": 0.0075, "step": 16140 }, { "epoch": 4.111978945581119, "grad_norm": 1.3867734670639038, "learning_rate": 1.1353508760289942e-05, "loss": 0.0048, "step": 16145 }, { "epoch": 4.113252398336022, "grad_norm": 1.0083580017089844, "learning_rate": 1.1349103969887534e-05, "loss": 0.0089, "step": 16150 }, { "epoch": 4.114525851090924, "grad_norm": 0.3190726041793823, "learning_rate": 1.1344698912860927e-05, "loss": 0.0039, "step": 16155 }, { "epoch": 4.115799303845828, "grad_norm": 0.6255837082862854, "learning_rate": 1.1340293590080705e-05, "loss": 0.0063, "step": 16160 }, { "epoch": 4.11707275660073, "grad_norm": 1.0845547914505005, "learning_rate": 1.1335888002417483e-05, "loss": 0.0041, "step": 16165 }, { "epoch": 4.118346209355633, "grad_norm": 0.6207665205001831, "learning_rate": 1.1331482150741946e-05, "loss": 0.0047, "step": 16170 }, { "epoch": 4.119619662110535, "grad_norm": 0.8950415253639221, "learning_rate": 1.1327076035924825e-05, "loss": 0.0042, "step": 16175 }, { "epoch": 4.120893114865439, "grad_norm": 0.8714682459831238, "learning_rate": 1.13226696588369e-05, "loss": 0.005, "step": 16180 }, { "epoch": 4.122166567620341, "grad_norm": 1.0460741519927979, "learning_rate": 1.1318263020349008e-05, "loss": 0.0064, "step": 16185 }, { "epoch": 4.123440020375244, "grad_norm": 1.0295305252075195, "learning_rate": 1.1313856121332037e-05, "loss": 0.0047, "step": 16190 }, { "epoch": 4.1247134731301465, "grad_norm": 0.5545336604118347, "learning_rate": 1.1309448962656924e-05, "loss": 0.003, "step": 16195 }, { "epoch": 4.12598692588505, "grad_norm": 0.709620475769043, "learning_rate": 1.1305041545194657e-05, "loss": 0.0045, "step": 16200 }, { "epoch": 4.127260378639953, "grad_norm": 1.0103449821472168, "learning_rate": 1.1300633869816275e-05, "loss": 0.0055, "step": 16205 }, { "epoch": 4.128533831394855, "grad_norm": 0.23975321650505066, "learning_rate": 1.1296225937392872e-05, "loss": 0.0054, "step": 16210 }, { "epoch": 4.129807284149758, "grad_norm": 1.282401204109192, "learning_rate": 1.1291817748795593e-05, "loss": 0.009, "step": 16215 }, { "epoch": 4.131080736904661, "grad_norm": 0.6111401915550232, "learning_rate": 1.1287409304895623e-05, "loss": 0.0043, "step": 16220 }, { "epoch": 4.132354189659564, "grad_norm": 0.39164209365844727, "learning_rate": 1.1283000606564212e-05, "loss": 0.0052, "step": 16225 }, { "epoch": 4.133627642414466, "grad_norm": 0.3132180869579315, "learning_rate": 1.1278591654672652e-05, "loss": 0.0044, "step": 16230 }, { "epoch": 4.134901095169369, "grad_norm": 0.9888606667518616, "learning_rate": 1.1274182450092285e-05, "loss": 0.0075, "step": 16235 }, { "epoch": 4.136174547924272, "grad_norm": 1.131169319152832, "learning_rate": 1.1269772993694508e-05, "loss": 0.0062, "step": 16240 }, { "epoch": 4.137448000679175, "grad_norm": 1.7064012289047241, "learning_rate": 1.1265363286350761e-05, "loss": 0.0047, "step": 16245 }, { "epoch": 4.138721453434077, "grad_norm": 1.0814855098724365, "learning_rate": 1.1260953328932535e-05, "loss": 0.0069, "step": 16250 }, { "epoch": 4.1399949061889805, "grad_norm": 0.6315822005271912, "learning_rate": 1.125654312231138e-05, "loss": 0.003, "step": 16255 }, { "epoch": 4.141268358943883, "grad_norm": 0.56443852186203, "learning_rate": 1.1252132667358883e-05, "loss": 0.0065, "step": 16260 }, { "epoch": 4.142541811698786, "grad_norm": 2.284452438354492, "learning_rate": 1.1247721964946683e-05, "loss": 0.0052, "step": 16265 }, { "epoch": 4.143815264453689, "grad_norm": 1.0498689413070679, "learning_rate": 1.1243311015946472e-05, "loss": 0.0052, "step": 16270 }, { "epoch": 4.1450887172085915, "grad_norm": 0.7218289971351624, "learning_rate": 1.1238899821229992e-05, "loss": 0.0057, "step": 16275 }, { "epoch": 4.146362169963495, "grad_norm": 0.6625533699989319, "learning_rate": 1.123448838166902e-05, "loss": 0.0042, "step": 16280 }, { "epoch": 4.147635622718397, "grad_norm": 0.9295772314071655, "learning_rate": 1.1230076698135401e-05, "loss": 0.0044, "step": 16285 }, { "epoch": 4.1489090754733, "grad_norm": 0.7182146906852722, "learning_rate": 1.1225664771501015e-05, "loss": 0.0062, "step": 16290 }, { "epoch": 4.1501825282282025, "grad_norm": 0.7567660212516785, "learning_rate": 1.1221252602637793e-05, "loss": 0.0045, "step": 16295 }, { "epoch": 4.151455980983106, "grad_norm": 0.8191352486610413, "learning_rate": 1.1216840192417715e-05, "loss": 0.0053, "step": 16300 }, { "epoch": 4.152729433738008, "grad_norm": 0.5816457867622375, "learning_rate": 1.121242754171281e-05, "loss": 0.0043, "step": 16305 }, { "epoch": 4.154002886492911, "grad_norm": 0.7194662690162659, "learning_rate": 1.1208014651395147e-05, "loss": 0.0056, "step": 16310 }, { "epoch": 4.155276339247814, "grad_norm": 0.34138432145118713, "learning_rate": 1.1203601522336855e-05, "loss": 0.0044, "step": 16315 }, { "epoch": 4.156549792002717, "grad_norm": 0.8730624914169312, "learning_rate": 1.1199188155410102e-05, "loss": 0.0056, "step": 16320 }, { "epoch": 4.157823244757619, "grad_norm": 0.6569394469261169, "learning_rate": 1.11947745514871e-05, "loss": 0.005, "step": 16325 }, { "epoch": 4.159096697512522, "grad_norm": 0.16067667305469513, "learning_rate": 1.1190360711440114e-05, "loss": 0.0048, "step": 16330 }, { "epoch": 4.1603701502674255, "grad_norm": 0.4265356957912445, "learning_rate": 1.1185946636141455e-05, "loss": 0.0039, "step": 16335 }, { "epoch": 4.161643603022328, "grad_norm": 0.6476314067840576, "learning_rate": 1.118153232646348e-05, "loss": 0.0068, "step": 16340 }, { "epoch": 4.162917055777231, "grad_norm": 0.7480344176292419, "learning_rate": 1.117711778327859e-05, "loss": 0.0059, "step": 16345 }, { "epoch": 4.164190508532133, "grad_norm": 0.6284814476966858, "learning_rate": 1.1172703007459232e-05, "loss": 0.0093, "step": 16350 }, { "epoch": 4.1654639612870366, "grad_norm": 1.1717978715896606, "learning_rate": 1.1168287999877902e-05, "loss": 0.008, "step": 16355 }, { "epoch": 4.166737414041939, "grad_norm": 2.267528772354126, "learning_rate": 1.1163872761407144e-05, "loss": 0.012, "step": 16360 }, { "epoch": 4.168010866796842, "grad_norm": 1.0906522274017334, "learning_rate": 1.1159457292919542e-05, "loss": 0.0069, "step": 16365 }, { "epoch": 4.169284319551744, "grad_norm": 0.9390041828155518, "learning_rate": 1.1155041595287723e-05, "loss": 0.0027, "step": 16370 }, { "epoch": 4.170557772306648, "grad_norm": 0.7149091362953186, "learning_rate": 1.115062566938437e-05, "loss": 0.0072, "step": 16375 }, { "epoch": 4.17183122506155, "grad_norm": 0.5192981958389282, "learning_rate": 1.1146209516082202e-05, "loss": 0.0049, "step": 16380 }, { "epoch": 4.173104677816453, "grad_norm": 0.7600677609443665, "learning_rate": 1.1141793136253987e-05, "loss": 0.004, "step": 16385 }, { "epoch": 4.174378130571355, "grad_norm": 1.0514642000198364, "learning_rate": 1.1137376530772534e-05, "loss": 0.0076, "step": 16390 }, { "epoch": 4.175651583326259, "grad_norm": 1.6564249992370605, "learning_rate": 1.1132959700510704e-05, "loss": 0.0053, "step": 16395 }, { "epoch": 4.176925036081162, "grad_norm": 1.0555363893508911, "learning_rate": 1.1128542646341396e-05, "loss": 0.0059, "step": 16400 }, { "epoch": 4.178198488836064, "grad_norm": 1.5214128494262695, "learning_rate": 1.112412536913755e-05, "loss": 0.0067, "step": 16405 }, { "epoch": 4.179471941590967, "grad_norm": 0.6630090475082397, "learning_rate": 1.1119707869772162e-05, "loss": 0.0099, "step": 16410 }, { "epoch": 4.18074539434587, "grad_norm": 0.6615993976593018, "learning_rate": 1.1115290149118263e-05, "loss": 0.0055, "step": 16415 }, { "epoch": 4.182018847100773, "grad_norm": 0.7771114706993103, "learning_rate": 1.1110872208048926e-05, "loss": 0.0043, "step": 16420 }, { "epoch": 4.183292299855675, "grad_norm": 0.7260045409202576, "learning_rate": 1.110645404743728e-05, "loss": 0.0084, "step": 16425 }, { "epoch": 4.184565752610578, "grad_norm": 0.684233546257019, "learning_rate": 1.1102035668156477e-05, "loss": 0.006, "step": 16430 }, { "epoch": 4.185839205365481, "grad_norm": 0.32596564292907715, "learning_rate": 1.1097617071079729e-05, "loss": 0.0042, "step": 16435 }, { "epoch": 4.187112658120384, "grad_norm": 0.8805355429649353, "learning_rate": 1.1093198257080292e-05, "loss": 0.0064, "step": 16440 }, { "epoch": 4.188386110875286, "grad_norm": 1.0825587511062622, "learning_rate": 1.108877922703145e-05, "loss": 0.0087, "step": 16445 }, { "epoch": 4.189659563630189, "grad_norm": 0.7549059987068176, "learning_rate": 1.1084359981806538e-05, "loss": 0.0058, "step": 16450 }, { "epoch": 4.190933016385092, "grad_norm": 0.4698837995529175, "learning_rate": 1.1079940522278943e-05, "loss": 0.0048, "step": 16455 }, { "epoch": 4.192206469139995, "grad_norm": 0.5047542452812195, "learning_rate": 1.1075520849322078e-05, "loss": 0.0061, "step": 16460 }, { "epoch": 4.193479921894898, "grad_norm": 0.4125248193740845, "learning_rate": 1.1071100963809409e-05, "loss": 0.0039, "step": 16465 }, { "epoch": 4.1947533746498005, "grad_norm": 0.30635568499565125, "learning_rate": 1.1066680866614439e-05, "loss": 0.0068, "step": 16470 }, { "epoch": 4.196026827404704, "grad_norm": 0.27394139766693115, "learning_rate": 1.1062260558610714e-05, "loss": 0.0058, "step": 16475 }, { "epoch": 4.197300280159606, "grad_norm": 0.8875402808189392, "learning_rate": 1.105784004067182e-05, "loss": 0.0044, "step": 16480 }, { "epoch": 4.198573732914509, "grad_norm": 0.471368670463562, "learning_rate": 1.105341931367139e-05, "loss": 0.0076, "step": 16485 }, { "epoch": 4.1998471856694115, "grad_norm": 0.9313099980354309, "learning_rate": 1.1048998378483096e-05, "loss": 0.0086, "step": 16490 }, { "epoch": 4.201120638424315, "grad_norm": 0.6669067740440369, "learning_rate": 1.1044577235980646e-05, "loss": 0.0068, "step": 16495 }, { "epoch": 4.202394091179217, "grad_norm": 0.44039520621299744, "learning_rate": 1.1040155887037793e-05, "loss": 0.0046, "step": 16500 }, { "epoch": 4.20366754393412, "grad_norm": 0.5614885687828064, "learning_rate": 1.1035734332528336e-05, "loss": 0.0046, "step": 16505 }, { "epoch": 4.2049409966890225, "grad_norm": 0.8062819838523865, "learning_rate": 1.1031312573326102e-05, "loss": 0.0049, "step": 16510 }, { "epoch": 4.206214449443926, "grad_norm": 0.5043853521347046, "learning_rate": 1.1026890610304972e-05, "loss": 0.0042, "step": 16515 }, { "epoch": 4.207487902198828, "grad_norm": 0.9265994429588318, "learning_rate": 1.1022468444338858e-05, "loss": 0.0046, "step": 16520 }, { "epoch": 4.208761354953731, "grad_norm": 0.2988245487213135, "learning_rate": 1.1018046076301718e-05, "loss": 0.0054, "step": 16525 }, { "epoch": 4.210034807708634, "grad_norm": 1.0099698305130005, "learning_rate": 1.1013623507067541e-05, "loss": 0.006, "step": 16530 }, { "epoch": 4.211308260463537, "grad_norm": 0.4934808909893036, "learning_rate": 1.1009200737510369e-05, "loss": 0.004, "step": 16535 }, { "epoch": 4.21258171321844, "grad_norm": 0.5845344662666321, "learning_rate": 1.1004777768504274e-05, "loss": 0.0061, "step": 16540 }, { "epoch": 4.213855165973342, "grad_norm": 0.7763372659683228, "learning_rate": 1.1000354600923367e-05, "loss": 0.0043, "step": 16545 }, { "epoch": 4.2151286187282455, "grad_norm": 0.8929555416107178, "learning_rate": 1.0995931235641804e-05, "loss": 0.0052, "step": 16550 }, { "epoch": 4.216402071483148, "grad_norm": 0.6587688326835632, "learning_rate": 1.0991507673533776e-05, "loss": 0.005, "step": 16555 }, { "epoch": 4.217675524238051, "grad_norm": 0.6300079822540283, "learning_rate": 1.0987083915473515e-05, "loss": 0.006, "step": 16560 }, { "epoch": 4.218948976992953, "grad_norm": 0.33419355750083923, "learning_rate": 1.0982659962335294e-05, "loss": 0.004, "step": 16565 }, { "epoch": 4.2202224297478566, "grad_norm": 0.9712586402893066, "learning_rate": 1.0978235814993416e-05, "loss": 0.0046, "step": 16570 }, { "epoch": 4.221495882502759, "grad_norm": 0.9327349066734314, "learning_rate": 1.097381147432223e-05, "loss": 0.0067, "step": 16575 }, { "epoch": 4.222769335257662, "grad_norm": 0.7624064683914185, "learning_rate": 1.0969386941196121e-05, "loss": 0.0071, "step": 16580 }, { "epoch": 4.224042788012564, "grad_norm": 0.8137874007225037, "learning_rate": 1.0964962216489512e-05, "loss": 0.0044, "step": 16585 }, { "epoch": 4.225316240767468, "grad_norm": 0.9307061433792114, "learning_rate": 1.0960537301076864e-05, "loss": 0.0093, "step": 16590 }, { "epoch": 4.22658969352237, "grad_norm": 0.42331430315971375, "learning_rate": 1.0956112195832674e-05, "loss": 0.0061, "step": 16595 }, { "epoch": 4.227863146277273, "grad_norm": 0.3296581506729126, "learning_rate": 1.095168690163148e-05, "loss": 0.0074, "step": 16600 }, { "epoch": 4.229136599032176, "grad_norm": 1.3226042985916138, "learning_rate": 1.0947261419347856e-05, "loss": 0.0049, "step": 16605 }, { "epoch": 4.230410051787079, "grad_norm": 0.6534680724143982, "learning_rate": 1.094283574985641e-05, "loss": 0.0064, "step": 16610 }, { "epoch": 4.231683504541982, "grad_norm": 0.843971312046051, "learning_rate": 1.0938409894031793e-05, "loss": 0.0086, "step": 16615 }, { "epoch": 4.232956957296884, "grad_norm": 0.2931232154369354, "learning_rate": 1.093398385274869e-05, "loss": 0.0048, "step": 16620 }, { "epoch": 4.234230410051787, "grad_norm": 0.3688090741634369, "learning_rate": 1.0929557626881815e-05, "loss": 0.0055, "step": 16625 }, { "epoch": 4.23550386280669, "grad_norm": 0.7171391248703003, "learning_rate": 1.0925131217305931e-05, "loss": 0.0076, "step": 16630 }, { "epoch": 4.236777315561593, "grad_norm": 0.49024462699890137, "learning_rate": 1.0920704624895832e-05, "loss": 0.0069, "step": 16635 }, { "epoch": 4.238050768316495, "grad_norm": 1.1470108032226562, "learning_rate": 1.0916277850526346e-05, "loss": 0.0077, "step": 16640 }, { "epoch": 4.239324221071398, "grad_norm": 0.8234087228775024, "learning_rate": 1.0911850895072344e-05, "loss": 0.0069, "step": 16645 }, { "epoch": 4.240597673826301, "grad_norm": 0.6786779165267944, "learning_rate": 1.0907423759408722e-05, "loss": 0.0059, "step": 16650 }, { "epoch": 4.241871126581204, "grad_norm": 0.8179436922073364, "learning_rate": 1.0902996444410421e-05, "loss": 0.0048, "step": 16655 }, { "epoch": 4.243144579336106, "grad_norm": 0.43881821632385254, "learning_rate": 1.0898568950952408e-05, "loss": 0.0047, "step": 16660 }, { "epoch": 4.244418032091009, "grad_norm": 0.7304882407188416, "learning_rate": 1.0894141279909698e-05, "loss": 0.0099, "step": 16665 }, { "epoch": 4.245691484845913, "grad_norm": 0.6620092391967773, "learning_rate": 1.088971343215733e-05, "loss": 0.0061, "step": 16670 }, { "epoch": 4.246964937600815, "grad_norm": 0.5133092403411865, "learning_rate": 1.0885285408570385e-05, "loss": 0.0046, "step": 16675 }, { "epoch": 4.248238390355718, "grad_norm": 0.6622143387794495, "learning_rate": 1.0880857210023973e-05, "loss": 0.0058, "step": 16680 }, { "epoch": 4.2495118431106205, "grad_norm": 0.5150163769721985, "learning_rate": 1.0876428837393243e-05, "loss": 0.0066, "step": 16685 }, { "epoch": 4.250785295865524, "grad_norm": 0.41901424527168274, "learning_rate": 1.0872000291553377e-05, "loss": 0.0092, "step": 16690 }, { "epoch": 4.252058748620426, "grad_norm": 0.5836777687072754, "learning_rate": 1.0867571573379592e-05, "loss": 0.0087, "step": 16695 }, { "epoch": 4.253332201375329, "grad_norm": 0.7247949838638306, "learning_rate": 1.0863142683747134e-05, "loss": 0.0036, "step": 16700 }, { "epoch": 4.2546056541302315, "grad_norm": 1.7808277606964111, "learning_rate": 1.0858713623531292e-05, "loss": 0.0059, "step": 16705 }, { "epoch": 4.255879106885135, "grad_norm": 0.838411808013916, "learning_rate": 1.085428439360738e-05, "loss": 0.008, "step": 16710 }, { "epoch": 4.257152559640037, "grad_norm": 0.5133708715438843, "learning_rate": 1.0849854994850752e-05, "loss": 0.0053, "step": 16715 }, { "epoch": 4.25842601239494, "grad_norm": 0.9196904301643372, "learning_rate": 1.084542542813679e-05, "loss": 0.0051, "step": 16720 }, { "epoch": 4.2596994651498425, "grad_norm": 0.6484999656677246, "learning_rate": 1.0840995694340915e-05, "loss": 0.0047, "step": 16725 }, { "epoch": 4.260972917904746, "grad_norm": 0.6887391209602356, "learning_rate": 1.0836565794338577e-05, "loss": 0.0057, "step": 16730 }, { "epoch": 4.262246370659648, "grad_norm": 0.8571422100067139, "learning_rate": 1.0832135729005254e-05, "loss": 0.0057, "step": 16735 }, { "epoch": 4.263519823414551, "grad_norm": 1.6372003555297852, "learning_rate": 1.082770549921647e-05, "loss": 0.005, "step": 16740 }, { "epoch": 4.2647932761694545, "grad_norm": 0.5061640739440918, "learning_rate": 1.0823275105847773e-05, "loss": 0.0056, "step": 16745 }, { "epoch": 4.266066728924357, "grad_norm": 0.5543606281280518, "learning_rate": 1.0818844549774738e-05, "loss": 0.0064, "step": 16750 }, { "epoch": 4.26734018167926, "grad_norm": 0.5984180569648743, "learning_rate": 1.0814413831872987e-05, "loss": 0.0055, "step": 16755 }, { "epoch": 4.268613634434162, "grad_norm": 0.6494947075843811, "learning_rate": 1.0809982953018159e-05, "loss": 0.0063, "step": 16760 }, { "epoch": 4.2698870871890655, "grad_norm": 0.12944689393043518, "learning_rate": 1.0805551914085932e-05, "loss": 0.0059, "step": 16765 }, { "epoch": 4.271160539943968, "grad_norm": 0.6556417942047119, "learning_rate": 1.0801120715952016e-05, "loss": 0.0049, "step": 16770 }, { "epoch": 4.272433992698871, "grad_norm": 0.26069775223731995, "learning_rate": 1.0796689359492154e-05, "loss": 0.0061, "step": 16775 }, { "epoch": 4.273707445453773, "grad_norm": 1.2414602041244507, "learning_rate": 1.0792257845582111e-05, "loss": 0.0057, "step": 16780 }, { "epoch": 4.2749808982086765, "grad_norm": 1.110196590423584, "learning_rate": 1.0787826175097695e-05, "loss": 0.0075, "step": 16785 }, { "epoch": 4.276254350963579, "grad_norm": 1.2559504508972168, "learning_rate": 1.0783394348914737e-05, "loss": 0.0089, "step": 16790 }, { "epoch": 4.277527803718482, "grad_norm": 0.6843595504760742, "learning_rate": 1.07789623679091e-05, "loss": 0.0084, "step": 16795 }, { "epoch": 4.278801256473384, "grad_norm": 0.5788720846176147, "learning_rate": 1.0774530232956685e-05, "loss": 0.0062, "step": 16800 }, { "epoch": 4.280074709228288, "grad_norm": 0.8395846486091614, "learning_rate": 1.0770097944933412e-05, "loss": 0.0068, "step": 16805 }, { "epoch": 4.281348161983191, "grad_norm": 0.9121881127357483, "learning_rate": 1.076566550471524e-05, "loss": 0.0047, "step": 16810 }, { "epoch": 4.282621614738093, "grad_norm": 0.46629416942596436, "learning_rate": 1.076123291317815e-05, "loss": 0.0057, "step": 16815 }, { "epoch": 4.283895067492996, "grad_norm": 0.4100147485733032, "learning_rate": 1.0756800171198162e-05, "loss": 0.0057, "step": 16820 }, { "epoch": 4.285168520247899, "grad_norm": 0.8167598247528076, "learning_rate": 1.0752367279651321e-05, "loss": 0.007, "step": 16825 }, { "epoch": 4.286441973002802, "grad_norm": 1.6513429880142212, "learning_rate": 1.07479342394137e-05, "loss": 0.008, "step": 16830 }, { "epoch": 4.287715425757704, "grad_norm": 0.6828609704971313, "learning_rate": 1.0743501051361405e-05, "loss": 0.0082, "step": 16835 }, { "epoch": 4.288988878512607, "grad_norm": 0.7919784784317017, "learning_rate": 1.0739067716370566e-05, "loss": 0.0053, "step": 16840 }, { "epoch": 4.29026233126751, "grad_norm": 1.0791614055633545, "learning_rate": 1.0734634235317349e-05, "loss": 0.0046, "step": 16845 }, { "epoch": 4.291535784022413, "grad_norm": 0.9705979228019714, "learning_rate": 1.0730200609077946e-05, "loss": 0.0052, "step": 16850 }, { "epoch": 4.292809236777315, "grad_norm": 0.1449928730726242, "learning_rate": 1.0725766838528576e-05, "loss": 0.0061, "step": 16855 }, { "epoch": 4.294082689532218, "grad_norm": 0.7192208766937256, "learning_rate": 1.0721332924545482e-05, "loss": 0.0051, "step": 16860 }, { "epoch": 4.295356142287121, "grad_norm": 0.35109421610832214, "learning_rate": 1.0716898868004953e-05, "loss": 0.0051, "step": 16865 }, { "epoch": 4.296629595042024, "grad_norm": 0.8589048385620117, "learning_rate": 1.0712464669783284e-05, "loss": 0.0058, "step": 16870 }, { "epoch": 4.297903047796927, "grad_norm": 0.32494688034057617, "learning_rate": 1.0708030330756812e-05, "loss": 0.0058, "step": 16875 }, { "epoch": 4.299176500551829, "grad_norm": 0.6167275309562683, "learning_rate": 1.0703595851801899e-05, "loss": 0.005, "step": 16880 }, { "epoch": 4.300449953306733, "grad_norm": 0.5973005294799805, "learning_rate": 1.069916123379493e-05, "loss": 0.0066, "step": 16885 }, { "epoch": 4.301723406061635, "grad_norm": 0.7088512182235718, "learning_rate": 1.0694726477612323e-05, "loss": 0.0061, "step": 16890 }, { "epoch": 4.302996858816538, "grad_norm": 0.6618602275848389, "learning_rate": 1.0690291584130523e-05, "loss": 0.0063, "step": 16895 }, { "epoch": 4.3042703115714405, "grad_norm": 1.1269259452819824, "learning_rate": 1.0685856554226002e-05, "loss": 0.0087, "step": 16900 }, { "epoch": 4.305543764326344, "grad_norm": 0.41089189052581787, "learning_rate": 1.0681421388775251e-05, "loss": 0.0054, "step": 16905 }, { "epoch": 4.306817217081246, "grad_norm": 0.9561560153961182, "learning_rate": 1.06769860886548e-05, "loss": 0.0058, "step": 16910 }, { "epoch": 4.308090669836149, "grad_norm": 0.3296566605567932, "learning_rate": 1.0672550654741196e-05, "loss": 0.0049, "step": 16915 }, { "epoch": 4.3093641225910515, "grad_norm": 1.0554085969924927, "learning_rate": 1.0668115087911017e-05, "loss": 0.0057, "step": 16920 }, { "epoch": 4.310637575345955, "grad_norm": 0.5378391742706299, "learning_rate": 1.0663679389040872e-05, "loss": 0.0048, "step": 16925 }, { "epoch": 4.311911028100857, "grad_norm": 1.1633596420288086, "learning_rate": 1.0659243559007387e-05, "loss": 0.0069, "step": 16930 }, { "epoch": 4.31318448085576, "grad_norm": 0.5857096314430237, "learning_rate": 1.0654807598687213e-05, "loss": 0.0057, "step": 16935 }, { "epoch": 4.314457933610663, "grad_norm": 0.9663562774658203, "learning_rate": 1.065037150895704e-05, "loss": 0.0087, "step": 16940 }, { "epoch": 4.315731386365566, "grad_norm": 0.9111732244491577, "learning_rate": 1.0645935290693569e-05, "loss": 0.0042, "step": 16945 }, { "epoch": 4.317004839120469, "grad_norm": 0.46711596846580505, "learning_rate": 1.0641498944773533e-05, "loss": 0.0053, "step": 16950 }, { "epoch": 4.318278291875371, "grad_norm": 1.4857699871063232, "learning_rate": 1.0637062472073694e-05, "loss": 0.0057, "step": 16955 }, { "epoch": 4.3195517446302745, "grad_norm": 0.9419054388999939, "learning_rate": 1.063262587347083e-05, "loss": 0.0066, "step": 16960 }, { "epoch": 4.320825197385177, "grad_norm": 0.5715884566307068, "learning_rate": 1.062818914984175e-05, "loss": 0.0044, "step": 16965 }, { "epoch": 4.32209865014008, "grad_norm": 0.5788368582725525, "learning_rate": 1.0623752302063284e-05, "loss": 0.0059, "step": 16970 }, { "epoch": 4.323372102894982, "grad_norm": 0.5747083425521851, "learning_rate": 1.0619315331012291e-05, "loss": 0.0036, "step": 16975 }, { "epoch": 4.3246455556498855, "grad_norm": 0.4800707995891571, "learning_rate": 1.0614878237565656e-05, "loss": 0.005, "step": 16980 }, { "epoch": 4.325919008404788, "grad_norm": 0.5441935658454895, "learning_rate": 1.061044102260028e-05, "loss": 0.004, "step": 16985 }, { "epoch": 4.327192461159691, "grad_norm": 0.672944188117981, "learning_rate": 1.0606003686993087e-05, "loss": 0.0051, "step": 16990 }, { "epoch": 4.328465913914593, "grad_norm": 0.6026894450187683, "learning_rate": 1.0601566231621041e-05, "loss": 0.0033, "step": 16995 }, { "epoch": 4.3297393666694965, "grad_norm": 1.8723206520080566, "learning_rate": 1.0597128657361114e-05, "loss": 0.0089, "step": 17000 }, { "epoch": 4.3310128194244, "grad_norm": 0.884835958480835, "learning_rate": 1.0592690965090304e-05, "loss": 0.0077, "step": 17005 }, { "epoch": 4.332286272179302, "grad_norm": 0.6162419319152832, "learning_rate": 1.0588253155685638e-05, "loss": 0.0038, "step": 17010 }, { "epoch": 4.333559724934205, "grad_norm": 0.6963617205619812, "learning_rate": 1.0583815230024161e-05, "loss": 0.0078, "step": 17015 }, { "epoch": 4.334833177689108, "grad_norm": 0.7981426119804382, "learning_rate": 1.0579377188982945e-05, "loss": 0.0047, "step": 17020 }, { "epoch": 4.336106630444011, "grad_norm": 0.47012826800346375, "learning_rate": 1.057493903343908e-05, "loss": 0.0044, "step": 17025 }, { "epoch": 4.337380083198913, "grad_norm": 0.8030365109443665, "learning_rate": 1.0570500764269681e-05, "loss": 0.0053, "step": 17030 }, { "epoch": 4.338653535953816, "grad_norm": 0.4633045792579651, "learning_rate": 1.0566062382351888e-05, "loss": 0.0039, "step": 17035 }, { "epoch": 4.339926988708719, "grad_norm": 0.5494313836097717, "learning_rate": 1.0561623888562864e-05, "loss": 0.0076, "step": 17040 }, { "epoch": 4.341200441463622, "grad_norm": 0.9261872172355652, "learning_rate": 1.055718528377978e-05, "loss": 0.0098, "step": 17045 }, { "epoch": 4.342473894218524, "grad_norm": 0.3892466723918915, "learning_rate": 1.0552746568879852e-05, "loss": 0.0061, "step": 17050 }, { "epoch": 4.343747346973427, "grad_norm": 0.938925564289093, "learning_rate": 1.0548307744740303e-05, "loss": 0.0063, "step": 17055 }, { "epoch": 4.34502079972833, "grad_norm": 0.8908360600471497, "learning_rate": 1.0543868812238372e-05, "loss": 0.0062, "step": 17060 }, { "epoch": 4.346294252483233, "grad_norm": 0.5287014245986938, "learning_rate": 1.0539429772251338e-05, "loss": 0.0057, "step": 17065 }, { "epoch": 4.347567705238136, "grad_norm": 1.3550745248794556, "learning_rate": 1.0534990625656488e-05, "loss": 0.0063, "step": 17070 }, { "epoch": 4.348841157993038, "grad_norm": 0.6549002528190613, "learning_rate": 1.0530551373331128e-05, "loss": 0.0065, "step": 17075 }, { "epoch": 4.350114610747942, "grad_norm": 0.5858554244041443, "learning_rate": 1.0526112016152598e-05, "loss": 0.0062, "step": 17080 }, { "epoch": 4.351388063502844, "grad_norm": 0.6438503861427307, "learning_rate": 1.0521672554998245e-05, "loss": 0.0039, "step": 17085 }, { "epoch": 4.352661516257747, "grad_norm": 0.7154511213302612, "learning_rate": 1.0517232990745445e-05, "loss": 0.0033, "step": 17090 }, { "epoch": 4.353934969012649, "grad_norm": 0.8019555807113647, "learning_rate": 1.0512793324271591e-05, "loss": 0.0047, "step": 17095 }, { "epoch": 4.355208421767553, "grad_norm": 1.1900546550750732, "learning_rate": 1.0508353556454096e-05, "loss": 0.0069, "step": 17100 }, { "epoch": 4.356481874522455, "grad_norm": 0.4685034155845642, "learning_rate": 1.0503913688170397e-05, "loss": 0.0039, "step": 17105 }, { "epoch": 4.357755327277358, "grad_norm": 0.42544466257095337, "learning_rate": 1.0499473720297941e-05, "loss": 0.0038, "step": 17110 }, { "epoch": 4.3590287800322605, "grad_norm": 1.1427966356277466, "learning_rate": 1.049503365371421e-05, "loss": 0.0083, "step": 17115 }, { "epoch": 4.360302232787164, "grad_norm": 1.9608160257339478, "learning_rate": 1.0490593489296691e-05, "loss": 0.0067, "step": 17120 }, { "epoch": 4.361575685542066, "grad_norm": 0.7399283647537231, "learning_rate": 1.0486153227922898e-05, "loss": 0.0091, "step": 17125 }, { "epoch": 4.362849138296969, "grad_norm": 0.5590591430664062, "learning_rate": 1.048171287047036e-05, "loss": 0.0048, "step": 17130 }, { "epoch": 4.364122591051872, "grad_norm": 0.5152609944343567, "learning_rate": 1.0477272417816634e-05, "loss": 0.0048, "step": 17135 }, { "epoch": 4.365396043806775, "grad_norm": 0.8653866052627563, "learning_rate": 1.047283187083928e-05, "loss": 0.0057, "step": 17140 }, { "epoch": 4.366669496561678, "grad_norm": 0.5121146440505981, "learning_rate": 1.0468391230415893e-05, "loss": 0.005, "step": 17145 }, { "epoch": 4.36794294931658, "grad_norm": 1.1802862882614136, "learning_rate": 1.0463950497424081e-05, "loss": 0.0053, "step": 17150 }, { "epoch": 4.369216402071483, "grad_norm": 0.7864201068878174, "learning_rate": 1.0459509672741456e-05, "loss": 0.0084, "step": 17155 }, { "epoch": 4.370489854826386, "grad_norm": 0.8749565482139587, "learning_rate": 1.0455068757245676e-05, "loss": 0.0051, "step": 17160 }, { "epoch": 4.371763307581289, "grad_norm": 1.1958425045013428, "learning_rate": 1.0450627751814396e-05, "loss": 0.0061, "step": 17165 }, { "epoch": 4.373036760336191, "grad_norm": 0.6967389583587646, "learning_rate": 1.044618665732529e-05, "loss": 0.0047, "step": 17170 }, { "epoch": 4.3743102130910945, "grad_norm": 0.6787299513816833, "learning_rate": 1.0441745474656059e-05, "loss": 0.0037, "step": 17175 }, { "epoch": 4.375583665845997, "grad_norm": 1.368159294128418, "learning_rate": 1.0437304204684416e-05, "loss": 0.0102, "step": 17180 }, { "epoch": 4.3768571186009, "grad_norm": 0.7869994044303894, "learning_rate": 1.0432862848288088e-05, "loss": 0.0058, "step": 17185 }, { "epoch": 4.378130571355802, "grad_norm": 0.6087598204612732, "learning_rate": 1.0428421406344828e-05, "loss": 0.0054, "step": 17190 }, { "epoch": 4.3794040241107055, "grad_norm": 0.7064946293830872, "learning_rate": 1.04239798797324e-05, "loss": 0.0045, "step": 17195 }, { "epoch": 4.380677476865609, "grad_norm": 0.4525607228279114, "learning_rate": 1.0419538269328582e-05, "loss": 0.0054, "step": 17200 }, { "epoch": 4.381950929620511, "grad_norm": 0.9917742013931274, "learning_rate": 1.0415096576011175e-05, "loss": 0.0046, "step": 17205 }, { "epoch": 4.383224382375414, "grad_norm": 0.640160858631134, "learning_rate": 1.0410654800657991e-05, "loss": 0.006, "step": 17210 }, { "epoch": 4.3844978351303165, "grad_norm": 0.6543559432029724, "learning_rate": 1.0406212944146864e-05, "loss": 0.007, "step": 17215 }, { "epoch": 4.38577128788522, "grad_norm": 0.09996291995048523, "learning_rate": 1.0401771007355637e-05, "loss": 0.008, "step": 17220 }, { "epoch": 4.387044740640122, "grad_norm": 0.8246923089027405, "learning_rate": 1.0397328991162174e-05, "loss": 0.0057, "step": 17225 }, { "epoch": 4.388318193395025, "grad_norm": 0.22447988390922546, "learning_rate": 1.0392886896444353e-05, "loss": 0.0049, "step": 17230 }, { "epoch": 4.389591646149928, "grad_norm": 0.6081185936927795, "learning_rate": 1.038844472408007e-05, "loss": 0.0057, "step": 17235 }, { "epoch": 4.390865098904831, "grad_norm": 0.8134514093399048, "learning_rate": 1.0384002474947233e-05, "loss": 0.0055, "step": 17240 }, { "epoch": 4.392138551659733, "grad_norm": 0.8192667365074158, "learning_rate": 1.0379560149923765e-05, "loss": 0.0061, "step": 17245 }, { "epoch": 4.393412004414636, "grad_norm": 0.9726032614707947, "learning_rate": 1.0375117749887602e-05, "loss": 0.0049, "step": 17250 }, { "epoch": 4.394685457169539, "grad_norm": 0.6363368630409241, "learning_rate": 1.0370675275716704e-05, "loss": 0.0053, "step": 17255 }, { "epoch": 4.395958909924442, "grad_norm": 0.5769186019897461, "learning_rate": 1.0366232728289038e-05, "loss": 0.0073, "step": 17260 }, { "epoch": 4.397232362679345, "grad_norm": 0.7635168433189392, "learning_rate": 1.0361790108482585e-05, "loss": 0.0045, "step": 17265 }, { "epoch": 4.398505815434247, "grad_norm": 0.663733184337616, "learning_rate": 1.0357347417175348e-05, "loss": 0.0046, "step": 17270 }, { "epoch": 4.3997792681891505, "grad_norm": 0.861575186252594, "learning_rate": 1.0352904655245335e-05, "loss": 0.0072, "step": 17275 }, { "epoch": 4.401052720944053, "grad_norm": 1.0584522485733032, "learning_rate": 1.0348461823570568e-05, "loss": 0.0056, "step": 17280 }, { "epoch": 4.402326173698956, "grad_norm": 0.5731120109558105, "learning_rate": 1.0344018923029091e-05, "loss": 0.0069, "step": 17285 }, { "epoch": 4.403599626453858, "grad_norm": 1.0510727167129517, "learning_rate": 1.0339575954498956e-05, "loss": 0.0056, "step": 17290 }, { "epoch": 4.404873079208762, "grad_norm": 0.7894735336303711, "learning_rate": 1.0335132918858232e-05, "loss": 0.0039, "step": 17295 }, { "epoch": 4.406146531963664, "grad_norm": 0.7101461887359619, "learning_rate": 1.0330689816984993e-05, "loss": 0.0043, "step": 17300 }, { "epoch": 4.407419984718567, "grad_norm": 1.046265721321106, "learning_rate": 1.0326246649757336e-05, "loss": 0.0063, "step": 17305 }, { "epoch": 4.408693437473469, "grad_norm": 0.2029806673526764, "learning_rate": 1.0321803418053367e-05, "loss": 0.0033, "step": 17310 }, { "epoch": 4.409966890228373, "grad_norm": 1.3857253789901733, "learning_rate": 1.0317360122751201e-05, "loss": 0.0049, "step": 17315 }, { "epoch": 4.411240342983275, "grad_norm": 0.4394567608833313, "learning_rate": 1.0312916764728968e-05, "loss": 0.0056, "step": 17320 }, { "epoch": 4.412513795738178, "grad_norm": 0.8636107444763184, "learning_rate": 1.030847334486482e-05, "loss": 0.0062, "step": 17325 }, { "epoch": 4.413787248493081, "grad_norm": Infinity, "learning_rate": 1.0304918565037437e-05, "loss": 0.0054, "step": 17330 }, { "epoch": 4.415060701247984, "grad_norm": 0.8610852956771851, "learning_rate": 1.0300475036070787e-05, "loss": 0.0044, "step": 17335 }, { "epoch": 4.416334154002887, "grad_norm": 0.8410759568214417, "learning_rate": 1.0296031447721086e-05, "loss": 0.0033, "step": 17340 }, { "epoch": 4.417607606757789, "grad_norm": 0.9189033508300781, "learning_rate": 1.0291587800866519e-05, "loss": 0.0072, "step": 17345 }, { "epoch": 4.418881059512692, "grad_norm": 0.39646977186203003, "learning_rate": 1.0287144096385286e-05, "loss": 0.0037, "step": 17350 }, { "epoch": 4.420154512267595, "grad_norm": 0.7712379693984985, "learning_rate": 1.02827003351556e-05, "loss": 0.0039, "step": 17355 }, { "epoch": 4.421427965022498, "grad_norm": 0.8849602341651917, "learning_rate": 1.0278256518055685e-05, "loss": 0.0049, "step": 17360 }, { "epoch": 4.4227014177774, "grad_norm": 0.2486889362335205, "learning_rate": 1.0273812645963773e-05, "loss": 0.0036, "step": 17365 }, { "epoch": 4.423974870532303, "grad_norm": 0.9636090397834778, "learning_rate": 1.026936871975811e-05, "loss": 0.0047, "step": 17370 }, { "epoch": 4.425248323287206, "grad_norm": 0.5422865152359009, "learning_rate": 1.0264924740316948e-05, "loss": 0.0032, "step": 17375 }, { "epoch": 4.426521776042109, "grad_norm": 1.250295639038086, "learning_rate": 1.026048070851856e-05, "loss": 0.0044, "step": 17380 }, { "epoch": 4.427795228797011, "grad_norm": 0.33960285782814026, "learning_rate": 1.0256036625241222e-05, "loss": 0.0057, "step": 17385 }, { "epoch": 4.4290686815519145, "grad_norm": 1.2678180932998657, "learning_rate": 1.0251592491363216e-05, "loss": 0.0073, "step": 17390 }, { "epoch": 4.430342134306818, "grad_norm": 0.8740023374557495, "learning_rate": 1.024714830776284e-05, "loss": 0.0041, "step": 17395 }, { "epoch": 4.43161558706172, "grad_norm": 0.7846441268920898, "learning_rate": 1.0242704075318402e-05, "loss": 0.0046, "step": 17400 }, { "epoch": 4.432889039816623, "grad_norm": 0.5071787238121033, "learning_rate": 1.0238259794908218e-05, "loss": 0.0042, "step": 17405 }, { "epoch": 4.4341624925715255, "grad_norm": 0.6311402320861816, "learning_rate": 1.0233815467410615e-05, "loss": 0.0049, "step": 17410 }, { "epoch": 4.435435945326429, "grad_norm": 0.6769793033599854, "learning_rate": 1.022937109370392e-05, "loss": 0.0076, "step": 17415 }, { "epoch": 4.436709398081331, "grad_norm": 0.7116311192512512, "learning_rate": 1.0224926674666491e-05, "loss": 0.004, "step": 17420 }, { "epoch": 4.437982850836234, "grad_norm": 0.269428551197052, "learning_rate": 1.022048221117667e-05, "loss": 0.0053, "step": 17425 }, { "epoch": 4.4392563035911365, "grad_norm": 0.9223889708518982, "learning_rate": 1.0216037704112822e-05, "loss": 0.0056, "step": 17430 }, { "epoch": 4.44052975634604, "grad_norm": 0.6606734991073608, "learning_rate": 1.0211593154353321e-05, "loss": 0.0041, "step": 17435 }, { "epoch": 4.441803209100942, "grad_norm": 0.7744891047477722, "learning_rate": 1.0207148562776541e-05, "loss": 0.0055, "step": 17440 }, { "epoch": 4.443076661855845, "grad_norm": 0.6593675017356873, "learning_rate": 1.020270393026087e-05, "loss": 0.0036, "step": 17445 }, { "epoch": 4.444350114610748, "grad_norm": 0.7667275667190552, "learning_rate": 1.0198259257684707e-05, "loss": 0.0039, "step": 17450 }, { "epoch": 4.445623567365651, "grad_norm": 1.290239691734314, "learning_rate": 1.019381454592645e-05, "loss": 0.0072, "step": 17455 }, { "epoch": 4.446897020120554, "grad_norm": 0.8610133528709412, "learning_rate": 1.0189369795864512e-05, "loss": 0.006, "step": 17460 }, { "epoch": 4.448170472875456, "grad_norm": 0.8351715803146362, "learning_rate": 1.0184925008377315e-05, "loss": 0.0047, "step": 17465 }, { "epoch": 4.4494439256303595, "grad_norm": 0.866409957408905, "learning_rate": 1.018048018434328e-05, "loss": 0.0047, "step": 17470 }, { "epoch": 4.450717378385262, "grad_norm": 1.2325917482376099, "learning_rate": 1.0176035324640843e-05, "loss": 0.0061, "step": 17475 }, { "epoch": 4.451990831140165, "grad_norm": 1.2955094575881958, "learning_rate": 1.0171590430148444e-05, "loss": 0.0049, "step": 17480 }, { "epoch": 4.453264283895067, "grad_norm": 0.43204572796821594, "learning_rate": 1.0167145501744528e-05, "loss": 0.0038, "step": 17485 }, { "epoch": 4.4545377366499705, "grad_norm": 1.884368896484375, "learning_rate": 1.016270054030755e-05, "loss": 0.0065, "step": 17490 }, { "epoch": 4.455811189404873, "grad_norm": 0.617802083492279, "learning_rate": 1.0158255546715972e-05, "loss": 0.007, "step": 17495 }, { "epoch": 4.457084642159776, "grad_norm": 1.0880860090255737, "learning_rate": 1.0153810521848257e-05, "loss": 0.0076, "step": 17500 }, { "epoch": 4.458358094914678, "grad_norm": 0.7185360789299011, "learning_rate": 1.0149365466582884e-05, "loss": 0.0064, "step": 17505 }, { "epoch": 4.459631547669582, "grad_norm": 0.41881051659584045, "learning_rate": 1.0144920381798322e-05, "loss": 0.0037, "step": 17510 }, { "epoch": 4.460905000424484, "grad_norm": 0.5612937808036804, "learning_rate": 1.0140475268373064e-05, "loss": 0.0066, "step": 17515 }, { "epoch": 4.462178453179387, "grad_norm": 0.68364018201828, "learning_rate": 1.0136030127185598e-05, "loss": 0.0044, "step": 17520 }, { "epoch": 4.46345190593429, "grad_norm": 0.9366385340690613, "learning_rate": 1.013158495911442e-05, "loss": 0.0067, "step": 17525 }, { "epoch": 4.464725358689193, "grad_norm": 0.22432859241962433, "learning_rate": 1.012713976503803e-05, "loss": 0.0043, "step": 17530 }, { "epoch": 4.465998811444096, "grad_norm": 0.5364655256271362, "learning_rate": 1.0122694545834935e-05, "loss": 0.0039, "step": 17535 }, { "epoch": 4.467272264198998, "grad_norm": 0.7506240010261536, "learning_rate": 1.0118249302383645e-05, "loss": 0.0043, "step": 17540 }, { "epoch": 4.468545716953901, "grad_norm": 1.1343148946762085, "learning_rate": 1.011380403556268e-05, "loss": 0.0051, "step": 17545 }, { "epoch": 4.469819169708804, "grad_norm": 0.8549547791481018, "learning_rate": 1.0109358746250557e-05, "loss": 0.006, "step": 17550 }, { "epoch": 4.471092622463707, "grad_norm": 0.5923743844032288, "learning_rate": 1.0104913435325798e-05, "loss": 0.0045, "step": 17555 }, { "epoch": 4.472366075218609, "grad_norm": 0.5780715942382812, "learning_rate": 1.0100468103666941e-05, "loss": 0.0043, "step": 17560 }, { "epoch": 4.473639527973512, "grad_norm": 0.19007687270641327, "learning_rate": 1.0096022752152514e-05, "loss": 0.0064, "step": 17565 }, { "epoch": 4.474912980728415, "grad_norm": 0.726978600025177, "learning_rate": 1.0091577381661053e-05, "loss": 0.0053, "step": 17570 }, { "epoch": 4.476186433483318, "grad_norm": 0.8384236693382263, "learning_rate": 1.0087131993071102e-05, "loss": 0.004, "step": 17575 }, { "epoch": 4.47745988623822, "grad_norm": 0.9517943263053894, "learning_rate": 1.0082686587261208e-05, "loss": 0.0049, "step": 17580 }, { "epoch": 4.478733338993123, "grad_norm": 0.585834801197052, "learning_rate": 1.0078241165109911e-05, "loss": 0.0046, "step": 17585 }, { "epoch": 4.480006791748026, "grad_norm": 0.8502383828163147, "learning_rate": 1.0073795727495773e-05, "loss": 0.0081, "step": 17590 }, { "epoch": 4.481280244502929, "grad_norm": 5.886558532714844, "learning_rate": 1.0069350275297338e-05, "loss": 0.0114, "step": 17595 }, { "epoch": 4.482553697257832, "grad_norm": 0.6780590415000916, "learning_rate": 1.006490480939317e-05, "loss": 0.0071, "step": 17600 }, { "epoch": 4.4838271500127345, "grad_norm": 1.1562469005584717, "learning_rate": 1.0060459330661828e-05, "loss": 0.0079, "step": 17605 }, { "epoch": 4.485100602767638, "grad_norm": 1.1034189462661743, "learning_rate": 1.0056013839981873e-05, "loss": 0.008, "step": 17610 }, { "epoch": 4.48637405552254, "grad_norm": 0.5274009108543396, "learning_rate": 1.005156833823187e-05, "loss": 0.0039, "step": 17615 }, { "epoch": 4.487647508277443, "grad_norm": 0.6253779530525208, "learning_rate": 1.0047122826290385e-05, "loss": 0.006, "step": 17620 }, { "epoch": 4.4889209610323455, "grad_norm": 1.48215913772583, "learning_rate": 1.0042677305035992e-05, "loss": 0.0108, "step": 17625 }, { "epoch": 4.490194413787249, "grad_norm": 0.7593750953674316, "learning_rate": 1.0038231775347258e-05, "loss": 0.0066, "step": 17630 }, { "epoch": 4.491467866542151, "grad_norm": 1.0564550161361694, "learning_rate": 1.0033786238102754e-05, "loss": 0.005, "step": 17635 }, { "epoch": 4.492741319297054, "grad_norm": 1.1997300386428833, "learning_rate": 1.0029340694181058e-05, "loss": 0.008, "step": 17640 }, { "epoch": 4.4940147720519565, "grad_norm": 0.7016804218292236, "learning_rate": 1.0024895144460745e-05, "loss": 0.0058, "step": 17645 }, { "epoch": 4.49528822480686, "grad_norm": 0.2049701064825058, "learning_rate": 1.0020449589820387e-05, "loss": 0.0047, "step": 17650 }, { "epoch": 4.496561677561762, "grad_norm": 0.670078456401825, "learning_rate": 1.0016004031138567e-05, "loss": 0.0057, "step": 17655 }, { "epoch": 4.497835130316665, "grad_norm": 0.8959988951683044, "learning_rate": 1.001155846929386e-05, "loss": 0.0059, "step": 17660 }, { "epoch": 4.499108583071568, "grad_norm": 0.5827723145484924, "learning_rate": 1.0007112905164846e-05, "loss": 0.0041, "step": 17665 }, { "epoch": 4.500382035826471, "grad_norm": 0.7068720459938049, "learning_rate": 1.0002667339630106e-05, "loss": 0.0058, "step": 17670 }, { "epoch": 4.501655488581374, "grad_norm": 0.8250002861022949, "learning_rate": 9.998221773568217e-06, "loss": 0.0076, "step": 17675 }, { "epoch": 4.502928941336276, "grad_norm": 0.7309395670890808, "learning_rate": 9.99377620785776e-06, "loss": 0.0062, "step": 17680 }, { "epoch": 4.5042023940911795, "grad_norm": 1.3907703161239624, "learning_rate": 9.989330643377314e-06, "loss": 0.0064, "step": 17685 }, { "epoch": 4.505475846846082, "grad_norm": 0.4289229214191437, "learning_rate": 9.98488508100546e-06, "loss": 0.0056, "step": 17690 }, { "epoch": 4.506749299600985, "grad_norm": 0.7844918966293335, "learning_rate": 9.980439521620777e-06, "loss": 0.0047, "step": 17695 }, { "epoch": 4.508022752355887, "grad_norm": 0.8574080467224121, "learning_rate": 9.97599396610184e-06, "loss": 0.0033, "step": 17700 }, { "epoch": 4.5092962051107905, "grad_norm": 0.5087451338768005, "learning_rate": 9.971548415327231e-06, "loss": 0.0049, "step": 17705 }, { "epoch": 4.510569657865693, "grad_norm": 0.2964964509010315, "learning_rate": 9.967102870175526e-06, "loss": 0.0054, "step": 17710 }, { "epoch": 4.511843110620596, "grad_norm": 0.48773130774497986, "learning_rate": 9.9626573315253e-06, "loss": 0.0054, "step": 17715 }, { "epoch": 4.513116563375499, "grad_norm": 0.6709003448486328, "learning_rate": 9.958211800255123e-06, "loss": 0.0064, "step": 17720 }, { "epoch": 4.514390016130402, "grad_norm": 1.3069348335266113, "learning_rate": 9.953766277243577e-06, "loss": 0.0058, "step": 17725 }, { "epoch": 4.515663468885304, "grad_norm": 0.9126465320587158, "learning_rate": 9.949320763369226e-06, "loss": 0.0051, "step": 17730 }, { "epoch": 4.516936921640207, "grad_norm": 0.829868733882904, "learning_rate": 9.94487525951064e-06, "loss": 0.0077, "step": 17735 }, { "epoch": 4.51821037439511, "grad_norm": 0.9689640998840332, "learning_rate": 9.940429766546394e-06, "loss": 0.0084, "step": 17740 }, { "epoch": 4.519483827150013, "grad_norm": 1.0371562242507935, "learning_rate": 9.935984285355041e-06, "loss": 0.0054, "step": 17745 }, { "epoch": 4.520757279904916, "grad_norm": 0.791419506072998, "learning_rate": 9.931538816815152e-06, "loss": 0.0058, "step": 17750 }, { "epoch": 4.522030732659818, "grad_norm": 1.1301616430282593, "learning_rate": 9.92709336180529e-06, "loss": 0.0066, "step": 17755 }, { "epoch": 4.523304185414721, "grad_norm": 0.5890202522277832, "learning_rate": 9.922647921204005e-06, "loss": 0.0066, "step": 17760 }, { "epoch": 4.524577638169624, "grad_norm": 0.8806196451187134, "learning_rate": 9.918202495889855e-06, "loss": 0.0061, "step": 17765 }, { "epoch": 4.525851090924527, "grad_norm": 0.493190199136734, "learning_rate": 9.913757086741397e-06, "loss": 0.0051, "step": 17770 }, { "epoch": 4.527124543679429, "grad_norm": 1.420963168144226, "learning_rate": 9.909311694637173e-06, "loss": 0.0064, "step": 17775 }, { "epoch": 4.528397996434332, "grad_norm": 0.7028756737709045, "learning_rate": 9.904866320455731e-06, "loss": 0.0063, "step": 17780 }, { "epoch": 4.529671449189235, "grad_norm": 0.5855804085731506, "learning_rate": 9.900420965075617e-06, "loss": 0.006, "step": 17785 }, { "epoch": 4.530944901944138, "grad_norm": 0.8163647055625916, "learning_rate": 9.89597562937536e-06, "loss": 0.0057, "step": 17790 }, { "epoch": 4.53221835469904, "grad_norm": 0.8535287380218506, "learning_rate": 9.8915303142335e-06, "loss": 0.0058, "step": 17795 }, { "epoch": 4.533491807453943, "grad_norm": 0.4178983271121979, "learning_rate": 9.88708502052857e-06, "loss": 0.0061, "step": 17800 }, { "epoch": 4.534765260208847, "grad_norm": 0.7511168122291565, "learning_rate": 9.882639749139087e-06, "loss": 0.0063, "step": 17805 }, { "epoch": 4.536038712963749, "grad_norm": 0.7543849945068359, "learning_rate": 9.878194500943586e-06, "loss": 0.0051, "step": 17810 }, { "epoch": 4.537312165718652, "grad_norm": 0.7885438203811646, "learning_rate": 9.87374927682057e-06, "loss": 0.0058, "step": 17815 }, { "epoch": 4.5385856184735545, "grad_norm": 0.8049888610839844, "learning_rate": 9.869304077648555e-06, "loss": 0.0063, "step": 17820 }, { "epoch": 4.539859071228458, "grad_norm": 1.0104082822799683, "learning_rate": 9.864858904306059e-06, "loss": 0.0041, "step": 17825 }, { "epoch": 4.54113252398336, "grad_norm": 0.6877580285072327, "learning_rate": 9.860413757671568e-06, "loss": 0.0053, "step": 17830 }, { "epoch": 4.542405976738263, "grad_norm": 0.28189247846603394, "learning_rate": 9.855968638623588e-06, "loss": 0.0031, "step": 17835 }, { "epoch": 4.5436794294931655, "grad_norm": 0.7504107356071472, "learning_rate": 9.851523548040612e-06, "loss": 0.0058, "step": 17840 }, { "epoch": 4.544952882248069, "grad_norm": 0.4146064817905426, "learning_rate": 9.84707848680112e-06, "loss": 0.0048, "step": 17845 }, { "epoch": 4.546226335002971, "grad_norm": 1.0541120767593384, "learning_rate": 9.842633455783594e-06, "loss": 0.0044, "step": 17850 }, { "epoch": 4.547499787757874, "grad_norm": 1.0084527730941772, "learning_rate": 9.838188455866512e-06, "loss": 0.0065, "step": 17855 }, { "epoch": 4.5487732405127765, "grad_norm": 1.2036375999450684, "learning_rate": 9.833743487928337e-06, "loss": 0.0047, "step": 17860 }, { "epoch": 4.55004669326768, "grad_norm": 1.0152968168258667, "learning_rate": 9.829298552847531e-06, "loss": 0.0079, "step": 17865 }, { "epoch": 4.551320146022583, "grad_norm": 0.8202946782112122, "learning_rate": 9.824853651502555e-06, "loss": 0.0062, "step": 17870 }, { "epoch": 4.552593598777485, "grad_norm": 2.482287645339966, "learning_rate": 9.820408784771848e-06, "loss": 0.0054, "step": 17875 }, { "epoch": 4.5538670515323885, "grad_norm": 0.8778037428855896, "learning_rate": 9.815963953533857e-06, "loss": 0.0049, "step": 17880 }, { "epoch": 4.555140504287291, "grad_norm": 0.8850799798965454, "learning_rate": 9.81151915866702e-06, "loss": 0.009, "step": 17885 }, { "epoch": 4.556413957042194, "grad_norm": 1.2090966701507568, "learning_rate": 9.807074401049757e-06, "loss": 0.0052, "step": 17890 }, { "epoch": 4.557687409797096, "grad_norm": 0.3343554139137268, "learning_rate": 9.80262968156049e-06, "loss": 0.0052, "step": 17895 }, { "epoch": 4.5589608625519995, "grad_norm": 0.45143449306488037, "learning_rate": 9.798185001077637e-06, "loss": 0.005, "step": 17900 }, { "epoch": 4.560234315306902, "grad_norm": 0.8237206935882568, "learning_rate": 9.793740360479594e-06, "loss": 0.0056, "step": 17905 }, { "epoch": 4.561507768061805, "grad_norm": 1.1084703207015991, "learning_rate": 9.789295760644764e-06, "loss": 0.0063, "step": 17910 }, { "epoch": 4.562781220816707, "grad_norm": 1.083728551864624, "learning_rate": 9.784851202451538e-06, "loss": 0.0062, "step": 17915 }, { "epoch": 4.5640546735716105, "grad_norm": 0.4030836522579193, "learning_rate": 9.780406686778288e-06, "loss": 0.0043, "step": 17920 }, { "epoch": 4.565328126326513, "grad_norm": 1.014193058013916, "learning_rate": 9.775962214503391e-06, "loss": 0.0071, "step": 17925 }, { "epoch": 4.566601579081416, "grad_norm": 0.7073419690132141, "learning_rate": 9.771517786505217e-06, "loss": 0.0052, "step": 17930 }, { "epoch": 4.567875031836319, "grad_norm": 0.47614774107933044, "learning_rate": 9.767073403662109e-06, "loss": 0.0051, "step": 17935 }, { "epoch": 4.569148484591222, "grad_norm": 0.4906972646713257, "learning_rate": 9.762629066852417e-06, "loss": 0.0056, "step": 17940 }, { "epoch": 4.570421937346125, "grad_norm": 0.9884282946586609, "learning_rate": 9.758184776954487e-06, "loss": 0.0092, "step": 17945 }, { "epoch": 4.571695390101027, "grad_norm": 0.9355340600013733, "learning_rate": 9.75374053484663e-06, "loss": 0.0102, "step": 17950 }, { "epoch": 4.57296884285593, "grad_norm": 0.502196192741394, "learning_rate": 9.749296341407176e-06, "loss": 0.0058, "step": 17955 }, { "epoch": 4.574242295610833, "grad_norm": 0.7050665616989136, "learning_rate": 9.744852197514435e-06, "loss": 0.0049, "step": 17960 }, { "epoch": 4.575515748365736, "grad_norm": 0.47344323992729187, "learning_rate": 9.740408104046697e-06, "loss": 0.0063, "step": 17965 }, { "epoch": 4.576789201120638, "grad_norm": 0.692112147808075, "learning_rate": 9.735964061882255e-06, "loss": 0.003, "step": 17970 }, { "epoch": 4.578062653875541, "grad_norm": 0.8193139433860779, "learning_rate": 9.73152007189939e-06, "loss": 0.0049, "step": 17975 }, { "epoch": 4.579336106630444, "grad_norm": 0.8714790940284729, "learning_rate": 9.727076134976367e-06, "loss": 0.0053, "step": 17980 }, { "epoch": 4.580609559385347, "grad_norm": 1.0841941833496094, "learning_rate": 9.722632251991445e-06, "loss": 0.0071, "step": 17985 }, { "epoch": 4.581883012140249, "grad_norm": 0.5958244800567627, "learning_rate": 9.718188423822871e-06, "loss": 0.0069, "step": 17990 }, { "epoch": 4.583156464895152, "grad_norm": 0.35519567131996155, "learning_rate": 9.713744651348879e-06, "loss": 0.005, "step": 17995 }, { "epoch": 4.584429917650056, "grad_norm": 0.6373310685157776, "learning_rate": 9.709300935447698e-06, "loss": 0.0076, "step": 18000 }, { "epoch": 4.585703370404958, "grad_norm": 0.6564034819602966, "learning_rate": 9.704857276997543e-06, "loss": 0.0034, "step": 18005 }, { "epoch": 4.586976823159861, "grad_norm": 0.631106972694397, "learning_rate": 9.70041367687661e-06, "loss": 0.0042, "step": 18010 }, { "epoch": 4.588250275914763, "grad_norm": 0.7309592962265015, "learning_rate": 9.695970135963099e-06, "loss": 0.0054, "step": 18015 }, { "epoch": 4.589523728669667, "grad_norm": 1.1423170566558838, "learning_rate": 9.691526655135183e-06, "loss": 0.0066, "step": 18020 }, { "epoch": 4.590797181424569, "grad_norm": 0.825774610042572, "learning_rate": 9.687083235271034e-06, "loss": 0.0074, "step": 18025 }, { "epoch": 4.592070634179472, "grad_norm": 0.5734112858772278, "learning_rate": 9.682639877248802e-06, "loss": 0.0029, "step": 18030 }, { "epoch": 4.5933440869343745, "grad_norm": 1.0985138416290283, "learning_rate": 9.678196581946637e-06, "loss": 0.0058, "step": 18035 }, { "epoch": 4.594617539689278, "grad_norm": 1.2016229629516602, "learning_rate": 9.673753350242667e-06, "loss": 0.0074, "step": 18040 }, { "epoch": 4.59589099244418, "grad_norm": 0.8775098919868469, "learning_rate": 9.669310183015009e-06, "loss": 0.0053, "step": 18045 }, { "epoch": 4.597164445199083, "grad_norm": 1.1592673063278198, "learning_rate": 9.664867081141771e-06, "loss": 0.0095, "step": 18050 }, { "epoch": 4.5984378979539855, "grad_norm": 1.072088599205017, "learning_rate": 9.660424045501045e-06, "loss": 0.0039, "step": 18055 }, { "epoch": 4.599711350708889, "grad_norm": 1.1916617155075073, "learning_rate": 9.655981076970912e-06, "loss": 0.0067, "step": 18060 }, { "epoch": 4.600984803463792, "grad_norm": 0.27536430954933167, "learning_rate": 9.651538176429436e-06, "loss": 0.0058, "step": 18065 }, { "epoch": 4.602258256218694, "grad_norm": 0.4689675271511078, "learning_rate": 9.647095344754669e-06, "loss": 0.0055, "step": 18070 }, { "epoch": 4.603531708973597, "grad_norm": 0.5726991295814514, "learning_rate": 9.642652582824654e-06, "loss": 0.0064, "step": 18075 }, { "epoch": 4.6048051617285, "grad_norm": 0.966454029083252, "learning_rate": 9.638209891517417e-06, "loss": 0.0062, "step": 18080 }, { "epoch": 4.606078614483403, "grad_norm": 0.7102112770080566, "learning_rate": 9.633767271710963e-06, "loss": 0.0048, "step": 18085 }, { "epoch": 4.607352067238305, "grad_norm": 0.9734674692153931, "learning_rate": 9.629324724283298e-06, "loss": 0.0073, "step": 18090 }, { "epoch": 4.6086255199932085, "grad_norm": 0.6768167018890381, "learning_rate": 9.624882250112401e-06, "loss": 0.0052, "step": 18095 }, { "epoch": 4.609898972748111, "grad_norm": 0.4753047525882721, "learning_rate": 9.62043985007624e-06, "loss": 0.0053, "step": 18100 }, { "epoch": 4.611172425503014, "grad_norm": 0.9499591588973999, "learning_rate": 9.615997525052774e-06, "loss": 0.0083, "step": 18105 }, { "epoch": 4.612445878257916, "grad_norm": 1.5561702251434326, "learning_rate": 9.611555275919932e-06, "loss": 0.0069, "step": 18110 }, { "epoch": 4.6137193310128195, "grad_norm": 0.8869100213050842, "learning_rate": 9.607113103555648e-06, "loss": 0.0045, "step": 18115 }, { "epoch": 4.614992783767722, "grad_norm": 0.23135831952095032, "learning_rate": 9.602671008837831e-06, "loss": 0.0043, "step": 18120 }, { "epoch": 4.616266236522625, "grad_norm": 0.48720911145210266, "learning_rate": 9.598228992644368e-06, "loss": 0.004, "step": 18125 }, { "epoch": 4.617539689277528, "grad_norm": 0.9152600765228271, "learning_rate": 9.593787055853138e-06, "loss": 0.0063, "step": 18130 }, { "epoch": 4.6188131420324305, "grad_norm": 0.7599828243255615, "learning_rate": 9.589345199342012e-06, "loss": 0.0074, "step": 18135 }, { "epoch": 4.620086594787334, "grad_norm": 0.6738988161087036, "learning_rate": 9.584903423988829e-06, "loss": 0.0058, "step": 18140 }, { "epoch": 4.621360047542236, "grad_norm": 0.8981545567512512, "learning_rate": 9.58046173067142e-06, "loss": 0.0052, "step": 18145 }, { "epoch": 4.622633500297139, "grad_norm": 0.8308353424072266, "learning_rate": 9.576020120267604e-06, "loss": 0.0066, "step": 18150 }, { "epoch": 4.623906953052042, "grad_norm": 0.9629809260368347, "learning_rate": 9.571578593655174e-06, "loss": 0.0071, "step": 18155 }, { "epoch": 4.625180405806945, "grad_norm": 0.8004541993141174, "learning_rate": 9.567137151711912e-06, "loss": 0.0056, "step": 18160 }, { "epoch": 4.626453858561847, "grad_norm": 1.2717218399047852, "learning_rate": 9.56269579531559e-06, "loss": 0.0056, "step": 18165 }, { "epoch": 4.62772731131675, "grad_norm": 0.8238574266433716, "learning_rate": 9.558254525343944e-06, "loss": 0.0066, "step": 18170 }, { "epoch": 4.629000764071653, "grad_norm": 1.0162572860717773, "learning_rate": 9.553813342674711e-06, "loss": 0.0065, "step": 18175 }, { "epoch": 4.630274216826556, "grad_norm": 1.1049237251281738, "learning_rate": 9.54937224818561e-06, "loss": 0.0059, "step": 18180 }, { "epoch": 4.631547669581458, "grad_norm": 0.38400211930274963, "learning_rate": 9.544931242754326e-06, "loss": 0.0034, "step": 18185 }, { "epoch": 4.632821122336361, "grad_norm": 0.4351978898048401, "learning_rate": 9.540490327258542e-06, "loss": 0.0072, "step": 18190 }, { "epoch": 4.6340945750912645, "grad_norm": 0.627040684223175, "learning_rate": 9.536049502575926e-06, "loss": 0.0058, "step": 18195 }, { "epoch": 4.635368027846167, "grad_norm": 0.889549732208252, "learning_rate": 9.531608769584109e-06, "loss": 0.0074, "step": 18200 }, { "epoch": 4.63664148060107, "grad_norm": 0.6865336298942566, "learning_rate": 9.52716812916072e-06, "loss": 0.0054, "step": 18205 }, { "epoch": 4.637914933355972, "grad_norm": 1.0708343982696533, "learning_rate": 9.522727582183371e-06, "loss": 0.0052, "step": 18210 }, { "epoch": 4.639188386110876, "grad_norm": 0.6623796224594116, "learning_rate": 9.518287129529641e-06, "loss": 0.0041, "step": 18215 }, { "epoch": 4.640461838865778, "grad_norm": 1.4213701486587524, "learning_rate": 9.513846772077104e-06, "loss": 0.0065, "step": 18220 }, { "epoch": 4.641735291620681, "grad_norm": 0.6336188912391663, "learning_rate": 9.509406510703314e-06, "loss": 0.0042, "step": 18225 }, { "epoch": 4.643008744375583, "grad_norm": 0.474365770816803, "learning_rate": 9.504966346285793e-06, "loss": 0.0083, "step": 18230 }, { "epoch": 4.644282197130487, "grad_norm": 0.5174270868301392, "learning_rate": 9.500526279702059e-06, "loss": 0.0025, "step": 18235 }, { "epoch": 4.645555649885389, "grad_norm": 0.8341809511184692, "learning_rate": 9.496086311829608e-06, "loss": 0.0054, "step": 18240 }, { "epoch": 4.646829102640292, "grad_norm": 0.816162645816803, "learning_rate": 9.491646443545906e-06, "loss": 0.0059, "step": 18245 }, { "epoch": 4.6481025553951945, "grad_norm": 0.5047352910041809, "learning_rate": 9.48720667572841e-06, "loss": 0.0041, "step": 18250 }, { "epoch": 4.649376008150098, "grad_norm": 0.4504780173301697, "learning_rate": 9.482767009254558e-06, "loss": 0.0036, "step": 18255 }, { "epoch": 4.650649460905001, "grad_norm": 1.1958783864974976, "learning_rate": 9.478327445001757e-06, "loss": 0.0057, "step": 18260 }, { "epoch": 4.651922913659903, "grad_norm": 0.8299809098243713, "learning_rate": 9.473887983847403e-06, "loss": 0.0068, "step": 18265 }, { "epoch": 4.653196366414806, "grad_norm": 1.36953604221344, "learning_rate": 9.469448626668875e-06, "loss": 0.0037, "step": 18270 }, { "epoch": 4.654469819169709, "grad_norm": 0.44758039712905884, "learning_rate": 9.465009374343515e-06, "loss": 0.0056, "step": 18275 }, { "epoch": 4.655743271924612, "grad_norm": 0.8320930600166321, "learning_rate": 9.460570227748662e-06, "loss": 0.005, "step": 18280 }, { "epoch": 4.657016724679514, "grad_norm": 0.8233997821807861, "learning_rate": 9.45613118776163e-06, "loss": 0.0082, "step": 18285 }, { "epoch": 4.658290177434417, "grad_norm": 1.2646152973175049, "learning_rate": 9.451692255259702e-06, "loss": 0.0051, "step": 18290 }, { "epoch": 4.65956363018932, "grad_norm": 0.4580678641796112, "learning_rate": 9.44725343112015e-06, "loss": 0.0035, "step": 18295 }, { "epoch": 4.660837082944223, "grad_norm": 1.050774097442627, "learning_rate": 9.442814716220222e-06, "loss": 0.0078, "step": 18300 }, { "epoch": 4.662110535699125, "grad_norm": 1.1728655099868774, "learning_rate": 9.43837611143714e-06, "loss": 0.0086, "step": 18305 }, { "epoch": 4.6633839884540285, "grad_norm": 0.7124359607696533, "learning_rate": 9.433937617648113e-06, "loss": 0.0058, "step": 18310 }, { "epoch": 4.664657441208931, "grad_norm": 0.4193384647369385, "learning_rate": 9.42949923573032e-06, "loss": 0.0063, "step": 18315 }, { "epoch": 4.665930893963834, "grad_norm": 0.424331396818161, "learning_rate": 9.425060966560922e-06, "loss": 0.005, "step": 18320 }, { "epoch": 4.667204346718737, "grad_norm": 0.757716953754425, "learning_rate": 9.42062281101706e-06, "loss": 0.0067, "step": 18325 }, { "epoch": 4.6684777994736395, "grad_norm": 0.5854200720787048, "learning_rate": 9.416184769975842e-06, "loss": 0.0058, "step": 18330 }, { "epoch": 4.669751252228543, "grad_norm": 0.4986412227153778, "learning_rate": 9.411746844314365e-06, "loss": 0.0048, "step": 18335 }, { "epoch": 4.671024704983445, "grad_norm": 0.3520723879337311, "learning_rate": 9.407309034909699e-06, "loss": 0.0046, "step": 18340 }, { "epoch": 4.672298157738348, "grad_norm": 1.020983338356018, "learning_rate": 9.40287134263889e-06, "loss": 0.0075, "step": 18345 }, { "epoch": 4.6735716104932505, "grad_norm": 0.530399739742279, "learning_rate": 9.398433768378962e-06, "loss": 0.0044, "step": 18350 }, { "epoch": 4.674845063248154, "grad_norm": 1.1409435272216797, "learning_rate": 9.393996313006915e-06, "loss": 0.0038, "step": 18355 }, { "epoch": 4.676118516003056, "grad_norm": 0.5126278400421143, "learning_rate": 9.389558977399725e-06, "loss": 0.0041, "step": 18360 }, { "epoch": 4.677391968757959, "grad_norm": 1.0401815176010132, "learning_rate": 9.385121762434348e-06, "loss": 0.0056, "step": 18365 }, { "epoch": 4.678665421512862, "grad_norm": 0.5827401876449585, "learning_rate": 9.38068466898771e-06, "loss": 0.0049, "step": 18370 }, { "epoch": 4.679938874267765, "grad_norm": 1.7547188997268677, "learning_rate": 9.376247697936719e-06, "loss": 0.0069, "step": 18375 }, { "epoch": 4.681212327022667, "grad_norm": 0.5671047568321228, "learning_rate": 9.371810850158255e-06, "loss": 0.0036, "step": 18380 }, { "epoch": 4.68248577977757, "grad_norm": 0.4020199775695801, "learning_rate": 9.367374126529173e-06, "loss": 0.0063, "step": 18385 }, { "epoch": 4.6837592325324735, "grad_norm": 0.3980535566806793, "learning_rate": 9.36293752792631e-06, "loss": 0.0053, "step": 18390 }, { "epoch": 4.685032685287376, "grad_norm": 1.7736719846725464, "learning_rate": 9.358501055226468e-06, "loss": 0.0071, "step": 18395 }, { "epoch": 4.686306138042278, "grad_norm": 1.0397891998291016, "learning_rate": 9.354064709306433e-06, "loss": 0.0063, "step": 18400 }, { "epoch": 4.687579590797181, "grad_norm": 0.7321202754974365, "learning_rate": 9.349628491042963e-06, "loss": 0.0042, "step": 18405 }, { "epoch": 4.6888530435520845, "grad_norm": 0.8563195466995239, "learning_rate": 9.345192401312787e-06, "loss": 0.0055, "step": 18410 }, { "epoch": 4.690126496306987, "grad_norm": 0.8927724957466125, "learning_rate": 9.340756440992618e-06, "loss": 0.0056, "step": 18415 }, { "epoch": 4.69139994906189, "grad_norm": 0.631597638130188, "learning_rate": 9.336320610959131e-06, "loss": 0.0064, "step": 18420 }, { "epoch": 4.692673401816792, "grad_norm": 0.8046750426292419, "learning_rate": 9.331884912088982e-06, "loss": 0.0081, "step": 18425 }, { "epoch": 4.693946854571696, "grad_norm": 0.4012606143951416, "learning_rate": 9.327449345258809e-06, "loss": 0.006, "step": 18430 }, { "epoch": 4.695220307326598, "grad_norm": 1.3517762422561646, "learning_rate": 9.323013911345204e-06, "loss": 0.0054, "step": 18435 }, { "epoch": 4.696493760081501, "grad_norm": 0.3733052909374237, "learning_rate": 9.31857861122475e-06, "loss": 0.0053, "step": 18440 }, { "epoch": 4.697767212836403, "grad_norm": 0.832855761051178, "learning_rate": 9.314143445774005e-06, "loss": 0.0102, "step": 18445 }, { "epoch": 4.699040665591307, "grad_norm": 0.86599200963974, "learning_rate": 9.309708415869479e-06, "loss": 0.0051, "step": 18450 }, { "epoch": 4.70031411834621, "grad_norm": 0.7411115765571594, "learning_rate": 9.305273522387678e-06, "loss": 0.006, "step": 18455 }, { "epoch": 4.701587571101112, "grad_norm": 0.8412322402000427, "learning_rate": 9.300838766205075e-06, "loss": 0.0035, "step": 18460 }, { "epoch": 4.7028610238560145, "grad_norm": 1.2517958879470825, "learning_rate": 9.296404148198105e-06, "loss": 0.007, "step": 18465 }, { "epoch": 4.704134476610918, "grad_norm": 0.7698042988777161, "learning_rate": 9.29196966924319e-06, "loss": 0.0055, "step": 18470 }, { "epoch": 4.705407929365821, "grad_norm": 1.1909687519073486, "learning_rate": 9.28753533021672e-06, "loss": 0.0067, "step": 18475 }, { "epoch": 4.706681382120723, "grad_norm": 1.015060544013977, "learning_rate": 9.28310113199505e-06, "loss": 0.0082, "step": 18480 }, { "epoch": 4.707954834875626, "grad_norm": 0.4344119429588318, "learning_rate": 9.278667075454518e-06, "loss": 0.0064, "step": 18485 }, { "epoch": 4.709228287630529, "grad_norm": 0.7190651893615723, "learning_rate": 9.274233161471431e-06, "loss": 0.0062, "step": 18490 }, { "epoch": 4.710501740385432, "grad_norm": 0.7910693883895874, "learning_rate": 9.269799390922057e-06, "loss": 0.0044, "step": 18495 }, { "epoch": 4.711775193140334, "grad_norm": 0.7466403841972351, "learning_rate": 9.265365764682653e-06, "loss": 0.0033, "step": 18500 }, { "epoch": 4.713048645895237, "grad_norm": 1.0367419719696045, "learning_rate": 9.260932283629439e-06, "loss": 0.0054, "step": 18505 }, { "epoch": 4.71432209865014, "grad_norm": 0.6440672874450684, "learning_rate": 9.2564989486386e-06, "loss": 0.0073, "step": 18510 }, { "epoch": 4.715595551405043, "grad_norm": 0.8639602661132812, "learning_rate": 9.252065760586301e-06, "loss": 0.0054, "step": 18515 }, { "epoch": 4.716869004159946, "grad_norm": 0.7194103002548218, "learning_rate": 9.247632720348682e-06, "loss": 0.0051, "step": 18520 }, { "epoch": 4.7181424569148485, "grad_norm": 0.6197789907455444, "learning_rate": 9.24319982880184e-06, "loss": 0.0037, "step": 18525 }, { "epoch": 4.719415909669751, "grad_norm": 0.45425036549568176, "learning_rate": 9.23876708682185e-06, "loss": 0.0043, "step": 18530 }, { "epoch": 4.720689362424654, "grad_norm": 0.6820579171180725, "learning_rate": 9.234334495284764e-06, "loss": 0.0064, "step": 18535 }, { "epoch": 4.721962815179557, "grad_norm": 0.7160296440124512, "learning_rate": 9.22990205506659e-06, "loss": 0.0041, "step": 18540 }, { "epoch": 4.7232362679344595, "grad_norm": 0.6028857231140137, "learning_rate": 9.225469767043315e-06, "loss": 0.0053, "step": 18545 }, { "epoch": 4.724509720689363, "grad_norm": 0.9168111085891724, "learning_rate": 9.221037632090901e-06, "loss": 0.0076, "step": 18550 }, { "epoch": 4.725783173444265, "grad_norm": 1.3487461805343628, "learning_rate": 9.216605651085266e-06, "loss": 0.0056, "step": 18555 }, { "epoch": 4.727056626199168, "grad_norm": 0.8915379047393799, "learning_rate": 9.212173824902306e-06, "loss": 0.0069, "step": 18560 }, { "epoch": 4.7283300789540705, "grad_norm": 0.6966531872749329, "learning_rate": 9.207742154417894e-06, "loss": 0.0067, "step": 18565 }, { "epoch": 4.729603531708974, "grad_norm": 0.23015697300434113, "learning_rate": 9.20331064050785e-06, "loss": 0.0064, "step": 18570 }, { "epoch": 4.730876984463876, "grad_norm": 1.4849978685379028, "learning_rate": 9.198879284047988e-06, "loss": 0.0077, "step": 18575 }, { "epoch": 4.732150437218779, "grad_norm": 0.5101234912872314, "learning_rate": 9.194448085914072e-06, "loss": 0.0043, "step": 18580 }, { "epoch": 4.7334238899736825, "grad_norm": 0.47954294085502625, "learning_rate": 9.190017046981844e-06, "loss": 0.0044, "step": 18585 }, { "epoch": 4.734697342728585, "grad_norm": 0.914310872554779, "learning_rate": 9.185586168127018e-06, "loss": 0.0055, "step": 18590 }, { "epoch": 4.735970795483487, "grad_norm": 0.3251239061355591, "learning_rate": 9.181155450225264e-06, "loss": 0.0051, "step": 18595 }, { "epoch": 4.73724424823839, "grad_norm": 1.3372344970703125, "learning_rate": 9.17672489415223e-06, "loss": 0.0056, "step": 18600 }, { "epoch": 4.7385177009932935, "grad_norm": 0.8981145620346069, "learning_rate": 9.172294500783534e-06, "loss": 0.0048, "step": 18605 }, { "epoch": 4.739791153748196, "grad_norm": 0.4858753979206085, "learning_rate": 9.167864270994748e-06, "loss": 0.0065, "step": 18610 }, { "epoch": 4.741064606503099, "grad_norm": 0.6611778140068054, "learning_rate": 9.163434205661427e-06, "loss": 0.0048, "step": 18615 }, { "epoch": 4.742338059258001, "grad_norm": 1.2212005853652954, "learning_rate": 9.15900430565909e-06, "loss": 0.005, "step": 18620 }, { "epoch": 4.7436115120129045, "grad_norm": 0.7884944081306458, "learning_rate": 9.154574571863213e-06, "loss": 0.0051, "step": 18625 }, { "epoch": 4.744884964767807, "grad_norm": 0.4319799542427063, "learning_rate": 9.150145005149251e-06, "loss": 0.0056, "step": 18630 }, { "epoch": 4.74615841752271, "grad_norm": 0.46557191014289856, "learning_rate": 9.145715606392624e-06, "loss": 0.0047, "step": 18635 }, { "epoch": 4.747431870277612, "grad_norm": 0.43475037813186646, "learning_rate": 9.141286376468711e-06, "loss": 0.0057, "step": 18640 }, { "epoch": 4.748705323032516, "grad_norm": 0.40603822469711304, "learning_rate": 9.13685731625287e-06, "loss": 0.006, "step": 18645 }, { "epoch": 4.749978775787419, "grad_norm": 0.9345946907997131, "learning_rate": 9.132428426620413e-06, "loss": 0.0057, "step": 18650 }, { "epoch": 4.751252228542321, "grad_norm": 0.7379676103591919, "learning_rate": 9.127999708446626e-06, "loss": 0.0059, "step": 18655 }, { "epoch": 4.752525681297223, "grad_norm": 1.3119747638702393, "learning_rate": 9.12357116260676e-06, "loss": 0.0094, "step": 18660 }, { "epoch": 4.753799134052127, "grad_norm": 0.9813382625579834, "learning_rate": 9.11914278997603e-06, "loss": 0.005, "step": 18665 }, { "epoch": 4.75507258680703, "grad_norm": 0.7429441213607788, "learning_rate": 9.114714591429618e-06, "loss": 0.0048, "step": 18670 }, { "epoch": 4.756346039561932, "grad_norm": 1.127251386642456, "learning_rate": 9.110286567842671e-06, "loss": 0.0061, "step": 18675 }, { "epoch": 4.757619492316835, "grad_norm": 0.6731266975402832, "learning_rate": 9.105858720090305e-06, "loss": 0.0061, "step": 18680 }, { "epoch": 4.758892945071738, "grad_norm": 1.8574097156524658, "learning_rate": 9.101431049047596e-06, "loss": 0.0061, "step": 18685 }, { "epoch": 4.760166397826641, "grad_norm": 0.3232722878456116, "learning_rate": 9.097003555589584e-06, "loss": 0.0041, "step": 18690 }, { "epoch": 4.761439850581543, "grad_norm": 1.0551073551177979, "learning_rate": 9.092576240591282e-06, "loss": 0.0076, "step": 18695 }, { "epoch": 4.762713303336446, "grad_norm": 0.573894202709198, "learning_rate": 9.08814910492766e-06, "loss": 0.004, "step": 18700 }, { "epoch": 4.763986756091349, "grad_norm": 0.32874050736427307, "learning_rate": 9.083722149473652e-06, "loss": 0.0037, "step": 18705 }, { "epoch": 4.765260208846252, "grad_norm": 1.0706171989440918, "learning_rate": 9.07929537510417e-06, "loss": 0.0065, "step": 18710 }, { "epoch": 4.766533661601154, "grad_norm": 0.33127543330192566, "learning_rate": 9.07486878269407e-06, "loss": 0.0064, "step": 18715 }, { "epoch": 4.767807114356057, "grad_norm": 0.5711273550987244, "learning_rate": 9.070442373118186e-06, "loss": 0.0044, "step": 18720 }, { "epoch": 4.76908056711096, "grad_norm": 0.8299146294593811, "learning_rate": 9.066016147251316e-06, "loss": 0.0053, "step": 18725 }, { "epoch": 4.770354019865863, "grad_norm": 0.5418506860733032, "learning_rate": 9.061590105968208e-06, "loss": 0.0047, "step": 18730 }, { "epoch": 4.771627472620766, "grad_norm": 0.6000421643257141, "learning_rate": 9.05716425014359e-06, "loss": 0.0073, "step": 18735 }, { "epoch": 4.7729009253756685, "grad_norm": 0.490444153547287, "learning_rate": 9.052738580652149e-06, "loss": 0.0049, "step": 18740 }, { "epoch": 4.774174378130572, "grad_norm": 0.7409296631813049, "learning_rate": 9.048313098368521e-06, "loss": 0.0058, "step": 18745 }, { "epoch": 4.775447830885474, "grad_norm": 0.8337826132774353, "learning_rate": 9.043887804167326e-06, "loss": 0.0056, "step": 18750 }, { "epoch": 4.776721283640377, "grad_norm": 0.8561012148857117, "learning_rate": 9.039462698923141e-06, "loss": 0.0078, "step": 18755 }, { "epoch": 4.7779947363952795, "grad_norm": 0.9102628827095032, "learning_rate": 9.035037783510491e-06, "loss": 0.0095, "step": 18760 }, { "epoch": 4.779268189150183, "grad_norm": 0.6012089848518372, "learning_rate": 9.03061305880388e-06, "loss": 0.0043, "step": 18765 }, { "epoch": 4.780541641905085, "grad_norm": 1.214664101600647, "learning_rate": 9.026188525677774e-06, "loss": 0.005, "step": 18770 }, { "epoch": 4.781815094659988, "grad_norm": 0.5727747678756714, "learning_rate": 9.021764185006585e-06, "loss": 0.0058, "step": 18775 }, { "epoch": 4.7830885474148905, "grad_norm": 0.38617458939552307, "learning_rate": 9.017340037664706e-06, "loss": 0.0038, "step": 18780 }, { "epoch": 4.784362000169794, "grad_norm": 0.3732163608074188, "learning_rate": 9.012916084526487e-06, "loss": 0.0031, "step": 18785 }, { "epoch": 4.785635452924696, "grad_norm": 0.39586853981018066, "learning_rate": 9.008492326466225e-06, "loss": 0.0081, "step": 18790 }, { "epoch": 4.786908905679599, "grad_norm": 0.6505070924758911, "learning_rate": 9.004068764358196e-06, "loss": 0.0076, "step": 18795 }, { "epoch": 4.7881823584345025, "grad_norm": 0.6309881210327148, "learning_rate": 8.999645399076638e-06, "loss": 0.0038, "step": 18800 }, { "epoch": 4.789455811189405, "grad_norm": 0.16638270020484924, "learning_rate": 8.995222231495729e-06, "loss": 0.0036, "step": 18805 }, { "epoch": 4.790729263944308, "grad_norm": 1.7881662845611572, "learning_rate": 8.990799262489631e-06, "loss": 0.0072, "step": 18810 }, { "epoch": 4.79200271669921, "grad_norm": 0.5213753581047058, "learning_rate": 8.986376492932462e-06, "loss": 0.0053, "step": 18815 }, { "epoch": 4.7932761694541135, "grad_norm": 0.45610693097114563, "learning_rate": 8.981953923698285e-06, "loss": 0.0033, "step": 18820 }, { "epoch": 4.794549622209016, "grad_norm": 0.5420287251472473, "learning_rate": 8.977531555661142e-06, "loss": 0.0073, "step": 18825 }, { "epoch": 4.795823074963919, "grad_norm": 0.5709807872772217, "learning_rate": 8.973109389695031e-06, "loss": 0.0042, "step": 18830 }, { "epoch": 4.797096527718821, "grad_norm": 0.24412578344345093, "learning_rate": 8.9686874266739e-06, "loss": 0.0051, "step": 18835 }, { "epoch": 4.7983699804737245, "grad_norm": 0.8492004871368408, "learning_rate": 8.96426566747167e-06, "loss": 0.0046, "step": 18840 }, { "epoch": 4.799643433228627, "grad_norm": 0.6240329146385193, "learning_rate": 8.95984411296221e-06, "loss": 0.0054, "step": 18845 }, { "epoch": 4.80091688598353, "grad_norm": 0.8746883273124695, "learning_rate": 8.955422764019357e-06, "loss": 0.005, "step": 18850 }, { "epoch": 4.802190338738432, "grad_norm": 0.4503156840801239, "learning_rate": 8.951001621516909e-06, "loss": 0.0039, "step": 18855 }, { "epoch": 4.803463791493336, "grad_norm": 0.7162415385246277, "learning_rate": 8.946580686328614e-06, "loss": 0.0058, "step": 18860 }, { "epoch": 4.804737244248239, "grad_norm": 0.6577386260032654, "learning_rate": 8.942159959328181e-06, "loss": 0.0077, "step": 18865 }, { "epoch": 4.806010697003141, "grad_norm": 0.8503921627998352, "learning_rate": 8.937739441389293e-06, "loss": 0.0075, "step": 18870 }, { "epoch": 4.807284149758044, "grad_norm": 0.4839950501918793, "learning_rate": 8.933319133385565e-06, "loss": 0.0039, "step": 18875 }, { "epoch": 4.808557602512947, "grad_norm": 0.770347535610199, "learning_rate": 8.928899036190593e-06, "loss": 0.0041, "step": 18880 }, { "epoch": 4.80983105526785, "grad_norm": 0.3919973373413086, "learning_rate": 8.924479150677925e-06, "loss": 0.0044, "step": 18885 }, { "epoch": 4.811104508022752, "grad_norm": 0.6721681952476501, "learning_rate": 8.920059477721059e-06, "loss": 0.0055, "step": 18890 }, { "epoch": 4.812377960777655, "grad_norm": 0.565629243850708, "learning_rate": 8.91564001819346e-06, "loss": 0.0055, "step": 18895 }, { "epoch": 4.813651413532558, "grad_norm": 1.3450812101364136, "learning_rate": 8.911220772968556e-06, "loss": 0.0063, "step": 18900 }, { "epoch": 4.814924866287461, "grad_norm": 0.507063090801239, "learning_rate": 8.906801742919713e-06, "loss": 0.0057, "step": 18905 }, { "epoch": 4.816198319042363, "grad_norm": 0.733797013759613, "learning_rate": 8.90238292892027e-06, "loss": 0.0044, "step": 18910 }, { "epoch": 4.817471771797266, "grad_norm": 0.5687713623046875, "learning_rate": 8.897964331843528e-06, "loss": 0.0078, "step": 18915 }, { "epoch": 4.818745224552169, "grad_norm": 0.3932921290397644, "learning_rate": 8.893545952562726e-06, "loss": 0.0063, "step": 18920 }, { "epoch": 4.820018677307072, "grad_norm": 0.9003282189369202, "learning_rate": 8.889127791951074e-06, "loss": 0.0065, "step": 18925 }, { "epoch": 4.821292130061975, "grad_norm": 0.5524948835372925, "learning_rate": 8.88470985088174e-06, "loss": 0.0054, "step": 18930 }, { "epoch": 4.822565582816877, "grad_norm": 1.1096917390823364, "learning_rate": 8.88029213022784e-06, "loss": 0.0056, "step": 18935 }, { "epoch": 4.823839035571781, "grad_norm": 0.35641559958457947, "learning_rate": 8.87587463086245e-06, "loss": 0.0052, "step": 18940 }, { "epoch": 4.825112488326683, "grad_norm": 0.9246786236763, "learning_rate": 8.87145735365861e-06, "loss": 0.0045, "step": 18945 }, { "epoch": 4.826385941081586, "grad_norm": 0.6719244718551636, "learning_rate": 8.8670402994893e-06, "loss": 0.0087, "step": 18950 }, { "epoch": 4.8276593938364885, "grad_norm": 0.8063743710517883, "learning_rate": 8.862623469227469e-06, "loss": 0.0078, "step": 18955 }, { "epoch": 4.828932846591392, "grad_norm": 0.7415965795516968, "learning_rate": 8.858206863746018e-06, "loss": 0.007, "step": 18960 }, { "epoch": 4.830206299346294, "grad_norm": 0.6104704141616821, "learning_rate": 8.853790483917801e-06, "loss": 0.0044, "step": 18965 }, { "epoch": 4.831479752101197, "grad_norm": 0.47671762108802795, "learning_rate": 8.849374330615634e-06, "loss": 0.0062, "step": 18970 }, { "epoch": 4.8327532048560995, "grad_norm": 1.0456620454788208, "learning_rate": 8.84495840471228e-06, "loss": 0.0053, "step": 18975 }, { "epoch": 4.834026657611003, "grad_norm": 0.7729334235191345, "learning_rate": 8.840542707080463e-06, "loss": 0.0054, "step": 18980 }, { "epoch": 4.835300110365905, "grad_norm": 0.8417448401451111, "learning_rate": 8.836127238592858e-06, "loss": 0.0081, "step": 18985 }, { "epoch": 4.836573563120808, "grad_norm": 0.5780398845672607, "learning_rate": 8.8317120001221e-06, "loss": 0.0066, "step": 18990 }, { "epoch": 4.837847015875711, "grad_norm": 1.0266679525375366, "learning_rate": 8.827296992540772e-06, "loss": 0.0069, "step": 18995 }, { "epoch": 4.839120468630614, "grad_norm": 1.0861473083496094, "learning_rate": 8.822882216721412e-06, "loss": 0.0055, "step": 19000 }, { "epoch": 4.840393921385517, "grad_norm": 1.7790552377700806, "learning_rate": 8.818467673536522e-06, "loss": 0.0048, "step": 19005 }, { "epoch": 4.841667374140419, "grad_norm": 0.423627644777298, "learning_rate": 8.814053363858546e-06, "loss": 0.0044, "step": 19010 }, { "epoch": 4.8429408268953225, "grad_norm": 0.24204035103321075, "learning_rate": 8.809639288559888e-06, "loss": 0.004, "step": 19015 }, { "epoch": 4.844214279650225, "grad_norm": 0.7361630201339722, "learning_rate": 8.805225448512902e-06, "loss": 0.0069, "step": 19020 }, { "epoch": 4.845487732405128, "grad_norm": 0.9684880971908569, "learning_rate": 8.800811844589902e-06, "loss": 0.0073, "step": 19025 }, { "epoch": 4.84676118516003, "grad_norm": 1.1611204147338867, "learning_rate": 8.796398477663145e-06, "loss": 0.0056, "step": 19030 }, { "epoch": 4.8480346379149335, "grad_norm": 0.9564541578292847, "learning_rate": 8.791985348604855e-06, "loss": 0.0056, "step": 19035 }, { "epoch": 4.849308090669836, "grad_norm": 0.4357541501522064, "learning_rate": 8.787572458287193e-06, "loss": 0.0039, "step": 19040 }, { "epoch": 4.850581543424739, "grad_norm": 1.2272560596466064, "learning_rate": 8.783159807582285e-06, "loss": 0.0052, "step": 19045 }, { "epoch": 4.851854996179641, "grad_norm": 0.9376307129859924, "learning_rate": 8.77874739736221e-06, "loss": 0.0087, "step": 19050 }, { "epoch": 4.8531284489345445, "grad_norm": 0.43057382106781006, "learning_rate": 8.774335228498987e-06, "loss": 0.0043, "step": 19055 }, { "epoch": 4.854401901689448, "grad_norm": 0.4638229012489319, "learning_rate": 8.769923301864599e-06, "loss": 0.0062, "step": 19060 }, { "epoch": 4.85567535444435, "grad_norm": 0.7730168700218201, "learning_rate": 8.765511618330983e-06, "loss": 0.0078, "step": 19065 }, { "epoch": 4.856948807199253, "grad_norm": 0.6534221172332764, "learning_rate": 8.761100178770012e-06, "loss": 0.0055, "step": 19070 }, { "epoch": 4.858222259954156, "grad_norm": 0.24080237746238708, "learning_rate": 8.756688984053528e-06, "loss": 0.0028, "step": 19075 }, { "epoch": 4.859495712709059, "grad_norm": 1.013139009475708, "learning_rate": 8.75227803505332e-06, "loss": 0.0037, "step": 19080 }, { "epoch": 4.860769165463961, "grad_norm": 0.6178326606750488, "learning_rate": 8.74786733264112e-06, "loss": 0.0051, "step": 19085 }, { "epoch": 4.862042618218864, "grad_norm": 0.7558813691139221, "learning_rate": 8.743456877688625e-06, "loss": 0.0037, "step": 19090 }, { "epoch": 4.863316070973767, "grad_norm": 1.0440564155578613, "learning_rate": 8.739046671067466e-06, "loss": 0.006, "step": 19095 }, { "epoch": 4.86458952372867, "grad_norm": 0.3361533284187317, "learning_rate": 8.734636713649242e-06, "loss": 0.0047, "step": 19100 }, { "epoch": 4.865862976483572, "grad_norm": 0.9496538639068604, "learning_rate": 8.730227006305498e-06, "loss": 0.0042, "step": 19105 }, { "epoch": 4.867136429238475, "grad_norm": 0.26116862893104553, "learning_rate": 8.725817549907718e-06, "loss": 0.0043, "step": 19110 }, { "epoch": 4.868409881993378, "grad_norm": 0.5578219294548035, "learning_rate": 8.72140834532735e-06, "loss": 0.0055, "step": 19115 }, { "epoch": 4.869683334748281, "grad_norm": 0.8758561015129089, "learning_rate": 8.716999393435793e-06, "loss": 0.0062, "step": 19120 }, { "epoch": 4.870956787503184, "grad_norm": 1.2161798477172852, "learning_rate": 8.71259069510438e-06, "loss": 0.004, "step": 19125 }, { "epoch": 4.872230240258086, "grad_norm": 0.9980555176734924, "learning_rate": 8.708182251204412e-06, "loss": 0.005, "step": 19130 }, { "epoch": 4.87350369301299, "grad_norm": 0.4460764229297638, "learning_rate": 8.703774062607133e-06, "loss": 0.0045, "step": 19135 }, { "epoch": 4.874777145767892, "grad_norm": 0.1863548308610916, "learning_rate": 8.699366130183728e-06, "loss": 0.0047, "step": 19140 }, { "epoch": 4.876050598522795, "grad_norm": 0.9224615693092346, "learning_rate": 8.694958454805346e-06, "loss": 0.0062, "step": 19145 }, { "epoch": 4.877324051277697, "grad_norm": 2.090118169784546, "learning_rate": 8.690551037343081e-06, "loss": 0.0101, "step": 19150 }, { "epoch": 4.878597504032601, "grad_norm": 0.6063019037246704, "learning_rate": 8.686143878667965e-06, "loss": 0.0047, "step": 19155 }, { "epoch": 4.879870956787503, "grad_norm": 0.3666181266307831, "learning_rate": 8.681736979650992e-06, "loss": 0.0051, "step": 19160 }, { "epoch": 4.881144409542406, "grad_norm": 0.8646928668022156, "learning_rate": 8.677330341163104e-06, "loss": 0.006, "step": 19165 }, { "epoch": 4.8824178622973085, "grad_norm": 0.5392720103263855, "learning_rate": 8.67292396407518e-06, "loss": 0.0032, "step": 19170 }, { "epoch": 4.883691315052212, "grad_norm": 0.6404744386672974, "learning_rate": 8.668517849258055e-06, "loss": 0.0043, "step": 19175 }, { "epoch": 4.884964767807114, "grad_norm": 0.8535940647125244, "learning_rate": 8.664111997582522e-06, "loss": 0.0045, "step": 19180 }, { "epoch": 4.886238220562017, "grad_norm": 0.7793566584587097, "learning_rate": 8.6597064099193e-06, "loss": 0.0026, "step": 19185 }, { "epoch": 4.88751167331692, "grad_norm": 0.6686957478523254, "learning_rate": 8.655301087139071e-06, "loss": 0.0078, "step": 19190 }, { "epoch": 4.888785126071823, "grad_norm": 0.87309730052948, "learning_rate": 8.650896030112471e-06, "loss": 0.0057, "step": 19195 }, { "epoch": 4.890058578826726, "grad_norm": 0.5455039739608765, "learning_rate": 8.64649123971006e-06, "loss": 0.0043, "step": 19200 }, { "epoch": 4.891332031581628, "grad_norm": 1.2053109407424927, "learning_rate": 8.642086716802367e-06, "loss": 0.0076, "step": 19205 }, { "epoch": 4.892605484336531, "grad_norm": 0.5246099829673767, "learning_rate": 8.637682462259862e-06, "loss": 0.0045, "step": 19210 }, { "epoch": 4.893878937091434, "grad_norm": 0.8819009065628052, "learning_rate": 8.633278476952954e-06, "loss": 0.0069, "step": 19215 }, { "epoch": 4.895152389846337, "grad_norm": 0.8786417841911316, "learning_rate": 8.628874761752007e-06, "loss": 0.0048, "step": 19220 }, { "epoch": 4.896425842601239, "grad_norm": 0.7961286902427673, "learning_rate": 8.624471317527337e-06, "loss": 0.0065, "step": 19225 }, { "epoch": 4.8976992953561425, "grad_norm": 0.4050956964492798, "learning_rate": 8.620068145149187e-06, "loss": 0.0058, "step": 19230 }, { "epoch": 4.898972748111045, "grad_norm": 0.6990489959716797, "learning_rate": 8.615665245487767e-06, "loss": 0.0063, "step": 19235 }, { "epoch": 4.900246200865948, "grad_norm": 1.1271523237228394, "learning_rate": 8.611262619413227e-06, "loss": 0.008, "step": 19240 }, { "epoch": 4.90151965362085, "grad_norm": 0.3306136131286621, "learning_rate": 8.60686026779565e-06, "loss": 0.0091, "step": 19245 }, { "epoch": 4.9027931063757535, "grad_norm": 0.6111834049224854, "learning_rate": 8.602458191505084e-06, "loss": 0.0043, "step": 19250 }, { "epoch": 4.904066559130657, "grad_norm": 0.48193982243537903, "learning_rate": 8.598056391411514e-06, "loss": 0.005, "step": 19255 }, { "epoch": 4.905340011885559, "grad_norm": 0.31391775608062744, "learning_rate": 8.593654868384864e-06, "loss": 0.005, "step": 19260 }, { "epoch": 4.906613464640462, "grad_norm": 1.5441232919692993, "learning_rate": 8.589253623295012e-06, "loss": 0.0067, "step": 19265 }, { "epoch": 4.9078869173953645, "grad_norm": 0.8484218716621399, "learning_rate": 8.584852657011787e-06, "loss": 0.0072, "step": 19270 }, { "epoch": 4.909160370150268, "grad_norm": 0.9356661438941956, "learning_rate": 8.580451970404939e-06, "loss": 0.0049, "step": 19275 }, { "epoch": 4.91043382290517, "grad_norm": 0.45666974782943726, "learning_rate": 8.576051564344192e-06, "loss": 0.0046, "step": 19280 }, { "epoch": 4.911707275660073, "grad_norm": 1.416309118270874, "learning_rate": 8.571651439699193e-06, "loss": 0.0069, "step": 19285 }, { "epoch": 4.912980728414976, "grad_norm": 0.8342096209526062, "learning_rate": 8.567251597339541e-06, "loss": 0.0055, "step": 19290 }, { "epoch": 4.914254181169879, "grad_norm": 0.7138675451278687, "learning_rate": 8.562852038134786e-06, "loss": 0.0045, "step": 19295 }, { "epoch": 4.915527633924781, "grad_norm": 1.1458652019500732, "learning_rate": 8.558452762954409e-06, "loss": 0.0046, "step": 19300 }, { "epoch": 4.916801086679684, "grad_norm": 1.6620802879333496, "learning_rate": 8.554053772667844e-06, "loss": 0.006, "step": 19305 }, { "epoch": 4.918074539434587, "grad_norm": 1.0481863021850586, "learning_rate": 8.549655068144464e-06, "loss": 0.0057, "step": 19310 }, { "epoch": 4.91934799218949, "grad_norm": 0.9081844687461853, "learning_rate": 8.545256650253591e-06, "loss": 0.0037, "step": 19315 }, { "epoch": 4.920621444944393, "grad_norm": 0.5926789045333862, "learning_rate": 8.540858519864486e-06, "loss": 0.0052, "step": 19320 }, { "epoch": 4.921894897699295, "grad_norm": 0.9078114032745361, "learning_rate": 8.536460677846348e-06, "loss": 0.0046, "step": 19325 }, { "epoch": 4.923168350454198, "grad_norm": 0.6159272193908691, "learning_rate": 8.532063125068334e-06, "loss": 0.007, "step": 19330 }, { "epoch": 4.924441803209101, "grad_norm": 1.0959004163742065, "learning_rate": 8.527665862399532e-06, "loss": 0.006, "step": 19335 }, { "epoch": 4.925715255964004, "grad_norm": 0.6226354837417603, "learning_rate": 8.523268890708972e-06, "loss": 0.0058, "step": 19340 }, { "epoch": 4.926988708718906, "grad_norm": 1.897741436958313, "learning_rate": 8.518872210865634e-06, "loss": 0.0078, "step": 19345 }, { "epoch": 4.92826216147381, "grad_norm": 1.039716362953186, "learning_rate": 8.514475823738431e-06, "loss": 0.007, "step": 19350 }, { "epoch": 4.929535614228712, "grad_norm": 0.450259268283844, "learning_rate": 8.510079730196232e-06, "loss": 0.0043, "step": 19355 }, { "epoch": 4.930809066983615, "grad_norm": 0.3958800137042999, "learning_rate": 8.505683931107832e-06, "loss": 0.0079, "step": 19360 }, { "epoch": 4.932082519738517, "grad_norm": 0.7310514450073242, "learning_rate": 8.501288427341978e-06, "loss": 0.0069, "step": 19365 }, { "epoch": 4.933355972493421, "grad_norm": 0.7873147130012512, "learning_rate": 8.496893219767358e-06, "loss": 0.0076, "step": 19370 }, { "epoch": 4.934629425248323, "grad_norm": 1.0940778255462646, "learning_rate": 8.492498309252598e-06, "loss": 0.0045, "step": 19375 }, { "epoch": 4.935902878003226, "grad_norm": 0.6187626123428345, "learning_rate": 8.488103696666263e-06, "loss": 0.0052, "step": 19380 }, { "epoch": 4.937176330758129, "grad_norm": 0.6975421905517578, "learning_rate": 8.48370938287687e-06, "loss": 0.0048, "step": 19385 }, { "epoch": 4.938449783513032, "grad_norm": 1.491508960723877, "learning_rate": 8.479315368752862e-06, "loss": 0.0074, "step": 19390 }, { "epoch": 4.939723236267934, "grad_norm": 0.15616056323051453, "learning_rate": 8.474921655162636e-06, "loss": 0.0069, "step": 19395 }, { "epoch": 4.940996689022837, "grad_norm": 0.5308710932731628, "learning_rate": 8.470528242974524e-06, "loss": 0.008, "step": 19400 }, { "epoch": 4.94227014177774, "grad_norm": 0.8643108010292053, "learning_rate": 8.466135133056795e-06, "loss": 0.006, "step": 19405 }, { "epoch": 4.943543594532643, "grad_norm": 0.3085150718688965, "learning_rate": 8.461742326277663e-06, "loss": 0.006, "step": 19410 }, { "epoch": 4.944817047287546, "grad_norm": 0.4941287636756897, "learning_rate": 8.457349823505286e-06, "loss": 0.0066, "step": 19415 }, { "epoch": 4.946090500042448, "grad_norm": 0.5172063708305359, "learning_rate": 8.452957625607748e-06, "loss": 0.0054, "step": 19420 }, { "epoch": 4.947363952797351, "grad_norm": 0.6109001636505127, "learning_rate": 8.448565733453086e-06, "loss": 0.0072, "step": 19425 }, { "epoch": 4.948637405552254, "grad_norm": 0.8221765160560608, "learning_rate": 8.444174147909278e-06, "loss": 0.0067, "step": 19430 }, { "epoch": 4.949910858307157, "grad_norm": 0.3433816134929657, "learning_rate": 8.439782869844226e-06, "loss": 0.0038, "step": 19435 }, { "epoch": 4.951184311062059, "grad_norm": 0.7267446517944336, "learning_rate": 8.435391900125785e-06, "loss": 0.0036, "step": 19440 }, { "epoch": 4.9524577638169625, "grad_norm": 0.38086065649986267, "learning_rate": 8.431001239621749e-06, "loss": 0.0063, "step": 19445 }, { "epoch": 4.953731216571866, "grad_norm": 1.1619257926940918, "learning_rate": 8.426610889199837e-06, "loss": 0.0057, "step": 19450 }, { "epoch": 4.955004669326768, "grad_norm": 1.0508780479431152, "learning_rate": 8.422220849727722e-06, "loss": 0.0063, "step": 19455 }, { "epoch": 4.95627812208167, "grad_norm": 0.20768117904663086, "learning_rate": 8.417831122073015e-06, "loss": 0.0039, "step": 19460 }, { "epoch": 4.9575515748365735, "grad_norm": 0.8026828169822693, "learning_rate": 8.413441707103252e-06, "loss": 0.0042, "step": 19465 }, { "epoch": 4.958825027591477, "grad_norm": 0.7829314470291138, "learning_rate": 8.409052605685918e-06, "loss": 0.0056, "step": 19470 }, { "epoch": 4.960098480346379, "grad_norm": 0.6936241388320923, "learning_rate": 8.40466381868844e-06, "loss": 0.0049, "step": 19475 }, { "epoch": 4.961371933101282, "grad_norm": 1.0598609447479248, "learning_rate": 8.400275346978166e-06, "loss": 0.0099, "step": 19480 }, { "epoch": 4.9626453858561845, "grad_norm": 0.9833427667617798, "learning_rate": 8.395887191422397e-06, "loss": 0.0054, "step": 19485 }, { "epoch": 4.963918838611088, "grad_norm": 0.47374361753463745, "learning_rate": 8.391499352888373e-06, "loss": 0.0046, "step": 19490 }, { "epoch": 4.96519229136599, "grad_norm": 0.775769829750061, "learning_rate": 8.387111832243254e-06, "loss": 0.0059, "step": 19495 }, { "epoch": 4.966465744120893, "grad_norm": 1.0704797506332397, "learning_rate": 8.382724630354153e-06, "loss": 0.0047, "step": 19500 }, { "epoch": 4.967739196875796, "grad_norm": 1.2427483797073364, "learning_rate": 8.37833774808812e-06, "loss": 0.0086, "step": 19505 }, { "epoch": 4.969012649630699, "grad_norm": 0.8498367071151733, "learning_rate": 8.373951186312128e-06, "loss": 0.0068, "step": 19510 }, { "epoch": 4.970286102385602, "grad_norm": 0.9294739365577698, "learning_rate": 8.369564945893102e-06, "loss": 0.0055, "step": 19515 }, { "epoch": 4.971559555140504, "grad_norm": 0.2828501760959625, "learning_rate": 8.3651790276979e-06, "loss": 0.0047, "step": 19520 }, { "epoch": 4.972833007895407, "grad_norm": 1.3960188627243042, "learning_rate": 8.360793432593305e-06, "loss": 0.0064, "step": 19525 }, { "epoch": 4.97410646065031, "grad_norm": 0.31330251693725586, "learning_rate": 8.356408161446045e-06, "loss": 0.0048, "step": 19530 }, { "epoch": 4.975379913405213, "grad_norm": 0.6189050674438477, "learning_rate": 8.352023215122796e-06, "loss": 0.0069, "step": 19535 }, { "epoch": 4.976653366160115, "grad_norm": 0.8833821415901184, "learning_rate": 8.347638594490144e-06, "loss": 0.005, "step": 19540 }, { "epoch": 4.9779268189150185, "grad_norm": 0.6478185057640076, "learning_rate": 8.343254300414629e-06, "loss": 0.0072, "step": 19545 }, { "epoch": 4.979200271669921, "grad_norm": 1.0025408267974854, "learning_rate": 8.338870333762725e-06, "loss": 0.0109, "step": 19550 }, { "epoch": 4.980473724424824, "grad_norm": 0.7625808715820312, "learning_rate": 8.334486695400832e-06, "loss": 0.0072, "step": 19555 }, { "epoch": 4.981747177179726, "grad_norm": 1.1581014394760132, "learning_rate": 8.330103386195292e-06, "loss": 0.0044, "step": 19560 }, { "epoch": 4.98302062993463, "grad_norm": 0.836337685585022, "learning_rate": 8.325720407012388e-06, "loss": 0.0057, "step": 19565 }, { "epoch": 4.984294082689532, "grad_norm": 0.67647385597229, "learning_rate": 8.32133775871832e-06, "loss": 0.004, "step": 19570 }, { "epoch": 4.985567535444435, "grad_norm": 1.0735526084899902, "learning_rate": 8.316955442179239e-06, "loss": 0.0065, "step": 19575 }, { "epoch": 4.986840988199338, "grad_norm": 0.31847280263900757, "learning_rate": 8.312573458261228e-06, "loss": 0.0057, "step": 19580 }, { "epoch": 4.988114440954241, "grad_norm": 0.7520204186439514, "learning_rate": 8.308191807830292e-06, "loss": 0.0064, "step": 19585 }, { "epoch": 4.989387893709143, "grad_norm": 0.6525266170501709, "learning_rate": 8.303810491752388e-06, "loss": 0.0056, "step": 19590 }, { "epoch": 4.990661346464046, "grad_norm": 0.46080976724624634, "learning_rate": 8.299429510893396e-06, "loss": 0.0048, "step": 19595 }, { "epoch": 4.991934799218949, "grad_norm": 0.7300014495849609, "learning_rate": 8.295048866119126e-06, "loss": 0.0057, "step": 19600 }, { "epoch": 4.993208251973852, "grad_norm": 1.0892860889434814, "learning_rate": 8.290668558295336e-06, "loss": 0.0068, "step": 19605 }, { "epoch": 4.994481704728755, "grad_norm": 0.7689711451530457, "learning_rate": 8.286288588287698e-06, "loss": 0.0057, "step": 19610 }, { "epoch": 4.995755157483657, "grad_norm": 0.6802260875701904, "learning_rate": 8.281908956961837e-06, "loss": 0.0067, "step": 19615 }, { "epoch": 4.99702861023856, "grad_norm": 0.24406471848487854, "learning_rate": 8.277529665183302e-06, "loss": 0.0044, "step": 19620 }, { "epoch": 4.998302062993463, "grad_norm": 1.1258419752120972, "learning_rate": 8.273150713817569e-06, "loss": 0.0052, "step": 19625 }, { "epoch": 4.999575515748366, "grad_norm": 0.6722444295883179, "learning_rate": 8.268772103730058e-06, "loss": 0.0053, "step": 19630 }, { "epoch": 5.000848968503268, "grad_norm": 0.14982540905475616, "learning_rate": 8.264393835786114e-06, "loss": 0.0046, "step": 19635 }, { "epoch": 5.002122421258171, "grad_norm": 0.385877788066864, "learning_rate": 8.260015910851015e-06, "loss": 0.0032, "step": 19640 }, { "epoch": 5.003395874013074, "grad_norm": 0.5826315879821777, "learning_rate": 8.255638329789976e-06, "loss": 0.0037, "step": 19645 }, { "epoch": 5.004669326767977, "grad_norm": 0.44455140829086304, "learning_rate": 8.25126109346814e-06, "loss": 0.0055, "step": 19650 }, { "epoch": 5.00594277952288, "grad_norm": 0.5339987277984619, "learning_rate": 8.24688420275058e-06, "loss": 0.003, "step": 19655 }, { "epoch": 5.0072162322777825, "grad_norm": 0.2588989734649658, "learning_rate": 8.242507658502307e-06, "loss": 0.0033, "step": 19660 }, { "epoch": 5.008489685032686, "grad_norm": 0.3296302258968353, "learning_rate": 8.23813146158826e-06, "loss": 0.0024, "step": 19665 }, { "epoch": 5.009763137787588, "grad_norm": 2.7880165576934814, "learning_rate": 8.233755612873307e-06, "loss": 0.0039, "step": 19670 }, { "epoch": 5.011036590542491, "grad_norm": 0.5227532386779785, "learning_rate": 8.22938011322225e-06, "loss": 0.0023, "step": 19675 }, { "epoch": 5.0123100432973935, "grad_norm": 0.9406496286392212, "learning_rate": 8.225004963499823e-06, "loss": 0.0033, "step": 19680 }, { "epoch": 5.013583496052297, "grad_norm": 0.8025301098823547, "learning_rate": 8.220630164570686e-06, "loss": 0.0041, "step": 19685 }, { "epoch": 5.014856948807199, "grad_norm": 0.6833208203315735, "learning_rate": 8.216255717299436e-06, "loss": 0.0038, "step": 19690 }, { "epoch": 5.016130401562102, "grad_norm": 1.0287631750106812, "learning_rate": 8.2118816225506e-06, "loss": 0.0039, "step": 19695 }, { "epoch": 5.0174038543170045, "grad_norm": 1.0248745679855347, "learning_rate": 8.207507881188627e-06, "loss": 0.0027, "step": 19700 }, { "epoch": 5.018677307071908, "grad_norm": 0.3430224359035492, "learning_rate": 8.203134494077905e-06, "loss": 0.002, "step": 19705 }, { "epoch": 5.01995075982681, "grad_norm": 0.4159339964389801, "learning_rate": 8.198761462082751e-06, "loss": 0.002, "step": 19710 }, { "epoch": 5.021224212581713, "grad_norm": 0.2171134501695633, "learning_rate": 8.194388786067405e-06, "loss": 0.0029, "step": 19715 }, { "epoch": 5.022497665336616, "grad_norm": 0.6691083312034607, "learning_rate": 8.190016466896043e-06, "loss": 0.0049, "step": 19720 }, { "epoch": 5.023771118091519, "grad_norm": 0.19762229919433594, "learning_rate": 8.185644505432775e-06, "loss": 0.0031, "step": 19725 }, { "epoch": 5.025044570846422, "grad_norm": 0.6102502346038818, "learning_rate": 8.181272902541624e-06, "loss": 0.0033, "step": 19730 }, { "epoch": 5.026318023601324, "grad_norm": 0.4447349011898041, "learning_rate": 8.17690165908656e-06, "loss": 0.0016, "step": 19735 }, { "epoch": 5.0275914763562275, "grad_norm": 0.2677255868911743, "learning_rate": 8.172530775931476e-06, "loss": 0.0019, "step": 19740 }, { "epoch": 5.02886492911113, "grad_norm": 0.06219276785850525, "learning_rate": 8.168160253940181e-06, "loss": 0.0025, "step": 19745 }, { "epoch": 5.030138381866033, "grad_norm": 0.4424509108066559, "learning_rate": 8.163790093976434e-06, "loss": 0.0025, "step": 19750 }, { "epoch": 5.031411834620935, "grad_norm": 0.22273294627666473, "learning_rate": 8.159420296903913e-06, "loss": 0.0037, "step": 19755 }, { "epoch": 5.0326852873758385, "grad_norm": 0.18457911908626556, "learning_rate": 8.155050863586216e-06, "loss": 0.0038, "step": 19760 }, { "epoch": 5.033958740130741, "grad_norm": 0.7466756701469421, "learning_rate": 8.15068179488688e-06, "loss": 0.0017, "step": 19765 }, { "epoch": 5.035232192885644, "grad_norm": 0.40982845425605774, "learning_rate": 8.14631309166937e-06, "loss": 0.0028, "step": 19770 }, { "epoch": 5.036505645640546, "grad_norm": 0.4076124429702759, "learning_rate": 8.14194475479707e-06, "loss": 0.0024, "step": 19775 }, { "epoch": 5.03777909839545, "grad_norm": 0.34124746918678284, "learning_rate": 8.137576785133298e-06, "loss": 0.0015, "step": 19780 }, { "epoch": 5.039052551150352, "grad_norm": 0.20915080606937408, "learning_rate": 8.133209183541304e-06, "loss": 0.0035, "step": 19785 }, { "epoch": 5.040326003905255, "grad_norm": 0.3049793839454651, "learning_rate": 8.128841950884251e-06, "loss": 0.0021, "step": 19790 }, { "epoch": 5.041599456660158, "grad_norm": 0.4947112798690796, "learning_rate": 8.124475088025243e-06, "loss": 0.0022, "step": 19795 }, { "epoch": 5.042872909415061, "grad_norm": 0.586924135684967, "learning_rate": 8.120108595827308e-06, "loss": 0.0024, "step": 19800 }, { "epoch": 5.044146362169964, "grad_norm": 2.044553756713867, "learning_rate": 8.11574247515339e-06, "loss": 0.0031, "step": 19805 }, { "epoch": 5.045419814924866, "grad_norm": 0.5418916940689087, "learning_rate": 8.111376726866373e-06, "loss": 0.0028, "step": 19810 }, { "epoch": 5.046693267679769, "grad_norm": 0.21314524114131927, "learning_rate": 8.107011351829069e-06, "loss": 0.0033, "step": 19815 }, { "epoch": 5.047966720434672, "grad_norm": 0.623217761516571, "learning_rate": 8.102646350904195e-06, "loss": 0.0027, "step": 19820 }, { "epoch": 5.049240173189575, "grad_norm": 0.8570582866668701, "learning_rate": 8.098281724954418e-06, "loss": 0.0025, "step": 19825 }, { "epoch": 5.050513625944477, "grad_norm": 0.2572759985923767, "learning_rate": 8.093917474842325e-06, "loss": 0.0018, "step": 19830 }, { "epoch": 5.05178707869938, "grad_norm": 1.1528593301773071, "learning_rate": 8.089553601430417e-06, "loss": 0.0034, "step": 19835 }, { "epoch": 5.053060531454283, "grad_norm": 0.3856850862503052, "learning_rate": 8.08519010558113e-06, "loss": 0.0037, "step": 19840 }, { "epoch": 5.054333984209186, "grad_norm": 0.7569184899330139, "learning_rate": 8.08082698815683e-06, "loss": 0.0033, "step": 19845 }, { "epoch": 5.055607436964088, "grad_norm": 0.795228123664856, "learning_rate": 8.076464250019797e-06, "loss": 0.0051, "step": 19850 }, { "epoch": 5.056880889718991, "grad_norm": 0.2822414040565491, "learning_rate": 8.072101892032246e-06, "loss": 0.0031, "step": 19855 }, { "epoch": 5.058154342473895, "grad_norm": 0.4598134756088257, "learning_rate": 8.067739915056306e-06, "loss": 0.003, "step": 19860 }, { "epoch": 5.059427795228797, "grad_norm": 0.6338938474655151, "learning_rate": 8.063378319954039e-06, "loss": 0.0022, "step": 19865 }, { "epoch": 5.0607012479837, "grad_norm": 0.38573265075683594, "learning_rate": 8.059017107587437e-06, "loss": 0.0016, "step": 19870 }, { "epoch": 5.0619747007386025, "grad_norm": 1.1744576692581177, "learning_rate": 8.054656278818398e-06, "loss": 0.0024, "step": 19875 }, { "epoch": 5.063248153493506, "grad_norm": 0.49674972891807556, "learning_rate": 8.05029583450876e-06, "loss": 0.0029, "step": 19880 }, { "epoch": 5.064521606248408, "grad_norm": 0.6483536958694458, "learning_rate": 8.045935775520284e-06, "loss": 0.0028, "step": 19885 }, { "epoch": 5.065795059003311, "grad_norm": 0.5233161449432373, "learning_rate": 8.041576102714644e-06, "loss": 0.002, "step": 19890 }, { "epoch": 5.0670685117582135, "grad_norm": 0.2163977175951004, "learning_rate": 8.037216816953447e-06, "loss": 0.0026, "step": 19895 }, { "epoch": 5.068341964513117, "grad_norm": 0.37298014760017395, "learning_rate": 8.032857919098227e-06, "loss": 0.0014, "step": 19900 }, { "epoch": 5.069615417268019, "grad_norm": 0.7416238188743591, "learning_rate": 8.028499410010427e-06, "loss": 0.0038, "step": 19905 }, { "epoch": 5.070888870022922, "grad_norm": 0.4367815852165222, "learning_rate": 8.024141290551424e-06, "loss": 0.0023, "step": 19910 }, { "epoch": 5.0721623227778245, "grad_norm": 0.21685075759887695, "learning_rate": 8.01978356158252e-06, "loss": 0.0031, "step": 19915 }, { "epoch": 5.073435775532728, "grad_norm": 0.2350308746099472, "learning_rate": 8.015426223964931e-06, "loss": 0.0028, "step": 19920 }, { "epoch": 5.074709228287631, "grad_norm": 0.44592007994651794, "learning_rate": 8.011069278559799e-06, "loss": 0.0028, "step": 19925 }, { "epoch": 5.075982681042533, "grad_norm": 0.3862541913986206, "learning_rate": 8.006712726228195e-06, "loss": 0.0034, "step": 19930 }, { "epoch": 5.0772561337974365, "grad_norm": 0.5197098255157471, "learning_rate": 8.002356567831104e-06, "loss": 0.003, "step": 19935 }, { "epoch": 5.078529586552339, "grad_norm": 0.20244671404361725, "learning_rate": 7.998000804229435e-06, "loss": 0.0027, "step": 19940 }, { "epoch": 5.079803039307242, "grad_norm": 0.1176300123333931, "learning_rate": 7.993645436284024e-06, "loss": 0.0027, "step": 19945 }, { "epoch": 5.081076492062144, "grad_norm": 0.7615808844566345, "learning_rate": 7.989290464855618e-06, "loss": 0.0026, "step": 19950 }, { "epoch": 5.0823499448170475, "grad_norm": 0.3572143614292145, "learning_rate": 7.984935890804897e-06, "loss": 0.0022, "step": 19955 }, { "epoch": 5.08362339757195, "grad_norm": 0.6449772119522095, "learning_rate": 7.98058171499246e-06, "loss": 0.0025, "step": 19960 }, { "epoch": 5.084896850326853, "grad_norm": 0.5581414103507996, "learning_rate": 7.976227938278822e-06, "loss": 0.0042, "step": 19965 }, { "epoch": 5.086170303081755, "grad_norm": 0.3773881196975708, "learning_rate": 7.971874561524422e-06, "loss": 0.002, "step": 19970 }, { "epoch": 5.0874437558366585, "grad_norm": 0.25892993807792664, "learning_rate": 7.967521585589623e-06, "loss": 0.0029, "step": 19975 }, { "epoch": 5.088717208591561, "grad_norm": 0.3612572252750397, "learning_rate": 7.963169011334706e-06, "loss": 0.0041, "step": 19980 }, { "epoch": 5.089990661346464, "grad_norm": 0.3980580270290375, "learning_rate": 7.958816839619869e-06, "loss": 0.0019, "step": 19985 }, { "epoch": 5.091264114101367, "grad_norm": 0.3350491225719452, "learning_rate": 7.954465071305237e-06, "loss": 0.0033, "step": 19990 }, { "epoch": 5.09253756685627, "grad_norm": 0.756868839263916, "learning_rate": 7.950113707250853e-06, "loss": 0.0027, "step": 19995 }, { "epoch": 5.093811019611173, "grad_norm": 0.7031562924385071, "learning_rate": 7.945762748316677e-06, "loss": 0.0022, "step": 20000 }, { "epoch": 5.095084472366075, "grad_norm": 0.26510995626449585, "learning_rate": 7.941412195362597e-06, "loss": 0.0022, "step": 20005 }, { "epoch": 5.096357925120978, "grad_norm": 0.5323432087898254, "learning_rate": 7.93706204924841e-06, "loss": 0.0044, "step": 20010 }, { "epoch": 5.097631377875881, "grad_norm": 0.809979259967804, "learning_rate": 7.932712310833836e-06, "loss": 0.0053, "step": 20015 }, { "epoch": 5.098904830630784, "grad_norm": 0.20923548936843872, "learning_rate": 7.928362980978527e-06, "loss": 0.0014, "step": 20020 }, { "epoch": 5.100178283385686, "grad_norm": 0.7895302176475525, "learning_rate": 7.92401406054203e-06, "loss": 0.0032, "step": 20025 }, { "epoch": 5.101451736140589, "grad_norm": 0.839188814163208, "learning_rate": 7.919665550383832e-06, "loss": 0.003, "step": 20030 }, { "epoch": 5.102725188895492, "grad_norm": 0.35717156529426575, "learning_rate": 7.915317451363334e-06, "loss": 0.0016, "step": 20035 }, { "epoch": 5.103998641650395, "grad_norm": 0.3014047145843506, "learning_rate": 7.910969764339846e-06, "loss": 0.002, "step": 20040 }, { "epoch": 5.105272094405297, "grad_norm": 0.5506333112716675, "learning_rate": 7.906622490172608e-06, "loss": 0.0021, "step": 20045 }, { "epoch": 5.1065455471602, "grad_norm": 1.0095256567001343, "learning_rate": 7.902275629720779e-06, "loss": 0.0036, "step": 20050 }, { "epoch": 5.107818999915104, "grad_norm": 0.5970160961151123, "learning_rate": 7.897929183843423e-06, "loss": 0.0032, "step": 20055 }, { "epoch": 5.109092452670006, "grad_norm": 0.30037155747413635, "learning_rate": 7.89358315339953e-06, "loss": 0.0032, "step": 20060 }, { "epoch": 5.110365905424909, "grad_norm": 0.43355339765548706, "learning_rate": 7.889237539248022e-06, "loss": 0.0025, "step": 20065 }, { "epoch": 5.111639358179811, "grad_norm": 0.18290674686431885, "learning_rate": 7.884892342247709e-06, "loss": 0.0022, "step": 20070 }, { "epoch": 5.112912810934715, "grad_norm": 0.5700591206550598, "learning_rate": 7.880547563257342e-06, "loss": 0.0027, "step": 20075 }, { "epoch": 5.114186263689617, "grad_norm": 0.7800885438919067, "learning_rate": 7.876203203135588e-06, "loss": 0.004, "step": 20080 }, { "epoch": 5.11545971644452, "grad_norm": 0.6187506914138794, "learning_rate": 7.871859262741012e-06, "loss": 0.003, "step": 20085 }, { "epoch": 5.1167331691994224, "grad_norm": 0.3440718352794647, "learning_rate": 7.867515742932118e-06, "loss": 0.003, "step": 20090 }, { "epoch": 5.118006621954326, "grad_norm": 0.32029950618743896, "learning_rate": 7.863172644567322e-06, "loss": 0.0017, "step": 20095 }, { "epoch": 5.119280074709228, "grad_norm": 3.276602029800415, "learning_rate": 7.858829968504944e-06, "loss": 0.0027, "step": 20100 }, { "epoch": 5.120553527464131, "grad_norm": 0.22193360328674316, "learning_rate": 7.854487715603234e-06, "loss": 0.0018, "step": 20105 }, { "epoch": 5.1218269802190335, "grad_norm": 0.3590860962867737, "learning_rate": 7.850145886720357e-06, "loss": 0.0028, "step": 20110 }, { "epoch": 5.123100432973937, "grad_norm": 0.9511917233467102, "learning_rate": 7.845804482714384e-06, "loss": 0.0019, "step": 20115 }, { "epoch": 5.12437388572884, "grad_norm": 0.6718182563781738, "learning_rate": 7.841463504443316e-06, "loss": 0.0016, "step": 20120 }, { "epoch": 5.125647338483742, "grad_norm": 0.5191982984542847, "learning_rate": 7.837122952765056e-06, "loss": 0.0035, "step": 20125 }, { "epoch": 5.126920791238645, "grad_norm": 0.3757810890674591, "learning_rate": 7.832782828537437e-06, "loss": 0.0022, "step": 20130 }, { "epoch": 5.128194243993548, "grad_norm": 0.1581554412841797, "learning_rate": 7.828443132618197e-06, "loss": 0.003, "step": 20135 }, { "epoch": 5.129467696748451, "grad_norm": 0.2834019064903259, "learning_rate": 7.824103865864992e-06, "loss": 0.002, "step": 20140 }, { "epoch": 5.130741149503353, "grad_norm": 0.46005311608314514, "learning_rate": 7.81976502913539e-06, "loss": 0.0021, "step": 20145 }, { "epoch": 5.1320146022582565, "grad_norm": 0.4196214973926544, "learning_rate": 7.81542662328689e-06, "loss": 0.0025, "step": 20150 }, { "epoch": 5.133288055013159, "grad_norm": 0.10621929913759232, "learning_rate": 7.811088649176882e-06, "loss": 0.0027, "step": 20155 }, { "epoch": 5.134561507768062, "grad_norm": 0.7079717516899109, "learning_rate": 7.806751107662684e-06, "loss": 0.0031, "step": 20160 }, { "epoch": 5.135834960522964, "grad_norm": 0.12423907965421677, "learning_rate": 7.802413999601534e-06, "loss": 0.0016, "step": 20165 }, { "epoch": 5.1371084132778675, "grad_norm": 0.9502784609794617, "learning_rate": 7.798077325850567e-06, "loss": 0.0028, "step": 20170 }, { "epoch": 5.13838186603277, "grad_norm": 1.0524744987487793, "learning_rate": 7.793741087266849e-06, "loss": 0.0038, "step": 20175 }, { "epoch": 5.139655318787673, "grad_norm": 0.9764232635498047, "learning_rate": 7.789405284707354e-06, "loss": 0.0047, "step": 20180 }, { "epoch": 5.140928771542576, "grad_norm": 0.10613352805376053, "learning_rate": 7.785069919028965e-06, "loss": 0.0015, "step": 20185 }, { "epoch": 5.1422022242974785, "grad_norm": 0.4512292146682739, "learning_rate": 7.780734991088484e-06, "loss": 0.003, "step": 20190 }, { "epoch": 5.143475677052382, "grad_norm": 0.9448532462120056, "learning_rate": 7.77640050174263e-06, "loss": 0.0045, "step": 20195 }, { "epoch": 5.144749129807284, "grad_norm": 0.4385889768600464, "learning_rate": 7.772066451848023e-06, "loss": 0.003, "step": 20200 }, { "epoch": 5.146022582562187, "grad_norm": 0.23495475947856903, "learning_rate": 7.767732842261207e-06, "loss": 0.0019, "step": 20205 }, { "epoch": 5.14729603531709, "grad_norm": 0.39559829235076904, "learning_rate": 7.763399673838641e-06, "loss": 0.0042, "step": 20210 }, { "epoch": 5.148569488071993, "grad_norm": 0.5156699419021606, "learning_rate": 7.759066947436682e-06, "loss": 0.0035, "step": 20215 }, { "epoch": 5.149842940826895, "grad_norm": 0.8544238805770874, "learning_rate": 7.754734663911616e-06, "loss": 0.0019, "step": 20220 }, { "epoch": 5.151116393581798, "grad_norm": 0.42707717418670654, "learning_rate": 7.750402824119634e-06, "loss": 0.0048, "step": 20225 }, { "epoch": 5.152389846336701, "grad_norm": 0.4971156120300293, "learning_rate": 7.746071428916836e-06, "loss": 0.0032, "step": 20230 }, { "epoch": 5.153663299091604, "grad_norm": 0.7276656627655029, "learning_rate": 7.74174047915924e-06, "loss": 0.0043, "step": 20235 }, { "epoch": 5.154936751846506, "grad_norm": 0.724097728729248, "learning_rate": 7.73740997570278e-06, "loss": 0.0032, "step": 20240 }, { "epoch": 5.156210204601409, "grad_norm": 0.4712446928024292, "learning_rate": 7.733079919403288e-06, "loss": 0.004, "step": 20245 }, { "epoch": 5.1574836573563125, "grad_norm": 0.09068871289491653, "learning_rate": 7.728750311116519e-06, "loss": 0.0044, "step": 20250 }, { "epoch": 5.158757110111215, "grad_norm": 0.4491003155708313, "learning_rate": 7.724421151698137e-06, "loss": 0.0027, "step": 20255 }, { "epoch": 5.160030562866118, "grad_norm": 0.45875638723373413, "learning_rate": 7.72009244200371e-06, "loss": 0.0026, "step": 20260 }, { "epoch": 5.16130401562102, "grad_norm": 0.4753594696521759, "learning_rate": 7.715764182888731e-06, "loss": 0.0024, "step": 20265 }, { "epoch": 5.162577468375924, "grad_norm": 0.09336699545383453, "learning_rate": 7.711436375208594e-06, "loss": 0.0022, "step": 20270 }, { "epoch": 5.163850921130826, "grad_norm": 0.28485435247421265, "learning_rate": 7.707109019818603e-06, "loss": 0.002, "step": 20275 }, { "epoch": 5.165124373885729, "grad_norm": 0.5951541662216187, "learning_rate": 7.70278211757398e-06, "loss": 0.0032, "step": 20280 }, { "epoch": 5.166397826640631, "grad_norm": 0.5903919339179993, "learning_rate": 7.698455669329853e-06, "loss": 0.0026, "step": 20285 }, { "epoch": 5.167671279395535, "grad_norm": 0.4931006133556366, "learning_rate": 7.694129675941258e-06, "loss": 0.0019, "step": 20290 }, { "epoch": 5.168944732150437, "grad_norm": 0.49702882766723633, "learning_rate": 7.689804138263141e-06, "loss": 0.0019, "step": 20295 }, { "epoch": 5.17021818490534, "grad_norm": 0.3168067932128906, "learning_rate": 7.685479057150367e-06, "loss": 0.0033, "step": 20300 }, { "epoch": 5.1714916376602424, "grad_norm": 0.4896777868270874, "learning_rate": 7.681154433457702e-06, "loss": 0.0025, "step": 20305 }, { "epoch": 5.172765090415146, "grad_norm": 0.6289421916007996, "learning_rate": 7.676830268039822e-06, "loss": 0.0032, "step": 20310 }, { "epoch": 5.174038543170048, "grad_norm": 0.7775374054908752, "learning_rate": 7.672506561751315e-06, "loss": 0.0037, "step": 20315 }, { "epoch": 5.175311995924951, "grad_norm": 0.35645005106925964, "learning_rate": 7.66818331544668e-06, "loss": 0.0032, "step": 20320 }, { "epoch": 5.176585448679854, "grad_norm": 0.4820006489753723, "learning_rate": 7.663860529980318e-06, "loss": 0.0025, "step": 20325 }, { "epoch": 5.177858901434757, "grad_norm": 0.6219130158424377, "learning_rate": 7.65953820620655e-06, "loss": 0.0026, "step": 20330 }, { "epoch": 5.17913235418966, "grad_norm": 0.03894239664077759, "learning_rate": 7.65521634497959e-06, "loss": 0.0016, "step": 20335 }, { "epoch": 5.180405806944562, "grad_norm": 0.6442372798919678, "learning_rate": 7.650894947153578e-06, "loss": 0.0036, "step": 20340 }, { "epoch": 5.181679259699465, "grad_norm": 1.1182337999343872, "learning_rate": 7.646574013582554e-06, "loss": 0.0038, "step": 20345 }, { "epoch": 5.182952712454368, "grad_norm": 0.34259510040283203, "learning_rate": 7.642253545120462e-06, "loss": 0.0021, "step": 20350 }, { "epoch": 5.184226165209271, "grad_norm": 0.5995506644248962, "learning_rate": 7.63793354262116e-06, "loss": 0.0026, "step": 20355 }, { "epoch": 5.185499617964173, "grad_norm": 0.6226041316986084, "learning_rate": 7.633614006938419e-06, "loss": 0.0034, "step": 20360 }, { "epoch": 5.1867730707190765, "grad_norm": 1.000793695449829, "learning_rate": 7.629294938925899e-06, "loss": 0.0049, "step": 20365 }, { "epoch": 5.188046523473979, "grad_norm": 0.5700103044509888, "learning_rate": 7.624976339437195e-06, "loss": 0.0039, "step": 20370 }, { "epoch": 5.189319976228882, "grad_norm": 0.27965471148490906, "learning_rate": 7.62065820932578e-06, "loss": 0.0024, "step": 20375 }, { "epoch": 5.190593428983784, "grad_norm": 0.22028842568397522, "learning_rate": 7.616340549445055e-06, "loss": 0.0033, "step": 20380 }, { "epoch": 5.1918668817386875, "grad_norm": 0.6566908359527588, "learning_rate": 7.612023360648325e-06, "loss": 0.0025, "step": 20385 }, { "epoch": 5.193140334493591, "grad_norm": 0.8452298045158386, "learning_rate": 7.6077066437887915e-06, "loss": 0.003, "step": 20390 }, { "epoch": 5.194413787248493, "grad_norm": 0.3737993538379669, "learning_rate": 7.603390399719573e-06, "loss": 0.0035, "step": 20395 }, { "epoch": 5.195687240003396, "grad_norm": 0.5847491025924683, "learning_rate": 7.599074629293696e-06, "loss": 0.0039, "step": 20400 }, { "epoch": 5.1969606927582985, "grad_norm": 0.3133038580417633, "learning_rate": 7.594759333364079e-06, "loss": 0.004, "step": 20405 }, { "epoch": 5.198234145513202, "grad_norm": 0.8196382522583008, "learning_rate": 7.590444512783561e-06, "loss": 0.003, "step": 20410 }, { "epoch": 5.199507598268104, "grad_norm": 0.2520490884780884, "learning_rate": 7.58613016840489e-06, "loss": 0.002, "step": 20415 }, { "epoch": 5.200781051023007, "grad_norm": 0.7067747712135315, "learning_rate": 7.581816301080699e-06, "loss": 0.0026, "step": 20420 }, { "epoch": 5.20205450377791, "grad_norm": 1.5630871057510376, "learning_rate": 7.577502911663547e-06, "loss": 0.0039, "step": 20425 }, { "epoch": 5.203327956532813, "grad_norm": 0.10002617537975311, "learning_rate": 7.573190001005895e-06, "loss": 0.0024, "step": 20430 }, { "epoch": 5.204601409287715, "grad_norm": 0.38920095562934875, "learning_rate": 7.568877569960098e-06, "loss": 0.0018, "step": 20435 }, { "epoch": 5.205874862042618, "grad_norm": 1.5428541898727417, "learning_rate": 7.564565619378429e-06, "loss": 0.0063, "step": 20440 }, { "epoch": 5.207148314797521, "grad_norm": 0.22930781543254852, "learning_rate": 7.560254150113065e-06, "loss": 0.0019, "step": 20445 }, { "epoch": 5.208421767552424, "grad_norm": 0.9985011219978333, "learning_rate": 7.555943163016074e-06, "loss": 0.0058, "step": 20450 }, { "epoch": 5.209695220307327, "grad_norm": 0.6743447780609131, "learning_rate": 7.551632658939446e-06, "loss": 0.0033, "step": 20455 }, { "epoch": 5.210968673062229, "grad_norm": 0.509039580821991, "learning_rate": 7.54732263873507e-06, "loss": 0.0025, "step": 20460 }, { "epoch": 5.2122421258171325, "grad_norm": 0.6553120017051697, "learning_rate": 7.543013103254733e-06, "loss": 0.0027, "step": 20465 }, { "epoch": 5.213515578572035, "grad_norm": 0.09523307532072067, "learning_rate": 7.538704053350132e-06, "loss": 0.0024, "step": 20470 }, { "epoch": 5.214789031326938, "grad_norm": 0.20191937685012817, "learning_rate": 7.534395489872871e-06, "loss": 0.0035, "step": 20475 }, { "epoch": 5.21606248408184, "grad_norm": 0.5096250772476196, "learning_rate": 7.5300874136744475e-06, "loss": 0.0045, "step": 20480 }, { "epoch": 5.217335936836744, "grad_norm": 0.46914416551589966, "learning_rate": 7.525779825606272e-06, "loss": 0.0022, "step": 20485 }, { "epoch": 5.218609389591646, "grad_norm": 0.4076698422431946, "learning_rate": 7.521472726519661e-06, "loss": 0.0021, "step": 20490 }, { "epoch": 5.219882842346549, "grad_norm": 0.5902090072631836, "learning_rate": 7.517166117265818e-06, "loss": 0.0024, "step": 20495 }, { "epoch": 5.221156295101451, "grad_norm": 0.7333032488822937, "learning_rate": 7.512859998695867e-06, "loss": 0.0026, "step": 20500 }, { "epoch": 5.222429747856355, "grad_norm": 1.0365698337554932, "learning_rate": 7.508554371660834e-06, "loss": 0.004, "step": 20505 }, { "epoch": 5.223703200611257, "grad_norm": 0.19088202714920044, "learning_rate": 7.504249237011631e-06, "loss": 0.0023, "step": 20510 }, { "epoch": 5.22497665336616, "grad_norm": 0.28679582476615906, "learning_rate": 7.499944595599091e-06, "loss": 0.0035, "step": 20515 }, { "epoch": 5.226250106121063, "grad_norm": 0.2657165229320526, "learning_rate": 7.495640448273947e-06, "loss": 0.0051, "step": 20520 }, { "epoch": 5.227523558875966, "grad_norm": 0.548611581325531, "learning_rate": 7.4913367958868184e-06, "loss": 0.0023, "step": 20525 }, { "epoch": 5.228797011630869, "grad_norm": 0.32965636253356934, "learning_rate": 7.4870336392882455e-06, "loss": 0.0028, "step": 20530 }, { "epoch": 5.230070464385771, "grad_norm": 0.5886591672897339, "learning_rate": 7.482730979328669e-06, "loss": 0.0034, "step": 20535 }, { "epoch": 5.231343917140674, "grad_norm": 0.4397311806678772, "learning_rate": 7.478428816858415e-06, "loss": 0.0028, "step": 20540 }, { "epoch": 5.232617369895577, "grad_norm": 0.7192957997322083, "learning_rate": 7.474127152727728e-06, "loss": 0.002, "step": 20545 }, { "epoch": 5.23389082265048, "grad_norm": 0.6807990670204163, "learning_rate": 7.469825987786752e-06, "loss": 0.0032, "step": 20550 }, { "epoch": 5.235164275405382, "grad_norm": 0.31558001041412354, "learning_rate": 7.4655253228855205e-06, "loss": 0.0025, "step": 20555 }, { "epoch": 5.236437728160285, "grad_norm": 0.4457665979862213, "learning_rate": 7.461225158873981e-06, "loss": 0.002, "step": 20560 }, { "epoch": 5.237711180915188, "grad_norm": 0.26438120007514954, "learning_rate": 7.4569254966019786e-06, "loss": 0.0034, "step": 20565 }, { "epoch": 5.238984633670091, "grad_norm": 0.46119338274002075, "learning_rate": 7.4526263369192555e-06, "loss": 0.0041, "step": 20570 }, { "epoch": 5.240258086424993, "grad_norm": 0.09929046779870987, "learning_rate": 7.448327680675458e-06, "loss": 0.0027, "step": 20575 }, { "epoch": 5.2415315391798964, "grad_norm": 0.7182751893997192, "learning_rate": 7.444029528720134e-06, "loss": 0.0029, "step": 20580 }, { "epoch": 5.242804991934799, "grad_norm": 0.5844861268997192, "learning_rate": 7.439731881902723e-06, "loss": 0.0042, "step": 20585 }, { "epoch": 5.244078444689702, "grad_norm": 0.4961995780467987, "learning_rate": 7.43543474107258e-06, "loss": 0.0028, "step": 20590 }, { "epoch": 5.245351897444605, "grad_norm": 0.08467314392328262, "learning_rate": 7.431138107078948e-06, "loss": 0.0027, "step": 20595 }, { "epoch": 5.2466253501995075, "grad_norm": 0.6939778327941895, "learning_rate": 7.426841980770971e-06, "loss": 0.004, "step": 20600 }, { "epoch": 5.247898802954411, "grad_norm": 0.3110222816467285, "learning_rate": 7.4225463629976955e-06, "loss": 0.0014, "step": 20605 }, { "epoch": 5.249172255709313, "grad_norm": 0.7213952541351318, "learning_rate": 7.418251254608071e-06, "loss": 0.0026, "step": 20610 }, { "epoch": 5.250445708464216, "grad_norm": 0.304592102766037, "learning_rate": 7.413956656450939e-06, "loss": 0.0035, "step": 20615 }, { "epoch": 5.2517191612191185, "grad_norm": 0.1106928288936615, "learning_rate": 7.409662569375045e-06, "loss": 0.0044, "step": 20620 }, { "epoch": 5.252992613974022, "grad_norm": 0.108757384121418, "learning_rate": 7.405368994229026e-06, "loss": 0.0021, "step": 20625 }, { "epoch": 5.254266066728924, "grad_norm": 0.46975451707839966, "learning_rate": 7.401075931861432e-06, "loss": 0.0035, "step": 20630 }, { "epoch": 5.255539519483827, "grad_norm": 0.4089536964893341, "learning_rate": 7.3967833831207e-06, "loss": 0.0023, "step": 20635 }, { "epoch": 5.25681297223873, "grad_norm": 0.30438700318336487, "learning_rate": 7.392491348855169e-06, "loss": 0.0042, "step": 20640 }, { "epoch": 5.258086424993633, "grad_norm": 1.562390685081482, "learning_rate": 7.388199829913073e-06, "loss": 0.0035, "step": 20645 }, { "epoch": 5.259359877748535, "grad_norm": 0.513270378112793, "learning_rate": 7.383908827142553e-06, "loss": 0.0032, "step": 20650 }, { "epoch": 5.260633330503438, "grad_norm": 0.36506587266921997, "learning_rate": 7.379618341391639e-06, "loss": 0.0049, "step": 20655 }, { "epoch": 5.2619067832583415, "grad_norm": 1.3669956922531128, "learning_rate": 7.375328373508261e-06, "loss": 0.0031, "step": 20660 }, { "epoch": 5.263180236013244, "grad_norm": 0.33434703946113586, "learning_rate": 7.371038924340253e-06, "loss": 0.0026, "step": 20665 }, { "epoch": 5.264453688768147, "grad_norm": 0.3888457417488098, "learning_rate": 7.366749994735336e-06, "loss": 0.0023, "step": 20670 }, { "epoch": 5.265727141523049, "grad_norm": 0.46078163385391235, "learning_rate": 7.362461585541134e-06, "loss": 0.0033, "step": 20675 }, { "epoch": 5.2670005942779525, "grad_norm": 1.210520625114441, "learning_rate": 7.358173697605175e-06, "loss": 0.0046, "step": 20680 }, { "epoch": 5.268274047032855, "grad_norm": 0.5272203087806702, "learning_rate": 7.353886331774866e-06, "loss": 0.0028, "step": 20685 }, { "epoch": 5.269547499787758, "grad_norm": 0.06809825450181961, "learning_rate": 7.349599488897526e-06, "loss": 0.0031, "step": 20690 }, { "epoch": 5.27082095254266, "grad_norm": 0.613793671131134, "learning_rate": 7.345313169820373e-06, "loss": 0.0031, "step": 20695 }, { "epoch": 5.272094405297564, "grad_norm": 0.8528215885162354, "learning_rate": 7.341027375390503e-06, "loss": 0.0033, "step": 20700 }, { "epoch": 5.273367858052466, "grad_norm": 0.6276558041572571, "learning_rate": 7.336742106454926e-06, "loss": 0.0024, "step": 20705 }, { "epoch": 5.274641310807369, "grad_norm": 0.2302681803703308, "learning_rate": 7.332457363860546e-06, "loss": 0.0018, "step": 20710 }, { "epoch": 5.275914763562271, "grad_norm": 0.9787541627883911, "learning_rate": 7.328173148454151e-06, "loss": 0.0038, "step": 20715 }, { "epoch": 5.277188216317175, "grad_norm": 0.5283450484275818, "learning_rate": 7.323889461082436e-06, "loss": 0.0025, "step": 20720 }, { "epoch": 5.278461669072078, "grad_norm": 0.15689204633235931, "learning_rate": 7.319606302591995e-06, "loss": 0.0015, "step": 20725 }, { "epoch": 5.27973512182698, "grad_norm": 1.7285405397415161, "learning_rate": 7.315323673829302e-06, "loss": 0.0039, "step": 20730 }, { "epoch": 5.281008574581883, "grad_norm": 1.1882882118225098, "learning_rate": 7.311041575640739e-06, "loss": 0.0025, "step": 20735 }, { "epoch": 5.282282027336786, "grad_norm": 0.4682348072528839, "learning_rate": 7.306760008872582e-06, "loss": 0.0025, "step": 20740 }, { "epoch": 5.283555480091689, "grad_norm": 0.5590042471885681, "learning_rate": 7.302478974370994e-06, "loss": 0.0035, "step": 20745 }, { "epoch": 5.284828932846591, "grad_norm": 0.8249054551124573, "learning_rate": 7.298198472982041e-06, "loss": 0.0024, "step": 20750 }, { "epoch": 5.286102385601494, "grad_norm": 0.8697770833969116, "learning_rate": 7.293918505551685e-06, "loss": 0.0035, "step": 20755 }, { "epoch": 5.287375838356397, "grad_norm": 0.526585578918457, "learning_rate": 7.289639072925771e-06, "loss": 0.0021, "step": 20760 }, { "epoch": 5.2886492911113, "grad_norm": 0.04164482653141022, "learning_rate": 7.285360175950051e-06, "loss": 0.0042, "step": 20765 }, { "epoch": 5.289922743866202, "grad_norm": 0.25878047943115234, "learning_rate": 7.281081815470167e-06, "loss": 0.0026, "step": 20770 }, { "epoch": 5.291196196621105, "grad_norm": 0.33469000458717346, "learning_rate": 7.276803992331647e-06, "loss": 0.0016, "step": 20775 }, { "epoch": 5.292469649376008, "grad_norm": 0.6266815662384033, "learning_rate": 7.2725267073799254e-06, "loss": 0.0022, "step": 20780 }, { "epoch": 5.293743102130911, "grad_norm": 0.6644147038459778, "learning_rate": 7.2682499614603256e-06, "loss": 0.0024, "step": 20785 }, { "epoch": 5.295016554885814, "grad_norm": 0.5258261561393738, "learning_rate": 7.2639737554180565e-06, "loss": 0.0024, "step": 20790 }, { "epoch": 5.2962900076407164, "grad_norm": 0.4171348810195923, "learning_rate": 7.259698090098233e-06, "loss": 0.0039, "step": 20795 }, { "epoch": 5.29756346039562, "grad_norm": 0.7976241111755371, "learning_rate": 7.255422966345859e-06, "loss": 0.0033, "step": 20800 }, { "epoch": 5.298836913150522, "grad_norm": 0.9859825372695923, "learning_rate": 7.251148385005822e-06, "loss": 0.004, "step": 20805 }, { "epoch": 5.300110365905425, "grad_norm": 0.25924739241600037, "learning_rate": 7.246874346922915e-06, "loss": 0.0031, "step": 20810 }, { "epoch": 5.3013838186603275, "grad_norm": 0.506069004535675, "learning_rate": 7.242600852941823e-06, "loss": 0.0026, "step": 20815 }, { "epoch": 5.302657271415231, "grad_norm": 0.11972880363464355, "learning_rate": 7.238327903907108e-06, "loss": 0.0017, "step": 20820 }, { "epoch": 5.303930724170133, "grad_norm": 0.5516262054443359, "learning_rate": 7.234055500663242e-06, "loss": 0.0032, "step": 20825 }, { "epoch": 5.305204176925036, "grad_norm": 0.18035466969013214, "learning_rate": 7.229783644054586e-06, "loss": 0.0018, "step": 20830 }, { "epoch": 5.3064776296799385, "grad_norm": 0.3217453062534332, "learning_rate": 7.2255123349253815e-06, "loss": 0.003, "step": 20835 }, { "epoch": 5.307751082434842, "grad_norm": 0.5107089281082153, "learning_rate": 7.221241574119774e-06, "loss": 0.0022, "step": 20840 }, { "epoch": 5.309024535189744, "grad_norm": 0.20209024846553802, "learning_rate": 7.216971362481801e-06, "loss": 0.0017, "step": 20845 }, { "epoch": 5.310297987944647, "grad_norm": 0.63416588306427, "learning_rate": 7.212701700855376e-06, "loss": 0.0026, "step": 20850 }, { "epoch": 5.3115714406995505, "grad_norm": 0.11666174978017807, "learning_rate": 7.208432590084322e-06, "loss": 0.0019, "step": 20855 }, { "epoch": 5.312844893454453, "grad_norm": 0.2870018184185028, "learning_rate": 7.204164031012348e-06, "loss": 0.002, "step": 20860 }, { "epoch": 5.314118346209356, "grad_norm": 0.11138475686311722, "learning_rate": 7.199896024483045e-06, "loss": 0.0017, "step": 20865 }, { "epoch": 5.315391798964258, "grad_norm": 0.6841147541999817, "learning_rate": 7.195628571339906e-06, "loss": 0.0023, "step": 20870 }, { "epoch": 5.3166652517191615, "grad_norm": 0.2549835443496704, "learning_rate": 7.191361672426311e-06, "loss": 0.0032, "step": 20875 }, { "epoch": 5.317938704474064, "grad_norm": 0.2989613711833954, "learning_rate": 7.187095328585525e-06, "loss": 0.0012, "step": 20880 }, { "epoch": 5.319212157228967, "grad_norm": 0.14187417924404144, "learning_rate": 7.182829540660714e-06, "loss": 0.0022, "step": 20885 }, { "epoch": 5.320485609983869, "grad_norm": 0.4222973883152008, "learning_rate": 7.178564309494922e-06, "loss": 0.0032, "step": 20890 }, { "epoch": 5.3217590627387725, "grad_norm": 0.13003283739089966, "learning_rate": 7.1742996359310905e-06, "loss": 0.0014, "step": 20895 }, { "epoch": 5.323032515493675, "grad_norm": 0.6822298169136047, "learning_rate": 7.1700355208120565e-06, "loss": 0.0029, "step": 20900 }, { "epoch": 5.324305968248578, "grad_norm": 1.6059211492538452, "learning_rate": 7.165771964980529e-06, "loss": 0.0036, "step": 20905 }, { "epoch": 5.32557942100348, "grad_norm": 0.5228056907653809, "learning_rate": 7.1615089692791225e-06, "loss": 0.0021, "step": 20910 }, { "epoch": 5.326852873758384, "grad_norm": 0.33596566319465637, "learning_rate": 7.15724653455034e-06, "loss": 0.0022, "step": 20915 }, { "epoch": 5.328126326513287, "grad_norm": 1.237182855606079, "learning_rate": 7.152984661636557e-06, "loss": 0.0037, "step": 20920 }, { "epoch": 5.329399779268189, "grad_norm": 0.703192949295044, "learning_rate": 7.14872335138006e-06, "loss": 0.0041, "step": 20925 }, { "epoch": 5.330673232023092, "grad_norm": 0.49636727571487427, "learning_rate": 7.14446260462301e-06, "loss": 0.0025, "step": 20930 }, { "epoch": 5.331946684777995, "grad_norm": 0.5974221229553223, "learning_rate": 7.140202422207462e-06, "loss": 0.0025, "step": 20935 }, { "epoch": 5.333220137532898, "grad_norm": 0.3131871223449707, "learning_rate": 7.135942804975358e-06, "loss": 0.0029, "step": 20940 }, { "epoch": 5.3344935902878, "grad_norm": 0.13892924785614014, "learning_rate": 7.131683753768529e-06, "loss": 0.0018, "step": 20945 }, { "epoch": 5.335767043042703, "grad_norm": 0.13568054139614105, "learning_rate": 7.127425269428692e-06, "loss": 0.0022, "step": 20950 }, { "epoch": 5.337040495797606, "grad_norm": 0.6252028942108154, "learning_rate": 7.123167352797455e-06, "loss": 0.0031, "step": 20955 }, { "epoch": 5.338313948552509, "grad_norm": 0.6191480755805969, "learning_rate": 7.1189100047163154e-06, "loss": 0.0043, "step": 20960 }, { "epoch": 5.339587401307411, "grad_norm": 0.9635404348373413, "learning_rate": 7.114653226026651e-06, "loss": 0.0046, "step": 20965 }, { "epoch": 5.340860854062314, "grad_norm": 0.3778388798236847, "learning_rate": 7.11039701756973e-06, "loss": 0.0017, "step": 20970 }, { "epoch": 5.342134306817217, "grad_norm": 0.10453817993402481, "learning_rate": 7.106141380186715e-06, "loss": 0.0032, "step": 20975 }, { "epoch": 5.34340775957212, "grad_norm": 0.15689325332641602, "learning_rate": 7.101886314718649e-06, "loss": 0.0024, "step": 20980 }, { "epoch": 5.344681212327023, "grad_norm": 0.3840726613998413, "learning_rate": 7.097631822006457e-06, "loss": 0.0037, "step": 20985 }, { "epoch": 5.345954665081925, "grad_norm": 0.6600927114486694, "learning_rate": 7.093377902890966e-06, "loss": 0.0027, "step": 20990 }, { "epoch": 5.347228117836829, "grad_norm": 0.6924810409545898, "learning_rate": 7.089124558212872e-06, "loss": 0.0021, "step": 20995 }, { "epoch": 5.348501570591731, "grad_norm": 0.4875492453575134, "learning_rate": 7.084871788812771e-06, "loss": 0.0034, "step": 21000 }, { "epoch": 5.349775023346634, "grad_norm": 0.39714834094047546, "learning_rate": 7.080619595531141e-06, "loss": 0.0028, "step": 21005 }, { "epoch": 5.3510484761015364, "grad_norm": 0.7863556146621704, "learning_rate": 7.076367979208339e-06, "loss": 0.0036, "step": 21010 }, { "epoch": 5.35232192885644, "grad_norm": 0.8887459635734558, "learning_rate": 7.07211694068462e-06, "loss": 0.0029, "step": 21015 }, { "epoch": 5.353595381611342, "grad_norm": 0.07260327786207199, "learning_rate": 7.067866480800122e-06, "loss": 0.0041, "step": 21020 }, { "epoch": 5.354868834366245, "grad_norm": 0.9814907908439636, "learning_rate": 7.063616600394857e-06, "loss": 0.0027, "step": 21025 }, { "epoch": 5.3561422871211475, "grad_norm": 0.5816420912742615, "learning_rate": 7.059367300308736e-06, "loss": 0.0012, "step": 21030 }, { "epoch": 5.357415739876051, "grad_norm": 0.36540254950523376, "learning_rate": 7.0551185813815544e-06, "loss": 0.0014, "step": 21035 }, { "epoch": 5.358689192630953, "grad_norm": 0.8340040445327759, "learning_rate": 7.050870444452983e-06, "loss": 0.0062, "step": 21040 }, { "epoch": 5.359962645385856, "grad_norm": 0.5585072040557861, "learning_rate": 7.0466228903625846e-06, "loss": 0.0028, "step": 21045 }, { "epoch": 5.361236098140759, "grad_norm": 0.6810255646705627, "learning_rate": 7.042375919949811e-06, "loss": 0.0039, "step": 21050 }, { "epoch": 5.362509550895662, "grad_norm": 0.17817123234272003, "learning_rate": 7.038129534053986e-06, "loss": 0.0026, "step": 21055 }, { "epoch": 5.363783003650565, "grad_norm": 0.8452842831611633, "learning_rate": 7.0338837335143284e-06, "loss": 0.0035, "step": 21060 }, { "epoch": 5.365056456405467, "grad_norm": 0.7042186856269836, "learning_rate": 7.029638519169941e-06, "loss": 0.0032, "step": 21065 }, { "epoch": 5.3663299091603704, "grad_norm": 0.04073395952582359, "learning_rate": 7.025393891859804e-06, "loss": 0.002, "step": 21070 }, { "epoch": 5.367603361915273, "grad_norm": 0.5468994975090027, "learning_rate": 7.021149852422786e-06, "loss": 0.0017, "step": 21075 }, { "epoch": 5.368876814670176, "grad_norm": 0.19781561195850372, "learning_rate": 7.016906401697643e-06, "loss": 0.0026, "step": 21080 }, { "epoch": 5.370150267425078, "grad_norm": 1.0836985111236572, "learning_rate": 7.012663540523005e-06, "loss": 0.0038, "step": 21085 }, { "epoch": 5.3714237201799815, "grad_norm": 0.6473721265792847, "learning_rate": 7.0084212697373935e-06, "loss": 0.0027, "step": 21090 }, { "epoch": 5.372697172934884, "grad_norm": 0.9430547952651978, "learning_rate": 7.004179590179214e-06, "loss": 0.0048, "step": 21095 }, { "epoch": 5.373970625689787, "grad_norm": 0.2742058038711548, "learning_rate": 6.999938502686745e-06, "loss": 0.0029, "step": 21100 }, { "epoch": 5.375244078444689, "grad_norm": 0.48284032940864563, "learning_rate": 6.99569800809816e-06, "loss": 0.0029, "step": 21105 }, { "epoch": 5.3765175311995925, "grad_norm": 0.5710683465003967, "learning_rate": 6.991458107251514e-06, "loss": 0.0027, "step": 21110 }, { "epoch": 5.377790983954496, "grad_norm": 0.7257452011108398, "learning_rate": 6.987218800984733e-06, "loss": 0.0053, "step": 21115 }, { "epoch": 5.379064436709398, "grad_norm": 0.5902617573738098, "learning_rate": 6.982980090135636e-06, "loss": 0.003, "step": 21120 }, { "epoch": 5.380337889464301, "grad_norm": 0.19077089428901672, "learning_rate": 6.978741975541928e-06, "loss": 0.0035, "step": 21125 }, { "epoch": 5.381611342219204, "grad_norm": 0.28940582275390625, "learning_rate": 6.97450445804118e-06, "loss": 0.0036, "step": 21130 }, { "epoch": 5.382884794974107, "grad_norm": 0.38709667325019836, "learning_rate": 6.970267538470865e-06, "loss": 0.0034, "step": 21135 }, { "epoch": 5.384158247729009, "grad_norm": 0.8393255472183228, "learning_rate": 6.96603121766832e-06, "loss": 0.003, "step": 21140 }, { "epoch": 5.385431700483912, "grad_norm": 0.48092222213745117, "learning_rate": 6.961795496470774e-06, "loss": 0.0041, "step": 21145 }, { "epoch": 5.386705153238815, "grad_norm": 0.5348889231681824, "learning_rate": 6.957560375715338e-06, "loss": 0.003, "step": 21150 }, { "epoch": 5.387978605993718, "grad_norm": 0.11859580874443054, "learning_rate": 6.953325856238996e-06, "loss": 0.0042, "step": 21155 }, { "epoch": 5.38925205874862, "grad_norm": 1.292683720588684, "learning_rate": 6.949091938878622e-06, "loss": 0.0042, "step": 21160 }, { "epoch": 5.390525511503523, "grad_norm": 0.1519664078950882, "learning_rate": 6.944858624470969e-06, "loss": 0.0037, "step": 21165 }, { "epoch": 5.391798964258426, "grad_norm": 0.7192227244377136, "learning_rate": 6.940625913852665e-06, "loss": 0.0053, "step": 21170 }, { "epoch": 5.393072417013329, "grad_norm": 0.38265976309776306, "learning_rate": 6.936393807860226e-06, "loss": 0.0034, "step": 21175 }, { "epoch": 5.394345869768232, "grad_norm": 0.2567736506462097, "learning_rate": 6.932162307330048e-06, "loss": 0.0028, "step": 21180 }, { "epoch": 5.395619322523134, "grad_norm": 0.1025993824005127, "learning_rate": 6.927931413098397e-06, "loss": 0.0038, "step": 21185 }, { "epoch": 5.396892775278038, "grad_norm": 0.3209450840950012, "learning_rate": 6.923701126001432e-06, "loss": 0.0027, "step": 21190 }, { "epoch": 5.39816622803294, "grad_norm": 0.6084628701210022, "learning_rate": 6.919471446875192e-06, "loss": 0.0025, "step": 21195 }, { "epoch": 5.399439680787843, "grad_norm": 0.16622765362262726, "learning_rate": 6.915242376555582e-06, "loss": 0.0026, "step": 21200 }, { "epoch": 5.400713133542745, "grad_norm": 0.5184718370437622, "learning_rate": 6.911013915878399e-06, "loss": 0.0025, "step": 21205 }, { "epoch": 5.401986586297649, "grad_norm": 0.791837751865387, "learning_rate": 6.90678606567932e-06, "loss": 0.0057, "step": 21210 }, { "epoch": 5.403260039052551, "grad_norm": 0.5634294748306274, "learning_rate": 6.90255882679389e-06, "loss": 0.0035, "step": 21215 }, { "epoch": 5.404533491807454, "grad_norm": 0.6668570637702942, "learning_rate": 6.8983322000575445e-06, "loss": 0.0024, "step": 21220 }, { "epoch": 5.405806944562356, "grad_norm": 1.184950590133667, "learning_rate": 6.8941061863056e-06, "loss": 0.0029, "step": 21225 }, { "epoch": 5.40708039731726, "grad_norm": 1.0581700801849365, "learning_rate": 6.889880786373235e-06, "loss": 0.0036, "step": 21230 }, { "epoch": 5.408353850072162, "grad_norm": 0.1251513957977295, "learning_rate": 6.885656001095527e-06, "loss": 0.002, "step": 21235 }, { "epoch": 5.409627302827065, "grad_norm": 0.3532602787017822, "learning_rate": 6.881431831307417e-06, "loss": 0.0027, "step": 21240 }, { "epoch": 5.410900755581968, "grad_norm": 0.22340130805969238, "learning_rate": 6.877208277843731e-06, "loss": 0.0012, "step": 21245 }, { "epoch": 5.412174208336871, "grad_norm": 0.42094820737838745, "learning_rate": 6.872985341539176e-06, "loss": 0.0032, "step": 21250 }, { "epoch": 5.413447661091774, "grad_norm": 0.6764249801635742, "learning_rate": 6.868763023228328e-06, "loss": 0.0029, "step": 21255 }, { "epoch": 5.414721113846676, "grad_norm": 0.26209697127342224, "learning_rate": 6.864541323745648e-06, "loss": 0.0034, "step": 21260 }, { "epoch": 5.415994566601579, "grad_norm": 0.6804068684577942, "learning_rate": 6.8603202439254776e-06, "loss": 0.0029, "step": 21265 }, { "epoch": 5.417268019356482, "grad_norm": 0.6199625134468079, "learning_rate": 6.8560997846020265e-06, "loss": 0.0045, "step": 21270 }, { "epoch": 5.418541472111385, "grad_norm": 0.12288092076778412, "learning_rate": 6.851879946609386e-06, "loss": 0.003, "step": 21275 }, { "epoch": 5.419814924866287, "grad_norm": 0.1461779922246933, "learning_rate": 6.847660730781527e-06, "loss": 0.0023, "step": 21280 }, { "epoch": 5.4210883776211904, "grad_norm": 0.07417410612106323, "learning_rate": 6.843442137952294e-06, "loss": 0.003, "step": 21285 }, { "epoch": 5.422361830376093, "grad_norm": 0.4884578585624695, "learning_rate": 6.839224168955414e-06, "loss": 0.0036, "step": 21290 }, { "epoch": 5.423635283130996, "grad_norm": 0.8275075554847717, "learning_rate": 6.835006824624479e-06, "loss": 0.0022, "step": 21295 }, { "epoch": 5.424908735885898, "grad_norm": 0.45498690009117126, "learning_rate": 6.8307901057929735e-06, "loss": 0.0015, "step": 21300 }, { "epoch": 5.4261821886408015, "grad_norm": 0.9405742287635803, "learning_rate": 6.826574013294247e-06, "loss": 0.0041, "step": 21305 }, { "epoch": 5.427455641395705, "grad_norm": 0.5288239121437073, "learning_rate": 6.822358547961525e-06, "loss": 0.003, "step": 21310 }, { "epoch": 5.428729094150607, "grad_norm": 0.526627242565155, "learning_rate": 6.818143710627918e-06, "loss": 0.0028, "step": 21315 }, { "epoch": 5.43000254690551, "grad_norm": 0.7387075424194336, "learning_rate": 6.813929502126402e-06, "loss": 0.003, "step": 21320 }, { "epoch": 5.4312759996604125, "grad_norm": 0.7922075390815735, "learning_rate": 6.809715923289832e-06, "loss": 0.0023, "step": 21325 }, { "epoch": 5.432549452415316, "grad_norm": 1.864189863204956, "learning_rate": 6.805502974950951e-06, "loss": 0.0033, "step": 21330 }, { "epoch": 5.433822905170218, "grad_norm": 0.3927688002586365, "learning_rate": 6.801290657942352e-06, "loss": 0.0031, "step": 21335 }, { "epoch": 5.435096357925121, "grad_norm": 0.49387526512145996, "learning_rate": 6.797078973096525e-06, "loss": 0.0027, "step": 21340 }, { "epoch": 5.436369810680024, "grad_norm": 0.2906287908554077, "learning_rate": 6.792867921245832e-06, "loss": 0.0031, "step": 21345 }, { "epoch": 5.437643263434927, "grad_norm": 0.9265438914299011, "learning_rate": 6.788657503222496e-06, "loss": 0.0029, "step": 21350 }, { "epoch": 5.438916716189829, "grad_norm": 0.7535140514373779, "learning_rate": 6.784447719858629e-06, "loss": 0.0023, "step": 21355 }, { "epoch": 5.440190168944732, "grad_norm": 0.3220116198062897, "learning_rate": 6.780238571986216e-06, "loss": 0.0028, "step": 21360 }, { "epoch": 5.441463621699635, "grad_norm": 0.7439151406288147, "learning_rate": 6.7760300604371065e-06, "loss": 0.0044, "step": 21365 }, { "epoch": 5.442737074454538, "grad_norm": 0.2977350056171417, "learning_rate": 6.771822186043035e-06, "loss": 0.0038, "step": 21370 }, { "epoch": 5.44401052720944, "grad_norm": 0.4256550967693329, "learning_rate": 6.76761494963561e-06, "loss": 0.001, "step": 21375 }, { "epoch": 5.445283979964343, "grad_norm": 0.1097925677895546, "learning_rate": 6.763408352046303e-06, "loss": 0.0022, "step": 21380 }, { "epoch": 5.4465574327192465, "grad_norm": 0.29964014887809753, "learning_rate": 6.759202394106467e-06, "loss": 0.002, "step": 21385 }, { "epoch": 5.447830885474149, "grad_norm": 0.3024725317955017, "learning_rate": 6.754997076647337e-06, "loss": 0.0038, "step": 21390 }, { "epoch": 5.449104338229052, "grad_norm": 0.30443334579467773, "learning_rate": 6.7507924004999995e-06, "loss": 0.0034, "step": 21395 }, { "epoch": 5.450377790983954, "grad_norm": 0.15972203016281128, "learning_rate": 6.746588366495438e-06, "loss": 0.0028, "step": 21400 }, { "epoch": 5.451651243738858, "grad_norm": 0.24780219793319702, "learning_rate": 6.7423849754644886e-06, "loss": 0.0019, "step": 21405 }, { "epoch": 5.45292469649376, "grad_norm": 0.8022862076759338, "learning_rate": 6.738182228237875e-06, "loss": 0.0034, "step": 21410 }, { "epoch": 5.454198149248663, "grad_norm": 0.3520021140575409, "learning_rate": 6.733980125646191e-06, "loss": 0.0047, "step": 21415 }, { "epoch": 5.455471602003565, "grad_norm": 0.7236614227294922, "learning_rate": 6.730618908268063e-06, "loss": 0.0045, "step": 21420 }, { "epoch": 5.456745054758469, "grad_norm": 0.7262017726898193, "learning_rate": 6.726417968111932e-06, "loss": 0.0046, "step": 21425 }, { "epoch": 5.458018507513371, "grad_norm": 0.3138579726219177, "learning_rate": 6.722217674915701e-06, "loss": 0.0034, "step": 21430 }, { "epoch": 5.459291960268274, "grad_norm": 1.6064454317092896, "learning_rate": 6.718018029509478e-06, "loss": 0.0037, "step": 21435 }, { "epoch": 5.460565413023176, "grad_norm": 0.3039226233959198, "learning_rate": 6.713819032723251e-06, "loss": 0.0031, "step": 21440 }, { "epoch": 5.46183886577808, "grad_norm": 0.3756534457206726, "learning_rate": 6.709620685386857e-06, "loss": 0.0027, "step": 21445 }, { "epoch": 5.463112318532982, "grad_norm": 0.9640763401985168, "learning_rate": 6.7054229883300245e-06, "loss": 0.0042, "step": 21450 }, { "epoch": 5.464385771287885, "grad_norm": 0.5331872701644897, "learning_rate": 6.701225942382351e-06, "loss": 0.0047, "step": 21455 }, { "epoch": 5.465659224042788, "grad_norm": 0.3828013837337494, "learning_rate": 6.69702954837329e-06, "loss": 0.0025, "step": 21460 }, { "epoch": 5.466932676797691, "grad_norm": 0.45942309498786926, "learning_rate": 6.692833807132186e-06, "loss": 0.0026, "step": 21465 }, { "epoch": 5.468206129552594, "grad_norm": 0.5757685899734497, "learning_rate": 6.688638719488248e-06, "loss": 0.0048, "step": 21470 }, { "epoch": 5.469479582307496, "grad_norm": 1.122910976409912, "learning_rate": 6.684444286270541e-06, "loss": 0.0032, "step": 21475 }, { "epoch": 5.470753035062399, "grad_norm": 0.39285215735435486, "learning_rate": 6.680250508308023e-06, "loss": 0.0042, "step": 21480 }, { "epoch": 5.472026487817302, "grad_norm": 0.37109261751174927, "learning_rate": 6.676057386429515e-06, "loss": 0.0029, "step": 21485 }, { "epoch": 5.473299940572205, "grad_norm": 0.1847795844078064, "learning_rate": 6.671864921463699e-06, "loss": 0.0022, "step": 21490 }, { "epoch": 5.474573393327107, "grad_norm": 0.24738368391990662, "learning_rate": 6.667673114239136e-06, "loss": 0.0029, "step": 21495 }, { "epoch": 5.47584684608201, "grad_norm": 0.19718316197395325, "learning_rate": 6.663481965584262e-06, "loss": 0.0024, "step": 21500 }, { "epoch": 5.477120298836913, "grad_norm": 0.4621720016002655, "learning_rate": 6.6592914763273654e-06, "loss": 0.0035, "step": 21505 }, { "epoch": 5.478393751591816, "grad_norm": 0.29889845848083496, "learning_rate": 6.6551016472966205e-06, "loss": 0.0047, "step": 21510 }, { "epoch": 5.479667204346718, "grad_norm": 0.37782782316207886, "learning_rate": 6.6509124793200685e-06, "loss": 0.0028, "step": 21515 }, { "epoch": 5.4809406571016215, "grad_norm": 1.230869174003601, "learning_rate": 6.6467239732256105e-06, "loss": 0.0022, "step": 21520 }, { "epoch": 5.482214109856525, "grad_norm": 0.477373868227005, "learning_rate": 6.642536129841026e-06, "loss": 0.0016, "step": 21525 }, { "epoch": 5.483487562611427, "grad_norm": 1.249334454536438, "learning_rate": 6.638348949993967e-06, "loss": 0.0039, "step": 21530 }, { "epoch": 5.48476101536633, "grad_norm": 0.2126093953847885, "learning_rate": 6.634162434511939e-06, "loss": 0.0024, "step": 21535 }, { "epoch": 5.4860344681212325, "grad_norm": 0.6883522868156433, "learning_rate": 6.629976584222328e-06, "loss": 0.0027, "step": 21540 }, { "epoch": 5.487307920876136, "grad_norm": 0.49628719687461853, "learning_rate": 6.625791399952394e-06, "loss": 0.0032, "step": 21545 }, { "epoch": 5.488581373631038, "grad_norm": 0.5220573544502258, "learning_rate": 6.6216068825292444e-06, "loss": 0.008, "step": 21550 }, { "epoch": 5.489854826385941, "grad_norm": 0.84958416223526, "learning_rate": 6.617423032779875e-06, "loss": 0.0032, "step": 21555 }, { "epoch": 5.491128279140844, "grad_norm": 0.12169300764799118, "learning_rate": 6.613239851531147e-06, "loss": 0.0019, "step": 21560 }, { "epoch": 5.492401731895747, "grad_norm": 0.6499847173690796, "learning_rate": 6.609057339609774e-06, "loss": 0.0033, "step": 21565 }, { "epoch": 5.493675184650649, "grad_norm": 0.5208379626274109, "learning_rate": 6.604875497842354e-06, "loss": 0.0035, "step": 21570 }, { "epoch": 5.494948637405552, "grad_norm": 0.4198550581932068, "learning_rate": 6.6006943270553515e-06, "loss": 0.002, "step": 21575 }, { "epoch": 5.496222090160455, "grad_norm": 0.7446340918540955, "learning_rate": 6.596513828075084e-06, "loss": 0.0033, "step": 21580 }, { "epoch": 5.497495542915358, "grad_norm": 0.9470084309577942, "learning_rate": 6.592334001727752e-06, "loss": 0.0062, "step": 21585 }, { "epoch": 5.498768995670261, "grad_norm": 0.4842085540294647, "learning_rate": 6.5881548488394185e-06, "loss": 0.0033, "step": 21590 }, { "epoch": 5.500042448425163, "grad_norm": 0.5744979381561279, "learning_rate": 6.583976370236005e-06, "loss": 0.0039, "step": 21595 }, { "epoch": 5.5013159011800665, "grad_norm": 0.27640947699546814, "learning_rate": 6.579798566743314e-06, "loss": 0.004, "step": 21600 }, { "epoch": 5.502589353934969, "grad_norm": 0.6387393474578857, "learning_rate": 6.575621439187003e-06, "loss": 0.003, "step": 21605 }, { "epoch": 5.503862806689872, "grad_norm": 0.7247270941734314, "learning_rate": 6.571444988392599e-06, "loss": 0.0041, "step": 21610 }, { "epoch": 5.505136259444774, "grad_norm": 0.832729697227478, "learning_rate": 6.567269215185501e-06, "loss": 0.0036, "step": 21615 }, { "epoch": 5.506409712199678, "grad_norm": 0.1594662219285965, "learning_rate": 6.563094120390967e-06, "loss": 0.0026, "step": 21620 }, { "epoch": 5.50768316495458, "grad_norm": 0.335679829120636, "learning_rate": 6.558919704834121e-06, "loss": 0.0035, "step": 21625 }, { "epoch": 5.508956617709483, "grad_norm": 0.0771254375576973, "learning_rate": 6.5547459693399595e-06, "loss": 0.0015, "step": 21630 }, { "epoch": 5.510230070464385, "grad_norm": 0.634194016456604, "learning_rate": 6.5505729147333385e-06, "loss": 0.0029, "step": 21635 }, { "epoch": 5.511503523219289, "grad_norm": 0.09421141445636749, "learning_rate": 6.546400541838981e-06, "loss": 0.0033, "step": 21640 }, { "epoch": 5.512776975974191, "grad_norm": 0.7022083401679993, "learning_rate": 6.542228851481476e-06, "loss": 0.0035, "step": 21645 }, { "epoch": 5.514050428729094, "grad_norm": 0.09125350415706635, "learning_rate": 6.538057844485273e-06, "loss": 0.004, "step": 21650 }, { "epoch": 5.515323881483997, "grad_norm": 0.36087745428085327, "learning_rate": 6.533887521674697e-06, "loss": 0.0021, "step": 21655 }, { "epoch": 5.5165973342389, "grad_norm": 0.5736220479011536, "learning_rate": 6.529717883873927e-06, "loss": 0.0025, "step": 21660 }, { "epoch": 5.517870786993803, "grad_norm": 0.570241391658783, "learning_rate": 6.525548931907012e-06, "loss": 0.0035, "step": 21665 }, { "epoch": 5.519144239748705, "grad_norm": 0.8338043093681335, "learning_rate": 6.521380666597865e-06, "loss": 0.0064, "step": 21670 }, { "epoch": 5.520417692503608, "grad_norm": 0.3595825135707855, "learning_rate": 6.517213088770265e-06, "loss": 0.0026, "step": 21675 }, { "epoch": 5.521691145258511, "grad_norm": 0.28427013754844666, "learning_rate": 6.5130461992478475e-06, "loss": 0.0028, "step": 21680 }, { "epoch": 5.522964598013414, "grad_norm": 0.15180912613868713, "learning_rate": 6.5088799988541185e-06, "loss": 0.0019, "step": 21685 }, { "epoch": 5.524238050768316, "grad_norm": 0.8417797088623047, "learning_rate": 6.504714488412449e-06, "loss": 0.0036, "step": 21690 }, { "epoch": 5.525511503523219, "grad_norm": 0.3379879891872406, "learning_rate": 6.5005496687460726e-06, "loss": 0.0015, "step": 21695 }, { "epoch": 5.526784956278122, "grad_norm": 1.1630394458770752, "learning_rate": 6.4963855406780785e-06, "loss": 0.0018, "step": 21700 }, { "epoch": 5.528058409033025, "grad_norm": 0.44716382026672363, "learning_rate": 6.492222105031433e-06, "loss": 0.0029, "step": 21705 }, { "epoch": 5.529331861787927, "grad_norm": 1.1758767366409302, "learning_rate": 6.488059362628954e-06, "loss": 0.0032, "step": 21710 }, { "epoch": 5.53060531454283, "grad_norm": 0.34687376022338867, "learning_rate": 6.483897314293326e-06, "loss": 0.0026, "step": 21715 }, { "epoch": 5.531878767297734, "grad_norm": 0.5727887749671936, "learning_rate": 6.479735960847104e-06, "loss": 0.0017, "step": 21720 }, { "epoch": 5.533152220052636, "grad_norm": 0.46425575017929077, "learning_rate": 6.475575303112689e-06, "loss": 0.0035, "step": 21725 }, { "epoch": 5.534425672807539, "grad_norm": 0.09985712170600891, "learning_rate": 6.471415341912357e-06, "loss": 0.0025, "step": 21730 }, { "epoch": 5.5356991255624415, "grad_norm": 0.2869757413864136, "learning_rate": 6.467256078068251e-06, "loss": 0.0037, "step": 21735 }, { "epoch": 5.536972578317345, "grad_norm": 0.24646759033203125, "learning_rate": 6.463097512402359e-06, "loss": 0.0029, "step": 21740 }, { "epoch": 5.538246031072247, "grad_norm": 0.7986907958984375, "learning_rate": 6.458939645736544e-06, "loss": 0.0021, "step": 21745 }, { "epoch": 5.53951948382715, "grad_norm": 0.6662119626998901, "learning_rate": 6.454782478892532e-06, "loss": 0.0027, "step": 21750 }, { "epoch": 5.5407929365820525, "grad_norm": 0.14976593852043152, "learning_rate": 6.4506260126918985e-06, "loss": 0.003, "step": 21755 }, { "epoch": 5.542066389336956, "grad_norm": 0.5263438820838928, "learning_rate": 6.446470247956094e-06, "loss": 0.0018, "step": 21760 }, { "epoch": 5.543339842091858, "grad_norm": 0.3878294825553894, "learning_rate": 6.4423151855064245e-06, "loss": 0.002, "step": 21765 }, { "epoch": 5.544613294846761, "grad_norm": 0.7019902467727661, "learning_rate": 6.438160826164053e-06, "loss": 0.0016, "step": 21770 }, { "epoch": 5.545886747601664, "grad_norm": 0.10731667280197144, "learning_rate": 6.4340071707500105e-06, "loss": 0.0022, "step": 21775 }, { "epoch": 5.547160200356567, "grad_norm": 0.06740577518939972, "learning_rate": 6.4298542200851925e-06, "loss": 0.0035, "step": 21780 }, { "epoch": 5.54843365311147, "grad_norm": 1.1152821779251099, "learning_rate": 6.425701974990337e-06, "loss": 0.0064, "step": 21785 }, { "epoch": 5.549707105866372, "grad_norm": 0.31861382722854614, "learning_rate": 6.4215504362860615e-06, "loss": 0.0024, "step": 21790 }, { "epoch": 5.5509805586212755, "grad_norm": 0.08265945315361023, "learning_rate": 6.41739960479284e-06, "loss": 0.0033, "step": 21795 }, { "epoch": 5.552254011376178, "grad_norm": 1.1171863079071045, "learning_rate": 6.413249481330995e-06, "loss": 0.003, "step": 21800 }, { "epoch": 5.553527464131081, "grad_norm": 0.4076113998889923, "learning_rate": 6.409100066720724e-06, "loss": 0.0041, "step": 21805 }, { "epoch": 5.554800916885983, "grad_norm": 0.5113330483436584, "learning_rate": 6.404951361782081e-06, "loss": 0.0024, "step": 21810 }, { "epoch": 5.5560743696408865, "grad_norm": 0.4670911133289337, "learning_rate": 6.400803367334969e-06, "loss": 0.003, "step": 21815 }, { "epoch": 5.557347822395789, "grad_norm": 0.34192344546318054, "learning_rate": 6.3966560841991625e-06, "loss": 0.0037, "step": 21820 }, { "epoch": 5.558621275150692, "grad_norm": 0.39389488101005554, "learning_rate": 6.3925095131942936e-06, "loss": 0.0027, "step": 21825 }, { "epoch": 5.559894727905594, "grad_norm": 0.5144122838973999, "learning_rate": 6.3883636551398465e-06, "loss": 0.0035, "step": 21830 }, { "epoch": 5.561168180660498, "grad_norm": 0.6061851978302002, "learning_rate": 6.384218510855173e-06, "loss": 0.0035, "step": 21835 }, { "epoch": 5.5624416334154, "grad_norm": 0.43815720081329346, "learning_rate": 6.380074081159484e-06, "loss": 0.0037, "step": 21840 }, { "epoch": 5.563715086170303, "grad_norm": 0.5446003675460815, "learning_rate": 6.375930366871836e-06, "loss": 0.0032, "step": 21845 }, { "epoch": 5.564988538925206, "grad_norm": 0.2878566086292267, "learning_rate": 6.37178736881116e-06, "loss": 0.0027, "step": 21850 }, { "epoch": 5.566261991680109, "grad_norm": 0.507777988910675, "learning_rate": 6.368473486596267e-06, "loss": 0.0041, "step": 21855 }, { "epoch": 5.567535444435012, "grad_norm": 0.601956844329834, "learning_rate": 6.364331779807375e-06, "loss": 0.003, "step": 21860 }, { "epoch": 5.568808897189914, "grad_norm": 0.5061746835708618, "learning_rate": 6.360190791537695e-06, "loss": 0.0033, "step": 21865 }, { "epoch": 5.570082349944817, "grad_norm": 0.12463623285293579, "learning_rate": 6.356050522605602e-06, "loss": 0.0033, "step": 21870 }, { "epoch": 5.57135580269972, "grad_norm": 1.1844565868377686, "learning_rate": 6.351910973829346e-06, "loss": 0.0048, "step": 21875 }, { "epoch": 5.572629255454623, "grad_norm": 0.7846466898918152, "learning_rate": 6.347772146027031e-06, "loss": 0.0031, "step": 21880 }, { "epoch": 5.573902708209525, "grad_norm": 0.4298258423805237, "learning_rate": 6.343634040016608e-06, "loss": 0.0022, "step": 21885 }, { "epoch": 5.575176160964428, "grad_norm": 0.5436177253723145, "learning_rate": 6.339496656615898e-06, "loss": 0.0022, "step": 21890 }, { "epoch": 5.576449613719331, "grad_norm": 0.3923104703426361, "learning_rate": 6.335359996642578e-06, "loss": 0.0023, "step": 21895 }, { "epoch": 5.577723066474234, "grad_norm": 0.07138541340827942, "learning_rate": 6.33122406091417e-06, "loss": 0.0026, "step": 21900 }, { "epoch": 5.578996519229136, "grad_norm": 0.5043923258781433, "learning_rate": 6.32708885024807e-06, "loss": 0.0035, "step": 21905 }, { "epoch": 5.580269971984039, "grad_norm": 0.5185493230819702, "learning_rate": 6.322954365461513e-06, "loss": 0.0023, "step": 21910 }, { "epoch": 5.581543424738943, "grad_norm": 1.0033891201019287, "learning_rate": 6.318820607371606e-06, "loss": 0.0032, "step": 21915 }, { "epoch": 5.582816877493845, "grad_norm": 1.2428393363952637, "learning_rate": 6.3146875767953086e-06, "loss": 0.0034, "step": 21920 }, { "epoch": 5.584090330248748, "grad_norm": 0.7586703300476074, "learning_rate": 6.310555274549426e-06, "loss": 0.0032, "step": 21925 }, { "epoch": 5.58536378300365, "grad_norm": 0.9683393836021423, "learning_rate": 6.306423701450631e-06, "loss": 0.0022, "step": 21930 }, { "epoch": 5.586637235758554, "grad_norm": 0.30418840050697327, "learning_rate": 6.302292858315453e-06, "loss": 0.0023, "step": 21935 }, { "epoch": 5.587910688513456, "grad_norm": 0.33467888832092285, "learning_rate": 6.298162745960266e-06, "loss": 0.0027, "step": 21940 }, { "epoch": 5.589184141268359, "grad_norm": 0.4677860140800476, "learning_rate": 6.294033365201308e-06, "loss": 0.0022, "step": 21945 }, { "epoch": 5.5904575940232615, "grad_norm": 0.24625267088413239, "learning_rate": 6.289904716854678e-06, "loss": 0.0011, "step": 21950 }, { "epoch": 5.591731046778165, "grad_norm": 0.2611978054046631, "learning_rate": 6.285776801736312e-06, "loss": 0.0036, "step": 21955 }, { "epoch": 5.593004499533067, "grad_norm": 0.5273603796958923, "learning_rate": 6.281649620662019e-06, "loss": 0.0032, "step": 21960 }, { "epoch": 5.59427795228797, "grad_norm": 0.1618194580078125, "learning_rate": 6.27752317444746e-06, "loss": 0.0027, "step": 21965 }, { "epoch": 5.5955514050428725, "grad_norm": 0.37924522161483765, "learning_rate": 6.273397463908136e-06, "loss": 0.0025, "step": 21970 }, { "epoch": 5.596824857797776, "grad_norm": 0.4137652814388275, "learning_rate": 6.2692724898594215e-06, "loss": 0.0022, "step": 21975 }, { "epoch": 5.598098310552679, "grad_norm": 0.7542482018470764, "learning_rate": 6.265148253116539e-06, "loss": 0.0031, "step": 21980 }, { "epoch": 5.599371763307581, "grad_norm": 0.4325544238090515, "learning_rate": 6.261024754494556e-06, "loss": 0.0024, "step": 21985 }, { "epoch": 5.600645216062484, "grad_norm": 0.4993095397949219, "learning_rate": 6.256901994808406e-06, "loss": 0.0033, "step": 21990 }, { "epoch": 5.601918668817387, "grad_norm": 0.5090665221214294, "learning_rate": 6.252779974872878e-06, "loss": 0.0032, "step": 21995 }, { "epoch": 5.60319212157229, "grad_norm": 0.6864000558853149, "learning_rate": 6.248658695502599e-06, "loss": 0.0031, "step": 22000 }, { "epoch": 5.604465574327192, "grad_norm": 0.49655991792678833, "learning_rate": 6.244538157512066e-06, "loss": 0.0018, "step": 22005 }, { "epoch": 5.6057390270820955, "grad_norm": 0.12438489496707916, "learning_rate": 6.240418361715622e-06, "loss": 0.0034, "step": 22010 }, { "epoch": 5.607012479836998, "grad_norm": 0.7739245295524597, "learning_rate": 6.2362993089274625e-06, "loss": 0.0038, "step": 22015 }, { "epoch": 5.608285932591901, "grad_norm": 0.3873724639415741, "learning_rate": 6.232180999961644e-06, "loss": 0.0031, "step": 22020 }, { "epoch": 5.609559385346803, "grad_norm": 0.5380098223686218, "learning_rate": 6.2280634356320655e-06, "loss": 0.0033, "step": 22025 }, { "epoch": 5.6108328381017065, "grad_norm": 0.5285215973854065, "learning_rate": 6.223946616752483e-06, "loss": 0.0028, "step": 22030 }, { "epoch": 5.612106290856609, "grad_norm": 1.6790344715118408, "learning_rate": 6.2198305441365095e-06, "loss": 0.0025, "step": 22035 }, { "epoch": 5.613379743611512, "grad_norm": 0.36099550127983093, "learning_rate": 6.215715218597604e-06, "loss": 0.002, "step": 22040 }, { "epoch": 5.614653196366415, "grad_norm": 0.2600169777870178, "learning_rate": 6.2116006409490815e-06, "loss": 0.0021, "step": 22045 }, { "epoch": 5.615926649121318, "grad_norm": 0.3055913746356964, "learning_rate": 6.207486812004107e-06, "loss": 0.0026, "step": 22050 }, { "epoch": 5.617200101876221, "grad_norm": 0.43027806282043457, "learning_rate": 6.203373732575702e-06, "loss": 0.0041, "step": 22055 }, { "epoch": 5.618473554631123, "grad_norm": 0.5941896438598633, "learning_rate": 6.1992614034767355e-06, "loss": 0.0027, "step": 22060 }, { "epoch": 5.619747007386026, "grad_norm": 0.8906679153442383, "learning_rate": 6.195149825519926e-06, "loss": 0.0044, "step": 22065 }, { "epoch": 5.621020460140929, "grad_norm": 0.30780884623527527, "learning_rate": 6.191038999517853e-06, "loss": 0.0017, "step": 22070 }, { "epoch": 5.622293912895832, "grad_norm": 0.29927659034729004, "learning_rate": 6.186928926282938e-06, "loss": 0.0019, "step": 22075 }, { "epoch": 5.623567365650734, "grad_norm": 1.2551146745681763, "learning_rate": 6.1828196066274546e-06, "loss": 0.0055, "step": 22080 }, { "epoch": 5.624840818405637, "grad_norm": 0.027907894924283028, "learning_rate": 6.1787110413635384e-06, "loss": 0.0028, "step": 22085 }, { "epoch": 5.62611427116054, "grad_norm": 0.9906622767448425, "learning_rate": 6.174603231303157e-06, "loss": 0.0057, "step": 22090 }, { "epoch": 5.627387723915443, "grad_norm": 0.16606485843658447, "learning_rate": 6.170496177258145e-06, "loss": 0.0023, "step": 22095 }, { "epoch": 5.628661176670345, "grad_norm": 0.4882150888442993, "learning_rate": 6.166389880040185e-06, "loss": 0.0035, "step": 22100 }, { "epoch": 5.629934629425248, "grad_norm": 0.2567068636417389, "learning_rate": 6.162284340460798e-06, "loss": 0.0032, "step": 22105 }, { "epoch": 5.631208082180152, "grad_norm": 0.3010910153388977, "learning_rate": 6.15817955933137e-06, "loss": 0.0026, "step": 22110 }, { "epoch": 5.632481534935054, "grad_norm": 0.45934826135635376, "learning_rate": 6.154075537463135e-06, "loss": 0.002, "step": 22115 }, { "epoch": 5.633754987689957, "grad_norm": 0.46973636746406555, "learning_rate": 6.1499722756671645e-06, "loss": 0.0024, "step": 22120 }, { "epoch": 5.635028440444859, "grad_norm": 0.2864607274532318, "learning_rate": 6.145869774754393e-06, "loss": 0.0031, "step": 22125 }, { "epoch": 5.636301893199763, "grad_norm": 0.31215623021125793, "learning_rate": 6.141768035535602e-06, "loss": 0.0029, "step": 22130 }, { "epoch": 5.637575345954665, "grad_norm": 0.6988843679428101, "learning_rate": 6.137667058821417e-06, "loss": 0.0037, "step": 22135 }, { "epoch": 5.638848798709568, "grad_norm": 0.2389957159757614, "learning_rate": 6.133566845422316e-06, "loss": 0.0011, "step": 22140 }, { "epoch": 5.64012225146447, "grad_norm": 0.363300621509552, "learning_rate": 6.1294673961486325e-06, "loss": 0.0022, "step": 22145 }, { "epoch": 5.641395704219374, "grad_norm": 0.6418066024780273, "learning_rate": 6.1253687118105355e-06, "loss": 0.0037, "step": 22150 }, { "epoch": 5.642669156974276, "grad_norm": 0.30630627274513245, "learning_rate": 6.121270793218054e-06, "loss": 0.0036, "step": 22155 }, { "epoch": 5.643942609729179, "grad_norm": 0.7411510348320007, "learning_rate": 6.117173641181064e-06, "loss": 0.004, "step": 22160 }, { "epoch": 5.6452160624840815, "grad_norm": 1.8505488634109497, "learning_rate": 6.113077256509283e-06, "loss": 0.0056, "step": 22165 }, { "epoch": 5.646489515238985, "grad_norm": 0.4859052300453186, "learning_rate": 6.108981640012289e-06, "loss": 0.0023, "step": 22170 }, { "epoch": 5.647762967993888, "grad_norm": 1.2731387615203857, "learning_rate": 6.104886792499491e-06, "loss": 0.0032, "step": 22175 }, { "epoch": 5.64903642074879, "grad_norm": 0.2974112629890442, "learning_rate": 6.100792714780164e-06, "loss": 0.0017, "step": 22180 }, { "epoch": 5.6503098735036925, "grad_norm": 0.5417484641075134, "learning_rate": 6.096699407663423e-06, "loss": 0.0035, "step": 22185 }, { "epoch": 5.651583326258596, "grad_norm": 0.6481115818023682, "learning_rate": 6.092606871958225e-06, "loss": 0.0031, "step": 22190 }, { "epoch": 5.652856779013499, "grad_norm": 0.803661048412323, "learning_rate": 6.0885151084733826e-06, "loss": 0.0023, "step": 22195 }, { "epoch": 5.654130231768401, "grad_norm": 0.3974982798099518, "learning_rate": 6.08442411801756e-06, "loss": 0.0014, "step": 22200 }, { "epoch": 5.655403684523304, "grad_norm": 0.3219428062438965, "learning_rate": 6.080333901399252e-06, "loss": 0.0016, "step": 22205 }, { "epoch": 5.656677137278207, "grad_norm": 0.7589671015739441, "learning_rate": 6.076244459426812e-06, "loss": 0.0033, "step": 22210 }, { "epoch": 5.65795059003311, "grad_norm": 0.46975865960121155, "learning_rate": 6.0721557929084486e-06, "loss": 0.0028, "step": 22215 }, { "epoch": 5.659224042788012, "grad_norm": 0.34348317980766296, "learning_rate": 6.068067902652195e-06, "loss": 0.0027, "step": 22220 }, { "epoch": 5.6604974955429155, "grad_norm": 0.22951093316078186, "learning_rate": 6.063980789465949e-06, "loss": 0.003, "step": 22225 }, { "epoch": 5.661770948297818, "grad_norm": 0.3370257318019867, "learning_rate": 6.059894454157451e-06, "loss": 0.0021, "step": 22230 }, { "epoch": 5.663044401052721, "grad_norm": 0.8014697432518005, "learning_rate": 6.055808897534281e-06, "loss": 0.0029, "step": 22235 }, { "epoch": 5.664317853807624, "grad_norm": 0.40591326355934143, "learning_rate": 6.051724120403872e-06, "loss": 0.0027, "step": 22240 }, { "epoch": 5.6655913065625265, "grad_norm": 0.740569531917572, "learning_rate": 6.047640123573506e-06, "loss": 0.0042, "step": 22245 }, { "epoch": 5.666864759317429, "grad_norm": 0.8913518786430359, "learning_rate": 6.043556907850296e-06, "loss": 0.0023, "step": 22250 }, { "epoch": 5.668138212072332, "grad_norm": 0.2405955195426941, "learning_rate": 6.039474474041216e-06, "loss": 0.0029, "step": 22255 }, { "epoch": 5.669411664827235, "grad_norm": 11.765457153320312, "learning_rate": 6.035392822953083e-06, "loss": 0.1109, "step": 22260 }, { "epoch": 5.670685117582138, "grad_norm": 0.3261817991733551, "learning_rate": 6.031311955392547e-06, "loss": 0.0023, "step": 22265 }, { "epoch": 5.671958570337041, "grad_norm": 0.2967856228351593, "learning_rate": 6.027231872166117e-06, "loss": 0.0024, "step": 22270 }, { "epoch": 5.673232023091943, "grad_norm": 0.39093276858329773, "learning_rate": 6.0231525740801474e-06, "loss": 0.0036, "step": 22275 }, { "epoch": 5.674505475846846, "grad_norm": 0.49442240595817566, "learning_rate": 6.019074061940823e-06, "loss": 0.0026, "step": 22280 }, { "epoch": 5.675778928601749, "grad_norm": 0.3933052718639374, "learning_rate": 6.014996336554186e-06, "loss": 0.0028, "step": 22285 }, { "epoch": 5.677052381356652, "grad_norm": 0.43981873989105225, "learning_rate": 6.010919398726125e-06, "loss": 0.0022, "step": 22290 }, { "epoch": 5.678325834111554, "grad_norm": 0.6024956107139587, "learning_rate": 6.0068432492623595e-06, "loss": 0.0033, "step": 22295 }, { "epoch": 5.679599286866457, "grad_norm": 0.4347195029258728, "learning_rate": 6.0027678889684635e-06, "loss": 0.0017, "step": 22300 }, { "epoch": 5.6808727396213605, "grad_norm": 0.5382107496261597, "learning_rate": 5.9986933186498585e-06, "loss": 0.0057, "step": 22305 }, { "epoch": 5.682146192376263, "grad_norm": 0.3565225899219513, "learning_rate": 5.9946195391117965e-06, "loss": 0.0021, "step": 22310 }, { "epoch": 5.683419645131165, "grad_norm": 0.24256391823291779, "learning_rate": 5.990546551159384e-06, "loss": 0.0024, "step": 22315 }, { "epoch": 5.684693097886068, "grad_norm": 0.7890423536300659, "learning_rate": 5.98647435559757e-06, "loss": 0.0026, "step": 22320 }, { "epoch": 5.685966550640972, "grad_norm": 0.04861881583929062, "learning_rate": 5.98240295323114e-06, "loss": 0.0042, "step": 22325 }, { "epoch": 5.687240003395874, "grad_norm": 0.3849143981933594, "learning_rate": 5.978332344864732e-06, "loss": 0.0018, "step": 22330 }, { "epoch": 5.688513456150777, "grad_norm": 0.5748382806777954, "learning_rate": 5.974262531302824e-06, "loss": 0.0059, "step": 22335 }, { "epoch": 5.689786908905679, "grad_norm": 0.5578311681747437, "learning_rate": 5.970193513349727e-06, "loss": 0.0029, "step": 22340 }, { "epoch": 5.691060361660583, "grad_norm": 0.17255207896232605, "learning_rate": 5.966125291809613e-06, "loss": 0.0016, "step": 22345 }, { "epoch": 5.692333814415485, "grad_norm": 1.2769184112548828, "learning_rate": 5.962057867486484e-06, "loss": 0.0036, "step": 22350 }, { "epoch": 5.693607267170388, "grad_norm": 1.246884822845459, "learning_rate": 5.957991241184184e-06, "loss": 0.0053, "step": 22355 }, { "epoch": 5.69488071992529, "grad_norm": 0.7266223430633545, "learning_rate": 5.953925413706405e-06, "loss": 0.0034, "step": 22360 }, { "epoch": 5.696154172680194, "grad_norm": 0.30835607647895813, "learning_rate": 5.949860385856681e-06, "loss": 0.0025, "step": 22365 }, { "epoch": 5.697427625435097, "grad_norm": 0.8229398727416992, "learning_rate": 5.9457961584383825e-06, "loss": 0.0027, "step": 22370 }, { "epoch": 5.698701078189999, "grad_norm": 0.630861759185791, "learning_rate": 5.941732732254726e-06, "loss": 0.0029, "step": 22375 }, { "epoch": 5.6999745309449015, "grad_norm": 2.108328104019165, "learning_rate": 5.93767010810877e-06, "loss": 0.0036, "step": 22380 }, { "epoch": 5.701247983699805, "grad_norm": 0.8337125182151794, "learning_rate": 5.933608286803414e-06, "loss": 0.0031, "step": 22385 }, { "epoch": 5.702521436454708, "grad_norm": 0.29597294330596924, "learning_rate": 5.9295472691413935e-06, "loss": 0.0024, "step": 22390 }, { "epoch": 5.70379488920961, "grad_norm": 0.41112565994262695, "learning_rate": 5.925487055925297e-06, "loss": 0.002, "step": 22395 }, { "epoch": 5.705068341964513, "grad_norm": 0.09853959083557129, "learning_rate": 5.921427647957538e-06, "loss": 0.0017, "step": 22400 }, { "epoch": 5.706341794719416, "grad_norm": 0.3956989347934723, "learning_rate": 5.917369046040384e-06, "loss": 0.002, "step": 22405 }, { "epoch": 5.707615247474319, "grad_norm": 0.6119239330291748, "learning_rate": 5.9133112509759435e-06, "loss": 0.0041, "step": 22410 }, { "epoch": 5.708888700229221, "grad_norm": 1.044118881225586, "learning_rate": 5.9092542635661535e-06, "loss": 0.003, "step": 22415 }, { "epoch": 5.710162152984124, "grad_norm": 0.2726011574268341, "learning_rate": 5.905198084612802e-06, "loss": 0.0033, "step": 22420 }, { "epoch": 5.711435605739027, "grad_norm": 0.1880582571029663, "learning_rate": 5.901142714917515e-06, "loss": 0.0028, "step": 22425 }, { "epoch": 5.71270905849393, "grad_norm": 0.8756456971168518, "learning_rate": 5.897088155281754e-06, "loss": 0.0023, "step": 22430 }, { "epoch": 5.713982511248833, "grad_norm": 0.3574633002281189, "learning_rate": 5.8930344065068305e-06, "loss": 0.0019, "step": 22435 }, { "epoch": 5.7152559640037355, "grad_norm": 0.307189017534256, "learning_rate": 5.88898146939388e-06, "loss": 0.0025, "step": 22440 }, { "epoch": 5.716529416758638, "grad_norm": 0.3675701320171356, "learning_rate": 5.884929344743891e-06, "loss": 0.0027, "step": 22445 }, { "epoch": 5.717802869513541, "grad_norm": 0.446490079164505, "learning_rate": 5.880878033357693e-06, "loss": 0.0022, "step": 22450 }, { "epoch": 5.719076322268444, "grad_norm": 0.4834707975387573, "learning_rate": 5.876827536035941e-06, "loss": 0.0018, "step": 22455 }, { "epoch": 5.7203497750233465, "grad_norm": 0.14454706013202667, "learning_rate": 5.872777853579138e-06, "loss": 0.004, "step": 22460 }, { "epoch": 5.72162322777825, "grad_norm": 0.6421947479248047, "learning_rate": 5.868728986787633e-06, "loss": 0.0022, "step": 22465 }, { "epoch": 5.722896680533152, "grad_norm": 0.7654571533203125, "learning_rate": 5.864680936461594e-06, "loss": 0.003, "step": 22470 }, { "epoch": 5.724170133288055, "grad_norm": 0.15684185922145844, "learning_rate": 5.860633703401047e-06, "loss": 0.0017, "step": 22475 }, { "epoch": 5.7254435860429576, "grad_norm": 0.5064588785171509, "learning_rate": 5.85658728840585e-06, "loss": 0.0027, "step": 22480 }, { "epoch": 5.726717038797861, "grad_norm": 0.12240644544363022, "learning_rate": 5.852541692275694e-06, "loss": 0.0021, "step": 22485 }, { "epoch": 5.727990491552763, "grad_norm": 0.490313321352005, "learning_rate": 5.848496915810113e-06, "loss": 0.0031, "step": 22490 }, { "epoch": 5.729263944307666, "grad_norm": 0.15130241215229034, "learning_rate": 5.844452959808484e-06, "loss": 0.002, "step": 22495 }, { "epoch": 5.730537397062569, "grad_norm": 0.6210802793502808, "learning_rate": 5.840409825070008e-06, "loss": 0.0023, "step": 22500 }, { "epoch": 5.731810849817472, "grad_norm": 0.5740182995796204, "learning_rate": 5.8363675123937395e-06, "loss": 0.0029, "step": 22505 }, { "epoch": 5.733084302572374, "grad_norm": 0.35999006032943726, "learning_rate": 5.832326022578558e-06, "loss": 0.0022, "step": 22510 }, { "epoch": 5.734357755327277, "grad_norm": 0.44077053666114807, "learning_rate": 5.828285356423185e-06, "loss": 0.0036, "step": 22515 }, { "epoch": 5.7356312080821805, "grad_norm": 0.9925187230110168, "learning_rate": 5.824245514726186e-06, "loss": 0.0035, "step": 22520 }, { "epoch": 5.736904660837083, "grad_norm": 0.15561579167842865, "learning_rate": 5.820206498285952e-06, "loss": 0.0027, "step": 22525 }, { "epoch": 5.738178113591986, "grad_norm": 0.33766117691993713, "learning_rate": 5.816168307900718e-06, "loss": 0.0035, "step": 22530 }, { "epoch": 5.739451566346888, "grad_norm": 0.3383485972881317, "learning_rate": 5.812130944368553e-06, "loss": 0.0022, "step": 22535 }, { "epoch": 5.740725019101792, "grad_norm": 0.15487636625766754, "learning_rate": 5.808094408487364e-06, "loss": 0.0011, "step": 22540 }, { "epoch": 5.741998471856694, "grad_norm": 0.6325805187225342, "learning_rate": 5.804058701054893e-06, "loss": 0.003, "step": 22545 }, { "epoch": 5.743271924611597, "grad_norm": 0.2664794921875, "learning_rate": 5.800023822868717e-06, "loss": 0.002, "step": 22550 }, { "epoch": 5.744545377366499, "grad_norm": 0.9941783547401428, "learning_rate": 5.795989774726258e-06, "loss": 0.0028, "step": 22555 }, { "epoch": 5.745818830121403, "grad_norm": 0.656891942024231, "learning_rate": 5.791956557424763e-06, "loss": 0.0032, "step": 22560 }, { "epoch": 5.747092282876305, "grad_norm": 0.13802608847618103, "learning_rate": 5.787924171761319e-06, "loss": 0.002, "step": 22565 }, { "epoch": 5.748365735631208, "grad_norm": 0.30509158968925476, "learning_rate": 5.7838926185328506e-06, "loss": 0.0038, "step": 22570 }, { "epoch": 5.74963918838611, "grad_norm": 0.2256477326154709, "learning_rate": 5.779861898536113e-06, "loss": 0.0024, "step": 22575 }, { "epoch": 5.750912641141014, "grad_norm": 0.3923169672489166, "learning_rate": 5.775832012567698e-06, "loss": 0.003, "step": 22580 }, { "epoch": 5.752186093895917, "grad_norm": 0.7404444217681885, "learning_rate": 5.7718029614240454e-06, "loss": 0.0025, "step": 22585 }, { "epoch": 5.753459546650819, "grad_norm": 0.6066553592681885, "learning_rate": 5.767774745901403e-06, "loss": 0.004, "step": 22590 }, { "epoch": 5.754732999405722, "grad_norm": 0.39326003193855286, "learning_rate": 5.763747366795882e-06, "loss": 0.003, "step": 22595 }, { "epoch": 5.756006452160625, "grad_norm": 0.23386502265930176, "learning_rate": 5.759720824903411e-06, "loss": 0.0021, "step": 22600 }, { "epoch": 5.757279904915528, "grad_norm": 0.5073645114898682, "learning_rate": 5.755695121019757e-06, "loss": 0.0043, "step": 22605 }, { "epoch": 5.75855335767043, "grad_norm": 0.6462920308113098, "learning_rate": 5.7516702559405205e-06, "loss": 0.0033, "step": 22610 }, { "epoch": 5.759826810425333, "grad_norm": 0.19411328434944153, "learning_rate": 5.747646230461148e-06, "loss": 0.0023, "step": 22615 }, { "epoch": 5.761100263180236, "grad_norm": 0.35833245515823364, "learning_rate": 5.743623045376894e-06, "loss": 0.0012, "step": 22620 }, { "epoch": 5.762373715935139, "grad_norm": 0.3502955734729767, "learning_rate": 5.739600701482875e-06, "loss": 0.0033, "step": 22625 }, { "epoch": 5.763647168690041, "grad_norm": 0.672657310962677, "learning_rate": 5.735579199574026e-06, "loss": 0.0022, "step": 22630 }, { "epoch": 5.764920621444944, "grad_norm": 0.7412696480751038, "learning_rate": 5.731558540445118e-06, "loss": 0.0039, "step": 22635 }, { "epoch": 5.766194074199847, "grad_norm": 0.4814690947532654, "learning_rate": 5.727538724890752e-06, "loss": 0.0054, "step": 22640 }, { "epoch": 5.76746752695475, "grad_norm": 0.45167356729507446, "learning_rate": 5.72351975370538e-06, "loss": 0.002, "step": 22645 }, { "epoch": 5.768740979709653, "grad_norm": 0.34380224347114563, "learning_rate": 5.719501627683255e-06, "loss": 0.0028, "step": 22650 }, { "epoch": 5.7700144324645555, "grad_norm": 0.7192511558532715, "learning_rate": 5.7154843476184965e-06, "loss": 0.0039, "step": 22655 }, { "epoch": 5.771287885219459, "grad_norm": 0.2191060334444046, "learning_rate": 5.711467914305036e-06, "loss": 0.0015, "step": 22660 }, { "epoch": 5.772561337974361, "grad_norm": 0.9484615921974182, "learning_rate": 5.707452328536643e-06, "loss": 0.0039, "step": 22665 }, { "epoch": 5.773834790729264, "grad_norm": 0.3051232099533081, "learning_rate": 5.703437591106922e-06, "loss": 0.0025, "step": 22670 }, { "epoch": 5.7751082434841665, "grad_norm": 0.37189173698425293, "learning_rate": 5.699423702809306e-06, "loss": 0.005, "step": 22675 }, { "epoch": 5.77638169623907, "grad_norm": 0.080584816634655, "learning_rate": 5.69541066443706e-06, "loss": 0.0032, "step": 22680 }, { "epoch": 5.777655148993972, "grad_norm": 0.4705783724784851, "learning_rate": 5.691398476783292e-06, "loss": 0.002, "step": 22685 }, { "epoch": 5.778928601748875, "grad_norm": 0.6456839442253113, "learning_rate": 5.6873871406409205e-06, "loss": 0.0046, "step": 22690 }, { "epoch": 5.7802020545037776, "grad_norm": 0.5715189576148987, "learning_rate": 5.683376656802719e-06, "loss": 0.0028, "step": 22695 }, { "epoch": 5.781475507258681, "grad_norm": 1.033736228942871, "learning_rate": 5.679367026061277e-06, "loss": 0.0046, "step": 22700 }, { "epoch": 5.782748960013583, "grad_norm": 0.35548457503318787, "learning_rate": 5.67535824920902e-06, "loss": 0.0022, "step": 22705 }, { "epoch": 5.784022412768486, "grad_norm": 0.41089707612991333, "learning_rate": 5.671350327038201e-06, "loss": 0.004, "step": 22710 }, { "epoch": 5.7852958655233895, "grad_norm": 0.756331741809845, "learning_rate": 5.66734326034092e-06, "loss": 0.004, "step": 22715 }, { "epoch": 5.786569318278292, "grad_norm": 0.2733241319656372, "learning_rate": 5.663337049909081e-06, "loss": 0.0013, "step": 22720 }, { "epoch": 5.787842771033195, "grad_norm": 0.854209840297699, "learning_rate": 5.659331696534445e-06, "loss": 0.004, "step": 22725 }, { "epoch": 5.789116223788097, "grad_norm": 0.7302265167236328, "learning_rate": 5.6553272010085895e-06, "loss": 0.0032, "step": 22730 }, { "epoch": 5.7903896765430005, "grad_norm": 0.3317854702472687, "learning_rate": 5.651323564122921e-06, "loss": 0.0024, "step": 22735 }, { "epoch": 5.791663129297903, "grad_norm": 0.32168787717819214, "learning_rate": 5.647320786668682e-06, "loss": 0.0017, "step": 22740 }, { "epoch": 5.792936582052806, "grad_norm": 0.11820748448371887, "learning_rate": 5.643318869436951e-06, "loss": 0.0024, "step": 22745 }, { "epoch": 5.794210034807708, "grad_norm": 0.5652557015419006, "learning_rate": 5.639317813218618e-06, "loss": 0.003, "step": 22750 }, { "epoch": 5.795483487562612, "grad_norm": 0.627910852432251, "learning_rate": 5.6353176188044215e-06, "loss": 0.0027, "step": 22755 }, { "epoch": 5.796756940317514, "grad_norm": 0.6794412136077881, "learning_rate": 5.631318286984922e-06, "loss": 0.002, "step": 22760 }, { "epoch": 5.798030393072417, "grad_norm": 0.3419494926929474, "learning_rate": 5.627319818550506e-06, "loss": 0.0023, "step": 22765 }, { "epoch": 5.799303845827319, "grad_norm": 0.7087470293045044, "learning_rate": 5.623322214291392e-06, "loss": 0.0033, "step": 22770 }, { "epoch": 5.800577298582223, "grad_norm": 0.3980162441730499, "learning_rate": 5.61932547499764e-06, "loss": 0.0032, "step": 22775 }, { "epoch": 5.801850751337126, "grad_norm": 0.49698975682258606, "learning_rate": 5.615329601459112e-06, "loss": 0.0024, "step": 22780 }, { "epoch": 5.803124204092028, "grad_norm": 0.23499006032943726, "learning_rate": 5.611334594465526e-06, "loss": 0.0016, "step": 22785 }, { "epoch": 5.804397656846931, "grad_norm": 0.4670346677303314, "learning_rate": 5.607340454806414e-06, "loss": 0.0032, "step": 22790 }, { "epoch": 5.805671109601834, "grad_norm": 0.9932557940483093, "learning_rate": 5.603347183271142e-06, "loss": 0.0038, "step": 22795 }, { "epoch": 5.806944562356737, "grad_norm": 0.38790568709373474, "learning_rate": 5.599354780648896e-06, "loss": 0.0023, "step": 22800 }, { "epoch": 5.808218015111639, "grad_norm": 1.2578469514846802, "learning_rate": 5.595363247728709e-06, "loss": 0.0043, "step": 22805 }, { "epoch": 5.809491467866542, "grad_norm": 0.9874067306518555, "learning_rate": 5.591372585299415e-06, "loss": 0.0033, "step": 22810 }, { "epoch": 5.810764920621445, "grad_norm": 2.009300470352173, "learning_rate": 5.587382794149703e-06, "loss": 0.0038, "step": 22815 }, { "epoch": 5.812038373376348, "grad_norm": 0.6964054107666016, "learning_rate": 5.5833938750680715e-06, "loss": 0.0039, "step": 22820 }, { "epoch": 5.81331182613125, "grad_norm": 0.4134794771671295, "learning_rate": 5.579405828842857e-06, "loss": 0.0029, "step": 22825 }, { "epoch": 5.814585278886153, "grad_norm": 0.2594463527202606, "learning_rate": 5.575418656262215e-06, "loss": 0.0036, "step": 22830 }, { "epoch": 5.815858731641056, "grad_norm": 0.4110799729824066, "learning_rate": 5.571432358114135e-06, "loss": 0.0053, "step": 22835 }, { "epoch": 5.817132184395959, "grad_norm": 0.29381057620048523, "learning_rate": 5.567446935186428e-06, "loss": 0.0022, "step": 22840 }, { "epoch": 5.818405637150862, "grad_norm": 0.37652432918548584, "learning_rate": 5.563462388266743e-06, "loss": 0.0025, "step": 22845 }, { "epoch": 5.819679089905764, "grad_norm": 0.6048930287361145, "learning_rate": 5.559478718142541e-06, "loss": 0.002, "step": 22850 }, { "epoch": 5.820952542660668, "grad_norm": 0.7142905592918396, "learning_rate": 5.555495925601123e-06, "loss": 0.003, "step": 22855 }, { "epoch": 5.82222599541557, "grad_norm": 0.6982975602149963, "learning_rate": 5.551514011429606e-06, "loss": 0.0033, "step": 22860 }, { "epoch": 5.823499448170473, "grad_norm": 0.13224728405475616, "learning_rate": 5.54753297641494e-06, "loss": 0.0029, "step": 22865 }, { "epoch": 5.8247729009253755, "grad_norm": 0.4732038676738739, "learning_rate": 5.543552821343898e-06, "loss": 0.0016, "step": 22870 }, { "epoch": 5.826046353680279, "grad_norm": 0.6856086850166321, "learning_rate": 5.5403693313750765e-06, "loss": 0.003, "step": 22875 }, { "epoch": 5.827319806435181, "grad_norm": 0.6105238199234009, "learning_rate": 5.536390762184671e-06, "loss": 0.0045, "step": 22880 }, { "epoch": 5.828593259190084, "grad_norm": 0.5384171009063721, "learning_rate": 5.532413075139937e-06, "loss": 0.0031, "step": 22885 }, { "epoch": 5.8298667119449865, "grad_norm": 0.669277548789978, "learning_rate": 5.5284362710269825e-06, "loss": 0.0032, "step": 22890 }, { "epoch": 5.83114016469989, "grad_norm": 0.6429836750030518, "learning_rate": 5.524460350631745e-06, "loss": 0.0058, "step": 22895 }, { "epoch": 5.832413617454792, "grad_norm": 0.8508428335189819, "learning_rate": 5.52048531473999e-06, "loss": 0.002, "step": 22900 }, { "epoch": 5.833687070209695, "grad_norm": 0.2671617567539215, "learning_rate": 5.516511164137302e-06, "loss": 0.003, "step": 22905 }, { "epoch": 5.834960522964598, "grad_norm": 0.39765727519989014, "learning_rate": 5.5125378996091e-06, "loss": 0.0033, "step": 22910 }, { "epoch": 5.836233975719501, "grad_norm": 0.39416396617889404, "learning_rate": 5.508565521940614e-06, "loss": 0.0028, "step": 22915 }, { "epoch": 5.837507428474404, "grad_norm": 0.6740007996559143, "learning_rate": 5.504594031916919e-06, "loss": 0.0029, "step": 22920 }, { "epoch": 5.838780881229306, "grad_norm": 0.37317219376564026, "learning_rate": 5.500623430322896e-06, "loss": 0.0021, "step": 22925 }, { "epoch": 5.8400543339842095, "grad_norm": 0.4295446574687958, "learning_rate": 5.496653717943259e-06, "loss": 0.003, "step": 22930 }, { "epoch": 5.841327786739112, "grad_norm": 0.3162093460559845, "learning_rate": 5.492684895562543e-06, "loss": 0.0016, "step": 22935 }, { "epoch": 5.842601239494015, "grad_norm": 0.4762533903121948, "learning_rate": 5.4887169639651105e-06, "loss": 0.0048, "step": 22940 }, { "epoch": 5.843874692248917, "grad_norm": 0.6774078607559204, "learning_rate": 5.484749923935144e-06, "loss": 0.0032, "step": 22945 }, { "epoch": 5.8451481450038205, "grad_norm": 0.4532949924468994, "learning_rate": 5.480783776256654e-06, "loss": 0.0044, "step": 22950 }, { "epoch": 5.846421597758723, "grad_norm": 0.9748526811599731, "learning_rate": 5.476818521713467e-06, "loss": 0.0033, "step": 22955 }, { "epoch": 5.847695050513626, "grad_norm": 0.7714719176292419, "learning_rate": 5.472854161089247e-06, "loss": 0.0042, "step": 22960 }, { "epoch": 5.848968503268528, "grad_norm": 0.589242160320282, "learning_rate": 5.46889069516747e-06, "loss": 0.0042, "step": 22965 }, { "epoch": 5.8502419560234316, "grad_norm": 0.8830773830413818, "learning_rate": 5.464928124731437e-06, "loss": 0.0026, "step": 22970 }, { "epoch": 5.851515408778335, "grad_norm": 0.37653183937072754, "learning_rate": 5.460966450564272e-06, "loss": 0.0028, "step": 22975 }, { "epoch": 5.852788861533237, "grad_norm": 0.27902549505233765, "learning_rate": 5.457005673448925e-06, "loss": 0.0043, "step": 22980 }, { "epoch": 5.85406231428814, "grad_norm": 0.4128091037273407, "learning_rate": 5.4530457941681615e-06, "loss": 0.0032, "step": 22985 }, { "epoch": 5.855335767043043, "grad_norm": 0.36892130970954895, "learning_rate": 5.449086813504587e-06, "loss": 0.003, "step": 22990 }, { "epoch": 5.856609219797946, "grad_norm": 0.7426274418830872, "learning_rate": 5.445128732240601e-06, "loss": 0.0044, "step": 22995 }, { "epoch": 5.857882672552848, "grad_norm": 0.4599062204360962, "learning_rate": 5.441171551158455e-06, "loss": 0.0025, "step": 23000 }, { "epoch": 5.859156125307751, "grad_norm": 0.3627505302429199, "learning_rate": 5.437215271040201e-06, "loss": 0.0021, "step": 23005 }, { "epoch": 5.860429578062654, "grad_norm": 0.7080380320549011, "learning_rate": 5.433259892667725e-06, "loss": 0.0031, "step": 23010 }, { "epoch": 5.861703030817557, "grad_norm": 0.6507861018180847, "learning_rate": 5.429305416822726e-06, "loss": 0.0027, "step": 23015 }, { "epoch": 5.862976483572459, "grad_norm": 0.6166746020317078, "learning_rate": 5.4253518442867384e-06, "loss": 0.0015, "step": 23020 }, { "epoch": 5.864249936327362, "grad_norm": 0.7038540244102478, "learning_rate": 5.421399175841096e-06, "loss": 0.0031, "step": 23025 }, { "epoch": 5.865523389082265, "grad_norm": 0.20168668031692505, "learning_rate": 5.4174474122669785e-06, "loss": 0.0024, "step": 23030 }, { "epoch": 5.866796841837168, "grad_norm": 0.5677368640899658, "learning_rate": 5.41349655434537e-06, "loss": 0.0028, "step": 23035 }, { "epoch": 5.868070294592071, "grad_norm": 0.940453827381134, "learning_rate": 5.409546602857081e-06, "loss": 0.0035, "step": 23040 }, { "epoch": 5.869343747346973, "grad_norm": 0.35516875982284546, "learning_rate": 5.40559755858274e-06, "loss": 0.0025, "step": 23045 }, { "epoch": 5.870617200101877, "grad_norm": 0.11690793186426163, "learning_rate": 5.40164942230281e-06, "loss": 0.0035, "step": 23050 }, { "epoch": 5.871890652856779, "grad_norm": 0.3829183578491211, "learning_rate": 5.3977021947975486e-06, "loss": 0.0016, "step": 23055 }, { "epoch": 5.873164105611682, "grad_norm": 0.5373998284339905, "learning_rate": 5.393755876847058e-06, "loss": 0.0024, "step": 23060 }, { "epoch": 5.874437558366584, "grad_norm": 0.4145232141017914, "learning_rate": 5.38981046923125e-06, "loss": 0.0027, "step": 23065 }, { "epoch": 5.875711011121488, "grad_norm": 0.4487011432647705, "learning_rate": 5.385865972729857e-06, "loss": 0.003, "step": 23070 }, { "epoch": 5.87698446387639, "grad_norm": 0.3587585389614105, "learning_rate": 5.381922388122427e-06, "loss": 0.0029, "step": 23075 }, { "epoch": 5.878257916631293, "grad_norm": 0.6918366551399231, "learning_rate": 5.377979716188346e-06, "loss": 0.0026, "step": 23080 }, { "epoch": 5.8795313693861955, "grad_norm": 1.244644284248352, "learning_rate": 5.374037957706791e-06, "loss": 0.0032, "step": 23085 }, { "epoch": 5.880804822141099, "grad_norm": 0.3191717565059662, "learning_rate": 5.370097113456785e-06, "loss": 0.0029, "step": 23090 }, { "epoch": 5.882078274896001, "grad_norm": 0.7874560356140137, "learning_rate": 5.366157184217155e-06, "loss": 0.0023, "step": 23095 }, { "epoch": 5.883351727650904, "grad_norm": 0.5825322270393372, "learning_rate": 5.362218170766551e-06, "loss": 0.0038, "step": 23100 }, { "epoch": 5.884625180405807, "grad_norm": 0.3985574543476105, "learning_rate": 5.358280073883442e-06, "loss": 0.0029, "step": 23105 }, { "epoch": 5.88589863316071, "grad_norm": 0.8723916411399841, "learning_rate": 5.354342894346125e-06, "loss": 0.0036, "step": 23110 }, { "epoch": 5.887172085915613, "grad_norm": 0.6782783269882202, "learning_rate": 5.350406632932692e-06, "loss": 0.0026, "step": 23115 }, { "epoch": 5.888445538670515, "grad_norm": 0.2588452100753784, "learning_rate": 5.346471290421081e-06, "loss": 0.0028, "step": 23120 }, { "epoch": 5.889718991425418, "grad_norm": 0.40437087416648865, "learning_rate": 5.34253686758903e-06, "loss": 0.0022, "step": 23125 }, { "epoch": 5.890992444180321, "grad_norm": 0.500554084777832, "learning_rate": 5.338603365214103e-06, "loss": 0.003, "step": 23130 }, { "epoch": 5.892265896935224, "grad_norm": 0.7738689184188843, "learning_rate": 5.334670784073678e-06, "loss": 0.0023, "step": 23135 }, { "epoch": 5.893539349690126, "grad_norm": 0.3794015944004059, "learning_rate": 5.330739124944962e-06, "loss": 0.0032, "step": 23140 }, { "epoch": 5.8948128024450295, "grad_norm": 1.0622966289520264, "learning_rate": 5.3268083886049585e-06, "loss": 0.0027, "step": 23145 }, { "epoch": 5.896086255199932, "grad_norm": 0.7607643008232117, "learning_rate": 5.322878575830509e-06, "loss": 0.0026, "step": 23150 }, { "epoch": 5.897359707954835, "grad_norm": 0.26288023591041565, "learning_rate": 5.318949687398263e-06, "loss": 0.003, "step": 23155 }, { "epoch": 5.898633160709737, "grad_norm": 0.44899412989616394, "learning_rate": 5.31502172408469e-06, "loss": 0.003, "step": 23160 }, { "epoch": 5.8999066134646405, "grad_norm": 0.31140321493148804, "learning_rate": 5.3110946866660706e-06, "loss": 0.0025, "step": 23165 }, { "epoch": 5.901180066219544, "grad_norm": 0.18434563279151917, "learning_rate": 5.30716857591852e-06, "loss": 0.0031, "step": 23170 }, { "epoch": 5.902453518974446, "grad_norm": 0.6369840502738953, "learning_rate": 5.3032433926179395e-06, "loss": 0.0033, "step": 23175 }, { "epoch": 5.903726971729348, "grad_norm": 0.29269590973854065, "learning_rate": 5.29931913754008e-06, "loss": 0.0033, "step": 23180 }, { "epoch": 5.9050004244842516, "grad_norm": 0.8171371221542358, "learning_rate": 5.295395811460489e-06, "loss": 0.0042, "step": 23185 }, { "epoch": 5.906273877239155, "grad_norm": 0.4687783122062683, "learning_rate": 5.291473415154535e-06, "loss": 0.0018, "step": 23190 }, { "epoch": 5.907547329994057, "grad_norm": 0.6673541069030762, "learning_rate": 5.287551949397406e-06, "loss": 0.0029, "step": 23195 }, { "epoch": 5.90882078274896, "grad_norm": 0.27846312522888184, "learning_rate": 5.283631414964101e-06, "loss": 0.0024, "step": 23200 }, { "epoch": 5.910094235503863, "grad_norm": 0.32412078976631165, "learning_rate": 5.279711812629436e-06, "loss": 0.0015, "step": 23205 }, { "epoch": 5.911367688258766, "grad_norm": 0.7372633814811707, "learning_rate": 5.275793143168052e-06, "loss": 0.003, "step": 23210 }, { "epoch": 5.912641141013668, "grad_norm": 0.11319863051176071, "learning_rate": 5.271875407354386e-06, "loss": 0.0015, "step": 23215 }, { "epoch": 5.913914593768571, "grad_norm": 0.48587700724601746, "learning_rate": 5.267958605962713e-06, "loss": 0.0024, "step": 23220 }, { "epoch": 5.915188046523474, "grad_norm": 0.33921679854393005, "learning_rate": 5.264042739767109e-06, "loss": 0.0021, "step": 23225 }, { "epoch": 5.916461499278377, "grad_norm": 0.18398375809192657, "learning_rate": 5.260127809541469e-06, "loss": 0.002, "step": 23230 }, { "epoch": 5.91773495203328, "grad_norm": 0.479402631521225, "learning_rate": 5.256213816059501e-06, "loss": 0.0023, "step": 23235 }, { "epoch": 5.919008404788182, "grad_norm": 0.15015995502471924, "learning_rate": 5.252300760094733e-06, "loss": 0.0024, "step": 23240 }, { "epoch": 5.920281857543085, "grad_norm": 0.18795908987522125, "learning_rate": 5.2483886424204965e-06, "loss": 0.0018, "step": 23245 }, { "epoch": 5.921555310297988, "grad_norm": 0.5108364224433899, "learning_rate": 5.244477463809958e-06, "loss": 0.0036, "step": 23250 }, { "epoch": 5.922828763052891, "grad_norm": 0.5960199236869812, "learning_rate": 5.24056722503608e-06, "loss": 0.0051, "step": 23255 }, { "epoch": 5.924102215807793, "grad_norm": 0.1953728348016739, "learning_rate": 5.236657926871645e-06, "loss": 0.0047, "step": 23260 }, { "epoch": 5.925375668562697, "grad_norm": 0.7592850923538208, "learning_rate": 5.232749570089248e-06, "loss": 0.0031, "step": 23265 }, { "epoch": 5.926649121317599, "grad_norm": 0.7425428628921509, "learning_rate": 5.228842155461303e-06, "loss": 0.0023, "step": 23270 }, { "epoch": 5.927922574072502, "grad_norm": 0.2714245319366455, "learning_rate": 5.224935683760033e-06, "loss": 0.0043, "step": 23275 }, { "epoch": 5.929196026827404, "grad_norm": 0.3782700300216675, "learning_rate": 5.2210301557574735e-06, "loss": 0.0024, "step": 23280 }, { "epoch": 5.930469479582308, "grad_norm": 0.17930062115192413, "learning_rate": 5.217125572225483e-06, "loss": 0.002, "step": 23285 }, { "epoch": 5.93174293233721, "grad_norm": 0.5030732154846191, "learning_rate": 5.213221933935723e-06, "loss": 0.0029, "step": 23290 }, { "epoch": 5.933016385092113, "grad_norm": 1.4694087505340576, "learning_rate": 5.209319241659672e-06, "loss": 0.0023, "step": 23295 }, { "epoch": 5.934289837847016, "grad_norm": 0.6051844954490662, "learning_rate": 5.205417496168621e-06, "loss": 0.0034, "step": 23300 }, { "epoch": 5.935563290601919, "grad_norm": 0.4924243986606598, "learning_rate": 5.201516698233673e-06, "loss": 0.0018, "step": 23305 }, { "epoch": 5.936836743356821, "grad_norm": 0.4347774088382721, "learning_rate": 5.197616848625744e-06, "loss": 0.0024, "step": 23310 }, { "epoch": 5.938110196111724, "grad_norm": 0.3600996136665344, "learning_rate": 5.193717948115573e-06, "loss": 0.0024, "step": 23315 }, { "epoch": 5.939383648866627, "grad_norm": 0.4436211585998535, "learning_rate": 5.1898199974736865e-06, "loss": 0.0031, "step": 23320 }, { "epoch": 5.94065710162153, "grad_norm": 0.6958094835281372, "learning_rate": 5.1859229974704515e-06, "loss": 0.0022, "step": 23325 }, { "epoch": 5.941930554376433, "grad_norm": 0.39608266949653625, "learning_rate": 5.182026948876029e-06, "loss": 0.002, "step": 23330 }, { "epoch": 5.943204007131335, "grad_norm": 0.2117258906364441, "learning_rate": 5.178131852460399e-06, "loss": 0.0023, "step": 23335 }, { "epoch": 5.944477459886238, "grad_norm": 0.9540325403213501, "learning_rate": 5.174237708993347e-06, "loss": 0.0038, "step": 23340 }, { "epoch": 5.945750912641141, "grad_norm": 0.5539766550064087, "learning_rate": 5.1703445192444856e-06, "loss": 0.0051, "step": 23345 }, { "epoch": 5.947024365396044, "grad_norm": 0.5918343663215637, "learning_rate": 5.166452283983214e-06, "loss": 0.0043, "step": 23350 }, { "epoch": 5.948297818150946, "grad_norm": 0.22927501797676086, "learning_rate": 5.162561003978767e-06, "loss": 0.0025, "step": 23355 }, { "epoch": 5.9495712709058495, "grad_norm": 0.2953946590423584, "learning_rate": 5.158670680000179e-06, "loss": 0.0021, "step": 23360 }, { "epoch": 5.950844723660753, "grad_norm": 0.24204938113689423, "learning_rate": 5.154781312816295e-06, "loss": 0.0034, "step": 23365 }, { "epoch": 5.952118176415655, "grad_norm": 0.957118570804596, "learning_rate": 5.150892903195769e-06, "loss": 0.0028, "step": 23370 }, { "epoch": 5.953391629170557, "grad_norm": 0.26956650614738464, "learning_rate": 5.147005451907081e-06, "loss": 0.0025, "step": 23375 }, { "epoch": 5.9546650819254605, "grad_norm": 0.3831212818622589, "learning_rate": 5.143118959718498e-06, "loss": 0.0021, "step": 23380 }, { "epoch": 5.955938534680364, "grad_norm": 0.6536152958869934, "learning_rate": 5.1392334273981174e-06, "loss": 0.0028, "step": 23385 }, { "epoch": 5.957211987435266, "grad_norm": 0.3427608609199524, "learning_rate": 5.135348855713837e-06, "loss": 0.0043, "step": 23390 }, { "epoch": 5.958485440190169, "grad_norm": 0.5882169008255005, "learning_rate": 5.131465245433366e-06, "loss": 0.0026, "step": 23395 }, { "epoch": 5.9597588929450716, "grad_norm": 1.720828890800476, "learning_rate": 5.127582597324221e-06, "loss": 0.0053, "step": 23400 }, { "epoch": 5.961032345699975, "grad_norm": 0.37233325839042664, "learning_rate": 5.123700912153746e-06, "loss": 0.002, "step": 23405 }, { "epoch": 5.962305798454877, "grad_norm": 0.09314246475696564, "learning_rate": 5.1198201906890624e-06, "loss": 0.0019, "step": 23410 }, { "epoch": 5.96357925120978, "grad_norm": 0.2668994069099426, "learning_rate": 5.115940433697131e-06, "loss": 0.0022, "step": 23415 }, { "epoch": 5.964852703964683, "grad_norm": 0.21115800738334656, "learning_rate": 5.1120616419447075e-06, "loss": 0.0022, "step": 23420 }, { "epoch": 5.966126156719586, "grad_norm": 0.773186206817627, "learning_rate": 5.10818381619836e-06, "loss": 0.0042, "step": 23425 }, { "epoch": 5.967399609474488, "grad_norm": 0.3280608057975769, "learning_rate": 5.1043069572244605e-06, "loss": 0.0023, "step": 23430 }, { "epoch": 5.968673062229391, "grad_norm": 0.09397431463003159, "learning_rate": 5.1004310657892075e-06, "loss": 0.0033, "step": 23435 }, { "epoch": 5.969946514984294, "grad_norm": 0.5465965867042542, "learning_rate": 5.09655614265858e-06, "loss": 0.0028, "step": 23440 }, { "epoch": 5.971219967739197, "grad_norm": 0.21222764253616333, "learning_rate": 5.092682188598397e-06, "loss": 0.0037, "step": 23445 }, { "epoch": 5.9724934204941, "grad_norm": 0.6419665813446045, "learning_rate": 5.088809204374253e-06, "loss": 0.0035, "step": 23450 }, { "epoch": 5.973766873249002, "grad_norm": 0.8893948197364807, "learning_rate": 5.084937190751582e-06, "loss": 0.0021, "step": 23455 }, { "epoch": 5.9750403260039056, "grad_norm": 0.3274373710155487, "learning_rate": 5.081066148495608e-06, "loss": 0.0026, "step": 23460 }, { "epoch": 5.976313778758808, "grad_norm": 1.2962653636932373, "learning_rate": 5.077196078371366e-06, "loss": 0.0057, "step": 23465 }, { "epoch": 5.977587231513711, "grad_norm": 0.21367377042770386, "learning_rate": 5.073326981143697e-06, "loss": 0.0011, "step": 23470 }, { "epoch": 5.978860684268613, "grad_norm": 0.6744191646575928, "learning_rate": 5.069458857577265e-06, "loss": 0.0047, "step": 23475 }, { "epoch": 5.980134137023517, "grad_norm": 0.7591440677642822, "learning_rate": 5.065591708436514e-06, "loss": 0.0025, "step": 23480 }, { "epoch": 5.981407589778419, "grad_norm": 0.09505365043878555, "learning_rate": 5.0617255344857216e-06, "loss": 0.0029, "step": 23485 }, { "epoch": 5.982681042533322, "grad_norm": 0.6595933437347412, "learning_rate": 5.05786033648896e-06, "loss": 0.0024, "step": 23490 }, { "epoch": 5.983954495288224, "grad_norm": 1.1597553491592407, "learning_rate": 5.053996115210107e-06, "loss": 0.0027, "step": 23495 }, { "epoch": 5.985227948043128, "grad_norm": 0.4597614109516144, "learning_rate": 5.050132871412852e-06, "loss": 0.0029, "step": 23500 }, { "epoch": 5.98650140079803, "grad_norm": 0.6744516491889954, "learning_rate": 5.046270605860697e-06, "loss": 0.005, "step": 23505 }, { "epoch": 5.987774853552933, "grad_norm": 0.5557984709739685, "learning_rate": 5.04240931931693e-06, "loss": 0.0021, "step": 23510 }, { "epoch": 5.989048306307836, "grad_norm": 0.3000294268131256, "learning_rate": 5.038549012544672e-06, "loss": 0.0025, "step": 23515 }, { "epoch": 5.990321759062739, "grad_norm": 0.1796206682920456, "learning_rate": 5.034689686306833e-06, "loss": 0.0017, "step": 23520 }, { "epoch": 5.991595211817642, "grad_norm": 0.527056097984314, "learning_rate": 5.030831341366134e-06, "loss": 0.0024, "step": 23525 }, { "epoch": 5.992868664572544, "grad_norm": 0.3421619236469269, "learning_rate": 5.0269739784851014e-06, "loss": 0.0022, "step": 23530 }, { "epoch": 5.994142117327447, "grad_norm": 0.4413299858570099, "learning_rate": 5.0231175984260685e-06, "loss": 0.0046, "step": 23535 }, { "epoch": 5.99541557008235, "grad_norm": 0.30276334285736084, "learning_rate": 5.01926220195117e-06, "loss": 0.0026, "step": 23540 }, { "epoch": 5.996689022837253, "grad_norm": 0.5056434869766235, "learning_rate": 5.015407789822356e-06, "loss": 0.0024, "step": 23545 }, { "epoch": 5.997962475592155, "grad_norm": 0.4689916670322418, "learning_rate": 5.011554362801376e-06, "loss": 0.002, "step": 23550 }, { "epoch": 5.999235928347058, "grad_norm": 0.7847785353660583, "learning_rate": 5.007701921649783e-06, "loss": 0.0019, "step": 23555 }, { "epoch": 6.000509381101961, "grad_norm": 0.35700780153274536, "learning_rate": 5.003850467128937e-06, "loss": 0.0024, "step": 23560 }, { "epoch": 6.001782833856864, "grad_norm": 0.1584547758102417, "learning_rate": 5.000000000000003e-06, "loss": 0.0038, "step": 23565 }, { "epoch": 6.003056286611766, "grad_norm": 0.21238592267036438, "learning_rate": 4.996150521023952e-06, "loss": 0.0029, "step": 23570 }, { "epoch": 6.0043297393666695, "grad_norm": 0.1054638996720314, "learning_rate": 4.992302030961553e-06, "loss": 0.0018, "step": 23575 }, { "epoch": 6.005603192121573, "grad_norm": 0.1998324692249298, "learning_rate": 4.988454530573395e-06, "loss": 0.0019, "step": 23580 }, { "epoch": 6.006876644876475, "grad_norm": 0.09561686217784882, "learning_rate": 4.984608020619859e-06, "loss": 0.0015, "step": 23585 }, { "epoch": 6.008150097631378, "grad_norm": 0.43148475885391235, "learning_rate": 4.980762501861129e-06, "loss": 0.0021, "step": 23590 }, { "epoch": 6.0094235503862805, "grad_norm": 0.07561642676591873, "learning_rate": 4.976917975057201e-06, "loss": 0.0012, "step": 23595 }, { "epoch": 6.010697003141184, "grad_norm": 0.07149233669042587, "learning_rate": 4.9730744409678685e-06, "loss": 0.0012, "step": 23600 }, { "epoch": 6.011970455896086, "grad_norm": 0.2533906102180481, "learning_rate": 4.969231900352729e-06, "loss": 0.0012, "step": 23605 }, { "epoch": 6.013243908650989, "grad_norm": 0.33156466484069824, "learning_rate": 4.965390353971197e-06, "loss": 0.0017, "step": 23610 }, { "epoch": 6.0145173614058915, "grad_norm": 0.3192024230957031, "learning_rate": 4.961549802582464e-06, "loss": 0.0012, "step": 23615 }, { "epoch": 6.015790814160795, "grad_norm": 0.12357386946678162, "learning_rate": 4.9577102469455514e-06, "loss": 0.0011, "step": 23620 }, { "epoch": 6.017064266915697, "grad_norm": 0.07144071906805038, "learning_rate": 4.953871687819269e-06, "loss": 0.0009, "step": 23625 }, { "epoch": 6.0183377196706, "grad_norm": 1.036564588546753, "learning_rate": 4.950034125962235e-06, "loss": 0.0015, "step": 23630 }, { "epoch": 6.019611172425503, "grad_norm": 0.3236915171146393, "learning_rate": 4.946197562132865e-06, "loss": 0.0019, "step": 23635 }, { "epoch": 6.020884625180406, "grad_norm": 0.2977951467037201, "learning_rate": 4.942361997089391e-06, "loss": 0.0014, "step": 23640 }, { "epoch": 6.022158077935309, "grad_norm": 0.17091627418994904, "learning_rate": 4.938527431589823e-06, "loss": 0.0011, "step": 23645 }, { "epoch": 6.023431530690211, "grad_norm": 0.09220495820045471, "learning_rate": 4.934693866392002e-06, "loss": 0.001, "step": 23650 }, { "epoch": 6.0247049834451145, "grad_norm": 0.469852477312088, "learning_rate": 4.930861302253551e-06, "loss": 0.0011, "step": 23655 }, { "epoch": 6.025978436200017, "grad_norm": 0.3023979663848877, "learning_rate": 4.927029739931903e-06, "loss": 0.0011, "step": 23660 }, { "epoch": 6.02725188895492, "grad_norm": 0.061338700354099274, "learning_rate": 4.9231991801842895e-06, "loss": 0.0008, "step": 23665 }, { "epoch": 6.028525341709822, "grad_norm": 0.18152384459972382, "learning_rate": 4.919369623767754e-06, "loss": 0.001, "step": 23670 }, { "epoch": 6.0297987944647256, "grad_norm": 0.2994607388973236, "learning_rate": 4.915541071439124e-06, "loss": 0.0013, "step": 23675 }, { "epoch": 6.031072247219628, "grad_norm": 0.8202417492866516, "learning_rate": 4.9117135239550465e-06, "loss": 0.0027, "step": 23680 }, { "epoch": 6.032345699974531, "grad_norm": 0.03711860999464989, "learning_rate": 4.907886982071957e-06, "loss": 0.0022, "step": 23685 }, { "epoch": 6.033619152729433, "grad_norm": 0.18818196654319763, "learning_rate": 4.9040614465461e-06, "loss": 0.0023, "step": 23690 }, { "epoch": 6.034892605484337, "grad_norm": 0.029408259317278862, "learning_rate": 4.9002369181335165e-06, "loss": 0.0031, "step": 23695 }, { "epoch": 6.036166058239239, "grad_norm": 0.3112916946411133, "learning_rate": 4.896413397590052e-06, "loss": 0.002, "step": 23700 }, { "epoch": 6.037439510994142, "grad_norm": 0.2279897928237915, "learning_rate": 4.892590885671344e-06, "loss": 0.0013, "step": 23705 }, { "epoch": 6.038712963749045, "grad_norm": 0.04078022763133049, "learning_rate": 4.888769383132853e-06, "loss": 0.001, "step": 23710 }, { "epoch": 6.039986416503948, "grad_norm": 0.4142187535762787, "learning_rate": 4.884948890729808e-06, "loss": 0.0011, "step": 23715 }, { "epoch": 6.041259869258851, "grad_norm": 0.07536479085683823, "learning_rate": 4.881129409217266e-06, "loss": 0.0013, "step": 23720 }, { "epoch": 6.042533322013753, "grad_norm": 0.50323086977005, "learning_rate": 4.877310939350069e-06, "loss": 0.0013, "step": 23725 }, { "epoch": 6.043806774768656, "grad_norm": 0.09288304299116135, "learning_rate": 4.873493481882865e-06, "loss": 0.0011, "step": 23730 }, { "epoch": 6.045080227523559, "grad_norm": 0.15812639892101288, "learning_rate": 4.869677037570096e-06, "loss": 0.0011, "step": 23735 }, { "epoch": 6.046353680278462, "grad_norm": 1.0757694244384766, "learning_rate": 4.865861607166019e-06, "loss": 0.0036, "step": 23740 }, { "epoch": 6.047627133033364, "grad_norm": 0.08667884021997452, "learning_rate": 4.862047191424665e-06, "loss": 0.0018, "step": 23745 }, { "epoch": 6.048900585788267, "grad_norm": 0.06329178810119629, "learning_rate": 4.85823379109989e-06, "loss": 0.0012, "step": 23750 }, { "epoch": 6.05017403854317, "grad_norm": 1.9252644777297974, "learning_rate": 4.854421406945336e-06, "loss": 0.0027, "step": 23755 }, { "epoch": 6.051447491298073, "grad_norm": 0.03635755926370621, "learning_rate": 4.850610039714444e-06, "loss": 0.0012, "step": 23760 }, { "epoch": 6.052720944052975, "grad_norm": 0.6263636350631714, "learning_rate": 4.846799690160455e-06, "loss": 0.0011, "step": 23765 }, { "epoch": 6.053994396807878, "grad_norm": 0.3476264774799347, "learning_rate": 4.842990359036423e-06, "loss": 0.002, "step": 23770 }, { "epoch": 6.055267849562782, "grad_norm": 0.5737224817276001, "learning_rate": 4.839182047095171e-06, "loss": 0.0017, "step": 23775 }, { "epoch": 6.056541302317684, "grad_norm": 0.25056129693984985, "learning_rate": 4.8353747550893505e-06, "loss": 0.0015, "step": 23780 }, { "epoch": 6.057814755072587, "grad_norm": 0.2943075895309448, "learning_rate": 4.8315684837713935e-06, "loss": 0.0011, "step": 23785 }, { "epoch": 6.0590882078274895, "grad_norm": 0.9909171462059021, "learning_rate": 4.827763233893537e-06, "loss": 0.0027, "step": 23790 }, { "epoch": 6.060361660582393, "grad_norm": 0.09639095515012741, "learning_rate": 4.8239590062078115e-06, "loss": 0.0012, "step": 23795 }, { "epoch": 6.061635113337295, "grad_norm": 0.5562769174575806, "learning_rate": 4.820155801466058e-06, "loss": 0.0009, "step": 23800 }, { "epoch": 6.062908566092198, "grad_norm": 0.3169366419315338, "learning_rate": 4.816353620419892e-06, "loss": 0.0017, "step": 23805 }, { "epoch": 6.0641820188471005, "grad_norm": 0.546333372592926, "learning_rate": 4.812552463820753e-06, "loss": 0.0014, "step": 23810 }, { "epoch": 6.065455471602004, "grad_norm": 0.028620397672057152, "learning_rate": 4.8087523324198614e-06, "loss": 0.0014, "step": 23815 }, { "epoch": 6.066728924356906, "grad_norm": 0.35471367835998535, "learning_rate": 4.8049532269682375e-06, "loss": 0.0009, "step": 23820 }, { "epoch": 6.068002377111809, "grad_norm": 0.4428870975971222, "learning_rate": 4.801155148216699e-06, "loss": 0.0016, "step": 23825 }, { "epoch": 6.0692758298667115, "grad_norm": 0.05855460464954376, "learning_rate": 4.797358096915875e-06, "loss": 0.0009, "step": 23830 }, { "epoch": 6.070549282621615, "grad_norm": 0.1210465133190155, "learning_rate": 4.79356207381616e-06, "loss": 0.002, "step": 23835 }, { "epoch": 6.071822735376518, "grad_norm": 0.31131958961486816, "learning_rate": 4.789767079667779e-06, "loss": 0.0028, "step": 23840 }, { "epoch": 6.07309618813142, "grad_norm": 0.9128492474555969, "learning_rate": 4.785973115220732e-06, "loss": 0.0021, "step": 23845 }, { "epoch": 6.0743696408863235, "grad_norm": 0.21414797008037567, "learning_rate": 4.782180181224826e-06, "loss": 0.0027, "step": 23850 }, { "epoch": 6.075643093641226, "grad_norm": 0.2377476543188095, "learning_rate": 4.778388278429657e-06, "loss": 0.0008, "step": 23855 }, { "epoch": 6.076916546396129, "grad_norm": 0.09709440916776657, "learning_rate": 4.774597407584624e-06, "loss": 0.001, "step": 23860 }, { "epoch": 6.078189999151031, "grad_norm": 0.1140919104218483, "learning_rate": 4.770807569438915e-06, "loss": 0.0015, "step": 23865 }, { "epoch": 6.0794634519059345, "grad_norm": 0.06437452882528305, "learning_rate": 4.7670187647415224e-06, "loss": 0.0017, "step": 23870 }, { "epoch": 6.080736904660837, "grad_norm": 0.1442648023366928, "learning_rate": 4.763230994241229e-06, "loss": 0.0014, "step": 23875 }, { "epoch": 6.08201035741574, "grad_norm": 0.05973583087325096, "learning_rate": 4.759444258686613e-06, "loss": 0.0017, "step": 23880 }, { "epoch": 6.083283810170642, "grad_norm": 0.716936469078064, "learning_rate": 4.755658558826049e-06, "loss": 0.0017, "step": 23885 }, { "epoch": 6.0845572629255456, "grad_norm": 0.07569888979196548, "learning_rate": 4.751873895407709e-06, "loss": 0.001, "step": 23890 }, { "epoch": 6.085830715680448, "grad_norm": 0.3738907277584076, "learning_rate": 4.7480902691795545e-06, "loss": 0.0011, "step": 23895 }, { "epoch": 6.087104168435351, "grad_norm": 0.10778386145830154, "learning_rate": 4.744307680889345e-06, "loss": 0.0019, "step": 23900 }, { "epoch": 6.088377621190254, "grad_norm": 0.610529899597168, "learning_rate": 4.740526131284643e-06, "loss": 0.0024, "step": 23905 }, { "epoch": 6.089651073945157, "grad_norm": 0.0788869559764862, "learning_rate": 4.736745621112793e-06, "loss": 0.0011, "step": 23910 }, { "epoch": 6.09092452670006, "grad_norm": 0.16612033545970917, "learning_rate": 4.73296615112094e-06, "loss": 0.0014, "step": 23915 }, { "epoch": 6.092197979454962, "grad_norm": 0.12893018126487732, "learning_rate": 4.7291877220560224e-06, "loss": 0.0007, "step": 23920 }, { "epoch": 6.093471432209865, "grad_norm": 0.12695401906967163, "learning_rate": 4.725410334664775e-06, "loss": 0.0023, "step": 23925 }, { "epoch": 6.094744884964768, "grad_norm": 1.224401831626892, "learning_rate": 4.721633989693719e-06, "loss": 0.0036, "step": 23930 }, { "epoch": 6.096018337719671, "grad_norm": 0.45676279067993164, "learning_rate": 4.71785868788919e-06, "loss": 0.0019, "step": 23935 }, { "epoch": 6.097291790474573, "grad_norm": 0.19277389347553253, "learning_rate": 4.714084429997284e-06, "loss": 0.0013, "step": 23940 }, { "epoch": 6.098565243229476, "grad_norm": 0.03531356528401375, "learning_rate": 4.710311216763924e-06, "loss": 0.0009, "step": 23945 }, { "epoch": 6.099838695984379, "grad_norm": 0.2074032425880432, "learning_rate": 4.706539048934808e-06, "loss": 0.001, "step": 23950 }, { "epoch": 6.101112148739282, "grad_norm": 0.11487238854169846, "learning_rate": 4.702767927255432e-06, "loss": 0.0021, "step": 23955 }, { "epoch": 6.102385601494184, "grad_norm": 0.3834715187549591, "learning_rate": 4.698997852471083e-06, "loss": 0.0011, "step": 23960 }, { "epoch": 6.103659054249087, "grad_norm": 0.15524716675281525, "learning_rate": 4.695228825326845e-06, "loss": 0.0033, "step": 23965 }, { "epoch": 6.104932507003991, "grad_norm": 0.07961779832839966, "learning_rate": 4.691460846567588e-06, "loss": 0.0009, "step": 23970 }, { "epoch": 6.106205959758893, "grad_norm": 0.3410117030143738, "learning_rate": 4.687693916937993e-06, "loss": 0.0022, "step": 23975 }, { "epoch": 6.107479412513796, "grad_norm": 0.764045774936676, "learning_rate": 4.6839280371825035e-06, "loss": 0.0009, "step": 23980 }, { "epoch": 6.108752865268698, "grad_norm": 0.39994189143180847, "learning_rate": 4.680163208045385e-06, "loss": 0.0013, "step": 23985 }, { "epoch": 6.110026318023602, "grad_norm": 0.23552767932415009, "learning_rate": 4.676399430270678e-06, "loss": 0.0009, "step": 23990 }, { "epoch": 6.111299770778504, "grad_norm": 0.1341448724269867, "learning_rate": 4.672636704602221e-06, "loss": 0.0018, "step": 23995 }, { "epoch": 6.112573223533407, "grad_norm": 0.3858896493911743, "learning_rate": 4.668875031783639e-06, "loss": 0.0017, "step": 24000 }, { "epoch": 6.1138466762883095, "grad_norm": 0.09384500980377197, "learning_rate": 4.665114412558366e-06, "loss": 0.0011, "step": 24005 }, { "epoch": 6.115120129043213, "grad_norm": 0.03156508505344391, "learning_rate": 4.661354847669599e-06, "loss": 0.001, "step": 24010 }, { "epoch": 6.116393581798115, "grad_norm": 0.6553557515144348, "learning_rate": 4.657596337860356e-06, "loss": 0.0015, "step": 24015 }, { "epoch": 6.117667034553018, "grad_norm": 0.6277816295623779, "learning_rate": 4.653838883873427e-06, "loss": 0.0015, "step": 24020 }, { "epoch": 6.1189404873079205, "grad_norm": 0.37803253531455994, "learning_rate": 4.6500824864514024e-06, "loss": 0.0008, "step": 24025 }, { "epoch": 6.120213940062824, "grad_norm": 0.9895496368408203, "learning_rate": 4.6463271463366565e-06, "loss": 0.0023, "step": 24030 }, { "epoch": 6.121487392817726, "grad_norm": 0.2916771471500397, "learning_rate": 4.642572864271371e-06, "loss": 0.0015, "step": 24035 }, { "epoch": 6.122760845572629, "grad_norm": 0.19210325181484222, "learning_rate": 4.63881964099749e-06, "loss": 0.001, "step": 24040 }, { "epoch": 6.124034298327532, "grad_norm": 0.1041603609919548, "learning_rate": 4.635067477256779e-06, "loss": 0.0014, "step": 24045 }, { "epoch": 6.125307751082435, "grad_norm": 0.2978006899356842, "learning_rate": 4.631316373790775e-06, "loss": 0.002, "step": 24050 }, { "epoch": 6.126581203837338, "grad_norm": 0.33590278029441833, "learning_rate": 4.627566331340812e-06, "loss": 0.0007, "step": 24055 }, { "epoch": 6.12785465659224, "grad_norm": 0.937678873538971, "learning_rate": 4.623817350648007e-06, "loss": 0.0013, "step": 24060 }, { "epoch": 6.1291281093471435, "grad_norm": 0.3698556423187256, "learning_rate": 4.6200694324532856e-06, "loss": 0.001, "step": 24065 }, { "epoch": 6.130401562102046, "grad_norm": 0.07118824869394302, "learning_rate": 4.616322577497337e-06, "loss": 0.0015, "step": 24070 }, { "epoch": 6.131675014856949, "grad_norm": 0.23044514656066895, "learning_rate": 4.612576786520665e-06, "loss": 0.0013, "step": 24075 }, { "epoch": 6.132948467611851, "grad_norm": 1.1398251056671143, "learning_rate": 4.608832060263548e-06, "loss": 0.004, "step": 24080 }, { "epoch": 6.1342219203667545, "grad_norm": 0.4799223244190216, "learning_rate": 4.60508839946606e-06, "loss": 0.0014, "step": 24085 }, { "epoch": 6.135495373121657, "grad_norm": 0.5568385124206543, "learning_rate": 4.601345804868057e-06, "loss": 0.0012, "step": 24090 }, { "epoch": 6.13676882587656, "grad_norm": 0.2792600989341736, "learning_rate": 4.5976042772092036e-06, "loss": 0.0014, "step": 24095 }, { "epoch": 6.138042278631462, "grad_norm": 0.09631266444921494, "learning_rate": 4.593863817228922e-06, "loss": 0.0024, "step": 24100 }, { "epoch": 6.1393157313863655, "grad_norm": 0.5461459755897522, "learning_rate": 4.590124425666456e-06, "loss": 0.0013, "step": 24105 }, { "epoch": 6.140589184141269, "grad_norm": 0.4495391845703125, "learning_rate": 4.586386103260818e-06, "loss": 0.0018, "step": 24110 }, { "epoch": 6.141862636896171, "grad_norm": 0.27191925048828125, "learning_rate": 4.582648850750815e-06, "loss": 0.0017, "step": 24115 }, { "epoch": 6.143136089651074, "grad_norm": 0.16806845366954803, "learning_rate": 4.578912668875039e-06, "loss": 0.0016, "step": 24120 }, { "epoch": 6.144409542405977, "grad_norm": 0.23137369751930237, "learning_rate": 4.575177558371886e-06, "loss": 0.0014, "step": 24125 }, { "epoch": 6.14568299516088, "grad_norm": 0.05103620886802673, "learning_rate": 4.5714435199795106e-06, "loss": 0.0016, "step": 24130 }, { "epoch": 6.146956447915782, "grad_norm": 0.07016859948635101, "learning_rate": 4.567710554435887e-06, "loss": 0.001, "step": 24135 }, { "epoch": 6.148229900670685, "grad_norm": 0.0566793829202652, "learning_rate": 4.563978662478757e-06, "loss": 0.0013, "step": 24140 }, { "epoch": 6.149503353425588, "grad_norm": 0.6908902525901794, "learning_rate": 4.5602478448456585e-06, "loss": 0.0021, "step": 24145 }, { "epoch": 6.150776806180491, "grad_norm": 0.2425536960363388, "learning_rate": 4.55651810227391e-06, "loss": 0.0013, "step": 24150 }, { "epoch": 6.152050258935393, "grad_norm": 0.0654391273856163, "learning_rate": 4.552789435500635e-06, "loss": 0.0017, "step": 24155 }, { "epoch": 6.153323711690296, "grad_norm": 0.1106349304318428, "learning_rate": 4.549061845262718e-06, "loss": 0.0013, "step": 24160 }, { "epoch": 6.154597164445199, "grad_norm": 0.11398033797740936, "learning_rate": 4.545335332296853e-06, "loss": 0.0008, "step": 24165 }, { "epoch": 6.155870617200102, "grad_norm": 0.0627758651971817, "learning_rate": 4.541609897339512e-06, "loss": 0.0012, "step": 24170 }, { "epoch": 6.157144069955005, "grad_norm": 0.5365540385246277, "learning_rate": 4.537885541126953e-06, "loss": 0.0021, "step": 24175 }, { "epoch": 6.158417522709907, "grad_norm": 0.014677859842777252, "learning_rate": 4.534162264395223e-06, "loss": 0.0008, "step": 24180 }, { "epoch": 6.159690975464811, "grad_norm": 0.08540328592061996, "learning_rate": 4.530440067880157e-06, "loss": 0.0012, "step": 24185 }, { "epoch": 6.160964428219713, "grad_norm": 0.06699799001216888, "learning_rate": 4.526718952317368e-06, "loss": 0.0008, "step": 24190 }, { "epoch": 6.162237880974616, "grad_norm": 0.5985924601554871, "learning_rate": 4.5229989184422726e-06, "loss": 0.0017, "step": 24195 }, { "epoch": 6.163511333729518, "grad_norm": 0.11007368564605713, "learning_rate": 4.519279966990059e-06, "loss": 0.0019, "step": 24200 }, { "epoch": 6.164784786484422, "grad_norm": 0.32131636142730713, "learning_rate": 4.515562098695704e-06, "loss": 0.0015, "step": 24205 }, { "epoch": 6.166058239239324, "grad_norm": 0.1725596785545349, "learning_rate": 4.511845314293974e-06, "loss": 0.001, "step": 24210 }, { "epoch": 6.167331691994227, "grad_norm": 0.24005241692066193, "learning_rate": 4.5081296145194185e-06, "loss": 0.0009, "step": 24215 }, { "epoch": 6.1686051447491295, "grad_norm": 0.09538282454013824, "learning_rate": 4.504415000106373e-06, "loss": 0.0008, "step": 24220 }, { "epoch": 6.169878597504033, "grad_norm": 0.4252987205982208, "learning_rate": 4.5007014717889586e-06, "loss": 0.0038, "step": 24225 }, { "epoch": 6.171152050258935, "grad_norm": 0.7367297410964966, "learning_rate": 4.49698903030108e-06, "loss": 0.0013, "step": 24230 }, { "epoch": 6.172425503013838, "grad_norm": 0.10871200263500214, "learning_rate": 4.4932776763764356e-06, "loss": 0.0016, "step": 24235 }, { "epoch": 6.173698955768741, "grad_norm": 0.1696961671113968, "learning_rate": 4.489567410748498e-06, "loss": 0.0016, "step": 24240 }, { "epoch": 6.174972408523644, "grad_norm": 0.14765676856040955, "learning_rate": 4.48585823415053e-06, "loss": 0.0027, "step": 24245 }, { "epoch": 6.176245861278547, "grad_norm": 0.19900284707546234, "learning_rate": 4.482150147315577e-06, "loss": 0.002, "step": 24250 }, { "epoch": 6.177519314033449, "grad_norm": 0.09109002351760864, "learning_rate": 4.4784431509764725e-06, "loss": 0.0012, "step": 24255 }, { "epoch": 6.178792766788352, "grad_norm": 0.4572557508945465, "learning_rate": 4.474737245865831e-06, "loss": 0.0019, "step": 24260 }, { "epoch": 6.180066219543255, "grad_norm": 0.014494095928966999, "learning_rate": 4.4710324327160494e-06, "loss": 0.0008, "step": 24265 }, { "epoch": 6.181339672298158, "grad_norm": 0.07961031794548035, "learning_rate": 4.467328712259324e-06, "loss": 0.0015, "step": 24270 }, { "epoch": 6.18261312505306, "grad_norm": 0.049982067197561264, "learning_rate": 4.463626085227606e-06, "loss": 0.0008, "step": 24275 }, { "epoch": 6.1838865778079635, "grad_norm": 1.083802342414856, "learning_rate": 4.45992455235266e-06, "loss": 0.0019, "step": 24280 }, { "epoch": 6.185160030562866, "grad_norm": 0.12660358846187592, "learning_rate": 4.456224114366019e-06, "loss": 0.0006, "step": 24285 }, { "epoch": 6.186433483317769, "grad_norm": 0.7673035860061646, "learning_rate": 4.452524771999002e-06, "loss": 0.0009, "step": 24290 }, { "epoch": 6.187706936072671, "grad_norm": 0.5018845796585083, "learning_rate": 4.448826525982709e-06, "loss": 0.0021, "step": 24295 }, { "epoch": 6.1889803888275745, "grad_norm": 0.08165518939495087, "learning_rate": 4.445129377048037e-06, "loss": 0.0015, "step": 24300 }, { "epoch": 6.190253841582478, "grad_norm": 0.31610915064811707, "learning_rate": 4.441433325925642e-06, "loss": 0.0011, "step": 24305 }, { "epoch": 6.19152729433738, "grad_norm": 0.19506695866584778, "learning_rate": 4.437738373345986e-06, "loss": 0.001, "step": 24310 }, { "epoch": 6.192800747092283, "grad_norm": 0.24011129140853882, "learning_rate": 4.434044520039302e-06, "loss": 0.001, "step": 24315 }, { "epoch": 6.1940741998471855, "grad_norm": 0.25521743297576904, "learning_rate": 4.430351766735609e-06, "loss": 0.0012, "step": 24320 }, { "epoch": 6.195347652602089, "grad_norm": 0.2273426353931427, "learning_rate": 4.4266601141647024e-06, "loss": 0.0012, "step": 24325 }, { "epoch": 6.196621105356991, "grad_norm": 0.09332738071680069, "learning_rate": 4.422969563056178e-06, "loss": 0.0014, "step": 24330 }, { "epoch": 6.197894558111894, "grad_norm": 0.1349940448999405, "learning_rate": 4.4192801141393884e-06, "loss": 0.0014, "step": 24335 }, { "epoch": 6.199168010866797, "grad_norm": 0.1241329237818718, "learning_rate": 4.415591768143489e-06, "loss": 0.0027, "step": 24340 }, { "epoch": 6.2004414636217, "grad_norm": 0.2993178963661194, "learning_rate": 4.411904525797408e-06, "loss": 0.0015, "step": 24345 }, { "epoch": 6.201714916376602, "grad_norm": 0.07088778913021088, "learning_rate": 4.408218387829858e-06, "loss": 0.0016, "step": 24350 }, { "epoch": 6.202988369131505, "grad_norm": 0.10630303621292114, "learning_rate": 4.404533354969327e-06, "loss": 0.0008, "step": 24355 }, { "epoch": 6.204261821886408, "grad_norm": 0.0841108113527298, "learning_rate": 4.400849427944103e-06, "loss": 0.0012, "step": 24360 }, { "epoch": 6.205535274641311, "grad_norm": 0.16163592040538788, "learning_rate": 4.397166607482226e-06, "loss": 0.0015, "step": 24365 }, { "epoch": 6.206808727396213, "grad_norm": 0.20124410092830658, "learning_rate": 4.393484894311547e-06, "loss": 0.0022, "step": 24370 }, { "epoch": 6.208082180151116, "grad_norm": 0.22066079080104828, "learning_rate": 4.389804289159681e-06, "loss": 0.0014, "step": 24375 }, { "epoch": 6.2093556329060196, "grad_norm": 0.637412428855896, "learning_rate": 4.3861247927540266e-06, "loss": 0.0013, "step": 24380 }, { "epoch": 6.210629085660922, "grad_norm": 0.49945691227912903, "learning_rate": 4.382446405821762e-06, "loss": 0.0015, "step": 24385 }, { "epoch": 6.211902538415825, "grad_norm": 0.3143484592437744, "learning_rate": 4.3787691290898605e-06, "loss": 0.0012, "step": 24390 }, { "epoch": 6.213175991170727, "grad_norm": 0.26319020986557007, "learning_rate": 4.375092963285049e-06, "loss": 0.001, "step": 24395 }, { "epoch": 6.214449443925631, "grad_norm": 0.19775009155273438, "learning_rate": 4.3714179091338615e-06, "loss": 0.0014, "step": 24400 }, { "epoch": 6.215722896680533, "grad_norm": 0.4431784152984619, "learning_rate": 4.367743967362598e-06, "loss": 0.0018, "step": 24405 }, { "epoch": 6.216996349435436, "grad_norm": 0.17816051840782166, "learning_rate": 4.3640711386973415e-06, "loss": 0.0016, "step": 24410 }, { "epoch": 6.218269802190338, "grad_norm": 0.7157361507415771, "learning_rate": 4.36039942386395e-06, "loss": 0.0012, "step": 24415 }, { "epoch": 6.219543254945242, "grad_norm": 0.3669544756412506, "learning_rate": 4.35672882358808e-06, "loss": 0.0007, "step": 24420 }, { "epoch": 6.220816707700144, "grad_norm": 0.4273756742477417, "learning_rate": 4.3530593385951384e-06, "loss": 0.0014, "step": 24425 }, { "epoch": 6.222090160455047, "grad_norm": 0.2855026125907898, "learning_rate": 4.349390969610339e-06, "loss": 0.0011, "step": 24430 }, { "epoch": 6.2233636132099495, "grad_norm": 0.157048299908638, "learning_rate": 4.3457237173586584e-06, "loss": 0.0013, "step": 24435 }, { "epoch": 6.224637065964853, "grad_norm": 0.6919676661491394, "learning_rate": 4.342057582564862e-06, "loss": 0.0012, "step": 24440 }, { "epoch": 6.225910518719756, "grad_norm": 0.24407657980918884, "learning_rate": 4.338392565953481e-06, "loss": 0.0025, "step": 24445 }, { "epoch": 6.227183971474658, "grad_norm": 0.07544821500778198, "learning_rate": 4.33472866824885e-06, "loss": 0.0016, "step": 24450 }, { "epoch": 6.228457424229561, "grad_norm": 0.11836795508861542, "learning_rate": 4.33106589017505e-06, "loss": 0.0011, "step": 24455 }, { "epoch": 6.229730876984464, "grad_norm": 0.20554126799106598, "learning_rate": 4.327404232455971e-06, "loss": 0.0009, "step": 24460 }, { "epoch": 6.231004329739367, "grad_norm": 0.21906115114688873, "learning_rate": 4.323743695815262e-06, "loss": 0.0017, "step": 24465 }, { "epoch": 6.232277782494269, "grad_norm": 0.2749914228916168, "learning_rate": 4.32008428097636e-06, "loss": 0.0014, "step": 24470 }, { "epoch": 6.233551235249172, "grad_norm": 0.014567673206329346, "learning_rate": 4.316425988662476e-06, "loss": 0.0014, "step": 24475 }, { "epoch": 6.234824688004075, "grad_norm": 0.40974023938179016, "learning_rate": 4.3127688195966e-06, "loss": 0.0014, "step": 24480 }, { "epoch": 6.236098140758978, "grad_norm": 0.0637388676404953, "learning_rate": 4.309112774501497e-06, "loss": 0.0017, "step": 24485 }, { "epoch": 6.23737159351388, "grad_norm": 0.292631059885025, "learning_rate": 4.305457854099725e-06, "loss": 0.0025, "step": 24490 }, { "epoch": 6.2386450462687835, "grad_norm": 0.412550151348114, "learning_rate": 4.3018040591135935e-06, "loss": 0.0017, "step": 24495 }, { "epoch": 6.239918499023686, "grad_norm": 0.1534014195203781, "learning_rate": 4.2981513902652135e-06, "loss": 0.0011, "step": 24500 }, { "epoch": 6.241191951778589, "grad_norm": 0.6024942398071289, "learning_rate": 4.294499848276461e-06, "loss": 0.0036, "step": 24505 }, { "epoch": 6.242465404533492, "grad_norm": 0.042047467082738876, "learning_rate": 4.290849433868993e-06, "loss": 0.0014, "step": 24510 }, { "epoch": 6.2437388572883945, "grad_norm": 0.07688385993242264, "learning_rate": 4.287200147764243e-06, "loss": 0.0022, "step": 24515 }, { "epoch": 6.245012310043298, "grad_norm": 0.07308317720890045, "learning_rate": 4.283551990683422e-06, "loss": 0.0013, "step": 24520 }, { "epoch": 6.2462857627982, "grad_norm": 0.30937156081199646, "learning_rate": 4.279904963347512e-06, "loss": 0.0013, "step": 24525 }, { "epoch": 6.247559215553103, "grad_norm": 0.12042021751403809, "learning_rate": 4.276259066477285e-06, "loss": 0.001, "step": 24530 }, { "epoch": 6.2488326683080055, "grad_norm": 0.1988590806722641, "learning_rate": 4.272614300793279e-06, "loss": 0.0016, "step": 24535 }, { "epoch": 6.250106121062909, "grad_norm": 0.10445544868707657, "learning_rate": 4.26897066701581e-06, "loss": 0.0015, "step": 24540 }, { "epoch": 6.251379573817811, "grad_norm": 0.775216817855835, "learning_rate": 4.265328165864972e-06, "loss": 0.0021, "step": 24545 }, { "epoch": 6.252653026572714, "grad_norm": 0.5312299728393555, "learning_rate": 4.261686798060636e-06, "loss": 0.0012, "step": 24550 }, { "epoch": 6.253926479327617, "grad_norm": 0.09171037375926971, "learning_rate": 4.258046564322446e-06, "loss": 0.0014, "step": 24555 }, { "epoch": 6.25519993208252, "grad_norm": 0.6202635765075684, "learning_rate": 4.2544074653698186e-06, "loss": 0.0019, "step": 24560 }, { "epoch": 6.256473384837422, "grad_norm": 0.1654689759016037, "learning_rate": 4.250769501921961e-06, "loss": 0.001, "step": 24565 }, { "epoch": 6.257746837592325, "grad_norm": 0.3037109673023224, "learning_rate": 4.24713267469784e-06, "loss": 0.0021, "step": 24570 }, { "epoch": 6.2590202903472285, "grad_norm": 0.19869737327098846, "learning_rate": 4.243496984416206e-06, "loss": 0.0007, "step": 24575 }, { "epoch": 6.260293743102131, "grad_norm": 0.14236980676651, "learning_rate": 4.23986243179558e-06, "loss": 0.001, "step": 24580 }, { "epoch": 6.261567195857034, "grad_norm": 0.2728886008262634, "learning_rate": 4.236229017554264e-06, "loss": 0.0028, "step": 24585 }, { "epoch": 6.262840648611936, "grad_norm": 0.4856249690055847, "learning_rate": 4.232596742410327e-06, "loss": 0.002, "step": 24590 }, { "epoch": 6.2641141013668395, "grad_norm": 0.14482198655605316, "learning_rate": 4.228965607081627e-06, "loss": 0.002, "step": 24595 }, { "epoch": 6.265387554121742, "grad_norm": 0.2016078233718872, "learning_rate": 4.225335612285773e-06, "loss": 0.0022, "step": 24600 }, { "epoch": 6.266661006876645, "grad_norm": 0.0745006576180458, "learning_rate": 4.221706758740175e-06, "loss": 0.0018, "step": 24605 }, { "epoch": 6.267934459631547, "grad_norm": 0.1618187129497528, "learning_rate": 4.218079047162001e-06, "loss": 0.0015, "step": 24610 }, { "epoch": 6.269207912386451, "grad_norm": 1.2239168882369995, "learning_rate": 4.214452478268199e-06, "loss": 0.0012, "step": 24615 }, { "epoch": 6.270481365141353, "grad_norm": 0.113683320581913, "learning_rate": 4.210827052775486e-06, "loss": 0.0024, "step": 24620 }, { "epoch": 6.271754817896256, "grad_norm": 0.7583709955215454, "learning_rate": 4.207202771400366e-06, "loss": 0.0016, "step": 24625 }, { "epoch": 6.273028270651158, "grad_norm": 0.1379140019416809, "learning_rate": 4.203579634859096e-06, "loss": 0.0012, "step": 24630 }, { "epoch": 6.274301723406062, "grad_norm": 0.2557721436023712, "learning_rate": 4.199957643867727e-06, "loss": 0.002, "step": 24635 }, { "epoch": 6.275575176160965, "grad_norm": 0.03375590220093727, "learning_rate": 4.196336799142072e-06, "loss": 0.001, "step": 24640 }, { "epoch": 6.276848628915867, "grad_norm": 0.11867669224739075, "learning_rate": 4.192717101397723e-06, "loss": 0.0016, "step": 24645 }, { "epoch": 6.27812208167077, "grad_norm": 0.12600722908973694, "learning_rate": 4.189098551350036e-06, "loss": 0.0014, "step": 24650 }, { "epoch": 6.279395534425673, "grad_norm": 0.022542988881468773, "learning_rate": 4.18548114971416e-06, "loss": 0.0023, "step": 24655 }, { "epoch": 6.280668987180576, "grad_norm": 0.12668117880821228, "learning_rate": 4.181864897204989e-06, "loss": 0.0011, "step": 24660 }, { "epoch": 6.281942439935478, "grad_norm": 0.14617636799812317, "learning_rate": 4.1782497945372155e-06, "loss": 0.0019, "step": 24665 }, { "epoch": 6.283215892690381, "grad_norm": 0.16789045929908752, "learning_rate": 4.174635842425293e-06, "loss": 0.001, "step": 24670 }, { "epoch": 6.284489345445284, "grad_norm": 0.6814597845077515, "learning_rate": 4.171023041583446e-06, "loss": 0.0018, "step": 24675 }, { "epoch": 6.285762798200187, "grad_norm": 0.34564390778541565, "learning_rate": 4.167411392725672e-06, "loss": 0.0023, "step": 24680 }, { "epoch": 6.287036250955089, "grad_norm": 0.13230681419372559, "learning_rate": 4.163800896565755e-06, "loss": 0.0007, "step": 24685 }, { "epoch": 6.288309703709992, "grad_norm": 0.4673471450805664, "learning_rate": 4.160191553817223e-06, "loss": 0.0019, "step": 24690 }, { "epoch": 6.289583156464895, "grad_norm": 0.13905023038387299, "learning_rate": 4.156583365193406e-06, "loss": 0.0012, "step": 24695 }, { "epoch": 6.290856609219798, "grad_norm": 0.5488401651382446, "learning_rate": 4.152976331407387e-06, "loss": 0.0014, "step": 24700 }, { "epoch": 6.292130061974701, "grad_norm": 0.20512652397155762, "learning_rate": 4.149370453172025e-06, "loss": 0.0016, "step": 24705 }, { "epoch": 6.2934035147296035, "grad_norm": 0.26397931575775146, "learning_rate": 4.145765731199951e-06, "loss": 0.0012, "step": 24710 }, { "epoch": 6.294676967484507, "grad_norm": 0.2390698343515396, "learning_rate": 4.142162166203577e-06, "loss": 0.0015, "step": 24715 }, { "epoch": 6.295950420239409, "grad_norm": 0.06802000105381012, "learning_rate": 4.1385597588950635e-06, "loss": 0.0006, "step": 24720 }, { "epoch": 6.297223872994312, "grad_norm": 0.049623794853687286, "learning_rate": 4.134958509986372e-06, "loss": 0.0016, "step": 24725 }, { "epoch": 6.2984973257492145, "grad_norm": 0.39312484860420227, "learning_rate": 4.131358420189203e-06, "loss": 0.0016, "step": 24730 }, { "epoch": 6.299770778504118, "grad_norm": 0.42812052369117737, "learning_rate": 4.127759490215057e-06, "loss": 0.0013, "step": 24735 }, { "epoch": 6.30104423125902, "grad_norm": 0.05614875257015228, "learning_rate": 4.12416172077519e-06, "loss": 0.0022, "step": 24740 }, { "epoch": 6.302317684013923, "grad_norm": 0.2916589379310608, "learning_rate": 4.120565112580627e-06, "loss": 0.0007, "step": 24745 }, { "epoch": 6.3035911367688255, "grad_norm": 0.19211779534816742, "learning_rate": 4.116969666342168e-06, "loss": 0.0004, "step": 24750 }, { "epoch": 6.304864589523729, "grad_norm": 0.1571478694677353, "learning_rate": 4.113375382770392e-06, "loss": 0.0013, "step": 24755 }, { "epoch": 6.306138042278631, "grad_norm": 0.4811771810054779, "learning_rate": 4.109782262575625e-06, "loss": 0.002, "step": 24760 }, { "epoch": 6.307411495033534, "grad_norm": 0.41836798191070557, "learning_rate": 4.10619030646799e-06, "loss": 0.0008, "step": 24765 }, { "epoch": 6.3086849477884375, "grad_norm": 0.5237107276916504, "learning_rate": 4.102599515157361e-06, "loss": 0.0016, "step": 24770 }, { "epoch": 6.30995840054334, "grad_norm": 0.47293153405189514, "learning_rate": 4.099009889353391e-06, "loss": 0.0019, "step": 24775 }, { "epoch": 6.311231853298243, "grad_norm": 0.2074776589870453, "learning_rate": 4.0954214297654945e-06, "loss": 0.0016, "step": 24780 }, { "epoch": 6.312505306053145, "grad_norm": 0.21101930737495422, "learning_rate": 4.091834137102872e-06, "loss": 0.0011, "step": 24785 }, { "epoch": 6.3137787588080485, "grad_norm": 0.5483812093734741, "learning_rate": 4.088248012074468e-06, "loss": 0.0017, "step": 24790 }, { "epoch": 6.315052211562951, "grad_norm": 0.07556889951229095, "learning_rate": 4.084663055389021e-06, "loss": 0.001, "step": 24795 }, { "epoch": 6.316325664317854, "grad_norm": 0.4713914394378662, "learning_rate": 4.081079267755024e-06, "loss": 0.0033, "step": 24800 }, { "epoch": 6.317599117072756, "grad_norm": 0.11505292356014252, "learning_rate": 4.077496649880744e-06, "loss": 0.0016, "step": 24805 }, { "epoch": 6.3188725698276595, "grad_norm": 0.15024282038211823, "learning_rate": 4.073915202474212e-06, "loss": 0.0009, "step": 24810 }, { "epoch": 6.320146022582562, "grad_norm": 0.5424999594688416, "learning_rate": 4.070334926243241e-06, "loss": 0.0031, "step": 24815 }, { "epoch": 6.321419475337465, "grad_norm": 0.15357932448387146, "learning_rate": 4.066755821895392e-06, "loss": 0.0017, "step": 24820 }, { "epoch": 6.322692928092367, "grad_norm": 0.1458486169576645, "learning_rate": 4.063177890138012e-06, "loss": 0.0016, "step": 24825 }, { "epoch": 6.323966380847271, "grad_norm": 0.48157432675361633, "learning_rate": 4.059601131678208e-06, "loss": 0.0015, "step": 24830 }, { "epoch": 6.325239833602174, "grad_norm": 0.5587522983551025, "learning_rate": 4.056025547222858e-06, "loss": 0.0029, "step": 24835 }, { "epoch": 6.326513286357076, "grad_norm": 0.12681682407855988, "learning_rate": 4.052451137478606e-06, "loss": 0.0011, "step": 24840 }, { "epoch": 6.327786739111979, "grad_norm": 0.20367147028446198, "learning_rate": 4.048877903151863e-06, "loss": 0.0012, "step": 24845 }, { "epoch": 6.329060191866882, "grad_norm": 0.16677476465702057, "learning_rate": 4.0453058449488095e-06, "loss": 0.001, "step": 24850 }, { "epoch": 6.330333644621785, "grad_norm": 0.18289418518543243, "learning_rate": 4.041734963575397e-06, "loss": 0.0014, "step": 24855 }, { "epoch": 6.331607097376687, "grad_norm": 0.13019368052482605, "learning_rate": 4.038165259737338e-06, "loss": 0.0008, "step": 24860 }, { "epoch": 6.33288055013159, "grad_norm": 0.7638862729072571, "learning_rate": 4.034596734140116e-06, "loss": 0.0022, "step": 24865 }, { "epoch": 6.334154002886493, "grad_norm": 0.14611080288887024, "learning_rate": 4.03102938748898e-06, "loss": 0.0018, "step": 24870 }, { "epoch": 6.335427455641396, "grad_norm": 0.25792768597602844, "learning_rate": 4.027463220488949e-06, "loss": 0.0013, "step": 24875 }, { "epoch": 6.336700908396298, "grad_norm": 0.2642567455768585, "learning_rate": 4.023898233844803e-06, "loss": 0.0026, "step": 24880 }, { "epoch": 6.337974361151201, "grad_norm": 0.0638241246342659, "learning_rate": 4.020334428261092e-06, "loss": 0.0006, "step": 24885 }, { "epoch": 6.339247813906104, "grad_norm": 0.297759473323822, "learning_rate": 4.016771804442138e-06, "loss": 0.0015, "step": 24890 }, { "epoch": 6.340521266661007, "grad_norm": 0.15377971529960632, "learning_rate": 4.013210363092021e-06, "loss": 0.002, "step": 24895 }, { "epoch": 6.34179471941591, "grad_norm": 0.16179564595222473, "learning_rate": 4.009650104914593e-06, "loss": 0.0013, "step": 24900 }, { "epoch": 6.343068172170812, "grad_norm": 0.05676034465432167, "learning_rate": 4.006091030613467e-06, "loss": 0.0015, "step": 24905 }, { "epoch": 6.344341624925716, "grad_norm": 0.25684261322021484, "learning_rate": 4.002533140892026e-06, "loss": 0.001, "step": 24910 }, { "epoch": 6.345615077680618, "grad_norm": 0.444742351770401, "learning_rate": 3.998976436453416e-06, "loss": 0.0012, "step": 24915 }, { "epoch": 6.346888530435521, "grad_norm": 0.23115606606006622, "learning_rate": 3.995420918000558e-06, "loss": 0.0008, "step": 24920 }, { "epoch": 6.3481619831904235, "grad_norm": 0.3168705701828003, "learning_rate": 3.9918665862361185e-06, "loss": 0.0009, "step": 24925 }, { "epoch": 6.349435435945327, "grad_norm": 0.13666115701198578, "learning_rate": 3.9883134418625535e-06, "loss": 0.0012, "step": 24930 }, { "epoch": 6.350708888700229, "grad_norm": 0.10810616612434387, "learning_rate": 3.984761485582068e-06, "loss": 0.0008, "step": 24935 }, { "epoch": 6.351982341455132, "grad_norm": 0.03291069716215134, "learning_rate": 3.981210718096638e-06, "loss": 0.0012, "step": 24940 }, { "epoch": 6.3532557942100345, "grad_norm": 0.36277899146080017, "learning_rate": 3.977661140108e-06, "loss": 0.002, "step": 24945 }, { "epoch": 6.354529246964938, "grad_norm": 0.15113312005996704, "learning_rate": 3.974112752317671e-06, "loss": 0.0014, "step": 24950 }, { "epoch": 6.35580269971984, "grad_norm": 0.548541784286499, "learning_rate": 3.970565555426904e-06, "loss": 0.001, "step": 24955 }, { "epoch": 6.357076152474743, "grad_norm": 0.30783042311668396, "learning_rate": 3.967019550136746e-06, "loss": 0.0013, "step": 24960 }, { "epoch": 6.358349605229646, "grad_norm": 0.03031071089208126, "learning_rate": 3.963474737147993e-06, "loss": 0.0008, "step": 24965 }, { "epoch": 6.359623057984549, "grad_norm": 0.08134100586175919, "learning_rate": 3.959931117161207e-06, "loss": 0.0017, "step": 24970 }, { "epoch": 6.360896510739452, "grad_norm": 0.3056614398956299, "learning_rate": 3.9563886908767166e-06, "loss": 0.0017, "step": 24975 }, { "epoch": 6.362169963494354, "grad_norm": 0.1692395657300949, "learning_rate": 3.9528474589946145e-06, "loss": 0.0011, "step": 24980 }, { "epoch": 6.3634434162492575, "grad_norm": 0.06798675656318665, "learning_rate": 3.949307422214751e-06, "loss": 0.0012, "step": 24985 }, { "epoch": 6.36471686900416, "grad_norm": 0.17098402976989746, "learning_rate": 3.9457685812367566e-06, "loss": 0.0016, "step": 24990 }, { "epoch": 6.365990321759063, "grad_norm": 0.08849765360355377, "learning_rate": 3.94223093676e-06, "loss": 0.0017, "step": 24995 }, { "epoch": 6.367263774513965, "grad_norm": 0.35103854537010193, "learning_rate": 3.93869448948364e-06, "loss": 0.0012, "step": 25000 }, { "epoch": 6.3685372272688685, "grad_norm": 0.1052624061703682, "learning_rate": 3.935159240106583e-06, "loss": 0.0017, "step": 25005 }, { "epoch": 6.369810680023771, "grad_norm": 0.3643682599067688, "learning_rate": 3.931625189327501e-06, "loss": 0.0014, "step": 25010 }, { "epoch": 6.371084132778674, "grad_norm": 0.35580921173095703, "learning_rate": 3.9280923378448275e-06, "loss": 0.0012, "step": 25015 }, { "epoch": 6.372357585533576, "grad_norm": 0.5556514263153076, "learning_rate": 3.924560686356773e-06, "loss": 0.0014, "step": 25020 }, { "epoch": 6.3736310382884795, "grad_norm": 1.2029873132705688, "learning_rate": 3.9210302355612854e-06, "loss": 0.0013, "step": 25025 }, { "epoch": 6.374904491043383, "grad_norm": 0.03949911519885063, "learning_rate": 3.9175009861561e-06, "loss": 0.0016, "step": 25030 }, { "epoch": 6.376177943798285, "grad_norm": 0.07436388731002808, "learning_rate": 3.9139729388387015e-06, "loss": 0.0021, "step": 25035 }, { "epoch": 6.377451396553188, "grad_norm": 0.3149266839027405, "learning_rate": 3.910446094306341e-06, "loss": 0.002, "step": 25040 }, { "epoch": 6.378724849308091, "grad_norm": 0.26116496324539185, "learning_rate": 3.906920453256024e-06, "loss": 0.0012, "step": 25045 }, { "epoch": 6.379998302062994, "grad_norm": 0.09407838433980942, "learning_rate": 3.903396016384538e-06, "loss": 0.0007, "step": 25050 }, { "epoch": 6.381271754817896, "grad_norm": 0.9484964609146118, "learning_rate": 3.899872784388405e-06, "loss": 0.0027, "step": 25055 }, { "epoch": 6.382545207572799, "grad_norm": 0.11441637575626373, "learning_rate": 3.896350757963935e-06, "loss": 0.0012, "step": 25060 }, { "epoch": 6.383818660327702, "grad_norm": 0.1917034238576889, "learning_rate": 3.8928299378071825e-06, "loss": 0.0016, "step": 25065 }, { "epoch": 6.385092113082605, "grad_norm": 0.12399791926145554, "learning_rate": 3.889310324613969e-06, "loss": 0.002, "step": 25070 }, { "epoch": 6.386365565837507, "grad_norm": 0.0396932028234005, "learning_rate": 3.885791919079878e-06, "loss": 0.0012, "step": 25075 }, { "epoch": 6.38763901859241, "grad_norm": 0.30285194516181946, "learning_rate": 3.88227472190026e-06, "loss": 0.0012, "step": 25080 }, { "epoch": 6.388912471347313, "grad_norm": 0.21529293060302734, "learning_rate": 3.878758733770209e-06, "loss": 0.0012, "step": 25085 }, { "epoch": 6.390185924102216, "grad_norm": 0.3406405746936798, "learning_rate": 3.875243955384602e-06, "loss": 0.0013, "step": 25090 }, { "epoch": 6.391459376857119, "grad_norm": 0.026233773678541183, "learning_rate": 3.871730387438063e-06, "loss": 0.0005, "step": 25095 }, { "epoch": 6.392732829612021, "grad_norm": 0.29763898253440857, "learning_rate": 3.868218030624979e-06, "loss": 0.0015, "step": 25100 }, { "epoch": 6.394006282366925, "grad_norm": 0.14228561520576477, "learning_rate": 3.864706885639499e-06, "loss": 0.0023, "step": 25105 }, { "epoch": 6.395279735121827, "grad_norm": 0.8918588757514954, "learning_rate": 3.86119695317554e-06, "loss": 0.0014, "step": 25110 }, { "epoch": 6.39655318787673, "grad_norm": 0.037927478551864624, "learning_rate": 3.857688233926759e-06, "loss": 0.0014, "step": 25115 }, { "epoch": 6.397826640631632, "grad_norm": 0.5195775628089905, "learning_rate": 3.854180728586598e-06, "loss": 0.0016, "step": 25120 }, { "epoch": 6.399100093386536, "grad_norm": 0.32621365785598755, "learning_rate": 3.850674437848243e-06, "loss": 0.0016, "step": 25125 }, { "epoch": 6.400373546141438, "grad_norm": 0.08744413405656815, "learning_rate": 3.847169362404643e-06, "loss": 0.0017, "step": 25130 }, { "epoch": 6.401646998896341, "grad_norm": 0.21973513066768646, "learning_rate": 3.843665502948508e-06, "loss": 0.0006, "step": 25135 }, { "epoch": 6.4029204516512435, "grad_norm": 0.17130929231643677, "learning_rate": 3.840162860172311e-06, "loss": 0.001, "step": 25140 }, { "epoch": 6.404193904406147, "grad_norm": 1.095815658569336, "learning_rate": 3.836661434768274e-06, "loss": 0.0014, "step": 25145 }, { "epoch": 6.405467357161049, "grad_norm": 0.06712604314088821, "learning_rate": 3.833161227428395e-06, "loss": 0.001, "step": 25150 }, { "epoch": 6.406740809915952, "grad_norm": 0.20037513971328735, "learning_rate": 3.829662238844417e-06, "loss": 0.0021, "step": 25155 }, { "epoch": 6.4080142626708545, "grad_norm": 0.026844089850783348, "learning_rate": 3.826164469707848e-06, "loss": 0.0012, "step": 25160 }, { "epoch": 6.409287715425758, "grad_norm": 0.3769230544567108, "learning_rate": 3.822667920709955e-06, "loss": 0.0012, "step": 25165 }, { "epoch": 6.410561168180661, "grad_norm": 0.6621493697166443, "learning_rate": 3.819172592541761e-06, "loss": 0.0018, "step": 25170 }, { "epoch": 6.411834620935563, "grad_norm": 0.04124891385436058, "learning_rate": 3.815678485894051e-06, "loss": 0.0009, "step": 25175 }, { "epoch": 6.413108073690466, "grad_norm": 0.31079235672950745, "learning_rate": 3.812185601457364e-06, "loss": 0.0011, "step": 25180 }, { "epoch": 6.414381526445369, "grad_norm": 0.11469393968582153, "learning_rate": 3.808693939922007e-06, "loss": 0.0013, "step": 25185 }, { "epoch": 6.415654979200272, "grad_norm": 0.6933194994926453, "learning_rate": 3.8052035019780367e-06, "loss": 0.0017, "step": 25190 }, { "epoch": 6.416928431955174, "grad_norm": 0.13623592257499695, "learning_rate": 3.801714288315268e-06, "loss": 0.0011, "step": 25195 }, { "epoch": 6.4182018847100775, "grad_norm": 0.2375497967004776, "learning_rate": 3.7982262996232787e-06, "loss": 0.0011, "step": 25200 }, { "epoch": 6.41947533746498, "grad_norm": 0.04562528431415558, "learning_rate": 3.7947395365914006e-06, "loss": 0.001, "step": 25205 }, { "epoch": 6.420748790219883, "grad_norm": 0.30931466817855835, "learning_rate": 3.791253999908723e-06, "loss": 0.0014, "step": 25210 }, { "epoch": 6.422022242974785, "grad_norm": 0.353670209646225, "learning_rate": 3.7877696902641027e-06, "loss": 0.0033, "step": 25215 }, { "epoch": 6.4232956957296885, "grad_norm": 0.5441415905952454, "learning_rate": 3.784286608346133e-06, "loss": 0.0026, "step": 25220 }, { "epoch": 6.424569148484591, "grad_norm": 0.021673142910003662, "learning_rate": 3.7808047548431882e-06, "loss": 0.001, "step": 25225 }, { "epoch": 6.425842601239494, "grad_norm": 0.24822302162647247, "learning_rate": 3.777324130443384e-06, "loss": 0.0013, "step": 25230 }, { "epoch": 6.427116053994396, "grad_norm": 0.4294116497039795, "learning_rate": 3.7738447358345997e-06, "loss": 0.0021, "step": 25235 }, { "epoch": 6.4283895067492995, "grad_norm": 0.46716228127479553, "learning_rate": 3.77036657170447e-06, "loss": 0.0013, "step": 25240 }, { "epoch": 6.429662959504203, "grad_norm": 0.12429912388324738, "learning_rate": 3.766889638740386e-06, "loss": 0.0007, "step": 25245 }, { "epoch": 6.430936412259105, "grad_norm": 0.5041886568069458, "learning_rate": 3.763413937629493e-06, "loss": 0.0009, "step": 25250 }, { "epoch": 6.432209865014008, "grad_norm": 0.252144992351532, "learning_rate": 3.7599394690587054e-06, "loss": 0.0024, "step": 25255 }, { "epoch": 6.433483317768911, "grad_norm": 0.4181070923805237, "learning_rate": 3.7564662337146707e-06, "loss": 0.0015, "step": 25260 }, { "epoch": 6.434756770523814, "grad_norm": 0.3363070487976074, "learning_rate": 3.7529942322838176e-06, "loss": 0.0013, "step": 25265 }, { "epoch": 6.436030223278716, "grad_norm": 0.1451306939125061, "learning_rate": 3.749523465452315e-06, "loss": 0.0013, "step": 25270 }, { "epoch": 6.437303676033619, "grad_norm": 0.07726234197616577, "learning_rate": 3.746053933906094e-06, "loss": 0.0012, "step": 25275 }, { "epoch": 6.438577128788522, "grad_norm": 0.2019227147102356, "learning_rate": 3.7425856383308356e-06, "loss": 0.0019, "step": 25280 }, { "epoch": 6.439850581543425, "grad_norm": 0.03508070856332779, "learning_rate": 3.739118579411991e-06, "loss": 0.0015, "step": 25285 }, { "epoch": 6.441124034298327, "grad_norm": 0.15462763607501984, "learning_rate": 3.7356527578347456e-06, "loss": 0.0006, "step": 25290 }, { "epoch": 6.44239748705323, "grad_norm": 0.33833080530166626, "learning_rate": 3.7321881742840594e-06, "loss": 0.0017, "step": 25295 }, { "epoch": 6.443670939808133, "grad_norm": 0.24810507893562317, "learning_rate": 3.7287248294446387e-06, "loss": 0.0006, "step": 25300 }, { "epoch": 6.444944392563036, "grad_norm": 0.14708344638347626, "learning_rate": 3.725262724000944e-06, "loss": 0.0012, "step": 25305 }, { "epoch": 6.446217845317939, "grad_norm": 0.3740890920162201, "learning_rate": 3.721801858637192e-06, "loss": 0.003, "step": 25310 }, { "epoch": 6.447491298072841, "grad_norm": 0.16702131927013397, "learning_rate": 3.7183422340373645e-06, "loss": 0.0012, "step": 25315 }, { "epoch": 6.448764750827745, "grad_norm": 0.018795669078826904, "learning_rate": 3.714883850885176e-06, "loss": 0.0013, "step": 25320 }, { "epoch": 6.450038203582647, "grad_norm": 0.05290602892637253, "learning_rate": 3.711426709864119e-06, "loss": 0.0013, "step": 25325 }, { "epoch": 6.45131165633755, "grad_norm": 0.11291875690221786, "learning_rate": 3.7079708116574263e-06, "loss": 0.0022, "step": 25330 }, { "epoch": 6.452585109092452, "grad_norm": 0.11423033475875854, "learning_rate": 3.7045161569480903e-06, "loss": 0.001, "step": 25335 }, { "epoch": 6.453858561847356, "grad_norm": 0.13749554753303528, "learning_rate": 3.701062746418851e-06, "loss": 0.0023, "step": 25340 }, { "epoch": 6.455132014602258, "grad_norm": 0.4353010654449463, "learning_rate": 3.69761058075222e-06, "loss": 0.0017, "step": 25345 }, { "epoch": 6.456405467357161, "grad_norm": 0.04134729132056236, "learning_rate": 3.694159660630435e-06, "loss": 0.0021, "step": 25350 }, { "epoch": 6.4576789201120635, "grad_norm": 0.44816848635673523, "learning_rate": 3.690709986735517e-06, "loss": 0.0019, "step": 25355 }, { "epoch": 6.458952372866967, "grad_norm": 0.10465448349714279, "learning_rate": 3.68726155974922e-06, "loss": 0.0012, "step": 25360 }, { "epoch": 6.460225825621869, "grad_norm": 0.16552259027957916, "learning_rate": 3.6838143803530614e-06, "loss": 0.0008, "step": 25365 }, { "epoch": 6.461499278376772, "grad_norm": 0.06745417416095734, "learning_rate": 3.6803684492283043e-06, "loss": 0.0011, "step": 25370 }, { "epoch": 6.462772731131675, "grad_norm": 0.4482142925262451, "learning_rate": 3.67692376705598e-06, "loss": 0.0024, "step": 25375 }, { "epoch": 6.464046183886578, "grad_norm": 0.44775068759918213, "learning_rate": 3.673480334516851e-06, "loss": 0.0015, "step": 25380 }, { "epoch": 6.465319636641481, "grad_norm": 0.024561816826462746, "learning_rate": 3.670038152291453e-06, "loss": 0.001, "step": 25385 }, { "epoch": 6.466593089396383, "grad_norm": 0.16919612884521484, "learning_rate": 3.6665972210600654e-06, "loss": 0.0015, "step": 25390 }, { "epoch": 6.467866542151286, "grad_norm": 0.15482819080352783, "learning_rate": 3.66315754150272e-06, "loss": 0.0015, "step": 25395 }, { "epoch": 6.469139994906189, "grad_norm": 0.5584961771965027, "learning_rate": 3.6597191142991983e-06, "loss": 0.0022, "step": 25400 }, { "epoch": 6.470413447661092, "grad_norm": 0.4154737591743469, "learning_rate": 3.6562819401290507e-06, "loss": 0.0031, "step": 25405 }, { "epoch": 6.471686900415994, "grad_norm": 0.2091488391160965, "learning_rate": 3.652846019671554e-06, "loss": 0.0016, "step": 25410 }, { "epoch": 6.4729603531708975, "grad_norm": 0.1256970763206482, "learning_rate": 3.649411353605761e-06, "loss": 0.0014, "step": 25415 }, { "epoch": 6.4742338059258, "grad_norm": 0.0674300491809845, "learning_rate": 3.6459779426104623e-06, "loss": 0.0013, "step": 25420 }, { "epoch": 6.475507258680703, "grad_norm": 0.03604666516184807, "learning_rate": 3.642545787364207e-06, "loss": 0.0011, "step": 25425 }, { "epoch": 6.476780711435605, "grad_norm": 1.3211439847946167, "learning_rate": 3.6391148885452874e-06, "loss": 0.002, "step": 25430 }, { "epoch": 6.4780541641905085, "grad_norm": 0.2175673395395279, "learning_rate": 3.6356852468317684e-06, "loss": 0.0014, "step": 25435 }, { "epoch": 6.479327616945412, "grad_norm": 0.051763955503702164, "learning_rate": 3.6322568629014353e-06, "loss": 0.0015, "step": 25440 }, { "epoch": 6.480601069700314, "grad_norm": 1.159070611000061, "learning_rate": 3.6288297374318526e-06, "loss": 0.0014, "step": 25445 }, { "epoch": 6.481874522455217, "grad_norm": 0.2870805263519287, "learning_rate": 3.6254038711003224e-06, "loss": 0.0012, "step": 25450 }, { "epoch": 6.4831479752101195, "grad_norm": 0.5150133967399597, "learning_rate": 3.6219792645839004e-06, "loss": 0.0009, "step": 25455 }, { "epoch": 6.484421427965023, "grad_norm": 0.4633120000362396, "learning_rate": 3.6185559185593924e-06, "loss": 0.0009, "step": 25460 }, { "epoch": 6.485694880719925, "grad_norm": 0.30668944120407104, "learning_rate": 3.615133833703358e-06, "loss": 0.0015, "step": 25465 }, { "epoch": 6.486968333474828, "grad_norm": 0.21115633845329285, "learning_rate": 3.6117130106921005e-06, "loss": 0.0015, "step": 25470 }, { "epoch": 6.488241786229731, "grad_norm": 0.4573938846588135, "learning_rate": 3.6082934502016874e-06, "loss": 0.001, "step": 25475 }, { "epoch": 6.489515238984634, "grad_norm": 0.26051777601242065, "learning_rate": 3.604875152907925e-06, "loss": 0.0014, "step": 25480 }, { "epoch": 6.490788691739536, "grad_norm": 0.35604944825172424, "learning_rate": 3.6014581194863718e-06, "loss": 0.0013, "step": 25485 }, { "epoch": 6.492062144494439, "grad_norm": 0.13803622126579285, "learning_rate": 3.59804235061234e-06, "loss": 0.0014, "step": 25490 }, { "epoch": 6.493335597249342, "grad_norm": 0.45392757654190063, "learning_rate": 3.5946278469608884e-06, "loss": 0.0011, "step": 25495 }, { "epoch": 6.494609050004245, "grad_norm": 0.22610358893871307, "learning_rate": 3.591214609206829e-06, "loss": 0.0013, "step": 25500 }, { "epoch": 6.495882502759148, "grad_norm": 0.1201622486114502, "learning_rate": 3.587802638024721e-06, "loss": 0.0008, "step": 25505 }, { "epoch": 6.49715595551405, "grad_norm": 1.5299187898635864, "learning_rate": 3.5843919340888712e-06, "loss": 0.0022, "step": 25510 }, { "epoch": 6.4984294082689535, "grad_norm": 0.12112840265035629, "learning_rate": 3.5809824980733445e-06, "loss": 0.0027, "step": 25515 }, { "epoch": 6.499702861023856, "grad_norm": 0.3263128399848938, "learning_rate": 3.5775743306519474e-06, "loss": 0.0013, "step": 25520 }, { "epoch": 6.500976313778759, "grad_norm": 0.37699609994888306, "learning_rate": 3.574167432498238e-06, "loss": 0.0042, "step": 25525 }, { "epoch": 6.502249766533661, "grad_norm": 0.5594006776809692, "learning_rate": 3.570761804285523e-06, "loss": 0.0021, "step": 25530 }, { "epoch": 6.503523219288565, "grad_norm": 0.15253205597400665, "learning_rate": 3.56735744668686e-06, "loss": 0.001, "step": 25535 }, { "epoch": 6.504796672043467, "grad_norm": 0.2261071801185608, "learning_rate": 3.5639543603750528e-06, "loss": 0.0018, "step": 25540 }, { "epoch": 6.50607012479837, "grad_norm": 0.1621900051832199, "learning_rate": 3.560552546022651e-06, "loss": 0.0021, "step": 25545 }, { "epoch": 6.507343577553272, "grad_norm": 0.3868546187877655, "learning_rate": 3.557152004301967e-06, "loss": 0.0007, "step": 25550 }, { "epoch": 6.508617030308176, "grad_norm": 0.06460695713758469, "learning_rate": 3.5537527358850456e-06, "loss": 0.0009, "step": 25555 }, { "epoch": 6.509890483063078, "grad_norm": 0.23544062674045563, "learning_rate": 3.5503547414436877e-06, "loss": 0.001, "step": 25560 }, { "epoch": 6.511163935817981, "grad_norm": 0.7062585353851318, "learning_rate": 3.5469580216494402e-06, "loss": 0.0014, "step": 25565 }, { "epoch": 6.512437388572884, "grad_norm": 0.7510907053947449, "learning_rate": 3.5435625771735993e-06, "loss": 0.0011, "step": 25570 }, { "epoch": 6.513710841327787, "grad_norm": 0.2842152714729309, "learning_rate": 3.5401684086872056e-06, "loss": 0.0019, "step": 25575 }, { "epoch": 6.51498429408269, "grad_norm": 0.02416975237429142, "learning_rate": 3.5367755168610597e-06, "loss": 0.0013, "step": 25580 }, { "epoch": 6.516257746837592, "grad_norm": 0.2038252353668213, "learning_rate": 3.5333839023656867e-06, "loss": 0.0007, "step": 25585 }, { "epoch": 6.517531199592495, "grad_norm": 0.2246650606393814, "learning_rate": 3.529993565871386e-06, "loss": 0.0013, "step": 25590 }, { "epoch": 6.518804652347398, "grad_norm": 0.12765444815158844, "learning_rate": 3.5266045080481857e-06, "loss": 0.0011, "step": 25595 }, { "epoch": 6.520078105102301, "grad_norm": 0.06604330241680145, "learning_rate": 3.5232167295658693e-06, "loss": 0.0006, "step": 25600 }, { "epoch": 6.521351557857203, "grad_norm": 0.9847479462623596, "learning_rate": 3.5198302310939603e-06, "loss": 0.0019, "step": 25605 }, { "epoch": 6.522625010612106, "grad_norm": 0.08838854730129242, "learning_rate": 3.5164450133017445e-06, "loss": 0.0016, "step": 25610 }, { "epoch": 6.523898463367009, "grad_norm": 0.5001994371414185, "learning_rate": 3.513061076858233e-06, "loss": 0.0034, "step": 25615 }, { "epoch": 6.525171916121912, "grad_norm": 0.3635755479335785, "learning_rate": 3.5096784224322044e-06, "loss": 0.0007, "step": 25620 }, { "epoch": 6.526445368876814, "grad_norm": 0.4506617784500122, "learning_rate": 3.5062970506921702e-06, "loss": 0.0007, "step": 25625 }, { "epoch": 6.5277188216317175, "grad_norm": 0.6378349661827087, "learning_rate": 3.502916962306393e-06, "loss": 0.0026, "step": 25630 }, { "epoch": 6.528992274386621, "grad_norm": 0.17926131188869476, "learning_rate": 3.499538157942879e-06, "loss": 0.0011, "step": 25635 }, { "epoch": 6.530265727141523, "grad_norm": 0.33800196647644043, "learning_rate": 3.4961606382693923e-06, "loss": 0.001, "step": 25640 }, { "epoch": 6.531539179896426, "grad_norm": 0.2767678499221802, "learning_rate": 3.492784403953423e-06, "loss": 0.0022, "step": 25645 }, { "epoch": 6.5328126326513285, "grad_norm": 0.34170809388160706, "learning_rate": 3.489409455662226e-06, "loss": 0.0014, "step": 25650 }, { "epoch": 6.534086085406232, "grad_norm": 0.059053774923086166, "learning_rate": 3.4860357940627908e-06, "loss": 0.0015, "step": 25655 }, { "epoch": 6.535359538161134, "grad_norm": 0.06406111270189285, "learning_rate": 3.482663419821858e-06, "loss": 0.0011, "step": 25660 }, { "epoch": 6.536632990916037, "grad_norm": 0.06744891405105591, "learning_rate": 3.479292333605907e-06, "loss": 0.0021, "step": 25665 }, { "epoch": 6.5379064436709395, "grad_norm": 0.2505331337451935, "learning_rate": 3.4759225360811776e-06, "loss": 0.0009, "step": 25670 }, { "epoch": 6.539179896425843, "grad_norm": 0.0725332722067833, "learning_rate": 3.472554027913633e-06, "loss": 0.0009, "step": 25675 }, { "epoch": 6.540453349180745, "grad_norm": 0.20955908298492432, "learning_rate": 3.469186809769002e-06, "loss": 0.0015, "step": 25680 }, { "epoch": 6.541726801935648, "grad_norm": 0.08869968354701996, "learning_rate": 3.465820882312746e-06, "loss": 0.0005, "step": 25685 }, { "epoch": 6.543000254690551, "grad_norm": 0.023131655529141426, "learning_rate": 3.4624562462100754e-06, "loss": 0.0007, "step": 25690 }, { "epoch": 6.544273707445454, "grad_norm": 0.07056085020303726, "learning_rate": 3.4590929021259436e-06, "loss": 0.0019, "step": 25695 }, { "epoch": 6.545547160200357, "grad_norm": 0.03140045702457428, "learning_rate": 3.4557308507250586e-06, "loss": 0.0021, "step": 25700 }, { "epoch": 6.546820612955259, "grad_norm": 0.10761331766843796, "learning_rate": 3.452370092671852e-06, "loss": 0.0014, "step": 25705 }, { "epoch": 6.5480940657101625, "grad_norm": 0.14294688403606415, "learning_rate": 3.4490106286305215e-06, "loss": 0.0008, "step": 25710 }, { "epoch": 6.549367518465065, "grad_norm": 0.33560827374458313, "learning_rate": 3.4456524592649976e-06, "loss": 0.0019, "step": 25715 }, { "epoch": 6.550640971219968, "grad_norm": 0.0509486198425293, "learning_rate": 3.442295585238956e-06, "loss": 0.0011, "step": 25720 }, { "epoch": 6.55191442397487, "grad_norm": 0.3622838854789734, "learning_rate": 3.4389400072158153e-06, "loss": 0.0011, "step": 25725 }, { "epoch": 6.5531878767297735, "grad_norm": 0.37888264656066895, "learning_rate": 3.435585725858751e-06, "loss": 0.0009, "step": 25730 }, { "epoch": 6.554461329484676, "grad_norm": 0.2940739691257477, "learning_rate": 3.432232741830657e-06, "loss": 0.0015, "step": 25735 }, { "epoch": 6.555734782239579, "grad_norm": 0.09896572679281235, "learning_rate": 3.428881055794194e-06, "loss": 0.0007, "step": 25740 }, { "epoch": 6.557008234994481, "grad_norm": 1.351056694984436, "learning_rate": 3.4255306684117583e-06, "loss": 0.0028, "step": 25745 }, { "epoch": 6.558281687749385, "grad_norm": 0.0679468959569931, "learning_rate": 3.4221815803454862e-06, "loss": 0.0011, "step": 25750 }, { "epoch": 6.559555140504287, "grad_norm": 0.17680127918720245, "learning_rate": 3.41883379225726e-06, "loss": 0.0016, "step": 25755 }, { "epoch": 6.56082859325919, "grad_norm": 0.42026668787002563, "learning_rate": 3.4154873048087067e-06, "loss": 0.0023, "step": 25760 }, { "epoch": 6.562102046014093, "grad_norm": 0.2680884003639221, "learning_rate": 3.4121421186611893e-06, "loss": 0.0021, "step": 25765 }, { "epoch": 6.563375498768996, "grad_norm": 0.5336704850196838, "learning_rate": 3.4087982344758284e-06, "loss": 0.0012, "step": 25770 }, { "epoch": 6.564648951523899, "grad_norm": 0.10826409608125687, "learning_rate": 3.4054556529134673e-06, "loss": 0.0021, "step": 25775 }, { "epoch": 6.565922404278801, "grad_norm": 0.1888151913881302, "learning_rate": 3.4021143746347096e-06, "loss": 0.001, "step": 25780 }, { "epoch": 6.567195857033704, "grad_norm": 0.42648959159851074, "learning_rate": 3.3987744002998923e-06, "loss": 0.0013, "step": 25785 }, { "epoch": 6.568469309788607, "grad_norm": 0.17493896186351776, "learning_rate": 3.3954357305690955e-06, "loss": 0.0018, "step": 25790 }, { "epoch": 6.56974276254351, "grad_norm": 0.4389711916446686, "learning_rate": 3.392098366102139e-06, "loss": 0.0026, "step": 25795 }, { "epoch": 6.571016215298412, "grad_norm": 0.14638932049274445, "learning_rate": 3.3887623075585986e-06, "loss": 0.0022, "step": 25800 }, { "epoch": 6.572289668053315, "grad_norm": 0.06586837023496628, "learning_rate": 3.385427555597769e-06, "loss": 0.0025, "step": 25805 }, { "epoch": 6.573563120808218, "grad_norm": 0.9571762084960938, "learning_rate": 3.3820941108787075e-06, "loss": 0.0021, "step": 25810 }, { "epoch": 6.574836573563121, "grad_norm": 0.26175424456596375, "learning_rate": 3.3787619740602017e-06, "loss": 0.0016, "step": 25815 }, { "epoch": 6.576110026318023, "grad_norm": 0.08293713629245758, "learning_rate": 3.375431145800785e-06, "loss": 0.0008, "step": 25820 }, { "epoch": 6.577383479072926, "grad_norm": 0.38488543033599854, "learning_rate": 3.3721016267587294e-06, "loss": 0.002, "step": 25825 }, { "epoch": 6.57865693182783, "grad_norm": 0.06658530235290527, "learning_rate": 3.3687734175920505e-06, "loss": 0.0013, "step": 25830 }, { "epoch": 6.579930384582732, "grad_norm": 0.5614635944366455, "learning_rate": 3.3654465189585008e-06, "loss": 0.0015, "step": 25835 }, { "epoch": 6.581203837337635, "grad_norm": 0.38133516907691956, "learning_rate": 3.3621209315155832e-06, "loss": 0.0023, "step": 25840 }, { "epoch": 6.5824772900925375, "grad_norm": 0.10754755884408951, "learning_rate": 3.358796655920532e-06, "loss": 0.0015, "step": 25845 }, { "epoch": 6.583750742847441, "grad_norm": 0.396379292011261, "learning_rate": 3.355473692830328e-06, "loss": 0.0011, "step": 25850 }, { "epoch": 6.585024195602343, "grad_norm": 0.09767447412014008, "learning_rate": 3.3521520429016875e-06, "loss": 0.0013, "step": 25855 }, { "epoch": 6.586297648357246, "grad_norm": 0.18449464440345764, "learning_rate": 3.3488317067910712e-06, "loss": 0.0009, "step": 25860 }, { "epoch": 6.5875711011121485, "grad_norm": 0.035427026450634, "learning_rate": 3.3455126851546794e-06, "loss": 0.0009, "step": 25865 }, { "epoch": 6.588844553867052, "grad_norm": 0.6375020742416382, "learning_rate": 3.3421949786484488e-06, "loss": 0.002, "step": 25870 }, { "epoch": 6.590118006621954, "grad_norm": 0.13486242294311523, "learning_rate": 3.338878587928066e-06, "loss": 0.0007, "step": 25875 }, { "epoch": 6.591391459376857, "grad_norm": 0.6236621141433716, "learning_rate": 3.335563513648947e-06, "loss": 0.0032, "step": 25880 }, { "epoch": 6.5926649121317595, "grad_norm": 0.1615826040506363, "learning_rate": 3.332249756466255e-06, "loss": 0.0012, "step": 25885 }, { "epoch": 6.593938364886663, "grad_norm": 0.08306018263101578, "learning_rate": 3.328937317034886e-06, "loss": 0.0008, "step": 25890 }, { "epoch": 6.595211817641566, "grad_norm": 0.033712226897478104, "learning_rate": 3.3256261960094805e-06, "loss": 0.0012, "step": 25895 }, { "epoch": 6.596485270396468, "grad_norm": 0.1731870472431183, "learning_rate": 3.322316394044415e-06, "loss": 0.0016, "step": 25900 }, { "epoch": 6.5977587231513715, "grad_norm": 0.062392253428697586, "learning_rate": 3.3190079117938167e-06, "loss": 0.0009, "step": 25905 }, { "epoch": 6.599032175906274, "grad_norm": 0.4227718114852905, "learning_rate": 3.31570074991153e-06, "loss": 0.0013, "step": 25910 }, { "epoch": 6.600305628661177, "grad_norm": 0.2672244608402252, "learning_rate": 3.312394909051161e-06, "loss": 0.0014, "step": 25915 }, { "epoch": 6.601579081416079, "grad_norm": 0.41096076369285583, "learning_rate": 3.3090903898660408e-06, "loss": 0.0025, "step": 25920 }, { "epoch": 6.6028525341709825, "grad_norm": 0.03700166568160057, "learning_rate": 3.3057871930092443e-06, "loss": 0.0019, "step": 25925 }, { "epoch": 6.604125986925885, "grad_norm": 0.04940301179885864, "learning_rate": 3.3024853191335817e-06, "loss": 0.0015, "step": 25930 }, { "epoch": 6.605399439680788, "grad_norm": 0.07221183180809021, "learning_rate": 3.299184768891612e-06, "loss": 0.0005, "step": 25935 }, { "epoch": 6.60667289243569, "grad_norm": 0.5025020837783813, "learning_rate": 3.295885542935613e-06, "loss": 0.0018, "step": 25940 }, { "epoch": 6.6079463451905935, "grad_norm": 0.4441552460193634, "learning_rate": 3.292587641917623e-06, "loss": 0.0012, "step": 25945 }, { "epoch": 6.609219797945496, "grad_norm": 0.11045824736356735, "learning_rate": 3.289291066489404e-06, "loss": 0.0014, "step": 25950 }, { "epoch": 6.610493250700399, "grad_norm": 0.23942561447620392, "learning_rate": 3.2859958173024597e-06, "loss": 0.0018, "step": 25955 }, { "epoch": 6.611766703455302, "grad_norm": 0.793390154838562, "learning_rate": 3.2827018950080304e-06, "loss": 0.0019, "step": 25960 }, { "epoch": 6.613040156210205, "grad_norm": 0.27768874168395996, "learning_rate": 3.279409300257104e-06, "loss": 0.001, "step": 25965 }, { "epoch": 6.614313608965108, "grad_norm": 0.5028412938117981, "learning_rate": 3.2761180337003852e-06, "loss": 0.0025, "step": 25970 }, { "epoch": 6.61558706172001, "grad_norm": 0.04092941805720329, "learning_rate": 3.2728280959883386e-06, "loss": 0.001, "step": 25975 }, { "epoch": 6.616860514474913, "grad_norm": 0.19462665915489197, "learning_rate": 3.269539487771154e-06, "loss": 0.0012, "step": 25980 }, { "epoch": 6.618133967229816, "grad_norm": 0.7448800206184387, "learning_rate": 3.2662522096987604e-06, "loss": 0.0026, "step": 25985 }, { "epoch": 6.619407419984719, "grad_norm": 0.021643580868840218, "learning_rate": 3.2629662624208214e-06, "loss": 0.0011, "step": 25990 }, { "epoch": 6.620680872739621, "grad_norm": 0.11182496696710587, "learning_rate": 3.2596816465867508e-06, "loss": 0.0013, "step": 25995 }, { "epoch": 6.621954325494524, "grad_norm": 0.11058531701564789, "learning_rate": 3.256398362845675e-06, "loss": 0.0013, "step": 26000 }, { "epoch": 6.623227778249427, "grad_norm": 0.22838784754276276, "learning_rate": 3.2531164118464855e-06, "loss": 0.0013, "step": 26005 }, { "epoch": 6.62450123100433, "grad_norm": 0.14462092518806458, "learning_rate": 3.2498357942377823e-06, "loss": 0.0022, "step": 26010 }, { "epoch": 6.625774683759232, "grad_norm": 0.2934172451496124, "learning_rate": 3.2465565106679253e-06, "loss": 0.0019, "step": 26015 }, { "epoch": 6.627048136514135, "grad_norm": 0.6012095808982849, "learning_rate": 3.243278561784999e-06, "loss": 0.0014, "step": 26020 }, { "epoch": 6.628321589269039, "grad_norm": 0.10057596862316132, "learning_rate": 3.240001948236826e-06, "loss": 0.0012, "step": 26025 }, { "epoch": 6.629595042023941, "grad_norm": 0.08752401918172836, "learning_rate": 3.236726670670962e-06, "loss": 0.0011, "step": 26030 }, { "epoch": 6.630868494778843, "grad_norm": 0.09763806313276291, "learning_rate": 3.233452729734711e-06, "loss": 0.002, "step": 26035 }, { "epoch": 6.632141947533746, "grad_norm": 0.057270124554634094, "learning_rate": 3.230180126075091e-06, "loss": 0.001, "step": 26040 }, { "epoch": 6.63341540028865, "grad_norm": 0.28123772144317627, "learning_rate": 3.22690886033888e-06, "loss": 0.0013, "step": 26045 }, { "epoch": 6.634688853043552, "grad_norm": 0.25544384121894836, "learning_rate": 3.223638933172575e-06, "loss": 0.0015, "step": 26050 }, { "epoch": 6.635962305798455, "grad_norm": 0.12230786681175232, "learning_rate": 3.2203703452224135e-06, "loss": 0.0019, "step": 26055 }, { "epoch": 6.6372357585533575, "grad_norm": 0.5300639867782593, "learning_rate": 3.2171030971343663e-06, "loss": 0.0019, "step": 26060 }, { "epoch": 6.638509211308261, "grad_norm": 0.02328132651746273, "learning_rate": 3.2138371895541507e-06, "loss": 0.0031, "step": 26065 }, { "epoch": 6.639782664063163, "grad_norm": 0.38975727558135986, "learning_rate": 3.2105726231271974e-06, "loss": 0.0024, "step": 26070 }, { "epoch": 6.641056116818066, "grad_norm": 0.22194236516952515, "learning_rate": 3.207309398498694e-06, "loss": 0.0024, "step": 26075 }, { "epoch": 6.6423295695729685, "grad_norm": 0.07160544395446777, "learning_rate": 3.2040475163135497e-06, "loss": 0.0017, "step": 26080 }, { "epoch": 6.643603022327872, "grad_norm": 0.16879400610923767, "learning_rate": 3.2007869772164123e-06, "loss": 0.0012, "step": 26085 }, { "epoch": 6.644876475082775, "grad_norm": 0.04745504632592201, "learning_rate": 3.1975277818516613e-06, "loss": 0.0009, "step": 26090 }, { "epoch": 6.646149927837677, "grad_norm": 0.19723764061927795, "learning_rate": 3.194269930863424e-06, "loss": 0.0015, "step": 26095 }, { "epoch": 6.6474233805925795, "grad_norm": 0.2693442404270172, "learning_rate": 3.191013424895536e-06, "loss": 0.0013, "step": 26100 }, { "epoch": 6.648696833347483, "grad_norm": 0.27126961946487427, "learning_rate": 3.187758264591594e-06, "loss": 0.001, "step": 26105 }, { "epoch": 6.649970286102386, "grad_norm": 0.08900262415409088, "learning_rate": 3.184504450594913e-06, "loss": 0.0013, "step": 26110 }, { "epoch": 6.651243738857288, "grad_norm": 0.028758633881807327, "learning_rate": 3.1812519835485465e-06, "loss": 0.0021, "step": 26115 }, { "epoch": 6.6525171916121915, "grad_norm": 0.10493691265583038, "learning_rate": 3.178000864095282e-06, "loss": 0.0021, "step": 26120 }, { "epoch": 6.653790644367094, "grad_norm": 0.1485927850008011, "learning_rate": 3.1747510928776393e-06, "loss": 0.0023, "step": 26125 }, { "epoch": 6.655064097121997, "grad_norm": 0.13998614251613617, "learning_rate": 3.1715026705378704e-06, "loss": 0.0016, "step": 26130 }, { "epoch": 6.656337549876899, "grad_norm": 0.4238103926181793, "learning_rate": 3.1682555977179676e-06, "loss": 0.0019, "step": 26135 }, { "epoch": 6.6576110026318025, "grad_norm": 0.3983476459980011, "learning_rate": 3.165009875059649e-06, "loss": 0.002, "step": 26140 }, { "epoch": 6.658884455386705, "grad_norm": 0.1523289978504181, "learning_rate": 3.1617655032043703e-06, "loss": 0.0015, "step": 26145 }, { "epoch": 6.660157908141608, "grad_norm": 0.42440444231033325, "learning_rate": 3.158522482793317e-06, "loss": 0.0018, "step": 26150 }, { "epoch": 6.661431360896511, "grad_norm": 0.11293594539165497, "learning_rate": 3.1552808144674095e-06, "loss": 0.0009, "step": 26155 }, { "epoch": 6.6627048136514135, "grad_norm": 0.10304325819015503, "learning_rate": 3.1520404988673005e-06, "loss": 0.0009, "step": 26160 }, { "epoch": 6.663978266406316, "grad_norm": 0.08633837848901749, "learning_rate": 3.1488015366333726e-06, "loss": 0.0022, "step": 26165 }, { "epoch": 6.665251719161219, "grad_norm": 0.04860633239150047, "learning_rate": 3.145563928405749e-06, "loss": 0.0015, "step": 26170 }, { "epoch": 6.666525171916122, "grad_norm": 0.29917892813682556, "learning_rate": 3.1423276748242793e-06, "loss": 0.0007, "step": 26175 }, { "epoch": 6.667798624671025, "grad_norm": 0.5187958478927612, "learning_rate": 3.139092776528544e-06, "loss": 0.0018, "step": 26180 }, { "epoch": 6.669072077425928, "grad_norm": 0.16362543404102325, "learning_rate": 3.1358592341578586e-06, "loss": 0.0015, "step": 26185 }, { "epoch": 6.67034553018083, "grad_norm": 0.6319637894630432, "learning_rate": 3.132627048351269e-06, "loss": 0.002, "step": 26190 }, { "epoch": 6.671618982935733, "grad_norm": 0.05530306324362755, "learning_rate": 3.1293962197475523e-06, "loss": 0.0018, "step": 26195 }, { "epoch": 6.672892435690636, "grad_norm": 0.39910387992858887, "learning_rate": 3.126166748985229e-06, "loss": 0.0025, "step": 26200 }, { "epoch": 6.674165888445539, "grad_norm": 0.07847902923822403, "learning_rate": 3.122938636702526e-06, "loss": 0.0009, "step": 26205 }, { "epoch": 6.675439341200441, "grad_norm": 0.1321837306022644, "learning_rate": 3.1197118835374284e-06, "loss": 0.0015, "step": 26210 }, { "epoch": 6.676712793955344, "grad_norm": 0.2192797064781189, "learning_rate": 3.116486490127638e-06, "loss": 0.0022, "step": 26215 }, { "epoch": 6.6779862467102475, "grad_norm": 0.1960121989250183, "learning_rate": 3.1132624571105907e-06, "loss": 0.0012, "step": 26220 }, { "epoch": 6.67925969946515, "grad_norm": 0.07258735597133636, "learning_rate": 3.110039785123451e-06, "loss": 0.0014, "step": 26225 }, { "epoch": 6.680533152220052, "grad_norm": 0.3475513756275177, "learning_rate": 3.1068184748031274e-06, "loss": 0.0019, "step": 26230 }, { "epoch": 6.681806604974955, "grad_norm": 0.07985743880271912, "learning_rate": 3.1035985267862356e-06, "loss": 0.0014, "step": 26235 }, { "epoch": 6.683080057729859, "grad_norm": 0.24300633370876312, "learning_rate": 3.100379941709146e-06, "loss": 0.001, "step": 26240 }, { "epoch": 6.684353510484761, "grad_norm": 0.08918902277946472, "learning_rate": 3.097162720207946e-06, "loss": 0.0004, "step": 26245 }, { "epoch": 6.685626963239664, "grad_norm": 0.09155329316854477, "learning_rate": 3.093946862918458e-06, "loss": 0.0018, "step": 26250 }, { "epoch": 6.686900415994566, "grad_norm": 0.735040009021759, "learning_rate": 3.0907323704762314e-06, "loss": 0.0024, "step": 26255 }, { "epoch": 6.68817386874947, "grad_norm": 0.2414732724428177, "learning_rate": 3.08751924351655e-06, "loss": 0.0013, "step": 26260 }, { "epoch": 6.689447321504372, "grad_norm": 0.13122516870498657, "learning_rate": 3.0843074826744225e-06, "loss": 0.0011, "step": 26265 }, { "epoch": 6.690720774259275, "grad_norm": 0.42046236991882324, "learning_rate": 3.0810970885846014e-06, "loss": 0.002, "step": 26270 }, { "epoch": 6.6919942270141775, "grad_norm": 0.12076231837272644, "learning_rate": 3.077888061881543e-06, "loss": 0.0014, "step": 26275 }, { "epoch": 6.693267679769081, "grad_norm": 0.06969895213842392, "learning_rate": 3.0746804031994624e-06, "loss": 0.0016, "step": 26280 }, { "epoch": 6.694541132523983, "grad_norm": 0.21809786558151245, "learning_rate": 3.071474113172286e-06, "loss": 0.0012, "step": 26285 }, { "epoch": 6.695814585278886, "grad_norm": 0.05566900223493576, "learning_rate": 3.0682691924336736e-06, "loss": 0.0012, "step": 26290 }, { "epoch": 6.6970880380337885, "grad_norm": 0.5123371481895447, "learning_rate": 3.0650656416170155e-06, "loss": 0.0015, "step": 26295 }, { "epoch": 6.698361490788692, "grad_norm": 0.21300917863845825, "learning_rate": 3.0618634613554377e-06, "loss": 0.0025, "step": 26300 }, { "epoch": 6.699634943543595, "grad_norm": 0.06950820982456207, "learning_rate": 3.0586626522817775e-06, "loss": 0.0016, "step": 26305 }, { "epoch": 6.700908396298497, "grad_norm": 0.5578799843788147, "learning_rate": 3.0554632150286224e-06, "loss": 0.0006, "step": 26310 }, { "epoch": 6.7021818490534, "grad_norm": 0.11407860368490219, "learning_rate": 3.052265150228276e-06, "loss": 0.0014, "step": 26315 }, { "epoch": 6.703455301808303, "grad_norm": 0.7925399541854858, "learning_rate": 3.049068458512773e-06, "loss": 0.0019, "step": 26320 }, { "epoch": 6.704728754563206, "grad_norm": 0.6940084099769592, "learning_rate": 3.045873140513874e-06, "loss": 0.0022, "step": 26325 }, { "epoch": 6.706002207318108, "grad_norm": 0.5257376432418823, "learning_rate": 3.042679196863083e-06, "loss": 0.0017, "step": 26330 }, { "epoch": 6.7072756600730115, "grad_norm": 0.06882429867982864, "learning_rate": 3.0394866281916057e-06, "loss": 0.001, "step": 26335 }, { "epoch": 6.708549112827914, "grad_norm": 0.5355520248413086, "learning_rate": 3.036295435130402e-06, "loss": 0.001, "step": 26340 }, { "epoch": 6.709822565582817, "grad_norm": 0.1569635272026062, "learning_rate": 3.033105618310146e-06, "loss": 0.0005, "step": 26345 }, { "epoch": 6.711096018337719, "grad_norm": 0.027574630454182625, "learning_rate": 3.0299171783612437e-06, "loss": 0.0007, "step": 26350 }, { "epoch": 6.7123694710926225, "grad_norm": 0.16612428426742554, "learning_rate": 3.026730115913824e-06, "loss": 0.0015, "step": 26355 }, { "epoch": 6.713642923847525, "grad_norm": 1.2952353954315186, "learning_rate": 3.023544431597758e-06, "loss": 0.0028, "step": 26360 }, { "epoch": 6.714916376602428, "grad_norm": 0.6190000772476196, "learning_rate": 3.0203601260426218e-06, "loss": 0.0015, "step": 26365 }, { "epoch": 6.716189829357331, "grad_norm": 0.27409666776657104, "learning_rate": 3.017177199877741e-06, "loss": 0.003, "step": 26370 }, { "epoch": 6.7174632821122335, "grad_norm": 0.27234017848968506, "learning_rate": 3.0139956537321546e-06, "loss": 0.0012, "step": 26375 }, { "epoch": 6.718736734867137, "grad_norm": 0.11509812623262405, "learning_rate": 3.0108154882346352e-06, "loss": 0.0021, "step": 26380 }, { "epoch": 6.720010187622039, "grad_norm": 0.10632980614900589, "learning_rate": 3.007636704013678e-06, "loss": 0.0018, "step": 26385 }, { "epoch": 6.721283640376942, "grad_norm": 0.3052425682544708, "learning_rate": 3.004459301697514e-06, "loss": 0.0019, "step": 26390 }, { "epoch": 6.722557093131845, "grad_norm": 0.03450571000576019, "learning_rate": 3.001283281914087e-06, "loss": 0.0011, "step": 26395 }, { "epoch": 6.723830545886748, "grad_norm": 0.0904288962483406, "learning_rate": 2.9981086452910825e-06, "loss": 0.0013, "step": 26400 }, { "epoch": 6.72510399864165, "grad_norm": 0.19072332978248596, "learning_rate": 2.9949353924559023e-06, "loss": 0.0015, "step": 26405 }, { "epoch": 6.726377451396553, "grad_norm": 0.8739868998527527, "learning_rate": 2.991763524035679e-06, "loss": 0.0023, "step": 26410 }, { "epoch": 6.727650904151456, "grad_norm": 0.185882106423378, "learning_rate": 2.9885930406572673e-06, "loss": 0.0007, "step": 26415 }, { "epoch": 6.728924356906359, "grad_norm": 0.07819723337888718, "learning_rate": 2.985423942947262e-06, "loss": 0.0018, "step": 26420 }, { "epoch": 6.730197809661261, "grad_norm": 0.23570333421230316, "learning_rate": 2.9822562315319604e-06, "loss": 0.001, "step": 26425 }, { "epoch": 6.731471262416164, "grad_norm": 0.17371198534965515, "learning_rate": 2.979089907037408e-06, "loss": 0.0013, "step": 26430 }, { "epoch": 6.7327447151710675, "grad_norm": 0.03610028699040413, "learning_rate": 2.9759249700893657e-06, "loss": 0.0019, "step": 26435 }, { "epoch": 6.73401816792597, "grad_norm": 0.09515460580587387, "learning_rate": 2.9727614213133203e-06, "loss": 0.0015, "step": 26440 }, { "epoch": 6.735291620680873, "grad_norm": 0.2849337160587311, "learning_rate": 2.9695992613344858e-06, "loss": 0.0013, "step": 26445 }, { "epoch": 6.736565073435775, "grad_norm": 0.33229944109916687, "learning_rate": 2.9664384907778034e-06, "loss": 0.004, "step": 26450 }, { "epoch": 6.737838526190679, "grad_norm": 0.22044211626052856, "learning_rate": 2.9632791102679338e-06, "loss": 0.0012, "step": 26455 }, { "epoch": 6.739111978945581, "grad_norm": 0.2903177738189697, "learning_rate": 2.9601211204292723e-06, "loss": 0.001, "step": 26460 }, { "epoch": 6.740385431700484, "grad_norm": 0.15429499745368958, "learning_rate": 2.956964521885933e-06, "loss": 0.0008, "step": 26465 }, { "epoch": 6.741658884455386, "grad_norm": 0.04103010892868042, "learning_rate": 2.953809315261755e-06, "loss": 0.0012, "step": 26470 }, { "epoch": 6.74293233721029, "grad_norm": 0.10031621158123016, "learning_rate": 2.950655501180305e-06, "loss": 0.0012, "step": 26475 }, { "epoch": 6.744205789965192, "grad_norm": 0.29542475938796997, "learning_rate": 2.9475030802648718e-06, "loss": 0.0025, "step": 26480 }, { "epoch": 6.745479242720095, "grad_norm": 0.10590671002864838, "learning_rate": 2.9443520531384706e-06, "loss": 0.002, "step": 26485 }, { "epoch": 6.7467526954749975, "grad_norm": 0.3197813332080841, "learning_rate": 2.9412024204238377e-06, "loss": 0.0012, "step": 26490 }, { "epoch": 6.748026148229901, "grad_norm": 0.27182435989379883, "learning_rate": 2.938054182743443e-06, "loss": 0.0025, "step": 26495 }, { "epoch": 6.749299600984804, "grad_norm": 0.10600925236940384, "learning_rate": 2.9349073407194706e-06, "loss": 0.0019, "step": 26500 }, { "epoch": 6.750573053739706, "grad_norm": 0.03045656532049179, "learning_rate": 2.9317618949738347e-06, "loss": 0.0014, "step": 26505 }, { "epoch": 6.751846506494609, "grad_norm": 0.049433670938014984, "learning_rate": 2.9286178461281687e-06, "loss": 0.0021, "step": 26510 }, { "epoch": 6.753119959249512, "grad_norm": 0.26194608211517334, "learning_rate": 2.925475194803836e-06, "loss": 0.0012, "step": 26515 }, { "epoch": 6.754393412004415, "grad_norm": 0.28703561425209045, "learning_rate": 2.9223339416219176e-06, "loss": 0.0008, "step": 26520 }, { "epoch": 6.755666864759317, "grad_norm": 0.2112417370080948, "learning_rate": 2.9191940872032233e-06, "loss": 0.0015, "step": 26525 }, { "epoch": 6.75694031751422, "grad_norm": 0.15541565418243408, "learning_rate": 2.9160556321682808e-06, "loss": 0.0013, "step": 26530 }, { "epoch": 6.758213770269123, "grad_norm": 0.037104878574609756, "learning_rate": 2.9129185771373502e-06, "loss": 0.0007, "step": 26535 }, { "epoch": 6.759487223024026, "grad_norm": 0.30825480818748474, "learning_rate": 2.9097829227304077e-06, "loss": 0.0024, "step": 26540 }, { "epoch": 6.760760675778928, "grad_norm": 0.13106293976306915, "learning_rate": 2.9066486695671526e-06, "loss": 0.0011, "step": 26545 }, { "epoch": 6.7620341285338315, "grad_norm": 0.5600691437721252, "learning_rate": 2.903515818267011e-06, "loss": 0.0018, "step": 26550 }, { "epoch": 6.763307581288734, "grad_norm": 0.09372754395008087, "learning_rate": 2.9003843694491284e-06, "loss": 0.0016, "step": 26555 }, { "epoch": 6.764581034043637, "grad_norm": 0.37960824370384216, "learning_rate": 2.8972543237323737e-06, "loss": 0.001, "step": 26560 }, { "epoch": 6.76585448679854, "grad_norm": 0.21098482608795166, "learning_rate": 2.894125681735347e-06, "loss": 0.0017, "step": 26565 }, { "epoch": 6.7671279395534425, "grad_norm": 0.3654367923736572, "learning_rate": 2.8909984440763526e-06, "loss": 0.0014, "step": 26570 }, { "epoch": 6.768401392308346, "grad_norm": 0.0986538827419281, "learning_rate": 2.8878726113734367e-06, "loss": 0.0011, "step": 26575 }, { "epoch": 6.769674845063248, "grad_norm": 0.03042343258857727, "learning_rate": 2.884748184244357e-06, "loss": 0.0014, "step": 26580 }, { "epoch": 6.770948297818151, "grad_norm": 0.2603353261947632, "learning_rate": 2.8816251633065963e-06, "loss": 0.0019, "step": 26585 }, { "epoch": 6.7722217505730535, "grad_norm": 0.49429816007614136, "learning_rate": 2.878503549177355e-06, "loss": 0.002, "step": 26590 }, { "epoch": 6.773495203327957, "grad_norm": 0.6855365037918091, "learning_rate": 2.8753833424735676e-06, "loss": 0.0014, "step": 26595 }, { "epoch": 6.774768656082859, "grad_norm": 0.47322070598602295, "learning_rate": 2.8722645438118724e-06, "loss": 0.0011, "step": 26600 }, { "epoch": 6.776042108837762, "grad_norm": 0.08979947119951248, "learning_rate": 2.869147153808648e-06, "loss": 0.0008, "step": 26605 }, { "epoch": 6.777315561592665, "grad_norm": 0.5725526809692383, "learning_rate": 2.8660311730799816e-06, "loss": 0.0014, "step": 26610 }, { "epoch": 6.778589014347568, "grad_norm": 0.18272225558757782, "learning_rate": 2.8629166022416877e-06, "loss": 0.0009, "step": 26615 }, { "epoch": 6.77986246710247, "grad_norm": 0.11945068091154099, "learning_rate": 2.859803441909297e-06, "loss": 0.0019, "step": 26620 }, { "epoch": 6.781135919857373, "grad_norm": 0.6580724120140076, "learning_rate": 2.8566916926980746e-06, "loss": 0.0026, "step": 26625 }, { "epoch": 6.7824093726122765, "grad_norm": 0.13304440677165985, "learning_rate": 2.853581355222985e-06, "loss": 0.0021, "step": 26630 }, { "epoch": 6.783682825367179, "grad_norm": 0.22293759882450104, "learning_rate": 2.850472430098735e-06, "loss": 0.0022, "step": 26635 }, { "epoch": 6.784956278122082, "grad_norm": 0.26990368962287903, "learning_rate": 2.8473649179397413e-06, "loss": 0.0015, "step": 26640 }, { "epoch": 6.786229730876984, "grad_norm": 0.6140449047088623, "learning_rate": 2.8442588193601404e-06, "loss": 0.0026, "step": 26645 }, { "epoch": 6.7875031836318875, "grad_norm": 0.04206901043653488, "learning_rate": 2.8411541349737926e-06, "loss": 0.0015, "step": 26650 }, { "epoch": 6.78877663638679, "grad_norm": 0.1057676449418068, "learning_rate": 2.838050865394286e-06, "loss": 0.001, "step": 26655 }, { "epoch": 6.790050089141693, "grad_norm": 0.5271881818771362, "learning_rate": 2.8349490112349087e-06, "loss": 0.0015, "step": 26660 }, { "epoch": 6.791323541896595, "grad_norm": 0.1641429364681244, "learning_rate": 2.831848573108693e-06, "loss": 0.0012, "step": 26665 }, { "epoch": 6.792596994651499, "grad_norm": 0.3747046887874603, "learning_rate": 2.828749551628376e-06, "loss": 0.0017, "step": 26670 }, { "epoch": 6.793870447406401, "grad_norm": 0.5839846134185791, "learning_rate": 2.8256519474064193e-06, "loss": 0.0017, "step": 26675 }, { "epoch": 6.795143900161304, "grad_norm": 0.23635230958461761, "learning_rate": 2.8225557610550015e-06, "loss": 0.0018, "step": 26680 }, { "epoch": 6.796417352916206, "grad_norm": 0.4508510231971741, "learning_rate": 2.819460993186032e-06, "loss": 0.0014, "step": 26685 }, { "epoch": 6.79769080567111, "grad_norm": 0.10106133669614792, "learning_rate": 2.81636764441112e-06, "loss": 0.0014, "step": 26690 }, { "epoch": 6.798964258426013, "grad_norm": 0.09784376621246338, "learning_rate": 2.8132757153416155e-06, "loss": 0.002, "step": 26695 }, { "epoch": 6.800237711180915, "grad_norm": 1.5991064310073853, "learning_rate": 2.8101852065885737e-06, "loss": 0.0015, "step": 26700 }, { "epoch": 6.801511163935818, "grad_norm": 0.28875863552093506, "learning_rate": 2.8070961187627742e-06, "loss": 0.0021, "step": 26705 }, { "epoch": 6.802784616690721, "grad_norm": 0.19750674068927765, "learning_rate": 2.8040084524747136e-06, "loss": 0.0014, "step": 26710 }, { "epoch": 6.804058069445624, "grad_norm": 0.2675807774066925, "learning_rate": 2.8009222083346155e-06, "loss": 0.0024, "step": 26715 }, { "epoch": 6.805331522200526, "grad_norm": 0.10321002453565598, "learning_rate": 2.797837386952407e-06, "loss": 0.0016, "step": 26720 }, { "epoch": 6.806604974955429, "grad_norm": 0.12595969438552856, "learning_rate": 2.7947539889377496e-06, "loss": 0.0017, "step": 26725 }, { "epoch": 6.807878427710332, "grad_norm": 0.163230761885643, "learning_rate": 2.7916720149000155e-06, "loss": 0.003, "step": 26730 }, { "epoch": 6.809151880465235, "grad_norm": 0.6715885996818542, "learning_rate": 2.788591465448297e-06, "loss": 0.0019, "step": 26735 }, { "epoch": 6.810425333220137, "grad_norm": 0.26780107617378235, "learning_rate": 2.785512341191404e-06, "loss": 0.0015, "step": 26740 }, { "epoch": 6.81169878597504, "grad_norm": 0.0839952602982521, "learning_rate": 2.782434642737867e-06, "loss": 0.0011, "step": 26745 }, { "epoch": 6.812972238729943, "grad_norm": 0.17392659187316895, "learning_rate": 2.7793583706959295e-06, "loss": 0.0014, "step": 26750 }, { "epoch": 6.814245691484846, "grad_norm": 0.16394400596618652, "learning_rate": 2.776283525673562e-06, "loss": 0.002, "step": 26755 }, { "epoch": 6.815519144239749, "grad_norm": 0.35811495780944824, "learning_rate": 2.7732101082784458e-06, "loss": 0.0013, "step": 26760 }, { "epoch": 6.8167925969946515, "grad_norm": 0.13854296505451202, "learning_rate": 2.7701381191179822e-06, "loss": 0.0011, "step": 26765 }, { "epoch": 6.818066049749555, "grad_norm": 0.23202496767044067, "learning_rate": 2.767067558799291e-06, "loss": 0.0026, "step": 26770 }, { "epoch": 6.819339502504457, "grad_norm": 0.21358275413513184, "learning_rate": 2.7639984279292075e-06, "loss": 0.0019, "step": 26775 }, { "epoch": 6.82061295525936, "grad_norm": 0.3728529214859009, "learning_rate": 2.760930727114286e-06, "loss": 0.0013, "step": 26780 }, { "epoch": 6.8218864080142625, "grad_norm": 0.34266892075538635, "learning_rate": 2.757864456960798e-06, "loss": 0.001, "step": 26785 }, { "epoch": 6.823159860769166, "grad_norm": 0.14005902409553528, "learning_rate": 2.7547996180747294e-06, "loss": 0.0021, "step": 26790 }, { "epoch": 6.824433313524068, "grad_norm": 0.19626203179359436, "learning_rate": 2.7517362110617928e-06, "loss": 0.0012, "step": 26795 }, { "epoch": 6.825706766278971, "grad_norm": 0.050760798156261444, "learning_rate": 2.7486742365274065e-06, "loss": 0.0008, "step": 26800 }, { "epoch": 6.8269802190338735, "grad_norm": 0.20790982246398926, "learning_rate": 2.7456136950767122e-06, "loss": 0.0013, "step": 26805 }, { "epoch": 6.828253671788777, "grad_norm": 0.4426571726799011, "learning_rate": 2.7425545873145642e-06, "loss": 0.0012, "step": 26810 }, { "epoch": 6.829527124543679, "grad_norm": 0.1604200154542923, "learning_rate": 2.7394969138455387e-06, "loss": 0.0015, "step": 26815 }, { "epoch": 6.830800577298582, "grad_norm": 0.29707691073417664, "learning_rate": 2.7364406752739224e-06, "loss": 0.0012, "step": 26820 }, { "epoch": 6.8320740300534855, "grad_norm": 0.21119506657123566, "learning_rate": 2.73338587220372e-06, "loss": 0.0011, "step": 26825 }, { "epoch": 6.833347482808388, "grad_norm": 0.09959077835083008, "learning_rate": 2.7303325052386596e-06, "loss": 0.0017, "step": 26830 }, { "epoch": 6.834620935563291, "grad_norm": 0.432028204202652, "learning_rate": 2.7272805749821785e-06, "loss": 0.0026, "step": 26835 }, { "epoch": 6.835894388318193, "grad_norm": 0.05708153545856476, "learning_rate": 2.724230082037429e-06, "loss": 0.0011, "step": 26840 }, { "epoch": 6.8371678410730965, "grad_norm": 1.295064926147461, "learning_rate": 2.7211810270072826e-06, "loss": 0.0024, "step": 26845 }, { "epoch": 6.838441293827999, "grad_norm": 0.03441912308335304, "learning_rate": 2.718133410494327e-06, "loss": 0.0018, "step": 26850 }, { "epoch": 6.839714746582902, "grad_norm": 0.18657535314559937, "learning_rate": 2.7150872331008594e-06, "loss": 0.0013, "step": 26855 }, { "epoch": 6.840988199337804, "grad_norm": 0.1389084756374359, "learning_rate": 2.712042495428908e-06, "loss": 0.0009, "step": 26860 }, { "epoch": 6.8422616520927075, "grad_norm": 0.7296568155288696, "learning_rate": 2.708999198080194e-06, "loss": 0.0014, "step": 26865 }, { "epoch": 6.84353510484761, "grad_norm": 0.0888187512755394, "learning_rate": 2.7059573416561746e-06, "loss": 0.0007, "step": 26870 }, { "epoch": 6.844808557602513, "grad_norm": 0.23232683539390564, "learning_rate": 2.7029169267580112e-06, "loss": 0.0015, "step": 26875 }, { "epoch": 6.846082010357415, "grad_norm": 0.33897295594215393, "learning_rate": 2.6998779539865818e-06, "loss": 0.0011, "step": 26880 }, { "epoch": 6.847355463112319, "grad_norm": 0.13793320953845978, "learning_rate": 2.6968404239424783e-06, "loss": 0.0008, "step": 26885 }, { "epoch": 6.848628915867222, "grad_norm": 0.08259813487529755, "learning_rate": 2.693804337226018e-06, "loss": 0.0014, "step": 26890 }, { "epoch": 6.849902368622124, "grad_norm": 0.1557045876979828, "learning_rate": 2.690769694437212e-06, "loss": 0.0019, "step": 26895 }, { "epoch": 6.851175821377027, "grad_norm": 0.30324140191078186, "learning_rate": 2.687736496175808e-06, "loss": 0.0007, "step": 26900 }, { "epoch": 6.85244927413193, "grad_norm": 0.2570309638977051, "learning_rate": 2.6847047430412567e-06, "loss": 0.0015, "step": 26905 }, { "epoch": 6.853722726886833, "grad_norm": 0.4100746810436249, "learning_rate": 2.681674435632723e-06, "loss": 0.0011, "step": 26910 }, { "epoch": 6.854996179641735, "grad_norm": 0.12173288315534592, "learning_rate": 2.678645574549088e-06, "loss": 0.0015, "step": 26915 }, { "epoch": 6.856269632396638, "grad_norm": 1.6661938428878784, "learning_rate": 2.675618160388953e-06, "loss": 0.0026, "step": 26920 }, { "epoch": 6.857543085151541, "grad_norm": 0.1612020581960678, "learning_rate": 2.6725921937506194e-06, "loss": 0.0016, "step": 26925 }, { "epoch": 6.858816537906444, "grad_norm": 0.628300130367279, "learning_rate": 2.6695676752321167e-06, "loss": 0.0024, "step": 26930 }, { "epoch": 6.860089990661346, "grad_norm": 0.23442420363426208, "learning_rate": 2.6665446054311806e-06, "loss": 0.0017, "step": 26935 }, { "epoch": 6.861363443416249, "grad_norm": 0.08840113133192062, "learning_rate": 2.6635229849452616e-06, "loss": 0.0009, "step": 26940 }, { "epoch": 6.862636896171152, "grad_norm": 0.11562806367874146, "learning_rate": 2.660502814371522e-06, "loss": 0.0015, "step": 26945 }, { "epoch": 6.863910348926055, "grad_norm": 0.10112591832876205, "learning_rate": 2.657484094306848e-06, "loss": 0.0016, "step": 26950 }, { "epoch": 6.865183801680958, "grad_norm": 0.37177714705467224, "learning_rate": 2.654466825347819e-06, "loss": 0.0015, "step": 26955 }, { "epoch": 6.86645725443586, "grad_norm": 0.3012223243713379, "learning_rate": 2.6514510080907495e-06, "loss": 0.0006, "step": 26960 }, { "epoch": 6.867730707190763, "grad_norm": 0.2291334867477417, "learning_rate": 2.6484366431316534e-06, "loss": 0.0012, "step": 26965 }, { "epoch": 6.869004159945666, "grad_norm": 0.06081872805953026, "learning_rate": 2.6454237310662624e-06, "loss": 0.0008, "step": 26970 }, { "epoch": 6.870277612700569, "grad_norm": 0.4403991103172302, "learning_rate": 2.6424122724900157e-06, "loss": 0.0013, "step": 26975 }, { "epoch": 6.8715510654554715, "grad_norm": 0.22714418172836304, "learning_rate": 2.639402267998079e-06, "loss": 0.0005, "step": 26980 }, { "epoch": 6.872824518210375, "grad_norm": 0.2694714367389679, "learning_rate": 2.6363937181853094e-06, "loss": 0.0011, "step": 26985 }, { "epoch": 6.874097970965277, "grad_norm": 0.09496811032295227, "learning_rate": 2.633386623646298e-06, "loss": 0.0007, "step": 26990 }, { "epoch": 6.87537142372018, "grad_norm": 0.17384277284145355, "learning_rate": 2.6303809849753346e-06, "loss": 0.0008, "step": 26995 }, { "epoch": 6.8766448764750825, "grad_norm": 0.08038120716810226, "learning_rate": 2.627376802766426e-06, "loss": 0.0009, "step": 27000 }, { "epoch": 6.877918329229986, "grad_norm": 0.4152168035507202, "learning_rate": 2.6243740776132875e-06, "loss": 0.0027, "step": 27005 }, { "epoch": 6.879191781984888, "grad_norm": 0.5290306806564331, "learning_rate": 2.6213728101093573e-06, "loss": 0.0012, "step": 27010 }, { "epoch": 6.880465234739791, "grad_norm": 0.30385786294937134, "learning_rate": 2.6183730008477657e-06, "loss": 0.0014, "step": 27015 }, { "epoch": 6.881738687494694, "grad_norm": 0.14252787828445435, "learning_rate": 2.6153746504213783e-06, "loss": 0.0005, "step": 27020 }, { "epoch": 6.883012140249597, "grad_norm": 0.07795543968677521, "learning_rate": 2.612377759422755e-06, "loss": 0.0014, "step": 27025 }, { "epoch": 6.884285593004499, "grad_norm": 0.08037873357534409, "learning_rate": 2.6093823284441746e-06, "loss": 0.0006, "step": 27030 }, { "epoch": 6.885559045759402, "grad_norm": 0.11283718049526215, "learning_rate": 2.606388358077624e-06, "loss": 0.0012, "step": 27035 }, { "epoch": 6.8868324985143055, "grad_norm": 0.09469795227050781, "learning_rate": 2.603395848914806e-06, "loss": 0.0021, "step": 27040 }, { "epoch": 6.888105951269208, "grad_norm": 0.03929697722196579, "learning_rate": 2.600404801547126e-06, "loss": 0.0007, "step": 27045 }, { "epoch": 6.889379404024111, "grad_norm": 1.0556514263153076, "learning_rate": 2.5974152165657185e-06, "loss": 0.0016, "step": 27050 }, { "epoch": 6.890652856779013, "grad_norm": 0.22343844175338745, "learning_rate": 2.5944270945614023e-06, "loss": 0.0013, "step": 27055 }, { "epoch": 6.8919263095339165, "grad_norm": 0.03165876865386963, "learning_rate": 2.5914404361247315e-06, "loss": 0.001, "step": 27060 }, { "epoch": 6.893199762288819, "grad_norm": 0.2658275067806244, "learning_rate": 2.5884552418459585e-06, "loss": 0.0013, "step": 27065 }, { "epoch": 6.894473215043722, "grad_norm": 0.043835435062646866, "learning_rate": 2.5854715123150497e-06, "loss": 0.0004, "step": 27070 }, { "epoch": 6.895746667798624, "grad_norm": 0.1611531674861908, "learning_rate": 2.582489248121677e-06, "loss": 0.0042, "step": 27075 }, { "epoch": 6.8970201205535275, "grad_norm": 0.08550646901130676, "learning_rate": 2.5795084498552357e-06, "loss": 0.001, "step": 27080 }, { "epoch": 6.898293573308431, "grad_norm": 0.26905936002731323, "learning_rate": 2.576529118104811e-06, "loss": 0.0007, "step": 27085 }, { "epoch": 6.899567026063333, "grad_norm": 0.7556374669075012, "learning_rate": 2.573551253459221e-06, "loss": 0.0022, "step": 27090 }, { "epoch": 6.900840478818235, "grad_norm": 0.08040893077850342, "learning_rate": 2.5705748565069766e-06, "loss": 0.0024, "step": 27095 }, { "epoch": 6.902113931573139, "grad_norm": 0.050003450363874435, "learning_rate": 2.5675999278363074e-06, "loss": 0.001, "step": 27100 }, { "epoch": 6.903387384328042, "grad_norm": 0.21054670214653015, "learning_rate": 2.5646264680351483e-06, "loss": 0.0011, "step": 27105 }, { "epoch": 6.904660837082944, "grad_norm": 0.8928487300872803, "learning_rate": 2.561654477691148e-06, "loss": 0.0009, "step": 27110 }, { "epoch": 6.905934289837847, "grad_norm": 0.07235687971115112, "learning_rate": 2.5586839573916565e-06, "loss": 0.0012, "step": 27115 }, { "epoch": 6.90720774259275, "grad_norm": 0.2982655167579651, "learning_rate": 2.5557149077237498e-06, "loss": 0.0025, "step": 27120 }, { "epoch": 6.908481195347653, "grad_norm": 0.2579464316368103, "learning_rate": 2.5527473292741953e-06, "loss": 0.0011, "step": 27125 }, { "epoch": 6.909754648102555, "grad_norm": 0.16077622771263123, "learning_rate": 2.5497812226294806e-06, "loss": 0.0012, "step": 27130 }, { "epoch": 6.911028100857458, "grad_norm": 0.08134682476520538, "learning_rate": 2.546816588375798e-06, "loss": 0.0015, "step": 27135 }, { "epoch": 6.912301553612361, "grad_norm": 0.2654016315937042, "learning_rate": 2.543853427099049e-06, "loss": 0.001, "step": 27140 }, { "epoch": 6.913575006367264, "grad_norm": 0.07035350799560547, "learning_rate": 2.5408917393848463e-06, "loss": 0.0036, "step": 27145 }, { "epoch": 6.914848459122167, "grad_norm": 0.18970957398414612, "learning_rate": 2.5379315258185068e-06, "loss": 0.001, "step": 27150 }, { "epoch": 6.916121911877069, "grad_norm": 0.18906250596046448, "learning_rate": 2.5349727869850638e-06, "loss": 0.001, "step": 27155 }, { "epoch": 6.917395364631972, "grad_norm": 0.4032861292362213, "learning_rate": 2.532015523469252e-06, "loss": 0.0009, "step": 27160 }, { "epoch": 6.918668817386875, "grad_norm": 0.05799808353185654, "learning_rate": 2.529059735855518e-06, "loss": 0.0027, "step": 27165 }, { "epoch": 6.919942270141778, "grad_norm": 0.06794276088476181, "learning_rate": 2.526105424728016e-06, "loss": 0.0015, "step": 27170 }, { "epoch": 6.92121572289668, "grad_norm": 0.14930406212806702, "learning_rate": 2.523152590670608e-06, "loss": 0.0022, "step": 27175 }, { "epoch": 6.922489175651584, "grad_norm": 0.1775776445865631, "learning_rate": 2.520201234266859e-06, "loss": 0.0018, "step": 27180 }, { "epoch": 6.923762628406486, "grad_norm": 0.2978338599205017, "learning_rate": 2.51725135610006e-06, "loss": 0.0016, "step": 27185 }, { "epoch": 6.925036081161389, "grad_norm": 0.18787500262260437, "learning_rate": 2.5143029567531816e-06, "loss": 0.0022, "step": 27190 }, { "epoch": 6.9263095339162914, "grad_norm": 0.7221565246582031, "learning_rate": 2.5113560368089296e-06, "loss": 0.001, "step": 27195 }, { "epoch": 6.927582986671195, "grad_norm": 0.7676315903663635, "learning_rate": 2.508410596849701e-06, "loss": 0.0007, "step": 27200 }, { "epoch": 6.928856439426097, "grad_norm": 0.026200883090496063, "learning_rate": 2.5054666374576053e-06, "loss": 0.0006, "step": 27205 }, { "epoch": 6.930129892181, "grad_norm": 0.15126048028469086, "learning_rate": 2.5025241592144546e-06, "loss": 0.0013, "step": 27210 }, { "epoch": 6.931403344935903, "grad_norm": 0.06234501674771309, "learning_rate": 2.499583162701783e-06, "loss": 0.001, "step": 27215 }, { "epoch": 6.932676797690806, "grad_norm": 0.04396003484725952, "learning_rate": 2.4966436485008094e-06, "loss": 0.0009, "step": 27220 }, { "epoch": 6.933950250445708, "grad_norm": 0.09449639171361923, "learning_rate": 2.49370561719248e-06, "loss": 0.0012, "step": 27225 }, { "epoch": 6.935223703200611, "grad_norm": 0.18157580494880676, "learning_rate": 2.490769069357437e-06, "loss": 0.0006, "step": 27230 }, { "epoch": 6.936497155955514, "grad_norm": 0.09444716572761536, "learning_rate": 2.487834005576032e-06, "loss": 0.0011, "step": 27235 }, { "epoch": 6.937770608710417, "grad_norm": 0.07910092175006866, "learning_rate": 2.4849004264283193e-06, "loss": 0.0009, "step": 27240 }, { "epoch": 6.93904406146532, "grad_norm": 0.20453794300556183, "learning_rate": 2.4819683324940748e-06, "loss": 0.0011, "step": 27245 }, { "epoch": 6.940317514220222, "grad_norm": 0.04707667976617813, "learning_rate": 2.4790377243527566e-06, "loss": 0.0011, "step": 27250 }, { "epoch": 6.9415909669751255, "grad_norm": 0.20168977975845337, "learning_rate": 2.476108602583551e-06, "loss": 0.001, "step": 27255 }, { "epoch": 6.942864419730028, "grad_norm": 0.3596051335334778, "learning_rate": 2.473180967765341e-06, "loss": 0.0017, "step": 27260 }, { "epoch": 6.944137872484931, "grad_norm": 0.08074595034122467, "learning_rate": 2.470254820476714e-06, "loss": 0.0021, "step": 27265 }, { "epoch": 6.945411325239833, "grad_norm": 0.311870813369751, "learning_rate": 2.4673301612959653e-06, "loss": 0.0012, "step": 27270 }, { "epoch": 6.9466847779947365, "grad_norm": 0.28228095173835754, "learning_rate": 2.4644069908011058e-06, "loss": 0.0018, "step": 27275 }, { "epoch": 6.947958230749639, "grad_norm": Infinity, "learning_rate": 2.4620695266472804e-06, "loss": 0.0027, "step": 27280 }, { "epoch": 6.949231683504542, "grad_norm": 0.1215919554233551, "learning_rate": 2.459149037242633e-06, "loss": 0.0013, "step": 27285 }, { "epoch": 6.950505136259444, "grad_norm": 0.2996014952659607, "learning_rate": 2.4562300381407047e-06, "loss": 0.0007, "step": 27290 }, { "epoch": 6.9517785890143475, "grad_norm": 0.24746063351631165, "learning_rate": 2.4533125299183903e-06, "loss": 0.0012, "step": 27295 }, { "epoch": 6.953052041769251, "grad_norm": 0.6980964541435242, "learning_rate": 2.4503965131522646e-06, "loss": 0.0011, "step": 27300 }, { "epoch": 6.954325494524153, "grad_norm": 0.10259774327278137, "learning_rate": 2.4474819884186305e-06, "loss": 0.0018, "step": 27305 }, { "epoch": 6.955598947279056, "grad_norm": 0.15012967586517334, "learning_rate": 2.444568956293486e-06, "loss": 0.0008, "step": 27310 }, { "epoch": 6.956872400033959, "grad_norm": 0.11375753581523895, "learning_rate": 2.441657417352534e-06, "loss": 0.002, "step": 27315 }, { "epoch": 6.958145852788862, "grad_norm": 0.07054677605628967, "learning_rate": 2.438747372171182e-06, "loss": 0.0014, "step": 27320 }, { "epoch": 6.959419305543764, "grad_norm": 0.1734706312417984, "learning_rate": 2.435838821324551e-06, "loss": 0.0011, "step": 27325 }, { "epoch": 6.960692758298667, "grad_norm": 0.27339789271354675, "learning_rate": 2.4329317653874485e-06, "loss": 0.0019, "step": 27330 }, { "epoch": 6.96196621105357, "grad_norm": 0.2619645893573761, "learning_rate": 2.430026204934406e-06, "loss": 0.0014, "step": 27335 }, { "epoch": 6.963239663808473, "grad_norm": 0.2216620296239853, "learning_rate": 2.4271221405396496e-06, "loss": 0.0012, "step": 27340 }, { "epoch": 6.964513116563375, "grad_norm": 0.351624995470047, "learning_rate": 2.4242195727771102e-06, "loss": 0.0006, "step": 27345 }, { "epoch": 6.965786569318278, "grad_norm": 0.1547124832868576, "learning_rate": 2.4213185022204213e-06, "loss": 0.0015, "step": 27350 }, { "epoch": 6.967060022073181, "grad_norm": 0.1562298983335495, "learning_rate": 2.4184189294429304e-06, "loss": 0.0021, "step": 27355 }, { "epoch": 6.968333474828084, "grad_norm": 0.12707296013832092, "learning_rate": 2.4155208550176724e-06, "loss": 0.0014, "step": 27360 }, { "epoch": 6.969606927582987, "grad_norm": 0.5370073318481445, "learning_rate": 2.4126242795174025e-06, "loss": 0.0014, "step": 27365 }, { "epoch": 6.970880380337889, "grad_norm": 0.1056453213095665, "learning_rate": 2.4097292035145714e-06, "loss": 0.0009, "step": 27370 }, { "epoch": 6.972153833092793, "grad_norm": 0.15179137885570526, "learning_rate": 2.4068356275813333e-06, "loss": 0.001, "step": 27375 }, { "epoch": 6.973427285847695, "grad_norm": 0.18551287055015564, "learning_rate": 2.403943552289545e-06, "loss": 0.0017, "step": 27380 }, { "epoch": 6.974700738602598, "grad_norm": 0.22094641625881195, "learning_rate": 2.401052978210776e-06, "loss": 0.0012, "step": 27385 }, { "epoch": 6.9759741913575, "grad_norm": 0.3106890320777893, "learning_rate": 2.398163905916283e-06, "loss": 0.0012, "step": 27390 }, { "epoch": 6.977247644112404, "grad_norm": 0.02032497338950634, "learning_rate": 2.3952763359770427e-06, "loss": 0.0006, "step": 27395 }, { "epoch": 6.978521096867306, "grad_norm": 0.37391048669815063, "learning_rate": 2.3923902689637236e-06, "loss": 0.0014, "step": 27400 }, { "epoch": 6.979794549622209, "grad_norm": 0.3754820227622986, "learning_rate": 2.3895057054467006e-06, "loss": 0.0013, "step": 27405 }, { "epoch": 6.9810680023771114, "grad_norm": 0.34744858741760254, "learning_rate": 2.386622645996052e-06, "loss": 0.0012, "step": 27410 }, { "epoch": 6.982341455132015, "grad_norm": 0.27761679887771606, "learning_rate": 2.383741091181563e-06, "loss": 0.001, "step": 27415 }, { "epoch": 6.983614907886917, "grad_norm": 0.39619186520576477, "learning_rate": 2.3808610415727073e-06, "loss": 0.0011, "step": 27420 }, { "epoch": 6.98488836064182, "grad_norm": 0.883313775062561, "learning_rate": 2.377982497738679e-06, "loss": 0.0017, "step": 27425 }, { "epoch": 6.986161813396723, "grad_norm": 0.32193124294281006, "learning_rate": 2.3751054602483647e-06, "loss": 0.001, "step": 27430 }, { "epoch": 6.987435266151626, "grad_norm": 0.3773587942123413, "learning_rate": 2.3722299296703533e-06, "loss": 0.0022, "step": 27435 }, { "epoch": 6.988708718906529, "grad_norm": 0.16544559597969055, "learning_rate": 2.3693559065729353e-06, "loss": 0.0034, "step": 27440 }, { "epoch": 6.989982171661431, "grad_norm": 0.22914361953735352, "learning_rate": 2.366483391524115e-06, "loss": 0.0011, "step": 27445 }, { "epoch": 6.991255624416334, "grad_norm": 0.26918599009513855, "learning_rate": 2.363612385091576e-06, "loss": 0.0018, "step": 27450 }, { "epoch": 6.992529077171237, "grad_norm": 0.08936522901058197, "learning_rate": 2.3607428878427274e-06, "loss": 0.0012, "step": 27455 }, { "epoch": 6.99380252992614, "grad_norm": 0.10485661029815674, "learning_rate": 2.3578749003446657e-06, "loss": 0.0011, "step": 27460 }, { "epoch": 6.995075982681042, "grad_norm": 0.14582093060016632, "learning_rate": 2.3550084231641923e-06, "loss": 0.0013, "step": 27465 }, { "epoch": 6.9963494354359455, "grad_norm": 0.13021999597549438, "learning_rate": 2.3521434568678123e-06, "loss": 0.0019, "step": 27470 }, { "epoch": 6.997622888190848, "grad_norm": 0.3123355209827423, "learning_rate": 2.34928000202173e-06, "loss": 0.001, "step": 27475 }, { "epoch": 6.998896340945751, "grad_norm": 0.44629037380218506, "learning_rate": 2.3464180591918484e-06, "loss": 0.0019, "step": 27480 }, { "epoch": 7.000169793700653, "grad_norm": 0.403434693813324, "learning_rate": 2.343557628943781e-06, "loss": 0.0017, "step": 27485 }, { "epoch": 7.0014432464555565, "grad_norm": 0.05031701177358627, "learning_rate": 2.3406987118428338e-06, "loss": 0.0006, "step": 27490 }, { "epoch": 7.00271669921046, "grad_norm": 0.4266816973686218, "learning_rate": 2.3378413084540154e-06, "loss": 0.0015, "step": 27495 }, { "epoch": 7.003990151965362, "grad_norm": 0.1874672919511795, "learning_rate": 2.334985419342035e-06, "loss": 0.0006, "step": 27500 }, { "epoch": 7.005263604720265, "grad_norm": 0.03442785516381264, "learning_rate": 2.3321310450713066e-06, "loss": 0.0008, "step": 27505 }, { "epoch": 7.0065370574751675, "grad_norm": 0.06763175874948502, "learning_rate": 2.32927818620594e-06, "loss": 0.0011, "step": 27510 }, { "epoch": 7.007810510230071, "grad_norm": 0.05501735210418701, "learning_rate": 2.3264268433097446e-06, "loss": 0.0007, "step": 27515 }, { "epoch": 7.009083962984973, "grad_norm": 0.1366427093744278, "learning_rate": 2.323577016946239e-06, "loss": 0.0008, "step": 27520 }, { "epoch": 7.010357415739876, "grad_norm": 0.03031003102660179, "learning_rate": 2.320728707678633e-06, "loss": 0.001, "step": 27525 }, { "epoch": 7.011630868494779, "grad_norm": 0.1458681970834732, "learning_rate": 2.3178819160698395e-06, "loss": 0.0009, "step": 27530 }, { "epoch": 7.012904321249682, "grad_norm": 0.09608486294746399, "learning_rate": 2.315036642682471e-06, "loss": 0.0005, "step": 27535 }, { "epoch": 7.014177774004584, "grad_norm": 0.25446009635925293, "learning_rate": 2.3121928880788424e-06, "loss": 0.0014, "step": 27540 }, { "epoch": 7.015451226759487, "grad_norm": 0.5342730283737183, "learning_rate": 2.3093506528209642e-06, "loss": 0.0005, "step": 27545 }, { "epoch": 7.01672467951439, "grad_norm": 0.2438707798719406, "learning_rate": 2.3065099374705503e-06, "loss": 0.0013, "step": 27550 }, { "epoch": 7.017998132269293, "grad_norm": 0.09319956600666046, "learning_rate": 2.3036707425890102e-06, "loss": 0.0007, "step": 27555 }, { "epoch": 7.019271585024196, "grad_norm": 0.1894800215959549, "learning_rate": 2.3008330687374613e-06, "loss": 0.0012, "step": 27560 }, { "epoch": 7.020545037779098, "grad_norm": 0.06957361102104187, "learning_rate": 2.2979969164767115e-06, "loss": 0.0008, "step": 27565 }, { "epoch": 7.0218184905340015, "grad_norm": 0.14305247366428375, "learning_rate": 2.2951622863672708e-06, "loss": 0.0005, "step": 27570 }, { "epoch": 7.023091943288904, "grad_norm": 0.19173955917358398, "learning_rate": 2.292329178969349e-06, "loss": 0.0017, "step": 27575 }, { "epoch": 7.024365396043807, "grad_norm": 0.04975759610533714, "learning_rate": 2.2894975948428556e-06, "loss": 0.0011, "step": 27580 }, { "epoch": 7.025638848798709, "grad_norm": 0.07980827242136002, "learning_rate": 2.2866675345473944e-06, "loss": 0.0009, "step": 27585 }, { "epoch": 7.026912301553613, "grad_norm": 0.17753705382347107, "learning_rate": 2.2838389986422816e-06, "loss": 0.0009, "step": 27590 }, { "epoch": 7.028185754308515, "grad_norm": 0.0810336321592331, "learning_rate": 2.281011987686509e-06, "loss": 0.0007, "step": 27595 }, { "epoch": 7.029459207063418, "grad_norm": 0.02018255926668644, "learning_rate": 2.2781865022387905e-06, "loss": 0.0013, "step": 27600 }, { "epoch": 7.03073265981832, "grad_norm": 0.04532336816191673, "learning_rate": 2.2753625428575253e-06, "loss": 0.0006, "step": 27605 }, { "epoch": 7.032006112573224, "grad_norm": 0.13983944058418274, "learning_rate": 2.2725401101008137e-06, "loss": 0.0008, "step": 27610 }, { "epoch": 7.033279565328126, "grad_norm": 0.027133949100971222, "learning_rate": 2.269719204526453e-06, "loss": 0.0009, "step": 27615 }, { "epoch": 7.034553018083029, "grad_norm": 0.29501381516456604, "learning_rate": 2.2668998266919474e-06, "loss": 0.0009, "step": 27620 }, { "epoch": 7.035826470837932, "grad_norm": 0.4035589396953583, "learning_rate": 2.264081977154482e-06, "loss": 0.0015, "step": 27625 }, { "epoch": 7.037099923592835, "grad_norm": 0.6139189004898071, "learning_rate": 2.261265656470957e-06, "loss": 0.0007, "step": 27630 }, { "epoch": 7.038373376347738, "grad_norm": 0.14093893766403198, "learning_rate": 2.2584508651979618e-06, "loss": 0.0007, "step": 27635 }, { "epoch": 7.03964682910264, "grad_norm": 0.021331727504730225, "learning_rate": 2.255637603891786e-06, "loss": 0.0011, "step": 27640 }, { "epoch": 7.040920281857543, "grad_norm": 0.027051450684666634, "learning_rate": 2.2528258731084117e-06, "loss": 0.0011, "step": 27645 }, { "epoch": 7.042193734612446, "grad_norm": 0.03294706717133522, "learning_rate": 2.250015673403533e-06, "loss": 0.0007, "step": 27650 }, { "epoch": 7.043467187367349, "grad_norm": 0.07566328346729279, "learning_rate": 2.2472070053325178e-06, "loss": 0.0006, "step": 27655 }, { "epoch": 7.044740640122251, "grad_norm": 0.03758895397186279, "learning_rate": 2.244399869450454e-06, "loss": 0.0006, "step": 27660 }, { "epoch": 7.046014092877154, "grad_norm": 0.4192628264427185, "learning_rate": 2.241594266312116e-06, "loss": 0.0006, "step": 27665 }, { "epoch": 7.047287545632057, "grad_norm": 0.015474233776330948, "learning_rate": 2.238790196471976e-06, "loss": 0.0006, "step": 27670 }, { "epoch": 7.04856099838696, "grad_norm": 0.03179602697491646, "learning_rate": 2.2359876604842e-06, "loss": 0.0008, "step": 27675 }, { "epoch": 7.049834451141862, "grad_norm": 0.05915069207549095, "learning_rate": 2.233186658902665e-06, "loss": 0.0011, "step": 27680 }, { "epoch": 7.0511079038967654, "grad_norm": 0.04895728826522827, "learning_rate": 2.2303871922809227e-06, "loss": 0.0008, "step": 27685 }, { "epoch": 7.052381356651669, "grad_norm": 0.06213415414094925, "learning_rate": 2.2275892611722403e-06, "loss": 0.0007, "step": 27690 }, { "epoch": 7.053654809406571, "grad_norm": 0.04978347197175026, "learning_rate": 2.224792866129575e-06, "loss": 0.0007, "step": 27695 }, { "epoch": 7.054928262161474, "grad_norm": 0.05627454072237015, "learning_rate": 2.2219980077055756e-06, "loss": 0.0007, "step": 27700 }, { "epoch": 7.0562017149163765, "grad_norm": 0.10711448639631271, "learning_rate": 2.2192046864525927e-06, "loss": 0.0005, "step": 27705 }, { "epoch": 7.05747516767128, "grad_norm": 0.04697881266474724, "learning_rate": 2.216412902922679e-06, "loss": 0.0004, "step": 27710 }, { "epoch": 7.058748620426182, "grad_norm": 0.023626018315553665, "learning_rate": 2.2136226576675647e-06, "loss": 0.0007, "step": 27715 }, { "epoch": 7.060022073181085, "grad_norm": 0.35820600390434265, "learning_rate": 2.2108339512386957e-06, "loss": 0.0021, "step": 27720 }, { "epoch": 7.0612955259359875, "grad_norm": 0.07340419292449951, "learning_rate": 2.2080467841872033e-06, "loss": 0.0011, "step": 27725 }, { "epoch": 7.062568978690891, "grad_norm": 0.050871025770902634, "learning_rate": 2.205261157063917e-06, "loss": 0.001, "step": 27730 }, { "epoch": 7.063842431445793, "grad_norm": 0.128639817237854, "learning_rate": 2.202477070419359e-06, "loss": 0.0009, "step": 27735 }, { "epoch": 7.065115884200696, "grad_norm": 0.07851850241422653, "learning_rate": 2.1996945248037592e-06, "loss": 0.0006, "step": 27740 }, { "epoch": 7.066389336955599, "grad_norm": 0.06342916935682297, "learning_rate": 2.196913520767021e-06, "loss": 0.0011, "step": 27745 }, { "epoch": 7.067662789710502, "grad_norm": 0.01486915722489357, "learning_rate": 2.1941340588587656e-06, "loss": 0.0007, "step": 27750 }, { "epoch": 7.068936242465405, "grad_norm": 0.032177530229091644, "learning_rate": 2.1913561396282957e-06, "loss": 0.0002, "step": 27755 }, { "epoch": 7.070209695220307, "grad_norm": 0.40311554074287415, "learning_rate": 2.188579763624613e-06, "loss": 0.0006, "step": 27760 }, { "epoch": 7.0714831479752105, "grad_norm": 0.07237514853477478, "learning_rate": 2.1858049313964124e-06, "loss": 0.0014, "step": 27765 }, { "epoch": 7.072756600730113, "grad_norm": 0.044421352446079254, "learning_rate": 2.1830316434920938e-06, "loss": 0.0004, "step": 27770 }, { "epoch": 7.074030053485016, "grad_norm": 0.1045602485537529, "learning_rate": 2.1802599004597334e-06, "loss": 0.001, "step": 27775 }, { "epoch": 7.075303506239918, "grad_norm": 0.044587962329387665, "learning_rate": 2.1774897028471185e-06, "loss": 0.0006, "step": 27780 }, { "epoch": 7.0765769589948215, "grad_norm": 0.33050668239593506, "learning_rate": 2.174721051201724e-06, "loss": 0.0009, "step": 27785 }, { "epoch": 7.077850411749724, "grad_norm": 0.09477122873067856, "learning_rate": 2.1719539460707185e-06, "loss": 0.0008, "step": 27790 }, { "epoch": 7.079123864504627, "grad_norm": 0.014981633052229881, "learning_rate": 2.169188388000968e-06, "loss": 0.0006, "step": 27795 }, { "epoch": 7.080397317259529, "grad_norm": 0.09329162538051605, "learning_rate": 2.1664243775390314e-06, "loss": 0.0009, "step": 27800 }, { "epoch": 7.081670770014433, "grad_norm": 0.040172331035137177, "learning_rate": 2.1636619152311576e-06, "loss": 0.0007, "step": 27805 }, { "epoch": 7.082944222769335, "grad_norm": 0.04189338535070419, "learning_rate": 2.1609010016233024e-06, "loss": 0.0004, "step": 27810 }, { "epoch": 7.084217675524238, "grad_norm": 0.22046217322349548, "learning_rate": 2.1581416372610965e-06, "loss": 0.0007, "step": 27815 }, { "epoch": 7.085491128279141, "grad_norm": 0.566709041595459, "learning_rate": 2.1553838226898814e-06, "loss": 0.0011, "step": 27820 }, { "epoch": 7.086764581034044, "grad_norm": 0.07459443062543869, "learning_rate": 2.1526275584546853e-06, "loss": 0.0004, "step": 27825 }, { "epoch": 7.088038033788947, "grad_norm": 0.09185706824064255, "learning_rate": 2.149872845100228e-06, "loss": 0.0015, "step": 27830 }, { "epoch": 7.089311486543849, "grad_norm": 0.04598160460591316, "learning_rate": 2.1471196831709263e-06, "loss": 0.0005, "step": 27835 }, { "epoch": 7.090584939298752, "grad_norm": 0.06894075125455856, "learning_rate": 2.1443680732108886e-06, "loss": 0.0009, "step": 27840 }, { "epoch": 7.091858392053655, "grad_norm": 0.12199904024600983, "learning_rate": 2.1416180157639142e-06, "loss": 0.0009, "step": 27845 }, { "epoch": 7.093131844808558, "grad_norm": 0.02968740463256836, "learning_rate": 2.1388695113735047e-06, "loss": 0.0019, "step": 27850 }, { "epoch": 7.09440529756346, "grad_norm": 0.04785989969968796, "learning_rate": 2.136122560582846e-06, "loss": 0.0006, "step": 27855 }, { "epoch": 7.095678750318363, "grad_norm": 0.02865496650338173, "learning_rate": 2.133377163934819e-06, "loss": 0.0005, "step": 27860 }, { "epoch": 7.096952203073266, "grad_norm": 0.06112245097756386, "learning_rate": 2.130633321971998e-06, "loss": 0.0009, "step": 27865 }, { "epoch": 7.098225655828169, "grad_norm": 0.5386813879013062, "learning_rate": 2.12789103523665e-06, "loss": 0.0017, "step": 27870 }, { "epoch": 7.099499108583071, "grad_norm": 0.26158493757247925, "learning_rate": 2.1251503042707343e-06, "loss": 0.0011, "step": 27875 }, { "epoch": 7.100772561337974, "grad_norm": 0.13970233500003815, "learning_rate": 2.122411129615902e-06, "loss": 0.0009, "step": 27880 }, { "epoch": 7.102046014092877, "grad_norm": 0.03266651928424835, "learning_rate": 2.1196735118135027e-06, "loss": 0.0012, "step": 27885 }, { "epoch": 7.10331946684778, "grad_norm": 0.08322858810424805, "learning_rate": 2.1169374514045692e-06, "loss": 0.0007, "step": 27890 }, { "epoch": 7.104592919602683, "grad_norm": 0.009622827172279358, "learning_rate": 2.1142029489298322e-06, "loss": 0.0008, "step": 27895 }, { "epoch": 7.1058663723575854, "grad_norm": 0.05385354906320572, "learning_rate": 2.111470004929713e-06, "loss": 0.0007, "step": 27900 }, { "epoch": 7.107139825112489, "grad_norm": 0.027141794562339783, "learning_rate": 2.108738619944324e-06, "loss": 0.0006, "step": 27905 }, { "epoch": 7.108413277867391, "grad_norm": 0.05689695477485657, "learning_rate": 2.1060087945134677e-06, "loss": 0.0004, "step": 27910 }, { "epoch": 7.109686730622294, "grad_norm": 0.13069164752960205, "learning_rate": 2.1032805291766502e-06, "loss": 0.0009, "step": 27915 }, { "epoch": 7.1109601833771965, "grad_norm": 0.05188168212771416, "learning_rate": 2.1005538244730493e-06, "loss": 0.0007, "step": 27920 }, { "epoch": 7.1122336361321, "grad_norm": 0.09342820942401886, "learning_rate": 2.0978286809415516e-06, "loss": 0.0011, "step": 27925 }, { "epoch": 7.113507088887002, "grad_norm": 0.0403350293636322, "learning_rate": 2.0951050991207276e-06, "loss": 0.001, "step": 27930 }, { "epoch": 7.114780541641905, "grad_norm": 0.16064514219760895, "learning_rate": 2.0923830795488407e-06, "loss": 0.0003, "step": 27935 }, { "epoch": 7.1160539943968075, "grad_norm": 0.038105178624391556, "learning_rate": 2.0896626227638406e-06, "loss": 0.0006, "step": 27940 }, { "epoch": 7.117327447151711, "grad_norm": 0.061245739459991455, "learning_rate": 2.0869437293033835e-06, "loss": 0.0018, "step": 27945 }, { "epoch": 7.118600899906613, "grad_norm": 0.05362643674015999, "learning_rate": 2.084226399704793e-06, "loss": 0.0008, "step": 27950 }, { "epoch": 7.119874352661516, "grad_norm": 0.05654933303594589, "learning_rate": 2.081510634505105e-06, "loss": 0.0011, "step": 27955 }, { "epoch": 7.1211478054164195, "grad_norm": 0.02250077575445175, "learning_rate": 2.078796434241035e-06, "loss": 0.0003, "step": 27960 }, { "epoch": 7.122421258171322, "grad_norm": 0.03090522438287735, "learning_rate": 2.0760837994489924e-06, "loss": 0.0006, "step": 27965 }, { "epoch": 7.123694710926225, "grad_norm": 0.030941566452383995, "learning_rate": 2.073372730665074e-06, "loss": 0.0007, "step": 27970 }, { "epoch": 7.124968163681127, "grad_norm": 0.006548775359988213, "learning_rate": 2.0706632284250783e-06, "loss": 0.0012, "step": 27975 }, { "epoch": 7.1262416164360305, "grad_norm": 0.18445608019828796, "learning_rate": 2.067955293264474e-06, "loss": 0.0006, "step": 27980 }, { "epoch": 7.127515069190933, "grad_norm": 0.014254794456064701, "learning_rate": 2.0652489257184416e-06, "loss": 0.0005, "step": 27985 }, { "epoch": 7.128788521945836, "grad_norm": 0.09091099351644516, "learning_rate": 2.0625441263218373e-06, "loss": 0.0008, "step": 27990 }, { "epoch": 7.130061974700738, "grad_norm": 0.09675980359315872, "learning_rate": 2.059840895609214e-06, "loss": 0.0012, "step": 27995 }, { "epoch": 7.1313354274556415, "grad_norm": 0.07120856642723083, "learning_rate": 2.057139234114809e-06, "loss": 0.0004, "step": 28000 }, { "epoch": 7.132608880210544, "grad_norm": 0.04743858054280281, "learning_rate": 2.0544391423725607e-06, "loss": 0.0005, "step": 28005 }, { "epoch": 7.133882332965447, "grad_norm": 0.1690356433391571, "learning_rate": 2.0517406209160815e-06, "loss": 0.0004, "step": 28010 }, { "epoch": 7.135155785720349, "grad_norm": 0.08029022812843323, "learning_rate": 2.0490436702786874e-06, "loss": 0.0009, "step": 28015 }, { "epoch": 7.136429238475253, "grad_norm": 0.05813177675008774, "learning_rate": 2.0463482909933764e-06, "loss": 0.0004, "step": 28020 }, { "epoch": 7.137702691230156, "grad_norm": 0.054495710879564285, "learning_rate": 2.0436544835928373e-06, "loss": 0.0004, "step": 28025 }, { "epoch": 7.138976143985058, "grad_norm": 0.010348464362323284, "learning_rate": 2.040962248609448e-06, "loss": 0.0003, "step": 28030 }, { "epoch": 7.140249596739961, "grad_norm": 0.29183313250541687, "learning_rate": 2.0382715865752824e-06, "loss": 0.0015, "step": 28035 }, { "epoch": 7.141523049494864, "grad_norm": 0.08818286657333374, "learning_rate": 2.0355824980220884e-06, "loss": 0.001, "step": 28040 }, { "epoch": 7.142796502249767, "grad_norm": 0.06396568566560745, "learning_rate": 2.032894983481318e-06, "loss": 0.0009, "step": 28045 }, { "epoch": 7.144069955004669, "grad_norm": 0.013045608066022396, "learning_rate": 2.0302090434841072e-06, "loss": 0.0009, "step": 28050 }, { "epoch": 7.145343407759572, "grad_norm": 0.15345099568367004, "learning_rate": 2.027524678561276e-06, "loss": 0.0009, "step": 28055 }, { "epoch": 7.146616860514475, "grad_norm": 0.05679116025567055, "learning_rate": 2.0248418892433397e-06, "loss": 0.0006, "step": 28060 }, { "epoch": 7.147890313269378, "grad_norm": 0.07554196566343307, "learning_rate": 2.022160676060497e-06, "loss": 0.0002, "step": 28065 }, { "epoch": 7.14916376602428, "grad_norm": 0.036048199981451035, "learning_rate": 2.019481039542638e-06, "loss": 0.0005, "step": 28070 }, { "epoch": 7.150437218779183, "grad_norm": 0.04341089725494385, "learning_rate": 2.016802980219347e-06, "loss": 0.0012, "step": 28075 }, { "epoch": 7.151710671534086, "grad_norm": 0.05343206226825714, "learning_rate": 2.0141264986198796e-06, "loss": 0.0008, "step": 28080 }, { "epoch": 7.152984124288989, "grad_norm": 0.05337134003639221, "learning_rate": 2.011451595273198e-06, "loss": 0.0014, "step": 28085 }, { "epoch": 7.154257577043892, "grad_norm": 0.020894397050142288, "learning_rate": 2.008778270707944e-06, "loss": 0.0006, "step": 28090 }, { "epoch": 7.155531029798794, "grad_norm": 0.033221155405044556, "learning_rate": 2.006106525452447e-06, "loss": 0.0011, "step": 28095 }, { "epoch": 7.156804482553698, "grad_norm": 0.05413881316781044, "learning_rate": 2.003436360034724e-06, "loss": 0.0007, "step": 28100 }, { "epoch": 7.1580779353086, "grad_norm": 0.07665431499481201, "learning_rate": 2.0007677749824872e-06, "loss": 0.0011, "step": 28105 }, { "epoch": 7.159351388063503, "grad_norm": 0.2277309149503708, "learning_rate": 1.9981007708231216e-06, "loss": 0.0021, "step": 28110 }, { "epoch": 7.1606248408184054, "grad_norm": 0.06350535154342651, "learning_rate": 1.9954353480837164e-06, "loss": 0.0006, "step": 28115 }, { "epoch": 7.161898293573309, "grad_norm": 0.025005152449011803, "learning_rate": 1.9927715072910383e-06, "loss": 0.0011, "step": 28120 }, { "epoch": 7.163171746328211, "grad_norm": 0.05730096623301506, "learning_rate": 1.9901092489715433e-06, "loss": 0.0005, "step": 28125 }, { "epoch": 7.164445199083114, "grad_norm": 0.21955613791942596, "learning_rate": 1.987448573651375e-06, "loss": 0.0005, "step": 28130 }, { "epoch": 7.1657186518380165, "grad_norm": 0.06474330276250839, "learning_rate": 1.984789481856364e-06, "loss": 0.0004, "step": 28135 }, { "epoch": 7.16699210459292, "grad_norm": 0.2920245826244354, "learning_rate": 1.982131974112026e-06, "loss": 0.0009, "step": 28140 }, { "epoch": 7.168265557347822, "grad_norm": 0.09197250008583069, "learning_rate": 1.9794760509435706e-06, "loss": 0.0007, "step": 28145 }, { "epoch": 7.169539010102725, "grad_norm": 0.21226981282234192, "learning_rate": 1.9768217128758882e-06, "loss": 0.0007, "step": 28150 }, { "epoch": 7.170812462857628, "grad_norm": 0.17580202221870422, "learning_rate": 1.9741689604335544e-06, "loss": 0.0006, "step": 28155 }, { "epoch": 7.172085915612531, "grad_norm": 0.012836907058954239, "learning_rate": 1.971517794140837e-06, "loss": 0.0013, "step": 28160 }, { "epoch": 7.173359368367434, "grad_norm": 0.20646005868911743, "learning_rate": 1.9688682145216863e-06, "loss": 0.0017, "step": 28165 }, { "epoch": 7.174632821122336, "grad_norm": 0.2144348919391632, "learning_rate": 1.9662202220997396e-06, "loss": 0.0007, "step": 28170 }, { "epoch": 7.1759062738772394, "grad_norm": 0.06457852572202682, "learning_rate": 1.9635738173983197e-06, "loss": 0.0005, "step": 28175 }, { "epoch": 7.177179726632142, "grad_norm": 0.033310309052467346, "learning_rate": 1.9609290009404415e-06, "loss": 0.0005, "step": 28180 }, { "epoch": 7.178453179387045, "grad_norm": 0.06332152336835861, "learning_rate": 1.9582857732488e-06, "loss": 0.0005, "step": 28185 }, { "epoch": 7.179726632141947, "grad_norm": 0.04505985602736473, "learning_rate": 1.9556441348457766e-06, "loss": 0.001, "step": 28190 }, { "epoch": 7.1810000848968505, "grad_norm": 0.05903678014874458, "learning_rate": 1.9530040862534407e-06, "loss": 0.0004, "step": 28195 }, { "epoch": 7.182273537651753, "grad_norm": 0.009820960462093353, "learning_rate": 1.950365627993546e-06, "loss": 0.0018, "step": 28200 }, { "epoch": 7.183546990406656, "grad_norm": 0.0600137934088707, "learning_rate": 1.9477287605875306e-06, "loss": 0.0005, "step": 28205 }, { "epoch": 7.184820443161558, "grad_norm": 0.3647083044052124, "learning_rate": 1.9450934845565272e-06, "loss": 0.0007, "step": 28210 }, { "epoch": 7.1860938959164615, "grad_norm": 0.32850581407546997, "learning_rate": 1.942459800421337e-06, "loss": 0.0005, "step": 28215 }, { "epoch": 7.187367348671364, "grad_norm": 0.7086085081100464, "learning_rate": 1.939827708702464e-06, "loss": 0.0018, "step": 28220 }, { "epoch": 7.188640801426267, "grad_norm": 0.0452355295419693, "learning_rate": 1.9371972099200876e-06, "loss": 0.001, "step": 28225 }, { "epoch": 7.18991425418117, "grad_norm": 1.748152732849121, "learning_rate": 1.9345683045940745e-06, "loss": 0.0025, "step": 28230 }, { "epoch": 7.191187706936073, "grad_norm": 0.09229777008295059, "learning_rate": 1.9319409932439747e-06, "loss": 0.0008, "step": 28235 }, { "epoch": 7.192461159690976, "grad_norm": 0.0251677967607975, "learning_rate": 1.929315276389033e-06, "loss": 0.0005, "step": 28240 }, { "epoch": 7.193734612445878, "grad_norm": 0.5493375062942505, "learning_rate": 1.9266911545481616e-06, "loss": 0.0008, "step": 28245 }, { "epoch": 7.195008065200781, "grad_norm": 0.19867953658103943, "learning_rate": 1.9240686282399734e-06, "loss": 0.0005, "step": 28250 }, { "epoch": 7.196281517955684, "grad_norm": 0.0589664950966835, "learning_rate": 1.9214476979827577e-06, "loss": 0.001, "step": 28255 }, { "epoch": 7.197554970710587, "grad_norm": 0.027587737888097763, "learning_rate": 1.9188283642944906e-06, "loss": 0.0003, "step": 28260 }, { "epoch": 7.198828423465489, "grad_norm": 0.11721368134021759, "learning_rate": 1.9162106276928307e-06, "loss": 0.001, "step": 28265 }, { "epoch": 7.200101876220392, "grad_norm": 0.03753001615405083, "learning_rate": 1.9135944886951296e-06, "loss": 0.0007, "step": 28270 }, { "epoch": 7.201375328975295, "grad_norm": 0.0416657030582428, "learning_rate": 1.9109799478184056e-06, "loss": 0.0007, "step": 28275 }, { "epoch": 7.202648781730198, "grad_norm": 0.0895952433347702, "learning_rate": 1.9083670055793814e-06, "loss": 0.001, "step": 28280 }, { "epoch": 7.2039222344851, "grad_norm": 0.18512603640556335, "learning_rate": 1.9057556624944507e-06, "loss": 0.0007, "step": 28285 }, { "epoch": 7.205195687240003, "grad_norm": 0.06203995645046234, "learning_rate": 1.9031459190796942e-06, "loss": 0.001, "step": 28290 }, { "epoch": 7.206469139994907, "grad_norm": 0.24802736937999725, "learning_rate": 1.9005377758508746e-06, "loss": 0.0006, "step": 28295 }, { "epoch": 7.207742592749809, "grad_norm": 0.08823564648628235, "learning_rate": 1.897931233323449e-06, "loss": 0.0018, "step": 28300 }, { "epoch": 7.209016045504712, "grad_norm": 0.21016912162303925, "learning_rate": 1.8953262920125403e-06, "loss": 0.0009, "step": 28305 }, { "epoch": 7.210289498259614, "grad_norm": 0.08209124952554703, "learning_rate": 1.8927229524329737e-06, "loss": 0.0011, "step": 28310 }, { "epoch": 7.211562951014518, "grad_norm": 0.2859971821308136, "learning_rate": 1.8901212150992376e-06, "loss": 0.0009, "step": 28315 }, { "epoch": 7.21283640376942, "grad_norm": 0.02574995532631874, "learning_rate": 1.887521080525525e-06, "loss": 0.0007, "step": 28320 }, { "epoch": 7.214109856524323, "grad_norm": 0.043147262185811996, "learning_rate": 1.8849225492256973e-06, "loss": 0.0009, "step": 28325 }, { "epoch": 7.215383309279225, "grad_norm": 0.38476693630218506, "learning_rate": 1.8823256217133045e-06, "loss": 0.0006, "step": 28330 }, { "epoch": 7.216656762034129, "grad_norm": 0.040547724813222885, "learning_rate": 1.8797302985015765e-06, "loss": 0.0006, "step": 28335 }, { "epoch": 7.217930214789031, "grad_norm": 0.017158277332782745, "learning_rate": 1.8771365801034358e-06, "loss": 0.0005, "step": 28340 }, { "epoch": 7.219203667543934, "grad_norm": 0.023098042234778404, "learning_rate": 1.8745444670314716e-06, "loss": 0.0003, "step": 28345 }, { "epoch": 7.2204771202988365, "grad_norm": 0.013730787672102451, "learning_rate": 1.8719539597979696e-06, "loss": 0.0008, "step": 28350 }, { "epoch": 7.22175057305374, "grad_norm": 0.025876251980662346, "learning_rate": 1.8693650589148938e-06, "loss": 0.0007, "step": 28355 }, { "epoch": 7.223024025808643, "grad_norm": 0.05456395447254181, "learning_rate": 1.866777764893888e-06, "loss": 0.0011, "step": 28360 }, { "epoch": 7.224297478563545, "grad_norm": 0.0584106519818306, "learning_rate": 1.8641920782462786e-06, "loss": 0.0009, "step": 28365 }, { "epoch": 7.225570931318448, "grad_norm": 0.177164688706398, "learning_rate": 1.8616079994830839e-06, "loss": 0.0004, "step": 28370 }, { "epoch": 7.226844384073351, "grad_norm": 0.06973160058259964, "learning_rate": 1.8590255291149873e-06, "loss": 0.0005, "step": 28375 }, { "epoch": 7.228117836828254, "grad_norm": 0.05427113175392151, "learning_rate": 1.8564446676523717e-06, "loss": 0.0007, "step": 28380 }, { "epoch": 7.229391289583156, "grad_norm": 0.07991410791873932, "learning_rate": 1.8538654156052903e-06, "loss": 0.0004, "step": 28385 }, { "epoch": 7.2306647423380594, "grad_norm": 0.054890625178813934, "learning_rate": 1.8512877734834834e-06, "loss": 0.0007, "step": 28390 }, { "epoch": 7.231938195092962, "grad_norm": 0.06844119727611542, "learning_rate": 1.8487117417963697e-06, "loss": 0.0005, "step": 28395 }, { "epoch": 7.233211647847865, "grad_norm": 0.026276059448719025, "learning_rate": 1.8461373210530575e-06, "loss": 0.0006, "step": 28400 }, { "epoch": 7.234485100602767, "grad_norm": 0.16881518065929413, "learning_rate": 1.8435645117623224e-06, "loss": 0.0011, "step": 28405 }, { "epoch": 7.2357585533576705, "grad_norm": 0.07510518282651901, "learning_rate": 1.8409933144326375e-06, "loss": 0.0011, "step": 28410 }, { "epoch": 7.237032006112573, "grad_norm": 0.04200557619333267, "learning_rate": 1.8384237295721486e-06, "loss": 0.0005, "step": 28415 }, { "epoch": 7.238305458867476, "grad_norm": 0.03190629184246063, "learning_rate": 1.8358557576886816e-06, "loss": 0.0004, "step": 28420 }, { "epoch": 7.239578911622379, "grad_norm": 0.007624906953424215, "learning_rate": 1.8332893992897461e-06, "loss": 0.0005, "step": 28425 }, { "epoch": 7.2408523643772815, "grad_norm": 0.044321946799755096, "learning_rate": 1.8307246548825397e-06, "loss": 0.0004, "step": 28430 }, { "epoch": 7.242125817132185, "grad_norm": 0.20155030488967896, "learning_rate": 1.828161524973925e-06, "loss": 0.0008, "step": 28435 }, { "epoch": 7.243399269887087, "grad_norm": 0.14208389818668365, "learning_rate": 1.8256000100704608e-06, "loss": 0.0008, "step": 28440 }, { "epoch": 7.24467272264199, "grad_norm": 0.07315386831760406, "learning_rate": 1.8230401106783791e-06, "loss": 0.0005, "step": 28445 }, { "epoch": 7.245946175396893, "grad_norm": 0.19571274518966675, "learning_rate": 1.820481827303594e-06, "loss": 0.0033, "step": 28450 }, { "epoch": 7.247219628151796, "grad_norm": 0.12598079442977905, "learning_rate": 1.8179251604517013e-06, "loss": 0.0008, "step": 28455 }, { "epoch": 7.248493080906698, "grad_norm": 0.10436660051345825, "learning_rate": 1.8153701106279765e-06, "loss": 0.0011, "step": 28460 }, { "epoch": 7.249766533661601, "grad_norm": 0.05911054462194443, "learning_rate": 1.8128166783373723e-06, "loss": 0.0006, "step": 28465 }, { "epoch": 7.251039986416504, "grad_norm": 0.04872290790081024, "learning_rate": 1.81026486408453e-06, "loss": 0.0007, "step": 28470 }, { "epoch": 7.252313439171407, "grad_norm": 0.07003361731767654, "learning_rate": 1.8077146683737644e-06, "loss": 0.001, "step": 28475 }, { "epoch": 7.253586891926309, "grad_norm": 0.00682580703869462, "learning_rate": 1.8051660917090718e-06, "loss": 0.0014, "step": 28480 }, { "epoch": 7.254860344681212, "grad_norm": 0.08882641047239304, "learning_rate": 1.802619134594128e-06, "loss": 0.0007, "step": 28485 }, { "epoch": 7.2561337974361155, "grad_norm": 0.04177949205040932, "learning_rate": 1.8000737975322912e-06, "loss": 0.0008, "step": 28490 }, { "epoch": 7.257407250191018, "grad_norm": 0.05031478404998779, "learning_rate": 1.7975300810265972e-06, "loss": 0.001, "step": 28495 }, { "epoch": 7.258680702945921, "grad_norm": 0.04178287088871002, "learning_rate": 1.7949879855797592e-06, "loss": 0.0008, "step": 28500 }, { "epoch": 7.259954155700823, "grad_norm": 0.051822345703840256, "learning_rate": 1.7924475116941774e-06, "loss": 0.001, "step": 28505 }, { "epoch": 7.261227608455727, "grad_norm": 0.03602265939116478, "learning_rate": 1.7899086598719262e-06, "loss": 0.0005, "step": 28510 }, { "epoch": 7.262501061210629, "grad_norm": 0.13576161861419678, "learning_rate": 1.7873714306147606e-06, "loss": 0.0013, "step": 28515 }, { "epoch": 7.263774513965532, "grad_norm": 0.09393459558486938, "learning_rate": 1.7848358244241126e-06, "loss": 0.001, "step": 28520 }, { "epoch": 7.265047966720434, "grad_norm": 0.0416736826300621, "learning_rate": 1.7823018418010963e-06, "loss": 0.0005, "step": 28525 }, { "epoch": 7.266321419475338, "grad_norm": 0.11255747824907303, "learning_rate": 1.779769483246503e-06, "loss": 0.0015, "step": 28530 }, { "epoch": 7.26759487223024, "grad_norm": 0.06311454623937607, "learning_rate": 1.77723874926081e-06, "loss": 0.0006, "step": 28535 }, { "epoch": 7.268868324985143, "grad_norm": 0.5801591873168945, "learning_rate": 1.7747096403441577e-06, "loss": 0.0009, "step": 28540 }, { "epoch": 7.270141777740045, "grad_norm": 0.10512123256921768, "learning_rate": 1.7721821569963848e-06, "loss": 0.001, "step": 28545 }, { "epoch": 7.271415230494949, "grad_norm": 0.03371576592326164, "learning_rate": 1.7696562997169953e-06, "loss": 0.0004, "step": 28550 }, { "epoch": 7.272688683249852, "grad_norm": 0.04768312722444534, "learning_rate": 1.7671320690051764e-06, "loss": 0.0006, "step": 28555 }, { "epoch": 7.273962136004754, "grad_norm": 0.029826218262314796, "learning_rate": 1.7646094653597922e-06, "loss": 0.0009, "step": 28560 }, { "epoch": 7.275235588759657, "grad_norm": 0.09721368551254272, "learning_rate": 1.7620884892793865e-06, "loss": 0.0013, "step": 28565 }, { "epoch": 7.27650904151456, "grad_norm": 0.04844309017062187, "learning_rate": 1.7595691412621807e-06, "loss": 0.0006, "step": 28570 }, { "epoch": 7.277782494269463, "grad_norm": 0.10757654160261154, "learning_rate": 1.7570514218060797e-06, "loss": 0.0005, "step": 28575 }, { "epoch": 7.279055947024365, "grad_norm": 0.07521063834428787, "learning_rate": 1.7545353314086545e-06, "loss": 0.0012, "step": 28580 }, { "epoch": 7.280329399779268, "grad_norm": 0.15691575407981873, "learning_rate": 1.7520208705671671e-06, "loss": 0.0004, "step": 28585 }, { "epoch": 7.281602852534171, "grad_norm": 0.019269142299890518, "learning_rate": 1.7495080397785503e-06, "loss": 0.0005, "step": 28590 }, { "epoch": 7.282876305289074, "grad_norm": 0.026247357949614525, "learning_rate": 1.7469968395394154e-06, "loss": 0.0009, "step": 28595 }, { "epoch": 7.284149758043976, "grad_norm": 0.07505328208208084, "learning_rate": 1.7444872703460502e-06, "loss": 0.0006, "step": 28600 }, { "epoch": 7.2854232107988794, "grad_norm": 0.029088178649544716, "learning_rate": 1.7419793326944313e-06, "loss": 0.0007, "step": 28605 }, { "epoch": 7.286696663553782, "grad_norm": 0.35413658618927, "learning_rate": 1.739473027080192e-06, "loss": 0.0008, "step": 28610 }, { "epoch": 7.287970116308685, "grad_norm": 0.06655263900756836, "learning_rate": 1.7369683539986626e-06, "loss": 0.0006, "step": 28615 }, { "epoch": 7.289243569063588, "grad_norm": 0.18808797001838684, "learning_rate": 1.7344653139448408e-06, "loss": 0.0005, "step": 28620 }, { "epoch": 7.2905170218184905, "grad_norm": 0.06294000893831253, "learning_rate": 1.7319639074134054e-06, "loss": 0.001, "step": 28625 }, { "epoch": 7.291790474573394, "grad_norm": 0.026625866070389748, "learning_rate": 1.7294641348987062e-06, "loss": 0.0003, "step": 28630 }, { "epoch": 7.293063927328296, "grad_norm": 0.047913357615470886, "learning_rate": 1.7269659968947828e-06, "loss": 0.0006, "step": 28635 }, { "epoch": 7.294337380083199, "grad_norm": 0.0766582041978836, "learning_rate": 1.7244694938953333e-06, "loss": 0.0012, "step": 28640 }, { "epoch": 7.2956108328381015, "grad_norm": 0.010723312385380268, "learning_rate": 1.721974626393752e-06, "loss": 0.0005, "step": 28645 }, { "epoch": 7.296884285593005, "grad_norm": 0.10872642695903778, "learning_rate": 1.719481394883097e-06, "loss": 0.0012, "step": 28650 }, { "epoch": 7.298157738347907, "grad_norm": 0.011705453507602215, "learning_rate": 1.716989799856108e-06, "loss": 0.0004, "step": 28655 }, { "epoch": 7.29943119110281, "grad_norm": 0.2763889729976654, "learning_rate": 1.7144998418051972e-06, "loss": 0.0018, "step": 28660 }, { "epoch": 7.300704643857713, "grad_norm": 0.14650443196296692, "learning_rate": 1.7120115212224642e-06, "loss": 0.0008, "step": 28665 }, { "epoch": 7.301978096612616, "grad_norm": 0.07004541158676147, "learning_rate": 1.7095248385996678e-06, "loss": 0.0007, "step": 28670 }, { "epoch": 7.303251549367518, "grad_norm": 0.01961079053580761, "learning_rate": 1.707039794428259e-06, "loss": 0.0005, "step": 28675 }, { "epoch": 7.304525002122421, "grad_norm": 0.04727395996451378, "learning_rate": 1.7045563891993567e-06, "loss": 0.0006, "step": 28680 }, { "epoch": 7.3057984548773245, "grad_norm": 0.27703672647476196, "learning_rate": 1.7020746234037577e-06, "loss": 0.0013, "step": 28685 }, { "epoch": 7.307071907632227, "grad_norm": 0.12758293747901917, "learning_rate": 1.6995944975319312e-06, "loss": 0.001, "step": 28690 }, { "epoch": 7.30834536038713, "grad_norm": 0.0913301631808281, "learning_rate": 1.6971160120740348e-06, "loss": 0.0005, "step": 28695 }, { "epoch": 7.309618813142032, "grad_norm": 0.035147774964571, "learning_rate": 1.6946391675198838e-06, "loss": 0.0006, "step": 28700 }, { "epoch": 7.3108922658969355, "grad_norm": 0.23156407475471497, "learning_rate": 1.6921639643589827e-06, "loss": 0.0007, "step": 28705 }, { "epoch": 7.312165718651838, "grad_norm": 0.0927402600646019, "learning_rate": 1.6896904030805072e-06, "loss": 0.0007, "step": 28710 }, { "epoch": 7.313439171406741, "grad_norm": 0.07033326476812363, "learning_rate": 1.6872184841733085e-06, "loss": 0.0004, "step": 28715 }, { "epoch": 7.314712624161643, "grad_norm": 0.028346940875053406, "learning_rate": 1.6847482081259113e-06, "loss": 0.0011, "step": 28720 }, { "epoch": 7.315986076916547, "grad_norm": 0.05438331142067909, "learning_rate": 1.6822795754265231e-06, "loss": 0.0007, "step": 28725 }, { "epoch": 7.317259529671449, "grad_norm": 0.01130136288702488, "learning_rate": 1.6798125865630132e-06, "loss": 0.0013, "step": 28730 }, { "epoch": 7.318532982426352, "grad_norm": 0.3454141914844513, "learning_rate": 1.677347242022941e-06, "loss": 0.0007, "step": 28735 }, { "epoch": 7.319806435181254, "grad_norm": 0.09123975038528442, "learning_rate": 1.6748835422935317e-06, "loss": 0.0006, "step": 28740 }, { "epoch": 7.321079887936158, "grad_norm": 0.10337559878826141, "learning_rate": 1.6724214878616862e-06, "loss": 0.0009, "step": 28745 }, { "epoch": 7.322353340691061, "grad_norm": 0.07736646384000778, "learning_rate": 1.669961079213981e-06, "loss": 0.0009, "step": 28750 }, { "epoch": 7.323626793445963, "grad_norm": 0.05993662402033806, "learning_rate": 1.6675023168366755e-06, "loss": 0.0008, "step": 28755 }, { "epoch": 7.324900246200866, "grad_norm": 0.4799269139766693, "learning_rate": 1.6650452012156848e-06, "loss": 0.001, "step": 28760 }, { "epoch": 7.326173698955769, "grad_norm": 0.04736965149641037, "learning_rate": 1.662589732836619e-06, "loss": 0.0014, "step": 28765 }, { "epoch": 7.327447151710672, "grad_norm": 0.6052928566932678, "learning_rate": 1.6601359121847515e-06, "loss": 0.0015, "step": 28770 }, { "epoch": 7.328720604465574, "grad_norm": 0.02826092392206192, "learning_rate": 1.6576837397450307e-06, "loss": 0.0007, "step": 28775 }, { "epoch": 7.329994057220477, "grad_norm": 0.02200390212237835, "learning_rate": 1.6552332160020812e-06, "loss": 0.0009, "step": 28780 }, { "epoch": 7.33126750997538, "grad_norm": 0.05613941699266434, "learning_rate": 1.6527843414402034e-06, "loss": 0.0003, "step": 28785 }, { "epoch": 7.332540962730283, "grad_norm": 0.054567523300647736, "learning_rate": 1.6503371165433668e-06, "loss": 0.0005, "step": 28790 }, { "epoch": 7.333814415485185, "grad_norm": 0.009282965213060379, "learning_rate": 1.6478915417952179e-06, "loss": 0.0006, "step": 28795 }, { "epoch": 7.335087868240088, "grad_norm": 0.04415144771337509, "learning_rate": 1.6454476176790811e-06, "loss": 0.0004, "step": 28800 }, { "epoch": 7.336361320994991, "grad_norm": 0.12668520212173462, "learning_rate": 1.6430053446779481e-06, "loss": 0.0011, "step": 28805 }, { "epoch": 7.337634773749894, "grad_norm": 0.030228862538933754, "learning_rate": 1.640564723274486e-06, "loss": 0.0005, "step": 28810 }, { "epoch": 7.338908226504797, "grad_norm": 0.012516364455223083, "learning_rate": 1.6381257539510375e-06, "loss": 0.0004, "step": 28815 }, { "epoch": 7.340181679259699, "grad_norm": 0.10653059184551239, "learning_rate": 1.6356884371896165e-06, "loss": 0.0007, "step": 28820 }, { "epoch": 7.341455132014603, "grad_norm": 0.025435011833906174, "learning_rate": 1.6332527734719117e-06, "loss": 0.0005, "step": 28825 }, { "epoch": 7.342728584769505, "grad_norm": 0.07124637067317963, "learning_rate": 1.6308187632792838e-06, "loss": 0.0009, "step": 28830 }, { "epoch": 7.344002037524408, "grad_norm": 0.037657372653484344, "learning_rate": 1.628386407092767e-06, "loss": 0.0007, "step": 28835 }, { "epoch": 7.3452754902793105, "grad_norm": 0.0447714626789093, "learning_rate": 1.625955705393073e-06, "loss": 0.0012, "step": 28840 }, { "epoch": 7.346548943034214, "grad_norm": 0.015067098662257195, "learning_rate": 1.6235266586605814e-06, "loss": 0.0011, "step": 28845 }, { "epoch": 7.347822395789116, "grad_norm": 0.03814665228128433, "learning_rate": 1.6210992673753434e-06, "loss": 0.0011, "step": 28850 }, { "epoch": 7.349095848544019, "grad_norm": 0.03179893642663956, "learning_rate": 1.6186735320170889e-06, "loss": 0.0007, "step": 28855 }, { "epoch": 7.3503693012989215, "grad_norm": 0.05260135605931282, "learning_rate": 1.6162494530652163e-06, "loss": 0.0005, "step": 28860 }, { "epoch": 7.351642754053825, "grad_norm": 0.04257279634475708, "learning_rate": 1.6138270309987947e-06, "loss": 0.0006, "step": 28865 }, { "epoch": 7.352916206808727, "grad_norm": 0.052112605422735214, "learning_rate": 1.6114062662965757e-06, "loss": 0.001, "step": 28870 }, { "epoch": 7.35418965956363, "grad_norm": 0.005376116838306189, "learning_rate": 1.608987159436969e-06, "loss": 0.0005, "step": 28875 }, { "epoch": 7.3554631123185334, "grad_norm": 0.03830369934439659, "learning_rate": 1.6065697108980682e-06, "loss": 0.0006, "step": 28880 }, { "epoch": 7.356736565073436, "grad_norm": 0.029420457780361176, "learning_rate": 1.6041539211576352e-06, "loss": 0.0009, "step": 28885 }, { "epoch": 7.358010017828339, "grad_norm": 0.08044400811195374, "learning_rate": 1.601739790693102e-06, "loss": 0.0005, "step": 28890 }, { "epoch": 7.359283470583241, "grad_norm": 0.032601580023765564, "learning_rate": 1.599327319981574e-06, "loss": 0.0005, "step": 28895 }, { "epoch": 7.3605569233381445, "grad_norm": 0.06008419021964073, "learning_rate": 1.5969165094998352e-06, "loss": 0.0005, "step": 28900 }, { "epoch": 7.361830376093047, "grad_norm": 0.5176844000816345, "learning_rate": 1.5945073597243265e-06, "loss": 0.0019, "step": 28905 }, { "epoch": 7.36310382884795, "grad_norm": 0.018574310466647148, "learning_rate": 1.5920998711311764e-06, "loss": 0.0007, "step": 28910 }, { "epoch": 7.364377281602852, "grad_norm": 0.008077915757894516, "learning_rate": 1.5896940441961762e-06, "loss": 0.0008, "step": 28915 }, { "epoch": 7.3656507343577555, "grad_norm": 0.007083128672093153, "learning_rate": 1.58728987939479e-06, "loss": 0.0007, "step": 28920 }, { "epoch": 7.366924187112658, "grad_norm": 0.05193199962377548, "learning_rate": 1.5848873772021522e-06, "loss": 0.0006, "step": 28925 }, { "epoch": 7.368197639867561, "grad_norm": 0.02939523011445999, "learning_rate": 1.5824865380930777e-06, "loss": 0.0009, "step": 28930 }, { "epoch": 7.369471092622463, "grad_norm": 0.09900929778814316, "learning_rate": 1.5800873625420376e-06, "loss": 0.0004, "step": 28935 }, { "epoch": 7.370744545377367, "grad_norm": 0.10819419473409653, "learning_rate": 1.5776898510231887e-06, "loss": 0.0009, "step": 28940 }, { "epoch": 7.372017998132269, "grad_norm": 0.09490072727203369, "learning_rate": 1.5752940040103493e-06, "loss": 0.0007, "step": 28945 }, { "epoch": 7.373291450887172, "grad_norm": 0.012495859526097775, "learning_rate": 1.572899821977012e-06, "loss": 0.0016, "step": 28950 }, { "epoch": 7.374564903642075, "grad_norm": 0.10400612652301788, "learning_rate": 1.5705073053963404e-06, "loss": 0.0012, "step": 28955 }, { "epoch": 7.375838356396978, "grad_norm": 0.05963843688368797, "learning_rate": 1.5681164547411731e-06, "loss": 0.0008, "step": 28960 }, { "epoch": 7.377111809151881, "grad_norm": 0.3329477608203888, "learning_rate": 1.5657272704840077e-06, "loss": 0.0021, "step": 28965 }, { "epoch": 7.378385261906783, "grad_norm": 0.029543351382017136, "learning_rate": 1.5633397530970262e-06, "loss": 0.0009, "step": 28970 }, { "epoch": 7.379658714661686, "grad_norm": 0.031822919845581055, "learning_rate": 1.5609539030520725e-06, "loss": 0.0005, "step": 28975 }, { "epoch": 7.380932167416589, "grad_norm": 0.08595740795135498, "learning_rate": 1.558569720820665e-06, "loss": 0.0005, "step": 28980 }, { "epoch": 7.382205620171492, "grad_norm": 0.23329958319664001, "learning_rate": 1.556187206873987e-06, "loss": 0.0012, "step": 28985 }, { "epoch": 7.383479072926394, "grad_norm": 0.03001919761300087, "learning_rate": 1.5538063616829046e-06, "loss": 0.0011, "step": 28990 }, { "epoch": 7.384752525681297, "grad_norm": 0.07469483464956284, "learning_rate": 1.5514271857179353e-06, "loss": 0.0007, "step": 28995 }, { "epoch": 7.3860259784362, "grad_norm": 0.14087431132793427, "learning_rate": 1.5490496794492837e-06, "loss": 0.0007, "step": 29000 }, { "epoch": 7.387299431191103, "grad_norm": 0.02397497557103634, "learning_rate": 1.5466738433468153e-06, "loss": 0.0007, "step": 29005 }, { "epoch": 7.388572883946005, "grad_norm": 0.10661831498146057, "learning_rate": 1.5442996778800702e-06, "loss": 0.0008, "step": 29010 }, { "epoch": 7.389846336700908, "grad_norm": 0.03800567612051964, "learning_rate": 1.5419271835182504e-06, "loss": 0.0006, "step": 29015 }, { "epoch": 7.391119789455811, "grad_norm": 0.030576881021261215, "learning_rate": 1.5395563607302433e-06, "loss": 0.0003, "step": 29020 }, { "epoch": 7.392393242210714, "grad_norm": 0.06076652184128761, "learning_rate": 1.5371872099845853e-06, "loss": 0.001, "step": 29025 }, { "epoch": 7.393666694965617, "grad_norm": 0.10870476067066193, "learning_rate": 1.5348197317494994e-06, "loss": 0.0005, "step": 29030 }, { "epoch": 7.394940147720519, "grad_norm": 0.053864479064941406, "learning_rate": 1.5324539264928706e-06, "loss": 0.0005, "step": 29035 }, { "epoch": 7.396213600475423, "grad_norm": 0.1551145613193512, "learning_rate": 1.5300897946822535e-06, "loss": 0.0012, "step": 29040 }, { "epoch": 7.397487053230325, "grad_norm": 0.17105011641979218, "learning_rate": 1.5277273367848712e-06, "loss": 0.0012, "step": 29045 }, { "epoch": 7.398760505985228, "grad_norm": 0.01007336750626564, "learning_rate": 1.5253665532676242e-06, "loss": 0.0005, "step": 29050 }, { "epoch": 7.4000339587401305, "grad_norm": 0.07952424138784409, "learning_rate": 1.5230074445970654e-06, "loss": 0.0005, "step": 29055 }, { "epoch": 7.401307411495034, "grad_norm": 0.06996924430131912, "learning_rate": 1.5206500112394352e-06, "loss": 0.0008, "step": 29060 }, { "epoch": 7.402580864249936, "grad_norm": 0.004332204815000296, "learning_rate": 1.5182942536606316e-06, "loss": 0.001, "step": 29065 }, { "epoch": 7.403854317004839, "grad_norm": 0.44829535484313965, "learning_rate": 1.5159401723262246e-06, "loss": 0.0008, "step": 29070 }, { "epoch": 7.4051277697597415, "grad_norm": 0.05435642972588539, "learning_rate": 1.5135877677014521e-06, "loss": 0.001, "step": 29075 }, { "epoch": 7.406401222514645, "grad_norm": 0.05119767785072327, "learning_rate": 1.511237040251221e-06, "loss": 0.001, "step": 29080 }, { "epoch": 7.407674675269547, "grad_norm": 0.16670097410678864, "learning_rate": 1.5088879904401066e-06, "loss": 0.0016, "step": 29085 }, { "epoch": 7.40894812802445, "grad_norm": 0.023682456463575363, "learning_rate": 1.5065406187323572e-06, "loss": 0.0005, "step": 29090 }, { "epoch": 7.4102215807793534, "grad_norm": 0.006272170692682266, "learning_rate": 1.5041949255918764e-06, "loss": 0.001, "step": 29095 }, { "epoch": 7.411495033534256, "grad_norm": 0.3447013199329376, "learning_rate": 1.5018509114822543e-06, "loss": 0.0009, "step": 29100 }, { "epoch": 7.412768486289159, "grad_norm": 0.06341859698295593, "learning_rate": 1.499508576866735e-06, "loss": 0.0007, "step": 29105 }, { "epoch": 7.414041939044061, "grad_norm": 0.05423000082373619, "learning_rate": 1.4971679222082358e-06, "loss": 0.0008, "step": 29110 }, { "epoch": 7.4153153917989645, "grad_norm": 0.04250835254788399, "learning_rate": 1.4948289479693423e-06, "loss": 0.0007, "step": 29115 }, { "epoch": 7.416588844553867, "grad_norm": 0.08641579002141953, "learning_rate": 1.4924916546123069e-06, "loss": 0.0009, "step": 29120 }, { "epoch": 7.41786229730877, "grad_norm": 0.046221401542425156, "learning_rate": 1.490156042599048e-06, "loss": 0.0009, "step": 29125 }, { "epoch": 7.419135750063672, "grad_norm": 0.005584875121712685, "learning_rate": 1.4878221123911585e-06, "loss": 0.0009, "step": 29130 }, { "epoch": 7.4204092028185755, "grad_norm": 0.06591122597455978, "learning_rate": 1.4854898644498927e-06, "loss": 0.0004, "step": 29135 }, { "epoch": 7.421682655573478, "grad_norm": 0.017209215089678764, "learning_rate": 1.4831592992361732e-06, "loss": 0.0006, "step": 29140 }, { "epoch": 7.422956108328381, "grad_norm": 0.13668909668922424, "learning_rate": 1.4808304172105914e-06, "loss": 0.0004, "step": 29145 }, { "epoch": 7.424229561083283, "grad_norm": 0.02570364810526371, "learning_rate": 1.4785032188334048e-06, "loss": 0.0008, "step": 29150 }, { "epoch": 7.425503013838187, "grad_norm": 0.03097875788807869, "learning_rate": 1.4761777045645409e-06, "loss": 0.0009, "step": 29155 }, { "epoch": 7.42677646659309, "grad_norm": 0.5718064308166504, "learning_rate": 1.4738538748635877e-06, "loss": 0.0009, "step": 29160 }, { "epoch": 7.428049919347992, "grad_norm": 0.017241811379790306, "learning_rate": 1.4715317301898113e-06, "loss": 0.0006, "step": 29165 }, { "epoch": 7.429323372102895, "grad_norm": 0.03183137625455856, "learning_rate": 1.4692112710021366e-06, "loss": 0.0009, "step": 29170 }, { "epoch": 7.430596824857798, "grad_norm": 0.040178339928388596, "learning_rate": 1.4668924977591548e-06, "loss": 0.0007, "step": 29175 }, { "epoch": 7.431870277612701, "grad_norm": 0.04707891494035721, "learning_rate": 1.464575410919129e-06, "loss": 0.0006, "step": 29180 }, { "epoch": 7.433143730367603, "grad_norm": 0.03663383424282074, "learning_rate": 1.4622600109399854e-06, "loss": 0.0003, "step": 29185 }, { "epoch": 7.434417183122506, "grad_norm": 0.08301182091236115, "learning_rate": 1.4599462982793155e-06, "loss": 0.0003, "step": 29190 }, { "epoch": 7.435690635877409, "grad_norm": 0.03013903833925724, "learning_rate": 1.4576342733943861e-06, "loss": 0.0005, "step": 29195 }, { "epoch": 7.436964088632312, "grad_norm": 0.046666741371154785, "learning_rate": 1.4553239367421157e-06, "loss": 0.0009, "step": 29200 }, { "epoch": 7.438237541387214, "grad_norm": 0.057873230427503586, "learning_rate": 1.4530152887791037e-06, "loss": 0.0008, "step": 29205 }, { "epoch": 7.439510994142117, "grad_norm": 0.03619721531867981, "learning_rate": 1.4507083299616077e-06, "loss": 0.0003, "step": 29210 }, { "epoch": 7.44078444689702, "grad_norm": 0.32874831557273865, "learning_rate": 1.4484030607455534e-06, "loss": 0.0012, "step": 29215 }, { "epoch": 7.442057899651923, "grad_norm": 0.0324983224272728, "learning_rate": 1.4460994815865292e-06, "loss": 0.0008, "step": 29220 }, { "epoch": 7.443331352406826, "grad_norm": 0.2235514372587204, "learning_rate": 1.4437975929398006e-06, "loss": 0.0006, "step": 29225 }, { "epoch": 7.444604805161728, "grad_norm": 0.032894376665353775, "learning_rate": 1.4414973952602819e-06, "loss": 0.0007, "step": 29230 }, { "epoch": 7.445878257916632, "grad_norm": 0.1541079580783844, "learning_rate": 1.4391988890025689e-06, "loss": 0.001, "step": 29235 }, { "epoch": 7.447151710671534, "grad_norm": 0.17879490554332733, "learning_rate": 1.436902074620915e-06, "loss": 0.0005, "step": 29240 }, { "epoch": 7.448425163426437, "grad_norm": 0.025272799655795097, "learning_rate": 1.4346069525692407e-06, "loss": 0.0007, "step": 29245 }, { "epoch": 7.449698616181339, "grad_norm": 0.0589541532099247, "learning_rate": 1.4323135233011298e-06, "loss": 0.0014, "step": 29250 }, { "epoch": 7.450972068936243, "grad_norm": 0.020985879004001617, "learning_rate": 1.4300217872698418e-06, "loss": 0.0006, "step": 29255 }, { "epoch": 7.452245521691145, "grad_norm": 0.05822550877928734, "learning_rate": 1.4277317449282834e-06, "loss": 0.0006, "step": 29260 }, { "epoch": 7.453518974446048, "grad_norm": 0.11857731640338898, "learning_rate": 1.4254433967290438e-06, "loss": 0.0006, "step": 29265 }, { "epoch": 7.4547924272009505, "grad_norm": 0.2576240301132202, "learning_rate": 1.4231567431243697e-06, "loss": 0.0012, "step": 29270 }, { "epoch": 7.456065879955854, "grad_norm": 0.3047160804271698, "learning_rate": 1.4208717845661735e-06, "loss": 0.0009, "step": 29275 }, { "epoch": 7.457339332710756, "grad_norm": 0.1774214804172516, "learning_rate": 1.4185885215060292e-06, "loss": 0.0007, "step": 29280 }, { "epoch": 7.458612785465659, "grad_norm": 0.09465721249580383, "learning_rate": 1.4163069543951867e-06, "loss": 0.0004, "step": 29285 }, { "epoch": 7.459886238220562, "grad_norm": 0.2704653739929199, "learning_rate": 1.414027083684546e-06, "loss": 0.0017, "step": 29290 }, { "epoch": 7.461159690975465, "grad_norm": 0.330124169588089, "learning_rate": 1.411748909824684e-06, "loss": 0.0008, "step": 29295 }, { "epoch": 7.462433143730368, "grad_norm": 0.025944510474801064, "learning_rate": 1.409472433265836e-06, "loss": 0.0003, "step": 29300 }, { "epoch": 7.46370659648527, "grad_norm": 0.020834840834140778, "learning_rate": 1.4071976544579035e-06, "loss": 0.0007, "step": 29305 }, { "epoch": 7.464980049240173, "grad_norm": 0.05752132460474968, "learning_rate": 1.4049245738504503e-06, "loss": 0.0008, "step": 29310 }, { "epoch": 7.466253501995076, "grad_norm": 0.01781790889799595, "learning_rate": 1.4026531918927133e-06, "loss": 0.0005, "step": 29315 }, { "epoch": 7.467526954749979, "grad_norm": 0.013360419310629368, "learning_rate": 1.4003835090335772e-06, "loss": 0.0008, "step": 29320 }, { "epoch": 7.468800407504881, "grad_norm": 0.35262948274612427, "learning_rate": 1.3981155257216084e-06, "loss": 0.0024, "step": 29325 }, { "epoch": 7.4700738602597845, "grad_norm": 0.06769407540559769, "learning_rate": 1.3958492424050274e-06, "loss": 0.0011, "step": 29330 }, { "epoch": 7.471347313014687, "grad_norm": 0.015149627812206745, "learning_rate": 1.3935846595317203e-06, "loss": 0.0004, "step": 29335 }, { "epoch": 7.47262076576959, "grad_norm": 0.09830199927091599, "learning_rate": 1.3913217775492382e-06, "loss": 0.0008, "step": 29340 }, { "epoch": 7.473894218524492, "grad_norm": 0.02889879047870636, "learning_rate": 1.3890605969047976e-06, "loss": 0.0005, "step": 29345 }, { "epoch": 7.4751676712793955, "grad_norm": 0.06040016561746597, "learning_rate": 1.3868011180452723e-06, "loss": 0.0007, "step": 29350 }, { "epoch": 7.476441124034299, "grad_norm": 0.023778703063726425, "learning_rate": 1.3845433414172116e-06, "loss": 0.0006, "step": 29355 }, { "epoch": 7.477714576789201, "grad_norm": 0.09673107415437698, "learning_rate": 1.3822872674668132e-06, "loss": 0.001, "step": 29360 }, { "epoch": 7.478988029544104, "grad_norm": 0.04759581387042999, "learning_rate": 1.3800328966399523e-06, "loss": 0.0006, "step": 29365 }, { "epoch": 7.480261482299007, "grad_norm": 0.5917465686798096, "learning_rate": 1.3777802293821596e-06, "loss": 0.001, "step": 29370 }, { "epoch": 7.48153493505391, "grad_norm": 0.02368742600083351, "learning_rate": 1.375529266138631e-06, "loss": 0.0004, "step": 29375 }, { "epoch": 7.482808387808812, "grad_norm": 0.08186540752649307, "learning_rate": 1.3732800073542229e-06, "loss": 0.0008, "step": 29380 }, { "epoch": 7.484081840563715, "grad_norm": 0.03677685931324959, "learning_rate": 1.3710324534734643e-06, "loss": 0.0005, "step": 29385 }, { "epoch": 7.485355293318618, "grad_norm": 0.061640821397304535, "learning_rate": 1.368786604940533e-06, "loss": 0.0012, "step": 29390 }, { "epoch": 7.486628746073521, "grad_norm": 0.28960904479026794, "learning_rate": 1.3665424621992817e-06, "loss": 0.0016, "step": 29395 }, { "epoch": 7.487902198828423, "grad_norm": 0.1245136708021164, "learning_rate": 1.3643000256932215e-06, "loss": 0.0007, "step": 29400 }, { "epoch": 7.489175651583326, "grad_norm": 0.0323193296790123, "learning_rate": 1.3620592958655265e-06, "loss": 0.0018, "step": 29405 }, { "epoch": 7.490449104338229, "grad_norm": 0.23462262749671936, "learning_rate": 1.3598202731590292e-06, "loss": 0.0011, "step": 29410 }, { "epoch": 7.491722557093132, "grad_norm": 0.24360045790672302, "learning_rate": 1.357582958016238e-06, "loss": 0.0005, "step": 29415 }, { "epoch": 7.492996009848035, "grad_norm": 0.12222479283809662, "learning_rate": 1.3553473508793036e-06, "loss": 0.0005, "step": 29420 }, { "epoch": 7.494269462602937, "grad_norm": 0.04756096005439758, "learning_rate": 1.353113452190058e-06, "loss": 0.0004, "step": 29425 }, { "epoch": 7.495542915357841, "grad_norm": 0.039718858897686005, "learning_rate": 1.350881262389987e-06, "loss": 0.0009, "step": 29430 }, { "epoch": 7.496816368112743, "grad_norm": 0.019291065633296967, "learning_rate": 1.3486507819202365e-06, "loss": 0.0009, "step": 29435 }, { "epoch": 7.498089820867646, "grad_norm": 0.06692135334014893, "learning_rate": 1.346422011221621e-06, "loss": 0.001, "step": 29440 }, { "epoch": 7.499363273622548, "grad_norm": 0.02425249293446541, "learning_rate": 1.3441949507346119e-06, "loss": 0.0007, "step": 29445 }, { "epoch": 7.500636726377452, "grad_norm": 0.04272910952568054, "learning_rate": 1.3419696008993422e-06, "loss": 0.0008, "step": 29450 }, { "epoch": 7.501910179132354, "grad_norm": 0.01837480627000332, "learning_rate": 1.339745962155613e-06, "loss": 0.0013, "step": 29455 }, { "epoch": 7.503183631887257, "grad_norm": 0.09244048595428467, "learning_rate": 1.337524034942883e-06, "loss": 0.0016, "step": 29460 }, { "epoch": 7.504457084642159, "grad_norm": 0.026409387588500977, "learning_rate": 1.3353038197002721e-06, "loss": 0.0005, "step": 29465 }, { "epoch": 7.505730537397063, "grad_norm": 0.06040642783045769, "learning_rate": 1.3330853168665614e-06, "loss": 0.0008, "step": 29470 }, { "epoch": 7.507003990151965, "grad_norm": 0.04448267072439194, "learning_rate": 1.3308685268801968e-06, "loss": 0.0006, "step": 29475 }, { "epoch": 7.508277442906868, "grad_norm": 0.08461955189704895, "learning_rate": 1.3286534501792826e-06, "loss": 0.0013, "step": 29480 }, { "epoch": 7.509550895661771, "grad_norm": 0.02416006103157997, "learning_rate": 1.3264400872015836e-06, "loss": 0.0013, "step": 29485 }, { "epoch": 7.510824348416674, "grad_norm": 0.017833521589636803, "learning_rate": 1.3242284383845339e-06, "loss": 0.0005, "step": 29490 }, { "epoch": 7.512097801171577, "grad_norm": 0.03677654266357422, "learning_rate": 1.3220185041652178e-06, "loss": 0.0013, "step": 29495 }, { "epoch": 7.513371253926479, "grad_norm": 0.20597127079963684, "learning_rate": 1.3198102849803884e-06, "loss": 0.0008, "step": 29500 }, { "epoch": 7.514644706681382, "grad_norm": 0.03292957320809364, "learning_rate": 1.3176037812664566e-06, "loss": 0.0006, "step": 29505 }, { "epoch": 7.515918159436285, "grad_norm": 0.0711844190955162, "learning_rate": 1.3153989934594946e-06, "loss": 0.0012, "step": 29510 }, { "epoch": 7.517191612191188, "grad_norm": 0.03328844532370567, "learning_rate": 1.313195921995234e-06, "loss": 0.0007, "step": 29515 }, { "epoch": 7.51846506494609, "grad_norm": 0.09394124895334244, "learning_rate": 1.3109945673090763e-06, "loss": 0.0006, "step": 29520 }, { "epoch": 7.519738517700993, "grad_norm": 0.062070101499557495, "learning_rate": 1.3087949298360658e-06, "loss": 0.0007, "step": 29525 }, { "epoch": 7.521011970455896, "grad_norm": 0.10299202054738998, "learning_rate": 1.306597010010927e-06, "loss": 0.0008, "step": 29530 }, { "epoch": 7.522285423210799, "grad_norm": 0.010139637626707554, "learning_rate": 1.3044008082680327e-06, "loss": 0.0003, "step": 29535 }, { "epoch": 7.523558875965701, "grad_norm": 0.14731226861476898, "learning_rate": 1.3022063250414186e-06, "loss": 0.0007, "step": 29540 }, { "epoch": 7.5248323287206045, "grad_norm": 0.06644017994403839, "learning_rate": 1.3000135607647823e-06, "loss": 0.0008, "step": 29545 }, { "epoch": 7.526105781475508, "grad_norm": 0.03846036642789841, "learning_rate": 1.2978225158714842e-06, "loss": 0.0006, "step": 29550 }, { "epoch": 7.52737923423041, "grad_norm": 0.3822004497051239, "learning_rate": 1.2956331907945352e-06, "loss": 0.0013, "step": 29555 }, { "epoch": 7.528652686985313, "grad_norm": 0.05531487986445427, "learning_rate": 1.2934455859666194e-06, "loss": 0.004, "step": 29560 }, { "epoch": 7.5299261397402155, "grad_norm": 0.03194171190261841, "learning_rate": 1.291259701820071e-06, "loss": 0.001, "step": 29565 }, { "epoch": 7.531199592495119, "grad_norm": 0.042747847735881805, "learning_rate": 1.2890755387868892e-06, "loss": 0.0007, "step": 29570 }, { "epoch": 7.532473045250021, "grad_norm": 0.025018995627760887, "learning_rate": 1.2868930972987282e-06, "loss": 0.0007, "step": 29575 }, { "epoch": 7.533746498004924, "grad_norm": 0.17857694625854492, "learning_rate": 1.2847123777869119e-06, "loss": 0.0004, "step": 29580 }, { "epoch": 7.535019950759827, "grad_norm": 0.07292819023132324, "learning_rate": 1.282533380682407e-06, "loss": 0.0011, "step": 29585 }, { "epoch": 7.53629340351473, "grad_norm": 0.07492934167385101, "learning_rate": 1.2803561064158609e-06, "loss": 0.0009, "step": 29590 }, { "epoch": 7.537566856269632, "grad_norm": 0.02200443111360073, "learning_rate": 1.2781805554175607e-06, "loss": 0.0012, "step": 29595 }, { "epoch": 7.538840309024535, "grad_norm": 0.025095025077462196, "learning_rate": 1.2760067281174672e-06, "loss": 0.0011, "step": 29600 }, { "epoch": 7.540113761779438, "grad_norm": 0.39628422260284424, "learning_rate": 1.2738346249451926e-06, "loss": 0.0011, "step": 29605 }, { "epoch": 7.541387214534341, "grad_norm": 0.042192790657281876, "learning_rate": 1.2716642463300122e-06, "loss": 0.0012, "step": 29610 }, { "epoch": 7.542660667289244, "grad_norm": 0.01601947657763958, "learning_rate": 1.2694955927008568e-06, "loss": 0.0005, "step": 29615 }, { "epoch": 7.543934120044146, "grad_norm": 0.02408805675804615, "learning_rate": 1.2673286644863247e-06, "loss": 0.0004, "step": 29620 }, { "epoch": 7.5452075727990495, "grad_norm": 0.05938407778739929, "learning_rate": 1.265163462114658e-06, "loss": 0.0005, "step": 29625 }, { "epoch": 7.546481025553952, "grad_norm": 0.049089204519987106, "learning_rate": 1.2629999860137755e-06, "loss": 0.001, "step": 29630 }, { "epoch": 7.547754478308855, "grad_norm": 0.025473663583397865, "learning_rate": 1.2608382366112415e-06, "loss": 0.0008, "step": 29635 }, { "epoch": 7.549027931063757, "grad_norm": 0.3680427670478821, "learning_rate": 1.2586782143342846e-06, "loss": 0.0005, "step": 29640 }, { "epoch": 7.550301383818661, "grad_norm": 0.07789319008588791, "learning_rate": 1.25651991960979e-06, "loss": 0.0006, "step": 29645 }, { "epoch": 7.551574836573563, "grad_norm": 0.22246153652668, "learning_rate": 1.2543633528643084e-06, "loss": 0.0009, "step": 29650 }, { "epoch": 7.552848289328466, "grad_norm": 0.0508730486035347, "learning_rate": 1.2522085145240347e-06, "loss": 0.0006, "step": 29655 }, { "epoch": 7.554121742083368, "grad_norm": 0.07869452238082886, "learning_rate": 1.2500554050148383e-06, "loss": 0.0009, "step": 29660 }, { "epoch": 7.555395194838272, "grad_norm": 0.03665422275662422, "learning_rate": 1.2479040247622354e-06, "loss": 0.0008, "step": 29665 }, { "epoch": 7.556668647593174, "grad_norm": 0.20413054525852203, "learning_rate": 1.2457543741914069e-06, "loss": 0.001, "step": 29670 }, { "epoch": 7.557942100348077, "grad_norm": 0.06586132198572159, "learning_rate": 1.2436064537271853e-06, "loss": 0.0006, "step": 29675 }, { "epoch": 7.55921555310298, "grad_norm": 0.045972950756549835, "learning_rate": 1.241460263794072e-06, "loss": 0.0004, "step": 29680 }, { "epoch": 7.560489005857883, "grad_norm": 0.015957336872816086, "learning_rate": 1.2393158048162135e-06, "loss": 0.0009, "step": 29685 }, { "epoch": 7.561762458612786, "grad_norm": 0.03500857576727867, "learning_rate": 1.2371730772174239e-06, "loss": 0.0007, "step": 29690 }, { "epoch": 7.563035911367688, "grad_norm": 0.03585292026400566, "learning_rate": 1.2350320814211702e-06, "loss": 0.0008, "step": 29695 }, { "epoch": 7.564309364122591, "grad_norm": 0.03648331016302109, "learning_rate": 1.2328928178505805e-06, "loss": 0.0009, "step": 29700 }, { "epoch": 7.565582816877494, "grad_norm": 0.046672843396663666, "learning_rate": 1.2307552869284344e-06, "loss": 0.001, "step": 29705 }, { "epoch": 7.566856269632397, "grad_norm": 0.13144853711128235, "learning_rate": 1.2286194890771808e-06, "loss": 0.0008, "step": 29710 }, { "epoch": 7.568129722387299, "grad_norm": 0.050135765224695206, "learning_rate": 1.2264854247189106e-06, "loss": 0.0006, "step": 29715 }, { "epoch": 7.569403175142202, "grad_norm": 0.06138063967227936, "learning_rate": 1.2243530942753856e-06, "loss": 0.0008, "step": 29720 }, { "epoch": 7.570676627897105, "grad_norm": 0.049556780606508255, "learning_rate": 1.2222224981680175e-06, "loss": 0.0007, "step": 29725 }, { "epoch": 7.571950080652008, "grad_norm": 0.14490699768066406, "learning_rate": 1.2200936368178772e-06, "loss": 0.0012, "step": 29730 }, { "epoch": 7.57322353340691, "grad_norm": 0.025336187332868576, "learning_rate": 1.217966510645694e-06, "loss": 0.0005, "step": 29735 }, { "epoch": 7.574496986161813, "grad_norm": 1.1101281642913818, "learning_rate": 1.2158411200718512e-06, "loss": 0.0016, "step": 29740 }, { "epoch": 7.575770438916717, "grad_norm": 0.04031212255358696, "learning_rate": 1.2137174655163907e-06, "loss": 0.0004, "step": 29745 }, { "epoch": 7.577043891671619, "grad_norm": 0.04088936001062393, "learning_rate": 1.2115955473990137e-06, "loss": 0.0006, "step": 29750 }, { "epoch": 7.578317344426522, "grad_norm": 0.015877846628427505, "learning_rate": 1.2094753661390756e-06, "loss": 0.0009, "step": 29755 }, { "epoch": 7.5795907971814245, "grad_norm": 0.07582803070545197, "learning_rate": 1.2073569221555892e-06, "loss": 0.0006, "step": 29760 }, { "epoch": 7.580864249936328, "grad_norm": 0.03395044803619385, "learning_rate": 1.2052402158672228e-06, "loss": 0.0003, "step": 29765 }, { "epoch": 7.58213770269123, "grad_norm": 0.04964430257678032, "learning_rate": 1.2031252476923029e-06, "loss": 0.001, "step": 29770 }, { "epoch": 7.583411155446133, "grad_norm": 0.11290343105792999, "learning_rate": 1.2010120180488115e-06, "loss": 0.0007, "step": 29775 }, { "epoch": 7.5846846082010355, "grad_norm": 0.06451843678951263, "learning_rate": 1.1989005273543852e-06, "loss": 0.0009, "step": 29780 }, { "epoch": 7.585958060955939, "grad_norm": 0.0232315082103014, "learning_rate": 1.1967907760263232e-06, "loss": 0.0007, "step": 29785 }, { "epoch": 7.587231513710841, "grad_norm": 0.06292226165533066, "learning_rate": 1.1946827644815763e-06, "loss": 0.0007, "step": 29790 }, { "epoch": 7.588504966465744, "grad_norm": 0.03422201797366142, "learning_rate": 1.19257649313675e-06, "loss": 0.0008, "step": 29795 }, { "epoch": 7.5897784192206466, "grad_norm": 0.013594763353466988, "learning_rate": 1.1904719624081096e-06, "loss": 0.0011, "step": 29800 }, { "epoch": 7.59105187197555, "grad_norm": 0.03550697863101959, "learning_rate": 1.1883691727115732e-06, "loss": 0.0008, "step": 29805 }, { "epoch": 7.592325324730453, "grad_norm": 0.10858067870140076, "learning_rate": 1.186268124462715e-06, "loss": 0.0018, "step": 29810 }, { "epoch": 7.593598777485355, "grad_norm": 0.10534416884183884, "learning_rate": 1.184168818076773e-06, "loss": 0.0008, "step": 29815 }, { "epoch": 7.594872230240258, "grad_norm": 0.15072421729564667, "learning_rate": 1.1820712539686253e-06, "loss": 0.0006, "step": 29820 }, { "epoch": 7.596145682995161, "grad_norm": 0.0068412357941269875, "learning_rate": 1.1799754325528213e-06, "loss": 0.0004, "step": 29825 }, { "epoch": 7.597419135750064, "grad_norm": 0.05072836950421333, "learning_rate": 1.1778813542435574e-06, "loss": 0.0015, "step": 29830 }, { "epoch": 7.598692588504966, "grad_norm": 0.023896370083093643, "learning_rate": 1.1757890194546872e-06, "loss": 0.0008, "step": 29835 }, { "epoch": 7.5999660412598695, "grad_norm": 0.06561899185180664, "learning_rate": 1.1741164072361645e-06, "loss": 0.0014, "step": 29840 }, { "epoch": 7.601239494014772, "grad_norm": 1.008940577507019, "learning_rate": 1.1720272118258157e-06, "loss": 0.0014, "step": 29845 }, { "epoch": 7.602512946769675, "grad_norm": 0.03637376055121422, "learning_rate": 1.1699397610928176e-06, "loss": 0.0008, "step": 29850 }, { "epoch": 7.603786399524577, "grad_norm": 0.2150777131319046, "learning_rate": 1.167854055449713e-06, "loss": 0.0007, "step": 29855 }, { "epoch": 7.605059852279481, "grad_norm": 0.08668972551822662, "learning_rate": 1.1657700953087058e-06, "loss": 0.0006, "step": 29860 }, { "epoch": 7.606333305034383, "grad_norm": 0.025180159136652946, "learning_rate": 1.1636878810816476e-06, "loss": 0.0005, "step": 29865 }, { "epoch": 7.607606757789286, "grad_norm": 0.5626616477966309, "learning_rate": 1.161607413180047e-06, "loss": 0.0009, "step": 29870 }, { "epoch": 7.608880210544189, "grad_norm": 0.03962705656886101, "learning_rate": 1.1595286920150695e-06, "loss": 0.0007, "step": 29875 }, { "epoch": 7.610153663299092, "grad_norm": 0.03640831634402275, "learning_rate": 1.157451717997532e-06, "loss": 0.0007, "step": 29880 }, { "epoch": 7.611427116053994, "grad_norm": 0.023675795644521713, "learning_rate": 1.1553764915379095e-06, "loss": 0.0003, "step": 29885 }, { "epoch": 7.612700568808897, "grad_norm": 0.48642459511756897, "learning_rate": 1.1533030130463285e-06, "loss": 0.0009, "step": 29890 }, { "epoch": 7.6139740215638, "grad_norm": 0.04385720565915108, "learning_rate": 1.1512312829325744e-06, "loss": 0.0014, "step": 29895 }, { "epoch": 7.615247474318703, "grad_norm": 0.2059989720582962, "learning_rate": 1.1491613016060843e-06, "loss": 0.0005, "step": 29900 }, { "epoch": 7.616520927073606, "grad_norm": 0.23918752372264862, "learning_rate": 1.1470930694759475e-06, "loss": 0.0008, "step": 29905 }, { "epoch": 7.617794379828508, "grad_norm": 0.1999645233154297, "learning_rate": 1.1450265869509113e-06, "loss": 0.0005, "step": 29910 }, { "epoch": 7.619067832583411, "grad_norm": 0.05800430849194527, "learning_rate": 1.142961854439375e-06, "loss": 0.0006, "step": 29915 }, { "epoch": 7.620341285338314, "grad_norm": 0.6720203161239624, "learning_rate": 1.140898872349393e-06, "loss": 0.0011, "step": 29920 }, { "epoch": 7.621614738093217, "grad_norm": 0.016051942482590675, "learning_rate": 1.138837641088677e-06, "loss": 0.0006, "step": 29925 }, { "epoch": 7.622888190848119, "grad_norm": 0.01831863820552826, "learning_rate": 1.1367781610645822e-06, "loss": 0.0006, "step": 29930 }, { "epoch": 7.624161643603022, "grad_norm": 0.16363434493541718, "learning_rate": 1.134720432684131e-06, "loss": 0.001, "step": 29935 }, { "epoch": 7.625435096357926, "grad_norm": 0.02765808254480362, "learning_rate": 1.1326644563539924e-06, "loss": 0.0006, "step": 29940 }, { "epoch": 7.626708549112828, "grad_norm": 0.029313204810023308, "learning_rate": 1.130610232480489e-06, "loss": 0.0011, "step": 29945 }, { "epoch": 7.62798200186773, "grad_norm": 0.012662946246564388, "learning_rate": 1.1285577614695964e-06, "loss": 0.0007, "step": 29950 }, { "epoch": 7.629255454622633, "grad_norm": 0.06148570403456688, "learning_rate": 1.126507043726952e-06, "loss": 0.0007, "step": 29955 }, { "epoch": 7.630528907377537, "grad_norm": 0.02471117489039898, "learning_rate": 1.1244580796578308e-06, "loss": 0.0006, "step": 29960 }, { "epoch": 7.631802360132439, "grad_norm": 0.07601889222860336, "learning_rate": 1.1224108696671799e-06, "loss": 0.001, "step": 29965 }, { "epoch": 7.633075812887342, "grad_norm": 0.006821748800575733, "learning_rate": 1.1203654141595855e-06, "loss": 0.0006, "step": 29970 }, { "epoch": 7.6343492656422445, "grad_norm": 0.19428810477256775, "learning_rate": 1.1183217135392943e-06, "loss": 0.0013, "step": 29975 }, { "epoch": 7.635622718397148, "grad_norm": 0.03665010258555412, "learning_rate": 1.1162797682102e-06, "loss": 0.0006, "step": 29980 }, { "epoch": 7.63689617115205, "grad_norm": 0.03855932131409645, "learning_rate": 1.114239578575861e-06, "loss": 0.0005, "step": 29985 }, { "epoch": 7.638169623906953, "grad_norm": 0.04659051075577736, "learning_rate": 1.1122011450394732e-06, "loss": 0.0011, "step": 29990 }, { "epoch": 7.6394430766618555, "grad_norm": 0.017901914194226265, "learning_rate": 1.110164468003898e-06, "loss": 0.0004, "step": 29995 }, { "epoch": 7.640716529416759, "grad_norm": 0.2609769105911255, "learning_rate": 1.1081295478716447e-06, "loss": 0.0013, "step": 30000 }, { "epoch": 7.641989982171662, "grad_norm": 0.29331931471824646, "learning_rate": 1.1060963850448747e-06, "loss": 0.0008, "step": 30005 }, { "epoch": 7.643263434926564, "grad_norm": 0.012506762519478798, "learning_rate": 1.1040649799254011e-06, "loss": 0.0006, "step": 30010 }, { "epoch": 7.6445368876814666, "grad_norm": 0.05611291900277138, "learning_rate": 1.1020353329146982e-06, "loss": 0.0008, "step": 30015 }, { "epoch": 7.64581034043637, "grad_norm": 0.3358372747898102, "learning_rate": 1.1000074444138787e-06, "loss": 0.0006, "step": 30020 }, { "epoch": 7.647083793191273, "grad_norm": 0.06788834929466248, "learning_rate": 1.0979813148237206e-06, "loss": 0.0009, "step": 30025 }, { "epoch": 7.648357245946175, "grad_norm": 0.015045172534883022, "learning_rate": 1.0959569445446471e-06, "loss": 0.0006, "step": 30030 }, { "epoch": 7.6496306987010785, "grad_norm": 0.02449234202504158, "learning_rate": 1.093934333976735e-06, "loss": 0.0008, "step": 30035 }, { "epoch": 7.650904151455981, "grad_norm": 0.06372339278459549, "learning_rate": 1.0919134835197142e-06, "loss": 0.0006, "step": 30040 }, { "epoch": 7.652177604210884, "grad_norm": 0.006417204160243273, "learning_rate": 1.08989439357297e-06, "loss": 0.0008, "step": 30045 }, { "epoch": 7.653451056965786, "grad_norm": 0.01906777173280716, "learning_rate": 1.0878770645355296e-06, "loss": 0.0007, "step": 30050 }, { "epoch": 7.6547245097206895, "grad_norm": 0.07578594237565994, "learning_rate": 1.0858614968060854e-06, "loss": 0.0012, "step": 30055 }, { "epoch": 7.655997962475592, "grad_norm": 0.09368161857128143, "learning_rate": 1.083847690782972e-06, "loss": 0.0008, "step": 30060 }, { "epoch": 7.657271415230495, "grad_norm": 0.06471487879753113, "learning_rate": 1.08183564686418e-06, "loss": 0.0006, "step": 30065 }, { "epoch": 7.658544867985398, "grad_norm": 0.08832593262195587, "learning_rate": 1.0798253654473478e-06, "loss": 0.0006, "step": 30070 }, { "epoch": 7.659818320740301, "grad_norm": 0.02565835602581501, "learning_rate": 1.0778168469297756e-06, "loss": 0.0006, "step": 30075 }, { "epoch": 7.661091773495203, "grad_norm": 0.06579869240522385, "learning_rate": 1.075810091708399e-06, "loss": 0.0013, "step": 30080 }, { "epoch": 7.662365226250106, "grad_norm": 0.07253700494766235, "learning_rate": 1.0738051001798211e-06, "loss": 0.0009, "step": 30085 }, { "epoch": 7.663638679005009, "grad_norm": 0.05406057834625244, "learning_rate": 1.0718018727402878e-06, "loss": 0.001, "step": 30090 }, { "epoch": 7.664912131759912, "grad_norm": 0.04325077310204506, "learning_rate": 1.0698004097856961e-06, "loss": 0.0005, "step": 30095 }, { "epoch": 7.666185584514815, "grad_norm": 0.06411541998386383, "learning_rate": 1.0678007117115984e-06, "loss": 0.0011, "step": 30100 }, { "epoch": 7.667459037269717, "grad_norm": 0.030330033972859383, "learning_rate": 1.0658027789131963e-06, "loss": 0.0004, "step": 30105 }, { "epoch": 7.66873249002462, "grad_norm": 0.020230328664183617, "learning_rate": 1.0638066117853385e-06, "loss": 0.001, "step": 30110 }, { "epoch": 7.670005942779523, "grad_norm": 0.06670531630516052, "learning_rate": 1.061812210722536e-06, "loss": 0.0005, "step": 30115 }, { "epoch": 7.671279395534426, "grad_norm": 0.05576726794242859, "learning_rate": 1.0598195761189345e-06, "loss": 0.0015, "step": 30120 }, { "epoch": 7.672552848289328, "grad_norm": 0.09878210723400116, "learning_rate": 1.057828708368347e-06, "loss": 0.0025, "step": 30125 }, { "epoch": 7.673826301044231, "grad_norm": 0.1381784826517105, "learning_rate": 1.0558396078642264e-06, "loss": 0.0003, "step": 30130 }, { "epoch": 7.675099753799134, "grad_norm": 0.25003716349601746, "learning_rate": 1.0538522749996804e-06, "loss": 0.0009, "step": 30135 }, { "epoch": 7.676373206554037, "grad_norm": 0.056145258247852325, "learning_rate": 1.0518667101674673e-06, "loss": 0.001, "step": 30140 }, { "epoch": 7.677646659308939, "grad_norm": 0.08932922035455704, "learning_rate": 1.0498829137599942e-06, "loss": 0.0006, "step": 30145 }, { "epoch": 7.678920112063842, "grad_norm": 0.06910941004753113, "learning_rate": 1.0479008861693195e-06, "loss": 0.0008, "step": 30150 }, { "epoch": 7.680193564818746, "grad_norm": 3.249476671218872, "learning_rate": 1.0459206277871547e-06, "loss": 0.0013, "step": 30155 }, { "epoch": 7.681467017573648, "grad_norm": 0.5727332830429077, "learning_rate": 1.0439421390048587e-06, "loss": 0.0013, "step": 30160 }, { "epoch": 7.682740470328551, "grad_norm": 0.06695128977298737, "learning_rate": 1.0419654202134399e-06, "loss": 0.0011, "step": 30165 }, { "epoch": 7.684013923083453, "grad_norm": 0.04632008448243141, "learning_rate": 1.0399904718035603e-06, "loss": 0.0011, "step": 30170 }, { "epoch": 7.685287375838357, "grad_norm": 0.06221376359462738, "learning_rate": 1.0380172941655298e-06, "loss": 0.0006, "step": 30175 }, { "epoch": 7.686560828593259, "grad_norm": 0.03039252944290638, "learning_rate": 1.0360458876893065e-06, "loss": 0.0007, "step": 30180 }, { "epoch": 7.687834281348162, "grad_norm": 0.02025575004518032, "learning_rate": 1.034076252764501e-06, "loss": 0.0008, "step": 30185 }, { "epoch": 7.6891077341030645, "grad_norm": 0.04841305688023567, "learning_rate": 1.0321083897803764e-06, "loss": 0.0006, "step": 30190 }, { "epoch": 7.690381186857968, "grad_norm": 0.4783269166946411, "learning_rate": 1.0301422991258402e-06, "loss": 0.0015, "step": 30195 }, { "epoch": 7.69165463961287, "grad_norm": 0.02653445303440094, "learning_rate": 1.0281779811894531e-06, "loss": 0.0003, "step": 30200 }, { "epoch": 7.692928092367773, "grad_norm": 0.03501703962683678, "learning_rate": 1.0262154363594235e-06, "loss": 0.0006, "step": 30205 }, { "epoch": 7.6942015451226755, "grad_norm": 0.13561058044433594, "learning_rate": 1.0242546650236107e-06, "loss": 0.0008, "step": 30210 }, { "epoch": 7.695474997877579, "grad_norm": 0.03580952063202858, "learning_rate": 1.0222956675695206e-06, "loss": 0.0011, "step": 30215 }, { "epoch": 7.696748450632482, "grad_norm": 0.03551136329770088, "learning_rate": 1.0203384443843178e-06, "loss": 0.001, "step": 30220 }, { "epoch": 7.698021903387384, "grad_norm": 0.028294140473008156, "learning_rate": 1.0183829958548008e-06, "loss": 0.0005, "step": 30225 }, { "epoch": 7.699295356142287, "grad_norm": 0.029099833220243454, "learning_rate": 1.0164293223674327e-06, "loss": 0.0016, "step": 30230 }, { "epoch": 7.70056880889719, "grad_norm": 0.06670483201742172, "learning_rate": 1.0144774243083156e-06, "loss": 0.0005, "step": 30235 }, { "epoch": 7.701842261652093, "grad_norm": 0.01983489841222763, "learning_rate": 1.0125273020632064e-06, "loss": 0.001, "step": 30240 }, { "epoch": 7.703115714406995, "grad_norm": 0.07848694175481796, "learning_rate": 1.010578956017505e-06, "loss": 0.0011, "step": 30245 }, { "epoch": 7.7043891671618985, "grad_norm": 0.04864300414919853, "learning_rate": 1.0086323865562708e-06, "loss": 0.0005, "step": 30250 }, { "epoch": 7.705662619916801, "grad_norm": 0.04024000093340874, "learning_rate": 1.0066875940641985e-06, "loss": 0.0005, "step": 30255 }, { "epoch": 7.706936072671704, "grad_norm": 0.05581493675708771, "learning_rate": 1.0047445789256427e-06, "loss": 0.0008, "step": 30260 }, { "epoch": 7.708209525426606, "grad_norm": 0.05890538915991783, "learning_rate": 1.002803341524602e-06, "loss": 0.0011, "step": 30265 }, { "epoch": 7.7094829781815095, "grad_norm": 0.02181101217865944, "learning_rate": 1.0008638822447225e-06, "loss": 0.0009, "step": 30270 }, { "epoch": 7.710756430936412, "grad_norm": 0.2777887284755707, "learning_rate": 9.989262014693013e-07, "loss": 0.0007, "step": 30275 }, { "epoch": 7.712029883691315, "grad_norm": 0.5077818632125854, "learning_rate": 9.969902995812874e-07, "loss": 0.0016, "step": 30280 }, { "epoch": 7.713303336446218, "grad_norm": 0.5261831879615784, "learning_rate": 9.950561769632671e-07, "loss": 0.0017, "step": 30285 }, { "epoch": 7.7145767892011206, "grad_norm": 0.02794620767235756, "learning_rate": 9.93123833997488e-07, "loss": 0.0009, "step": 30290 }, { "epoch": 7.715850241956024, "grad_norm": 0.05575382336974144, "learning_rate": 9.91193271065838e-07, "loss": 0.0004, "step": 30295 }, { "epoch": 7.717123694710926, "grad_norm": 0.5871603488922119, "learning_rate": 9.892644885498548e-07, "loss": 0.0005, "step": 30300 }, { "epoch": 7.718397147465829, "grad_norm": 0.01350435707718134, "learning_rate": 9.873374868307238e-07, "loss": 0.0004, "step": 30305 }, { "epoch": 7.719670600220732, "grad_norm": 0.04450949281454086, "learning_rate": 9.854122662892852e-07, "loss": 0.0009, "step": 30310 }, { "epoch": 7.720944052975635, "grad_norm": 0.10400956869125366, "learning_rate": 9.83488827306013e-07, "loss": 0.0007, "step": 30315 }, { "epoch": 7.722217505730537, "grad_norm": 0.04678022116422653, "learning_rate": 9.81567170261044e-07, "loss": 0.0014, "step": 30320 }, { "epoch": 7.72349095848544, "grad_norm": 0.05062606930732727, "learning_rate": 9.79647295534154e-07, "loss": 0.0005, "step": 30325 }, { "epoch": 7.724764411240343, "grad_norm": 0.04322045296430588, "learning_rate": 9.77729203504769e-07, "loss": 0.0005, "step": 30330 }, { "epoch": 7.726037863995246, "grad_norm": 0.004625050816684961, "learning_rate": 9.758128945519618e-07, "loss": 0.0007, "step": 30335 }, { "epoch": 7.727311316750148, "grad_norm": 0.07100250571966171, "learning_rate": 9.738983690544579e-07, "loss": 0.001, "step": 30340 }, { "epoch": 7.728584769505051, "grad_norm": 0.08966552466154099, "learning_rate": 9.719856273906192e-07, "loss": 0.0009, "step": 30345 }, { "epoch": 7.729858222259955, "grad_norm": 0.034271690994501114, "learning_rate": 9.700746699384667e-07, "loss": 0.001, "step": 30350 }, { "epoch": 7.731131675014857, "grad_norm": 0.17510443925857544, "learning_rate": 9.681654970756637e-07, "loss": 0.0015, "step": 30355 }, { "epoch": 7.73240512776976, "grad_norm": 0.09101936966180801, "learning_rate": 9.662581091795199e-07, "loss": 0.0007, "step": 30360 }, { "epoch": 7.733678580524662, "grad_norm": 0.038476716727018356, "learning_rate": 9.643525066269942e-07, "loss": 0.0006, "step": 30365 }, { "epoch": 7.734952033279566, "grad_norm": 0.02175919897854328, "learning_rate": 9.624486897946916e-07, "loss": 0.0011, "step": 30370 }, { "epoch": 7.736225486034468, "grad_norm": 0.04706332087516785, "learning_rate": 9.60546659058863e-07, "loss": 0.0015, "step": 30375 }, { "epoch": 7.737498938789371, "grad_norm": 0.33536580204963684, "learning_rate": 9.586464147954133e-07, "loss": 0.0011, "step": 30380 }, { "epoch": 7.738772391544273, "grad_norm": 0.09474770724773407, "learning_rate": 9.5674795737988e-07, "loss": 0.0008, "step": 30385 }, { "epoch": 7.740045844299177, "grad_norm": 0.25163838267326355, "learning_rate": 9.548512871874638e-07, "loss": 0.0012, "step": 30390 }, { "epoch": 7.741319297054079, "grad_norm": 0.031020989641547203, "learning_rate": 9.529564045930029e-07, "loss": 0.0007, "step": 30395 }, { "epoch": 7.742592749808982, "grad_norm": 0.0490390844643116, "learning_rate": 9.51063309970982e-07, "loss": 0.0006, "step": 30400 }, { "epoch": 7.7438662025638845, "grad_norm": 0.1868973672389984, "learning_rate": 9.491720036955354e-07, "loss": 0.0007, "step": 30405 }, { "epoch": 7.745139655318788, "grad_norm": 0.050836220383644104, "learning_rate": 9.472824861404461e-07, "loss": 0.0008, "step": 30410 }, { "epoch": 7.746413108073691, "grad_norm": 0.019415143877267838, "learning_rate": 9.453947576791356e-07, "loss": 0.0004, "step": 30415 }, { "epoch": 7.747686560828593, "grad_norm": 0.06682460755109787, "learning_rate": 9.435088186846797e-07, "loss": 0.0008, "step": 30420 }, { "epoch": 7.748960013583496, "grad_norm": 0.021362638100981712, "learning_rate": 9.41624669529797e-07, "loss": 0.001, "step": 30425 }, { "epoch": 7.750233466338399, "grad_norm": 1.6827630996704102, "learning_rate": 9.39742310586853e-07, "loss": 0.0015, "step": 30430 }, { "epoch": 7.751506919093302, "grad_norm": 0.024319522082805634, "learning_rate": 9.37861742227858e-07, "loss": 0.0005, "step": 30435 }, { "epoch": 7.752780371848204, "grad_norm": 0.05989805608987808, "learning_rate": 9.359829648244745e-07, "loss": 0.0024, "step": 30440 }, { "epoch": 7.754053824603107, "grad_norm": 0.04598044604063034, "learning_rate": 9.34105978748e-07, "loss": 0.001, "step": 30445 }, { "epoch": 7.75532727735801, "grad_norm": 0.08356106281280518, "learning_rate": 9.32230784369389e-07, "loss": 0.0008, "step": 30450 }, { "epoch": 7.756600730112913, "grad_norm": 0.056602805852890015, "learning_rate": 9.303573820592348e-07, "loss": 0.0007, "step": 30455 }, { "epoch": 7.757874182867815, "grad_norm": 0.522574782371521, "learning_rate": 9.284857721877805e-07, "loss": 0.0006, "step": 30460 }, { "epoch": 7.7591476356227185, "grad_norm": 0.02105811797082424, "learning_rate": 9.266159551249132e-07, "loss": 0.0007, "step": 30465 }, { "epoch": 7.760421088377621, "grad_norm": 0.004336753860116005, "learning_rate": 9.247479312401642e-07, "loss": 0.0005, "step": 30470 }, { "epoch": 7.761694541132524, "grad_norm": 0.011836514808237553, "learning_rate": 9.228817009027124e-07, "loss": 0.001, "step": 30475 }, { "epoch": 7.762967993887427, "grad_norm": 0.07782084494829178, "learning_rate": 9.210172644813842e-07, "loss": 0.0011, "step": 30480 }, { "epoch": 7.7642414466423295, "grad_norm": 0.3749431073665619, "learning_rate": 9.19154622344649e-07, "loss": 0.0013, "step": 30485 }, { "epoch": 7.765514899397233, "grad_norm": 0.08426939696073532, "learning_rate": 9.172937748606204e-07, "loss": 0.0008, "step": 30490 }, { "epoch": 7.766788352152135, "grad_norm": 0.023126671090722084, "learning_rate": 9.154347223970594e-07, "loss": 0.0008, "step": 30495 }, { "epoch": 7.768061804907038, "grad_norm": 0.06539815664291382, "learning_rate": 9.135774653213714e-07, "loss": 0.0011, "step": 30500 }, { "epoch": 7.7693352576619406, "grad_norm": 0.03796878084540367, "learning_rate": 9.11722004000607e-07, "loss": 0.0008, "step": 30505 }, { "epoch": 7.770608710416844, "grad_norm": 0.11200808733701706, "learning_rate": 9.098683388014595e-07, "loss": 0.0009, "step": 30510 }, { "epoch": 7.771882163171746, "grad_norm": 0.2676243185997009, "learning_rate": 9.080164700902761e-07, "loss": 0.0004, "step": 30515 }, { "epoch": 7.773155615926649, "grad_norm": 0.02487954869866371, "learning_rate": 9.061663982330382e-07, "loss": 0.0007, "step": 30520 }, { "epoch": 7.774429068681552, "grad_norm": 0.11487146466970444, "learning_rate": 9.043181235953779e-07, "loss": 0.001, "step": 30525 }, { "epoch": 7.775702521436455, "grad_norm": 0.02275981567800045, "learning_rate": 9.024716465425709e-07, "loss": 0.001, "step": 30530 }, { "epoch": 7.776975974191357, "grad_norm": 0.04887242242693901, "learning_rate": 9.006269674395374e-07, "loss": 0.0007, "step": 30535 }, { "epoch": 7.77824942694626, "grad_norm": 0.10958637297153473, "learning_rate": 8.98784086650839e-07, "loss": 0.0004, "step": 30540 }, { "epoch": 7.7795228797011635, "grad_norm": 0.04104718938469887, "learning_rate": 8.969430045406923e-07, "loss": 0.0018, "step": 30545 }, { "epoch": 7.780796332456066, "grad_norm": 0.03983444720506668, "learning_rate": 8.951037214729441e-07, "loss": 0.0008, "step": 30550 }, { "epoch": 7.782069785210969, "grad_norm": 0.2298312932252884, "learning_rate": 8.93266237811099e-07, "loss": 0.0008, "step": 30555 }, { "epoch": 7.783343237965871, "grad_norm": 0.3291536271572113, "learning_rate": 8.914305539182966e-07, "loss": 0.0008, "step": 30560 }, { "epoch": 7.784616690720775, "grad_norm": 0.42746904492378235, "learning_rate": 8.895966701573244e-07, "loss": 0.0013, "step": 30565 }, { "epoch": 7.785890143475677, "grad_norm": 0.04745202884078026, "learning_rate": 8.877645868906126e-07, "loss": 0.0009, "step": 30570 }, { "epoch": 7.78716359623058, "grad_norm": 0.008641241118311882, "learning_rate": 8.859343044802437e-07, "loss": 0.0007, "step": 30575 }, { "epoch": 7.788437048985482, "grad_norm": 0.038310807198286057, "learning_rate": 8.841058232879273e-07, "loss": 0.0007, "step": 30580 }, { "epoch": 7.789710501740386, "grad_norm": 0.06073303520679474, "learning_rate": 8.822791436750344e-07, "loss": 0.0014, "step": 30585 }, { "epoch": 7.790983954495288, "grad_norm": 0.1899721622467041, "learning_rate": 8.804542660025717e-07, "loss": 0.0009, "step": 30590 }, { "epoch": 7.792257407250191, "grad_norm": 0.2109941989183426, "learning_rate": 8.786311906311884e-07, "loss": 0.0006, "step": 30595 }, { "epoch": 7.793530860005093, "grad_norm": 0.1100655272603035, "learning_rate": 8.768099179211808e-07, "loss": 0.0012, "step": 30600 }, { "epoch": 7.794804312759997, "grad_norm": 0.03972446545958519, "learning_rate": 8.749904482324911e-07, "loss": 0.0009, "step": 30605 }, { "epoch": 7.7960777655149, "grad_norm": 0.05766676366329193, "learning_rate": 8.731727819246971e-07, "loss": 0.0005, "step": 30610 }, { "epoch": 7.797351218269802, "grad_norm": 0.1546669900417328, "learning_rate": 8.713569193570304e-07, "loss": 0.0007, "step": 30615 }, { "epoch": 7.798624671024705, "grad_norm": 0.05031011253595352, "learning_rate": 8.69542860888355e-07, "loss": 0.0024, "step": 30620 }, { "epoch": 7.799898123779608, "grad_norm": 0.058252137154340744, "learning_rate": 8.677306068771896e-07, "loss": 0.0007, "step": 30625 }, { "epoch": 7.801171576534511, "grad_norm": 0.032815273851156235, "learning_rate": 8.6592015768169e-07, "loss": 0.0012, "step": 30630 }, { "epoch": 7.802445029289413, "grad_norm": 0.01654016599059105, "learning_rate": 8.641115136596545e-07, "loss": 0.0006, "step": 30635 }, { "epoch": 7.803718482044316, "grad_norm": 0.097771055996418, "learning_rate": 8.62304675168526e-07, "loss": 0.0012, "step": 30640 }, { "epoch": 7.804991934799219, "grad_norm": 0.05332505702972412, "learning_rate": 8.604996425653955e-07, "loss": 0.0007, "step": 30645 }, { "epoch": 7.806265387554122, "grad_norm": 0.04983246698975563, "learning_rate": 8.586964162069855e-07, "loss": 0.0006, "step": 30650 }, { "epoch": 7.807538840309024, "grad_norm": 0.059964925050735474, "learning_rate": 8.568949964496753e-07, "loss": 0.0005, "step": 30655 }, { "epoch": 7.808812293063927, "grad_norm": 0.33608198165893555, "learning_rate": 8.550953836494779e-07, "loss": 0.0011, "step": 30660 }, { "epoch": 7.81008574581883, "grad_norm": 0.04354246333241463, "learning_rate": 8.532975781620511e-07, "loss": 0.0005, "step": 30665 }, { "epoch": 7.811359198573733, "grad_norm": 0.06842490285634995, "learning_rate": 8.515015803426962e-07, "loss": 0.0007, "step": 30670 }, { "epoch": 7.812632651328636, "grad_norm": 0.28992703557014465, "learning_rate": 8.497073905463616e-07, "loss": 0.001, "step": 30675 }, { "epoch": 7.8139061040835385, "grad_norm": 0.021045606583356857, "learning_rate": 8.479150091276256e-07, "loss": 0.0004, "step": 30680 }, { "epoch": 7.815179556838442, "grad_norm": 0.020248111337423325, "learning_rate": 8.461244364407251e-07, "loss": 0.0006, "step": 30685 }, { "epoch": 7.816453009593344, "grad_norm": 0.8116331100463867, "learning_rate": 8.443356728395302e-07, "loss": 0.0015, "step": 30690 }, { "epoch": 7.817726462348247, "grad_norm": 0.040425341576337814, "learning_rate": 8.425487186775538e-07, "loss": 0.0009, "step": 30695 }, { "epoch": 7.8189999151031495, "grad_norm": 0.045605067163705826, "learning_rate": 8.407635743079512e-07, "loss": 0.0008, "step": 30700 }, { "epoch": 7.820273367858053, "grad_norm": 0.2874923348426819, "learning_rate": 8.389802400835279e-07, "loss": 0.0004, "step": 30705 }, { "epoch": 7.821546820612955, "grad_norm": 0.313692182302475, "learning_rate": 8.371987163567175e-07, "loss": 0.0007, "step": 30710 }, { "epoch": 7.822820273367858, "grad_norm": 0.027086371555924416, "learning_rate": 8.354190034796084e-07, "loss": 0.0002, "step": 30715 }, { "epoch": 7.8240937261227606, "grad_norm": 0.05571671202778816, "learning_rate": 8.336411018039248e-07, "loss": 0.0007, "step": 30720 }, { "epoch": 7.825367178877664, "grad_norm": 0.1758987158536911, "learning_rate": 8.318650116810346e-07, "loss": 0.0011, "step": 30725 }, { "epoch": 7.826640631632566, "grad_norm": 0.10890331119298935, "learning_rate": 8.300907334619468e-07, "loss": 0.0014, "step": 30730 }, { "epoch": 7.827914084387469, "grad_norm": 0.13300780951976776, "learning_rate": 8.283182674973167e-07, "loss": 0.0007, "step": 30735 }, { "epoch": 7.8291875371423725, "grad_norm": 0.10017555952072144, "learning_rate": 8.265476141374306e-07, "loss": 0.0005, "step": 30740 }, { "epoch": 7.830460989897275, "grad_norm": 0.05141451209783554, "learning_rate": 8.247787737322311e-07, "loss": 0.0008, "step": 30745 }, { "epoch": 7.831734442652177, "grad_norm": 0.02449016459286213, "learning_rate": 8.230117466312915e-07, "loss": 0.0006, "step": 30750 }, { "epoch": 7.83300789540708, "grad_norm": 0.03589927777647972, "learning_rate": 8.212465331838304e-07, "loss": 0.0008, "step": 30755 }, { "epoch": 7.8342813481619835, "grad_norm": 0.11038616299629211, "learning_rate": 8.194831337387077e-07, "loss": 0.0006, "step": 30760 }, { "epoch": 7.835554800916886, "grad_norm": 0.08145425468683243, "learning_rate": 8.177215486444279e-07, "loss": 0.0009, "step": 30765 }, { "epoch": 7.836828253671789, "grad_norm": 0.05505979061126709, "learning_rate": 8.159617782491292e-07, "loss": 0.0009, "step": 30770 }, { "epoch": 7.838101706426691, "grad_norm": 0.058925144374370575, "learning_rate": 8.142038229006011e-07, "loss": 0.0004, "step": 30775 }, { "epoch": 7.8393751591815946, "grad_norm": 0.020344892516732216, "learning_rate": 8.124476829462669e-07, "loss": 0.0007, "step": 30780 }, { "epoch": 7.840648611936497, "grad_norm": 0.10402020812034607, "learning_rate": 8.106933587331933e-07, "loss": 0.0013, "step": 30785 }, { "epoch": 7.8419220646914, "grad_norm": 0.019863776862621307, "learning_rate": 8.089408506080887e-07, "loss": 0.0004, "step": 30790 }, { "epoch": 7.843195517446302, "grad_norm": 0.029552467167377472, "learning_rate": 8.071901589173025e-07, "loss": 0.0006, "step": 30795 }, { "epoch": 7.844468970201206, "grad_norm": 0.36115777492523193, "learning_rate": 8.054412840068238e-07, "loss": 0.0008, "step": 30800 }, { "epoch": 7.845742422956109, "grad_norm": 0.0909019410610199, "learning_rate": 8.036942262222857e-07, "loss": 0.0007, "step": 30805 }, { "epoch": 7.847015875711011, "grad_norm": 0.025681113824248314, "learning_rate": 8.019489859089602e-07, "loss": 0.0003, "step": 30810 }, { "epoch": 7.848289328465913, "grad_norm": 0.07725505530834198, "learning_rate": 8.002055634117578e-07, "loss": 0.0008, "step": 30815 }, { "epoch": 7.849562781220817, "grad_norm": 0.009605423547327518, "learning_rate": 7.984639590752352e-07, "loss": 0.0011, "step": 30820 }, { "epoch": 7.85083623397572, "grad_norm": 0.01013736892491579, "learning_rate": 7.967241732435838e-07, "loss": 0.0002, "step": 30825 }, { "epoch": 7.852109686730622, "grad_norm": 0.02065444178879261, "learning_rate": 7.949862062606406e-07, "loss": 0.0003, "step": 30830 }, { "epoch": 7.853383139485525, "grad_norm": 0.040100496262311935, "learning_rate": 7.932500584698776e-07, "loss": 0.0007, "step": 30835 }, { "epoch": 7.854656592240428, "grad_norm": 0.046775270253419876, "learning_rate": 7.91515730214415e-07, "loss": 0.0006, "step": 30840 }, { "epoch": 7.855930044995331, "grad_norm": 0.597552478313446, "learning_rate": 7.897832218370083e-07, "loss": 0.0018, "step": 30845 }, { "epoch": 7.857203497750233, "grad_norm": 0.02674339897930622, "learning_rate": 7.880525336800515e-07, "loss": 0.0006, "step": 30850 }, { "epoch": 7.858476950505136, "grad_norm": 0.14697694778442383, "learning_rate": 7.863236660855844e-07, "loss": 0.0013, "step": 30855 }, { "epoch": 7.859750403260039, "grad_norm": 0.03453289344906807, "learning_rate": 7.845966193952825e-07, "loss": 0.0004, "step": 30860 }, { "epoch": 7.861023856014942, "grad_norm": 0.05502195283770561, "learning_rate": 7.828713939504628e-07, "loss": 0.0006, "step": 30865 }, { "epoch": 7.862297308769845, "grad_norm": 0.1030983030796051, "learning_rate": 7.811479900920837e-07, "loss": 0.0005, "step": 30870 }, { "epoch": 7.863570761524747, "grad_norm": 0.014577705413103104, "learning_rate": 7.794264081607406e-07, "loss": 0.0012, "step": 30875 }, { "epoch": 7.86484421427965, "grad_norm": 0.051634591072797775, "learning_rate": 7.777066484966733e-07, "loss": 0.0011, "step": 30880 }, { "epoch": 7.866117667034553, "grad_norm": 0.023911530151963234, "learning_rate": 7.75988711439758e-07, "loss": 0.0008, "step": 30885 }, { "epoch": 7.867391119789456, "grad_norm": 0.011778619140386581, "learning_rate": 7.742725973295107e-07, "loss": 0.0006, "step": 30890 }, { "epoch": 7.8686645725443585, "grad_norm": 0.09174652397632599, "learning_rate": 7.72558306505089e-07, "loss": 0.001, "step": 30895 }, { "epoch": 7.869938025299262, "grad_norm": 0.06575087457895279, "learning_rate": 7.708458393052887e-07, "loss": 0.0009, "step": 30900 }, { "epoch": 7.871211478054164, "grad_norm": 0.028819138184189796, "learning_rate": 7.691351960685445e-07, "loss": 0.0006, "step": 30905 }, { "epoch": 7.872484930809067, "grad_norm": 0.03022732585668564, "learning_rate": 7.67426377132936e-07, "loss": 0.0006, "step": 30910 }, { "epoch": 7.8737583835639695, "grad_norm": 0.06152746081352234, "learning_rate": 7.65719382836172e-07, "loss": 0.0008, "step": 30915 }, { "epoch": 7.875031836318873, "grad_norm": 0.05507427826523781, "learning_rate": 7.640142135156114e-07, "loss": 0.0012, "step": 30920 }, { "epoch": 7.876305289073775, "grad_norm": 0.11828593164682388, "learning_rate": 7.62310869508246e-07, "loss": 0.0007, "step": 30925 }, { "epoch": 7.877578741828678, "grad_norm": 0.042615026235580444, "learning_rate": 7.606093511507095e-07, "loss": 0.0008, "step": 30930 }, { "epoch": 7.878852194583581, "grad_norm": 0.11777424812316895, "learning_rate": 7.58909658779271e-07, "loss": 0.001, "step": 30935 }, { "epoch": 7.880125647338484, "grad_norm": 0.13235019147396088, "learning_rate": 7.572117927298483e-07, "loss": 0.0006, "step": 30940 }, { "epoch": 7.881399100093386, "grad_norm": 0.06066840514540672, "learning_rate": 7.555157533379831e-07, "loss": 0.0006, "step": 30945 }, { "epoch": 7.882672552848289, "grad_norm": 0.18309298157691956, "learning_rate": 7.538215409388716e-07, "loss": 0.0007, "step": 30950 }, { "epoch": 7.8839460056031925, "grad_norm": 0.02909201942384243, "learning_rate": 7.521291558673393e-07, "loss": 0.0009, "step": 30955 }, { "epoch": 7.885219458358095, "grad_norm": 0.31609976291656494, "learning_rate": 7.504385984578533e-07, "loss": 0.0004, "step": 30960 }, { "epoch": 7.886492911112998, "grad_norm": 0.05397779867053032, "learning_rate": 7.487498690445172e-07, "loss": 0.0007, "step": 30965 }, { "epoch": 7.8877663638679, "grad_norm": 0.3299925923347473, "learning_rate": 7.470629679610808e-07, "loss": 0.0013, "step": 30970 }, { "epoch": 7.8890398166228035, "grad_norm": 0.12306839972734451, "learning_rate": 7.453778955409219e-07, "loss": 0.0008, "step": 30975 }, { "epoch": 7.890313269377706, "grad_norm": 0.03270142525434494, "learning_rate": 7.436946521170663e-07, "loss": 0.0004, "step": 30980 }, { "epoch": 7.891586722132609, "grad_norm": 0.10067091882228851, "learning_rate": 7.420132380221723e-07, "loss": 0.0009, "step": 30985 }, { "epoch": 7.892860174887511, "grad_norm": 0.06580211222171783, "learning_rate": 7.403336535885397e-07, "loss": 0.0009, "step": 30990 }, { "epoch": 7.8941336276424146, "grad_norm": 0.029595157131552696, "learning_rate": 7.38655899148104e-07, "loss": 0.0005, "step": 30995 }, { "epoch": 7.895407080397318, "grad_norm": 0.04156484454870224, "learning_rate": 7.369799750324446e-07, "loss": 0.0008, "step": 31000 }, { "epoch": 7.89668053315222, "grad_norm": 0.19435158371925354, "learning_rate": 7.353058815727698e-07, "loss": 0.0004, "step": 31005 }, { "epoch": 7.897953985907122, "grad_norm": 0.07410074025392532, "learning_rate": 7.336336190999371e-07, "loss": 0.0011, "step": 31010 }, { "epoch": 7.899227438662026, "grad_norm": 0.5475974082946777, "learning_rate": 7.319631879444333e-07, "loss": 0.0005, "step": 31015 }, { "epoch": 7.900500891416929, "grad_norm": 0.5277621746063232, "learning_rate": 7.302945884363876e-07, "loss": 0.0004, "step": 31020 }, { "epoch": 7.901774344171831, "grad_norm": 0.060120999813079834, "learning_rate": 7.286278209055653e-07, "loss": 0.0009, "step": 31025 }, { "epoch": 7.903047796926734, "grad_norm": 0.0381508395075798, "learning_rate": 7.269628856813748e-07, "loss": 0.0008, "step": 31030 }, { "epoch": 7.904321249681637, "grad_norm": 0.09880051016807556, "learning_rate": 7.252997830928521e-07, "loss": 0.0006, "step": 31035 }, { "epoch": 7.90559470243654, "grad_norm": 0.06211576610803604, "learning_rate": 7.236385134686808e-07, "loss": 0.0006, "step": 31040 }, { "epoch": 7.906868155191442, "grad_norm": 0.1428145319223404, "learning_rate": 7.219790771371793e-07, "loss": 0.0007, "step": 31045 }, { "epoch": 7.908141607946345, "grad_norm": 0.03902352228760719, "learning_rate": 7.203214744262998e-07, "loss": 0.0005, "step": 31050 }, { "epoch": 7.909415060701248, "grad_norm": 0.04549102857708931, "learning_rate": 7.18665705663637e-07, "loss": 0.0012, "step": 31055 }, { "epoch": 7.910688513456151, "grad_norm": 0.05685890093445778, "learning_rate": 7.170117711764235e-07, "loss": 0.0008, "step": 31060 }, { "epoch": 7.911961966211053, "grad_norm": 0.1003265380859375, "learning_rate": 7.153596712915223e-07, "loss": 0.0007, "step": 31065 }, { "epoch": 7.913235418965956, "grad_norm": 0.05268599092960358, "learning_rate": 7.137094063354444e-07, "loss": 0.0006, "step": 31070 }, { "epoch": 7.914508871720859, "grad_norm": 0.1289108395576477, "learning_rate": 7.120609766343289e-07, "loss": 0.0007, "step": 31075 }, { "epoch": 7.915782324475762, "grad_norm": 0.0782439112663269, "learning_rate": 7.104143825139576e-07, "loss": 0.0008, "step": 31080 }, { "epoch": 7.917055777230665, "grad_norm": 0.057679612189531326, "learning_rate": 7.087696242997466e-07, "loss": 0.0008, "step": 31085 }, { "epoch": 7.918329229985567, "grad_norm": 0.0781409963965416, "learning_rate": 7.071267023167527e-07, "loss": 0.0009, "step": 31090 }, { "epoch": 7.919602682740471, "grad_norm": 0.33160242438316345, "learning_rate": 7.054856168896629e-07, "loss": 0.0011, "step": 31095 }, { "epoch": 7.920876135495373, "grad_norm": 0.12529993057250977, "learning_rate": 7.03846368342811e-07, "loss": 0.0007, "step": 31100 }, { "epoch": 7.922149588250276, "grad_norm": 0.03888900205492973, "learning_rate": 7.0220895700016e-07, "loss": 0.0006, "step": 31105 }, { "epoch": 7.9234230410051785, "grad_norm": 0.0327703058719635, "learning_rate": 7.005733831853135e-07, "loss": 0.0009, "step": 31110 }, { "epoch": 7.924696493760082, "grad_norm": 0.015581842511892319, "learning_rate": 6.989396472215105e-07, "loss": 0.0004, "step": 31115 }, { "epoch": 7.925969946514984, "grad_norm": 0.05191977322101593, "learning_rate": 6.973077494316272e-07, "loss": 0.0007, "step": 31120 }, { "epoch": 7.927243399269887, "grad_norm": 0.08880626410245895, "learning_rate": 6.956776901381757e-07, "loss": 0.0008, "step": 31125 }, { "epoch": 7.9285168520247895, "grad_norm": 0.03277003765106201, "learning_rate": 6.94049469663306e-07, "loss": 0.0014, "step": 31130 }, { "epoch": 7.929790304779693, "grad_norm": 0.009904921054840088, "learning_rate": 6.924230883288041e-07, "loss": 0.0009, "step": 31135 }, { "epoch": 7.931063757534595, "grad_norm": 0.06996649503707886, "learning_rate": 6.907985464560929e-07, "loss": 0.0004, "step": 31140 }, { "epoch": 7.932337210289498, "grad_norm": 0.051733486354351044, "learning_rate": 6.891758443662333e-07, "loss": 0.0007, "step": 31145 }, { "epoch": 7.933610663044401, "grad_norm": 0.03988548740744591, "learning_rate": 6.875549823799188e-07, "loss": 0.0016, "step": 31150 }, { "epoch": 7.934884115799304, "grad_norm": 0.07952258735895157, "learning_rate": 6.859359608174809e-07, "loss": 0.0008, "step": 31155 }, { "epoch": 7.936157568554207, "grad_norm": 0.04348332807421684, "learning_rate": 6.843187799988893e-07, "loss": 0.0012, "step": 31160 }, { "epoch": 7.937431021309109, "grad_norm": 0.24389442801475525, "learning_rate": 6.827034402437483e-07, "loss": 0.0005, "step": 31165 }, { "epoch": 7.9387044740640125, "grad_norm": 0.5628142952919006, "learning_rate": 6.810899418712946e-07, "loss": 0.0012, "step": 31170 }, { "epoch": 7.939977926818915, "grad_norm": 0.07056409120559692, "learning_rate": 6.794782852004112e-07, "loss": 0.0006, "step": 31175 }, { "epoch": 7.941251379573818, "grad_norm": 0.1676056683063507, "learning_rate": 6.778684705496064e-07, "loss": 0.0005, "step": 31180 }, { "epoch": 7.94252483232872, "grad_norm": 0.23576070368289948, "learning_rate": 6.762604982370303e-07, "loss": 0.0007, "step": 31185 }, { "epoch": 7.9437982850836235, "grad_norm": 0.16335099935531616, "learning_rate": 6.746543685804674e-07, "loss": 0.0011, "step": 31190 }, { "epoch": 7.945071737838526, "grad_norm": 0.0941755548119545, "learning_rate": 6.730500818973373e-07, "loss": 0.0004, "step": 31195 }, { "epoch": 7.946345190593429, "grad_norm": 0.08077029883861542, "learning_rate": 6.714476385046953e-07, "loss": 0.001, "step": 31200 }, { "epoch": 7.947618643348331, "grad_norm": 0.02187070995569229, "learning_rate": 6.698470387192368e-07, "loss": 0.0005, "step": 31205 }, { "epoch": 7.9488920961032346, "grad_norm": 0.04058700054883957, "learning_rate": 6.682482828572845e-07, "loss": 0.0007, "step": 31210 }, { "epoch": 7.950165548858138, "grad_norm": 0.04726651310920715, "learning_rate": 6.666513712348044e-07, "loss": 0.0009, "step": 31215 }, { "epoch": 7.95143900161304, "grad_norm": 0.05896517634391785, "learning_rate": 6.650563041673952e-07, "loss": 0.0008, "step": 31220 }, { "epoch": 7.952712454367943, "grad_norm": 0.5519835948944092, "learning_rate": 6.634630819702903e-07, "loss": 0.0009, "step": 31225 }, { "epoch": 7.953985907122846, "grad_norm": 0.061497095972299576, "learning_rate": 6.618717049583567e-07, "loss": 0.0006, "step": 31230 }, { "epoch": 7.955259359877749, "grad_norm": 0.05342726409435272, "learning_rate": 6.602821734461051e-07, "loss": 0.0004, "step": 31235 }, { "epoch": 7.956532812632651, "grad_norm": 0.03163178265094757, "learning_rate": 6.586944877476686e-07, "loss": 0.0005, "step": 31240 }, { "epoch": 7.957806265387554, "grad_norm": 0.15625153481960297, "learning_rate": 6.571086481768274e-07, "loss": 0.001, "step": 31245 }, { "epoch": 7.959079718142457, "grad_norm": 0.05589514225721359, "learning_rate": 6.555246550469907e-07, "loss": 0.0007, "step": 31250 }, { "epoch": 7.96035317089736, "grad_norm": 0.3675939738750458, "learning_rate": 6.539425086712026e-07, "loss": 0.0006, "step": 31255 }, { "epoch": 7.961626623652262, "grad_norm": 0.04655294120311737, "learning_rate": 6.523622093621429e-07, "loss": 0.0005, "step": 31260 }, { "epoch": 7.962900076407165, "grad_norm": 0.08746842294931412, "learning_rate": 6.507837574321319e-07, "loss": 0.0007, "step": 31265 }, { "epoch": 7.964173529162068, "grad_norm": 0.13040803372859955, "learning_rate": 6.492071531931144e-07, "loss": 0.0004, "step": 31270 }, { "epoch": 7.965446981916971, "grad_norm": 0.04074086248874664, "learning_rate": 6.476323969566789e-07, "loss": 0.0007, "step": 31275 }, { "epoch": 7.966720434671874, "grad_norm": 0.05125979706645012, "learning_rate": 6.460594890340443e-07, "loss": 0.0005, "step": 31280 }, { "epoch": 7.967993887426776, "grad_norm": 0.04254594445228577, "learning_rate": 6.444884297360654e-07, "loss": 0.0006, "step": 31285 }, { "epoch": 7.96926734018168, "grad_norm": 0.04109544679522514, "learning_rate": 6.429192193732303e-07, "loss": 0.001, "step": 31290 }, { "epoch": 7.970540792936582, "grad_norm": 0.0236030425876379, "learning_rate": 6.413518582556666e-07, "loss": 0.0004, "step": 31295 }, { "epoch": 7.971814245691485, "grad_norm": 0.029971711337566376, "learning_rate": 6.397863466931276e-07, "loss": 0.0004, "step": 31300 }, { "epoch": 7.973087698446387, "grad_norm": 0.03512503206729889, "learning_rate": 6.382226849950113e-07, "loss": 0.0004, "step": 31305 }, { "epoch": 7.974361151201291, "grad_norm": 0.08817495405673981, "learning_rate": 6.366608734703416e-07, "loss": 0.0008, "step": 31310 }, { "epoch": 7.975634603956193, "grad_norm": 0.15917620062828064, "learning_rate": 6.35100912427783e-07, "loss": 0.001, "step": 31315 }, { "epoch": 7.976908056711096, "grad_norm": 0.0478566437959671, "learning_rate": 6.335428021756274e-07, "loss": 0.0007, "step": 31320 }, { "epoch": 7.9781815094659985, "grad_norm": 0.007801924366503954, "learning_rate": 6.319865430218098e-07, "loss": 0.0001, "step": 31325 }, { "epoch": 7.979454962220902, "grad_norm": 0.02999206818640232, "learning_rate": 6.304321352738907e-07, "loss": 0.0008, "step": 31330 }, { "epoch": 7.980728414975804, "grad_norm": 0.32460859417915344, "learning_rate": 6.288795792390712e-07, "loss": 0.0006, "step": 31335 }, { "epoch": 7.982001867730707, "grad_norm": 0.029901660978794098, "learning_rate": 6.273288752241823e-07, "loss": 0.0008, "step": 31340 }, { "epoch": 7.98327532048561, "grad_norm": 0.10497844964265823, "learning_rate": 6.257800235356926e-07, "loss": 0.0007, "step": 31345 }, { "epoch": 7.984548773240513, "grad_norm": 0.044847287237644196, "learning_rate": 6.242330244796979e-07, "loss": 0.0006, "step": 31350 }, { "epoch": 7.985822225995416, "grad_norm": 0.0845291018486023, "learning_rate": 6.226878783619395e-07, "loss": 0.0011, "step": 31355 }, { "epoch": 7.987095678750318, "grad_norm": 0.055186424404382706, "learning_rate": 6.211445854877785e-07, "loss": 0.001, "step": 31360 }, { "epoch": 7.988369131505221, "grad_norm": 0.06696662306785583, "learning_rate": 6.196031461622221e-07, "loss": 0.0009, "step": 31365 }, { "epoch": 7.989642584260124, "grad_norm": 0.049657877534627914, "learning_rate": 6.180635606899021e-07, "loss": 0.0008, "step": 31370 }, { "epoch": 7.990916037015027, "grad_norm": 0.023691046983003616, "learning_rate": 6.165258293750909e-07, "loss": 0.001, "step": 31375 }, { "epoch": 7.992189489769929, "grad_norm": 0.056727681308984756, "learning_rate": 6.149899525216885e-07, "loss": 0.0005, "step": 31380 }, { "epoch": 7.9934629425248325, "grad_norm": 0.05200489982962608, "learning_rate": 6.134559304332333e-07, "loss": 0.0007, "step": 31385 }, { "epoch": 7.994736395279735, "grad_norm": 0.01619589328765869, "learning_rate": 6.119237634128916e-07, "loss": 0.0008, "step": 31390 }, { "epoch": 7.996009848034638, "grad_norm": 0.05440843850374222, "learning_rate": 6.103934517634713e-07, "loss": 0.001, "step": 31395 }, { "epoch": 7.99728330078954, "grad_norm": 0.038610659539699554, "learning_rate": 6.088649957874038e-07, "loss": 0.0007, "step": 31400 }, { "epoch": 7.9985567535444435, "grad_norm": 0.044039949774742126, "learning_rate": 6.073383957867629e-07, "loss": 0.0012, "step": 31405 }, { "epoch": 7.999830206299347, "grad_norm": 0.03847966343164444, "learning_rate": 6.058136520632496e-07, "loss": 0.0009, "step": 31410 }, { "epoch": 8.00110365905425, "grad_norm": 0.05561217665672302, "learning_rate": 6.042907649181995e-07, "loss": 0.0008, "step": 31415 }, { "epoch": 8.002377111809151, "grad_norm": 0.015531455166637897, "learning_rate": 6.027697346525807e-07, "loss": 0.0005, "step": 31420 }, { "epoch": 8.003650564564055, "grad_norm": 0.1744609922170639, "learning_rate": 6.012505615670006e-07, "loss": 0.0005, "step": 31425 }, { "epoch": 8.004924017318958, "grad_norm": 0.012598424218595028, "learning_rate": 5.997332459616866e-07, "loss": 0.0005, "step": 31430 }, { "epoch": 8.006197470073861, "grad_norm": 0.016801487654447556, "learning_rate": 5.982177881365114e-07, "loss": 0.0008, "step": 31435 }, { "epoch": 8.007470922828762, "grad_norm": 0.005696412641555071, "learning_rate": 5.967041883909753e-07, "loss": 0.0002, "step": 31440 }, { "epoch": 8.008744375583666, "grad_norm": 0.051126401871442795, "learning_rate": 5.951924470242121e-07, "loss": 0.0006, "step": 31445 }, { "epoch": 8.010017828338569, "grad_norm": 0.07235357910394669, "learning_rate": 5.936825643349864e-07, "loss": 0.0008, "step": 31450 }, { "epoch": 8.011291281093472, "grad_norm": 0.08509232103824615, "learning_rate": 5.921745406216984e-07, "loss": 0.0009, "step": 31455 }, { "epoch": 8.012564733848373, "grad_norm": 0.0389719195663929, "learning_rate": 5.906683761823783e-07, "loss": 0.0002, "step": 31460 }, { "epoch": 8.013838186603277, "grad_norm": 0.08834030479192734, "learning_rate": 5.891640713146929e-07, "loss": 0.0005, "step": 31465 }, { "epoch": 8.01511163935818, "grad_norm": 0.10921753197908401, "learning_rate": 5.876616263159385e-07, "loss": 0.001, "step": 31470 }, { "epoch": 8.016385092113083, "grad_norm": 0.014856905676424503, "learning_rate": 5.86161041483041e-07, "loss": 0.0002, "step": 31475 }, { "epoch": 8.017658544867986, "grad_norm": 0.013566067442297935, "learning_rate": 5.846623171125654e-07, "loss": 0.0006, "step": 31480 }, { "epoch": 8.018931997622888, "grad_norm": 0.06735485792160034, "learning_rate": 5.831654535007036e-07, "loss": 0.0007, "step": 31485 }, { "epoch": 8.02020545037779, "grad_norm": 0.025279972702264786, "learning_rate": 5.816704509432824e-07, "loss": 0.0008, "step": 31490 }, { "epoch": 8.021478903132694, "grad_norm": 0.036978114396333694, "learning_rate": 5.801773097357577e-07, "loss": 0.0003, "step": 31495 }, { "epoch": 8.022752355887597, "grad_norm": 0.01899624615907669, "learning_rate": 5.786860301732233e-07, "loss": 0.0004, "step": 31500 }, { "epoch": 8.024025808642499, "grad_norm": 0.026021741330623627, "learning_rate": 5.771966125503992e-07, "loss": 0.0005, "step": 31505 }, { "epoch": 8.025299261397402, "grad_norm": 0.0531916543841362, "learning_rate": 5.757090571616419e-07, "loss": 0.0006, "step": 31510 }, { "epoch": 8.026572714152305, "grad_norm": 0.019722292199730873, "learning_rate": 5.742233643009365e-07, "loss": 0.0005, "step": 31515 }, { "epoch": 8.027846166907208, "grad_norm": 0.0054718623869121075, "learning_rate": 5.727395342619013e-07, "loss": 0.0002, "step": 31520 }, { "epoch": 8.02911961966211, "grad_norm": 0.011190514080226421, "learning_rate": 5.712575673377852e-07, "loss": 0.0003, "step": 31525 }, { "epoch": 8.030393072417013, "grad_norm": 0.03943561017513275, "learning_rate": 5.697774638214748e-07, "loss": 0.0003, "step": 31530 }, { "epoch": 8.031666525171916, "grad_norm": 0.044521115720272064, "learning_rate": 5.682992240054786e-07, "loss": 0.0002, "step": 31535 }, { "epoch": 8.03293997792682, "grad_norm": 0.05469819903373718, "learning_rate": 5.668228481819449e-07, "loss": 0.0007, "step": 31540 }, { "epoch": 8.034213430681723, "grad_norm": 0.07305307686328888, "learning_rate": 5.653483366426493e-07, "loss": 0.0008, "step": 31545 }, { "epoch": 8.035486883436624, "grad_norm": 0.03191568702459335, "learning_rate": 5.638756896790021e-07, "loss": 0.0008, "step": 31550 }, { "epoch": 8.036760336191527, "grad_norm": 0.06545528024435043, "learning_rate": 5.624049075820392e-07, "loss": 0.0006, "step": 31555 }, { "epoch": 8.03803378894643, "grad_norm": 0.04148465022444725, "learning_rate": 5.609359906424394e-07, "loss": 0.0004, "step": 31560 }, { "epoch": 8.039307241701334, "grad_norm": 0.05029460787773132, "learning_rate": 5.594689391504981e-07, "loss": 0.0019, "step": 31565 }, { "epoch": 8.040580694456235, "grad_norm": 0.08166522532701492, "learning_rate": 5.580037533961546e-07, "loss": 0.0004, "step": 31570 }, { "epoch": 8.041854147211138, "grad_norm": 0.017771035432815552, "learning_rate": 5.565404336689717e-07, "loss": 0.0003, "step": 31575 }, { "epoch": 8.043127599966041, "grad_norm": 0.028765372931957245, "learning_rate": 5.55078980258148e-07, "loss": 0.0005, "step": 31580 }, { "epoch": 8.044401052720945, "grad_norm": 0.01804567687213421, "learning_rate": 5.536193934525092e-07, "loss": 0.0005, "step": 31585 }, { "epoch": 8.045674505475846, "grad_norm": 0.006997008342295885, "learning_rate": 5.521616735405167e-07, "loss": 0.0004, "step": 31590 }, { "epoch": 8.04694795823075, "grad_norm": 0.029618825763463974, "learning_rate": 5.507058208102578e-07, "loss": 0.0004, "step": 31595 }, { "epoch": 8.048221410985652, "grad_norm": 0.019521698355674744, "learning_rate": 5.492518355494558e-07, "loss": 0.0005, "step": 31600 }, { "epoch": 8.049494863740556, "grad_norm": 0.002981384750455618, "learning_rate": 5.47799718045462e-07, "loss": 0.0006, "step": 31605 }, { "epoch": 8.050768316495459, "grad_norm": 0.028338143602013588, "learning_rate": 5.463494685852599e-07, "loss": 0.0007, "step": 31610 }, { "epoch": 8.05204176925036, "grad_norm": 0.0071997870691120625, "learning_rate": 5.449010874554595e-07, "loss": 0.0006, "step": 31615 }, { "epoch": 8.053315222005264, "grad_norm": 0.07647773623466492, "learning_rate": 5.434545749423125e-07, "loss": 0.0007, "step": 31620 }, { "epoch": 8.054588674760167, "grad_norm": 0.05043678358197212, "learning_rate": 5.420099313316851e-07, "loss": 0.0006, "step": 31625 }, { "epoch": 8.05586212751507, "grad_norm": 0.02494548074901104, "learning_rate": 5.405671569090898e-07, "loss": 0.0005, "step": 31630 }, { "epoch": 8.057135580269971, "grad_norm": 0.022618360817432404, "learning_rate": 5.391262519596607e-07, "loss": 0.0003, "step": 31635 }, { "epoch": 8.058409033024875, "grad_norm": 0.06179908663034439, "learning_rate": 5.376872167681634e-07, "loss": 0.0004, "step": 31640 }, { "epoch": 8.059682485779778, "grad_norm": 0.023226972669363022, "learning_rate": 5.362500516189984e-07, "loss": 0.0011, "step": 31645 }, { "epoch": 8.060955938534681, "grad_norm": 0.027902420610189438, "learning_rate": 5.348147567961903e-07, "loss": 0.0006, "step": 31650 }, { "epoch": 8.062229391289582, "grad_norm": 0.27718234062194824, "learning_rate": 5.33381332583397e-07, "loss": 0.001, "step": 31655 }, { "epoch": 8.063502844044486, "grad_norm": 0.07618485391139984, "learning_rate": 5.319497792639116e-07, "loss": 0.0006, "step": 31660 }, { "epoch": 8.064776296799389, "grad_norm": 0.006339207757264376, "learning_rate": 5.305200971206459e-07, "loss": 0.0002, "step": 31665 }, { "epoch": 8.066049749554292, "grad_norm": 0.04210580885410309, "learning_rate": 5.290922864361548e-07, "loss": 0.0006, "step": 31670 }, { "epoch": 8.067323202309195, "grad_norm": 0.009672902524471283, "learning_rate": 5.276663474926147e-07, "loss": 0.0013, "step": 31675 }, { "epoch": 8.068596655064097, "grad_norm": 0.03723973408341408, "learning_rate": 5.26242280571836e-07, "loss": 0.0008, "step": 31680 }, { "epoch": 8.069870107819, "grad_norm": 0.010850664228200912, "learning_rate": 5.248200859552544e-07, "loss": 0.0006, "step": 31685 }, { "epoch": 8.071143560573903, "grad_norm": 0.0623532310128212, "learning_rate": 5.233997639239452e-07, "loss": 0.0007, "step": 31690 }, { "epoch": 8.072417013328806, "grad_norm": 0.053137168288230896, "learning_rate": 5.219813147586006e-07, "loss": 0.0006, "step": 31695 }, { "epoch": 8.073690466083708, "grad_norm": 0.018749576061964035, "learning_rate": 5.205647387395552e-07, "loss": 0.0003, "step": 31700 }, { "epoch": 8.07496391883861, "grad_norm": 0.042203713208436966, "learning_rate": 5.191500361467639e-07, "loss": 0.0008, "step": 31705 }, { "epoch": 8.076237371593514, "grad_norm": 0.021865330636501312, "learning_rate": 5.177372072598174e-07, "loss": 0.0006, "step": 31710 }, { "epoch": 8.077510824348417, "grad_norm": 0.02797483280301094, "learning_rate": 5.163262523579315e-07, "loss": 0.0003, "step": 31715 }, { "epoch": 8.078784277103319, "grad_norm": 0.1929125338792801, "learning_rate": 5.149171717199586e-07, "loss": 0.0005, "step": 31720 }, { "epoch": 8.080057729858222, "grad_norm": 0.029749535024166107, "learning_rate": 5.135099656243703e-07, "loss": 0.0008, "step": 31725 }, { "epoch": 8.081331182613125, "grad_norm": 0.05781005322933197, "learning_rate": 5.121046343492786e-07, "loss": 0.0007, "step": 31730 }, { "epoch": 8.082604635368028, "grad_norm": 0.09172949939966202, "learning_rate": 5.107011781724169e-07, "loss": 0.0006, "step": 31735 }, { "epoch": 8.083878088122932, "grad_norm": 0.033376295119524, "learning_rate": 5.092995973711523e-07, "loss": 0.0004, "step": 31740 }, { "epoch": 8.085151540877833, "grad_norm": 0.1100776195526123, "learning_rate": 5.078998922224787e-07, "loss": 0.0009, "step": 31745 }, { "epoch": 8.086424993632736, "grad_norm": 0.023219775408506393, "learning_rate": 5.065020630030226e-07, "loss": 0.0005, "step": 31750 }, { "epoch": 8.08769844638764, "grad_norm": 0.022452231496572495, "learning_rate": 5.051061099890353e-07, "loss": 0.0004, "step": 31755 }, { "epoch": 8.088971899142543, "grad_norm": 0.009618542157113552, "learning_rate": 5.037120334564027e-07, "loss": 0.0005, "step": 31760 }, { "epoch": 8.090245351897444, "grad_norm": 0.07849901169538498, "learning_rate": 5.023198336806345e-07, "loss": 0.0012, "step": 31765 }, { "epoch": 8.091518804652347, "grad_norm": 0.03951258584856987, "learning_rate": 5.00929510936874e-07, "loss": 0.0009, "step": 31770 }, { "epoch": 8.09279225740725, "grad_norm": 0.09081132709980011, "learning_rate": 4.9954106549989e-07, "loss": 0.0007, "step": 31775 }, { "epoch": 8.094065710162154, "grad_norm": 0.02655741572380066, "learning_rate": 4.981544976440822e-07, "loss": 0.0006, "step": 31780 }, { "epoch": 8.095339162917055, "grad_norm": 0.008257636800408363, "learning_rate": 4.967698076434791e-07, "loss": 0.0003, "step": 31785 }, { "epoch": 8.096612615671958, "grad_norm": 0.0077345240861177444, "learning_rate": 4.953869957717361e-07, "loss": 0.0004, "step": 31790 }, { "epoch": 8.097886068426861, "grad_norm": 0.05529780685901642, "learning_rate": 4.940060623021414e-07, "loss": 0.001, "step": 31795 }, { "epoch": 8.099159521181765, "grad_norm": 0.043925926089286804, "learning_rate": 4.926270075076089e-07, "loss": 0.0004, "step": 31800 }, { "epoch": 8.100432973936666, "grad_norm": 0.04537048563361168, "learning_rate": 4.912498316606818e-07, "loss": 0.0004, "step": 31805 }, { "epoch": 8.10170642669157, "grad_norm": 0.05606811121106148, "learning_rate": 4.898745350335332e-07, "loss": 0.0003, "step": 31810 }, { "epoch": 8.102979879446472, "grad_norm": 0.013562186621129513, "learning_rate": 4.885011178979615e-07, "loss": 0.0007, "step": 31815 }, { "epoch": 8.104253332201376, "grad_norm": 0.026128986850380898, "learning_rate": 4.871295805253961e-07, "loss": 0.0006, "step": 31820 }, { "epoch": 8.105526784956279, "grad_norm": 0.05381229147315025, "learning_rate": 4.857599231868993e-07, "loss": 0.0005, "step": 31825 }, { "epoch": 8.10680023771118, "grad_norm": 0.00983002595603466, "learning_rate": 4.843921461531509e-07, "loss": 0.0006, "step": 31830 }, { "epoch": 8.108073690466084, "grad_norm": 0.06275976449251175, "learning_rate": 4.830262496944693e-07, "loss": 0.0005, "step": 31835 }, { "epoch": 8.109347143220987, "grad_norm": 0.0529756024479866, "learning_rate": 4.816622340807963e-07, "loss": 0.0006, "step": 31840 }, { "epoch": 8.11062059597589, "grad_norm": 0.007865095511078835, "learning_rate": 4.803000995817042e-07, "loss": 0.0003, "step": 31845 }, { "epoch": 8.111894048730791, "grad_norm": 0.006174393463879824, "learning_rate": 4.789398464663897e-07, "loss": 0.0004, "step": 31850 }, { "epoch": 8.113167501485695, "grad_norm": 0.005186140071600676, "learning_rate": 4.775814750036845e-07, "loss": 0.0009, "step": 31855 }, { "epoch": 8.114440954240598, "grad_norm": 0.026428556069731712, "learning_rate": 4.762249854620404e-07, "loss": 0.0007, "step": 31860 }, { "epoch": 8.115714406995501, "grad_norm": 0.013306742534041405, "learning_rate": 4.748703781095432e-07, "loss": 0.0009, "step": 31865 }, { "epoch": 8.116987859750402, "grad_norm": 0.0567626953125, "learning_rate": 4.7351765321390407e-07, "loss": 0.0012, "step": 31870 }, { "epoch": 8.118261312505306, "grad_norm": 0.02191409096121788, "learning_rate": 4.7216681104246377e-07, "loss": 0.0003, "step": 31875 }, { "epoch": 8.119534765260209, "grad_norm": 0.18982112407684326, "learning_rate": 4.7081785186218866e-07, "loss": 0.0009, "step": 31880 }, { "epoch": 8.120808218015112, "grad_norm": 0.039089739322662354, "learning_rate": 4.694707759396766e-07, "loss": 0.0006, "step": 31885 }, { "epoch": 8.122081670770015, "grad_norm": 0.014354060404002666, "learning_rate": 4.681255835411469e-07, "loss": 0.0003, "step": 31890 }, { "epoch": 8.123355123524917, "grad_norm": 0.03450622782111168, "learning_rate": 4.6678227493245685e-07, "loss": 0.0004, "step": 31895 }, { "epoch": 8.12462857627982, "grad_norm": 0.009349013678729534, "learning_rate": 4.654408503790775e-07, "loss": 0.0007, "step": 31900 }, { "epoch": 8.125902029034723, "grad_norm": 0.0501907654106617, "learning_rate": 4.6410131014612226e-07, "loss": 0.0004, "step": 31905 }, { "epoch": 8.127175481789626, "grad_norm": 0.05228195711970329, "learning_rate": 4.627636544983216e-07, "loss": 0.0006, "step": 31910 }, { "epoch": 8.128448934544528, "grad_norm": 0.014261312782764435, "learning_rate": 4.6142788370003855e-07, "loss": 0.0006, "step": 31915 }, { "epoch": 8.12972238729943, "grad_norm": 0.010443439707159996, "learning_rate": 4.6009399801525965e-07, "loss": 0.0003, "step": 31920 }, { "epoch": 8.130995840054334, "grad_norm": 0.022461747750639915, "learning_rate": 4.587619977076074e-07, "loss": 0.0004, "step": 31925 }, { "epoch": 8.132269292809237, "grad_norm": 0.0339573472738266, "learning_rate": 4.57431883040319e-07, "loss": 0.0003, "step": 31930 }, { "epoch": 8.133542745564139, "grad_norm": 0.023278770968317986, "learning_rate": 4.561036542762698e-07, "loss": 0.0005, "step": 31935 }, { "epoch": 8.134816198319042, "grad_norm": 0.009902134537696838, "learning_rate": 4.547773116779575e-07, "loss": 0.0002, "step": 31940 }, { "epoch": 8.136089651073945, "grad_norm": 0.05353224277496338, "learning_rate": 4.5345285550750795e-07, "loss": 0.0005, "step": 31945 }, { "epoch": 8.137363103828848, "grad_norm": 0.022982899099588394, "learning_rate": 4.5213028602667295e-07, "loss": 0.0005, "step": 31950 }, { "epoch": 8.138636556583752, "grad_norm": 0.006998537108302116, "learning_rate": 4.508096034968368e-07, "loss": 0.0003, "step": 31955 }, { "epoch": 8.139910009338653, "grad_norm": 0.013657117262482643, "learning_rate": 4.494908081790006e-07, "loss": 0.0003, "step": 31960 }, { "epoch": 8.141183462093556, "grad_norm": 0.055337704718112946, "learning_rate": 4.481739003338037e-07, "loss": 0.0004, "step": 31965 }, { "epoch": 8.14245691484846, "grad_norm": 0.02894754149019718, "learning_rate": 4.468588802215057e-07, "loss": 0.0007, "step": 31970 }, { "epoch": 8.143730367603363, "grad_norm": 0.03507180139422417, "learning_rate": 4.4554574810199314e-07, "loss": 0.0006, "step": 31975 }, { "epoch": 8.145003820358264, "grad_norm": 0.062042832374572754, "learning_rate": 4.4423450423478285e-07, "loss": 0.0011, "step": 31980 }, { "epoch": 8.146277273113167, "grad_norm": 0.027825741097331047, "learning_rate": 4.429251488790176e-07, "loss": 0.0006, "step": 31985 }, { "epoch": 8.14755072586807, "grad_norm": 0.05355367437005043, "learning_rate": 4.416176822934626e-07, "loss": 0.0008, "step": 31990 }, { "epoch": 8.148824178622974, "grad_norm": 0.05270984023809433, "learning_rate": 4.4031210473651665e-07, "loss": 0.0008, "step": 31995 }, { "epoch": 8.150097631377875, "grad_norm": 0.07620184868574142, "learning_rate": 4.390084164662012e-07, "loss": 0.0011, "step": 32000 }, { "epoch": 8.151371084132778, "grad_norm": 0.09974347054958344, "learning_rate": 4.377066177401634e-07, "loss": 0.0005, "step": 32005 }, { "epoch": 8.152644536887681, "grad_norm": 0.04327122122049332, "learning_rate": 4.364067088156787e-07, "loss": 0.0005, "step": 32010 }, { "epoch": 8.153917989642585, "grad_norm": 0.024536222219467163, "learning_rate": 4.351086899496526e-07, "loss": 0.0003, "step": 32015 }, { "epoch": 8.155191442397488, "grad_norm": 0.0346437506377697, "learning_rate": 4.3381256139860663e-07, "loss": 0.0009, "step": 32020 }, { "epoch": 8.15646489515239, "grad_norm": 0.04531392827630043, "learning_rate": 4.3251832341870026e-07, "loss": 0.001, "step": 32025 }, { "epoch": 8.157738347907292, "grad_norm": 0.01839621365070343, "learning_rate": 4.312259762657145e-07, "loss": 0.0002, "step": 32030 }, { "epoch": 8.159011800662196, "grad_norm": 0.024088464677333832, "learning_rate": 4.299355201950539e-07, "loss": 0.0008, "step": 32035 }, { "epoch": 8.160285253417099, "grad_norm": 0.033596742898225784, "learning_rate": 4.286469554617534e-07, "loss": 0.0005, "step": 32040 }, { "epoch": 8.161558706172, "grad_norm": 0.05119864642620087, "learning_rate": 4.2736028232047476e-07, "loss": 0.0004, "step": 32045 }, { "epoch": 8.162832158926904, "grad_norm": 0.009005671367049217, "learning_rate": 4.260755010255002e-07, "loss": 0.0002, "step": 32050 }, { "epoch": 8.164105611681807, "grad_norm": 0.2536018192768097, "learning_rate": 4.247926118307444e-07, "loss": 0.0006, "step": 32055 }, { "epoch": 8.16537906443671, "grad_norm": 0.008064321242272854, "learning_rate": 4.235116149897456e-07, "loss": 0.0004, "step": 32060 }, { "epoch": 8.166652517191611, "grad_norm": 0.030742404982447624, "learning_rate": 4.222325107556657e-07, "loss": 0.0005, "step": 32065 }, { "epoch": 8.167925969946515, "grad_norm": 0.00944022461771965, "learning_rate": 4.209552993812982e-07, "loss": 0.0003, "step": 32070 }, { "epoch": 8.169199422701418, "grad_norm": 0.004569592420011759, "learning_rate": 4.196799811190555e-07, "loss": 0.0003, "step": 32075 }, { "epoch": 8.170472875456321, "grad_norm": 0.016937999054789543, "learning_rate": 4.1840655622098047e-07, "loss": 0.0005, "step": 32080 }, { "epoch": 8.171746328211224, "grad_norm": 0.032123807817697525, "learning_rate": 4.1713502493874294e-07, "loss": 0.0004, "step": 32085 }, { "epoch": 8.173019780966126, "grad_norm": 0.034432172775268555, "learning_rate": 4.1586538752363516e-07, "loss": 0.0003, "step": 32090 }, { "epoch": 8.174293233721029, "grad_norm": 0.06980584561824799, "learning_rate": 4.1459764422657533e-07, "loss": 0.0004, "step": 32095 }, { "epoch": 8.175566686475932, "grad_norm": 0.012423710897564888, "learning_rate": 4.1333179529810976e-07, "loss": 0.0004, "step": 32100 }, { "epoch": 8.176840139230835, "grad_norm": 0.03010762669146061, "learning_rate": 4.1206784098840715e-07, "loss": 0.0006, "step": 32105 }, { "epoch": 8.178113591985737, "grad_norm": 0.03682197630405426, "learning_rate": 4.1080578154726545e-07, "loss": 0.0005, "step": 32110 }, { "epoch": 8.17938704474064, "grad_norm": 0.06620913743972778, "learning_rate": 4.095456172241041e-07, "loss": 0.0011, "step": 32115 }, { "epoch": 8.180660497495543, "grad_norm": 0.0619419701397419, "learning_rate": 4.0828734826797166e-07, "loss": 0.0005, "step": 32120 }, { "epoch": 8.181933950250446, "grad_norm": 0.02226055972278118, "learning_rate": 4.070309749275414e-07, "loss": 0.0002, "step": 32125 }, { "epoch": 8.183207403005348, "grad_norm": 0.0036504094023257494, "learning_rate": 4.057764974511091e-07, "loss": 0.0002, "step": 32130 }, { "epoch": 8.18448085576025, "grad_norm": 0.00909386295825243, "learning_rate": 4.045239160865988e-07, "loss": 0.0007, "step": 32135 }, { "epoch": 8.185754308515154, "grad_norm": 0.018345821648836136, "learning_rate": 4.032732310815579e-07, "loss": 0.0003, "step": 32140 }, { "epoch": 8.187027761270057, "grad_norm": 0.05437495931982994, "learning_rate": 4.020244426831621e-07, "loss": 0.0005, "step": 32145 }, { "epoch": 8.18830121402496, "grad_norm": 0.04767156019806862, "learning_rate": 4.007775511382084e-07, "loss": 0.0005, "step": 32150 }, { "epoch": 8.189574666779862, "grad_norm": 0.031801674515008926, "learning_rate": 3.9953255669311964e-07, "loss": 0.0006, "step": 32155 }, { "epoch": 8.190848119534765, "grad_norm": 0.05465434119105339, "learning_rate": 3.982894595939479e-07, "loss": 0.0009, "step": 32160 }, { "epoch": 8.192121572289668, "grad_norm": 0.04189922660589218, "learning_rate": 3.9704826008636566e-07, "loss": 0.0005, "step": 32165 }, { "epoch": 8.193395025044572, "grad_norm": 0.018795672804117203, "learning_rate": 3.9580895841567213e-07, "loss": 0.0003, "step": 32170 }, { "epoch": 8.194668477799473, "grad_norm": 0.014025761745870113, "learning_rate": 3.9457155482679033e-07, "loss": 0.0003, "step": 32175 }, { "epoch": 8.195941930554376, "grad_norm": 0.0031943994108587503, "learning_rate": 3.933360495642702e-07, "loss": 0.0005, "step": 32180 }, { "epoch": 8.19721538330928, "grad_norm": 0.010362026281654835, "learning_rate": 3.9210244287228304e-07, "loss": 0.0004, "step": 32185 }, { "epoch": 8.198488836064183, "grad_norm": 0.03149779886007309, "learning_rate": 3.908707349946317e-07, "loss": 0.0006, "step": 32190 }, { "epoch": 8.199762288819084, "grad_norm": 0.04970312863588333, "learning_rate": 3.896409261747336e-07, "loss": 0.0005, "step": 32195 }, { "epoch": 8.201035741573987, "grad_norm": 0.01728593185544014, "learning_rate": 3.8841301665564215e-07, "loss": 0.0008, "step": 32200 }, { "epoch": 8.20230919432889, "grad_norm": 0.07210080325603485, "learning_rate": 3.871870066800265e-07, "loss": 0.0007, "step": 32205 }, { "epoch": 8.203582647083794, "grad_norm": 0.03381721302866936, "learning_rate": 3.85962896490184e-07, "loss": 0.0002, "step": 32210 }, { "epoch": 8.204856099838697, "grad_norm": 0.028390005230903625, "learning_rate": 3.8474068632803563e-07, "loss": 0.0006, "step": 32215 }, { "epoch": 8.206129552593598, "grad_norm": 0.02156647853553295, "learning_rate": 3.835203764351314e-07, "loss": 0.0005, "step": 32220 }, { "epoch": 8.207403005348501, "grad_norm": 0.03910187631845474, "learning_rate": 3.8230196705263734e-07, "loss": 0.0004, "step": 32225 }, { "epoch": 8.208676458103405, "grad_norm": 0.008117332123219967, "learning_rate": 3.810854584213508e-07, "loss": 0.0002, "step": 32230 }, { "epoch": 8.209949910858308, "grad_norm": 0.03253589943051338, "learning_rate": 3.7987085078168953e-07, "loss": 0.0004, "step": 32235 }, { "epoch": 8.21122336361321, "grad_norm": 0.08468163758516312, "learning_rate": 3.786581443736992e-07, "loss": 0.0006, "step": 32240 }, { "epoch": 8.212496816368112, "grad_norm": 0.012148312292993069, "learning_rate": 3.774473394370448e-07, "loss": 0.0007, "step": 32245 }, { "epoch": 8.213770269123016, "grad_norm": 0.044468581676483154, "learning_rate": 3.762384362110216e-07, "loss": 0.0004, "step": 32250 }, { "epoch": 8.215043721877919, "grad_norm": 0.027821555733680725, "learning_rate": 3.75031434934543e-07, "loss": 0.0006, "step": 32255 }, { "epoch": 8.21631717463282, "grad_norm": 0.054920461028814316, "learning_rate": 3.7382633584615134e-07, "loss": 0.0006, "step": 32260 }, { "epoch": 8.217590627387724, "grad_norm": 0.01898830384016037, "learning_rate": 3.726231391840107e-07, "loss": 0.0005, "step": 32265 }, { "epoch": 8.218864080142627, "grad_norm": 0.026509759947657585, "learning_rate": 3.7142184518590975e-07, "loss": 0.0004, "step": 32270 }, { "epoch": 8.22013753289753, "grad_norm": 0.1261952668428421, "learning_rate": 3.702224540892585e-07, "loss": 0.0014, "step": 32275 }, { "epoch": 8.221410985652433, "grad_norm": 0.02622668445110321, "learning_rate": 3.6902496613109737e-07, "loss": 0.0009, "step": 32280 }, { "epoch": 8.222684438407335, "grad_norm": 0.01667352207005024, "learning_rate": 3.678293815480827e-07, "loss": 0.0005, "step": 32285 }, { "epoch": 8.223957891162238, "grad_norm": 0.008025082759559155, "learning_rate": 3.6663570057650087e-07, "loss": 0.0003, "step": 32290 }, { "epoch": 8.225231343917141, "grad_norm": 0.0038247595075517893, "learning_rate": 3.6544392345225885e-07, "loss": 0.0004, "step": 32295 }, { "epoch": 8.226504796672044, "grad_norm": 0.007236695848405361, "learning_rate": 3.6425405041088935e-07, "loss": 0.0002, "step": 32300 }, { "epoch": 8.227778249426946, "grad_norm": 0.02955908328294754, "learning_rate": 3.6306608168754533e-07, "loss": 0.0007, "step": 32305 }, { "epoch": 8.229051702181849, "grad_norm": 0.013451053760945797, "learning_rate": 3.618800175170101e-07, "loss": 0.0007, "step": 32310 }, { "epoch": 8.230325154936752, "grad_norm": 0.017150962725281715, "learning_rate": 3.606958581336795e-07, "loss": 0.0005, "step": 32315 }, { "epoch": 8.231598607691655, "grad_norm": 0.0055786133743822575, "learning_rate": 3.595136037715863e-07, "loss": 0.0004, "step": 32320 }, { "epoch": 8.232872060446557, "grad_norm": 0.02803444117307663, "learning_rate": 3.5833325466437697e-07, "loss": 0.0006, "step": 32325 }, { "epoch": 8.23414551320146, "grad_norm": 0.10667990148067474, "learning_rate": 3.571548110453238e-07, "loss": 0.0009, "step": 32330 }, { "epoch": 8.235418965956363, "grad_norm": 0.07491201162338257, "learning_rate": 3.5597827314732383e-07, "loss": 0.0007, "step": 32335 }, { "epoch": 8.236692418711266, "grad_norm": 0.11099310964345932, "learning_rate": 3.54803641202901e-07, "loss": 0.0005, "step": 32340 }, { "epoch": 8.23796587146617, "grad_norm": 0.04525706544518471, "learning_rate": 3.536309154441908e-07, "loss": 0.0009, "step": 32345 }, { "epoch": 8.23923932422107, "grad_norm": 0.053140461444854736, "learning_rate": 3.524600961029667e-07, "loss": 0.0007, "step": 32350 }, { "epoch": 8.240512776975974, "grad_norm": 0.02191057987511158, "learning_rate": 3.5129118341061475e-07, "loss": 0.0007, "step": 32355 }, { "epoch": 8.241786229730877, "grad_norm": 0.03576280176639557, "learning_rate": 3.5012417759814785e-07, "loss": 0.001, "step": 32360 }, { "epoch": 8.24305968248578, "grad_norm": 0.030294638127088547, "learning_rate": 3.4895907889620275e-07, "loss": 0.0011, "step": 32365 }, { "epoch": 8.244333135240682, "grad_norm": 0.0964961126446724, "learning_rate": 3.477958875350407e-07, "loss": 0.0005, "step": 32370 }, { "epoch": 8.245606587995585, "grad_norm": 0.04076676815748215, "learning_rate": 3.4663460374454005e-07, "loss": 0.0007, "step": 32375 }, { "epoch": 8.246880040750488, "grad_norm": 0.010757862590253353, "learning_rate": 3.454752277542084e-07, "loss": 0.0005, "step": 32380 }, { "epoch": 8.248153493505392, "grad_norm": 0.10550539195537567, "learning_rate": 3.443177597931735e-07, "loss": 0.0008, "step": 32385 }, { "epoch": 8.249426946260293, "grad_norm": 0.009588992223143578, "learning_rate": 3.4316220009018577e-07, "loss": 0.0003, "step": 32390 }, { "epoch": 8.250700399015196, "grad_norm": 0.04054344817996025, "learning_rate": 3.420085488736202e-07, "loss": 0.0006, "step": 32395 }, { "epoch": 8.2519738517701, "grad_norm": 0.06242061406373978, "learning_rate": 3.4085680637147234e-07, "loss": 0.0005, "step": 32400 }, { "epoch": 8.253247304525003, "grad_norm": 0.024003373458981514, "learning_rate": 3.397069728113622e-07, "loss": 0.0002, "step": 32405 }, { "epoch": 8.254520757279906, "grad_norm": 0.040340058505535126, "learning_rate": 3.3855904842053367e-07, "loss": 0.0006, "step": 32410 }, { "epoch": 8.255794210034807, "grad_norm": 0.043726131319999695, "learning_rate": 3.374130334258474e-07, "loss": 0.0006, "step": 32415 }, { "epoch": 8.25706766278971, "grad_norm": 0.03581329062581062, "learning_rate": 3.3626892805379565e-07, "loss": 0.0005, "step": 32420 }, { "epoch": 8.258341115544614, "grad_norm": 0.0691170021891594, "learning_rate": 3.351267325304863e-07, "loss": 0.0005, "step": 32425 }, { "epoch": 8.259614568299517, "grad_norm": 0.0333302803337574, "learning_rate": 3.339864470816534e-07, "loss": 0.0009, "step": 32430 }, { "epoch": 8.260888021054418, "grad_norm": 0.05220102518796921, "learning_rate": 3.32848071932651e-07, "loss": 0.0018, "step": 32435 }, { "epoch": 8.262161473809321, "grad_norm": 0.02663818560540676, "learning_rate": 3.3171160730845695e-07, "loss": 0.0002, "step": 32440 }, { "epoch": 8.263434926564225, "grad_norm": 0.03082575835287571, "learning_rate": 3.3057705343367054e-07, "loss": 0.0003, "step": 32445 }, { "epoch": 8.264708379319128, "grad_norm": 0.008778182789683342, "learning_rate": 3.294444105325167e-07, "loss": 0.0002, "step": 32450 }, { "epoch": 8.26598183207403, "grad_norm": 0.04123760387301445, "learning_rate": 3.283136788288388e-07, "loss": 0.0009, "step": 32455 }, { "epoch": 8.267255284828932, "grad_norm": 0.03394158557057381, "learning_rate": 3.271848585461046e-07, "loss": 0.0004, "step": 32460 }, { "epoch": 8.268528737583836, "grad_norm": 0.09283669292926788, "learning_rate": 3.260579499074035e-07, "loss": 0.0008, "step": 32465 }, { "epoch": 8.269802190338739, "grad_norm": 0.0233004130423069, "learning_rate": 3.2493295313544613e-07, "loss": 0.0003, "step": 32470 }, { "epoch": 8.271075643093642, "grad_norm": 0.03239966928958893, "learning_rate": 3.23809868452567e-07, "loss": 0.0005, "step": 32475 }, { "epoch": 8.272349095848543, "grad_norm": 0.048175107687711716, "learning_rate": 3.226886960807207e-07, "loss": 0.0005, "step": 32480 }, { "epoch": 8.273622548603447, "grad_norm": 0.12962882220745087, "learning_rate": 3.2156943624148783e-07, "loss": 0.0008, "step": 32485 }, { "epoch": 8.27489600135835, "grad_norm": 0.0640827864408493, "learning_rate": 3.20452089156067e-07, "loss": 0.0006, "step": 32490 }, { "epoch": 8.276169454113253, "grad_norm": 0.028847403824329376, "learning_rate": 3.1933665504527925e-07, "loss": 0.0004, "step": 32495 }, { "epoch": 8.277442906868155, "grad_norm": 0.020963728427886963, "learning_rate": 3.182231341295694e-07, "loss": 0.0006, "step": 32500 }, { "epoch": 8.278716359623058, "grad_norm": 0.026695258915424347, "learning_rate": 3.171115266290037e-07, "loss": 0.0008, "step": 32505 }, { "epoch": 8.279989812377961, "grad_norm": 0.023103198036551476, "learning_rate": 3.160018327632675e-07, "loss": 0.0004, "step": 32510 }, { "epoch": 8.281263265132864, "grad_norm": 0.09595537930727005, "learning_rate": 3.1489405275167415e-07, "loss": 0.0006, "step": 32515 }, { "epoch": 8.282536717887766, "grad_norm": 0.08381387591362, "learning_rate": 3.137881868131498e-07, "loss": 0.0004, "step": 32520 }, { "epoch": 8.283810170642669, "grad_norm": 0.022450152784585953, "learning_rate": 3.126842351662518e-07, "loss": 0.0006, "step": 32525 }, { "epoch": 8.285083623397572, "grad_norm": 0.009881746023893356, "learning_rate": 3.1158219802915336e-07, "loss": 0.0002, "step": 32530 }, { "epoch": 8.286357076152475, "grad_norm": 0.07760348170995712, "learning_rate": 3.1048207561965026e-07, "loss": 0.0015, "step": 32535 }, { "epoch": 8.287630528907378, "grad_norm": 0.04814085736870766, "learning_rate": 3.0938386815515974e-07, "loss": 0.0005, "step": 32540 }, { "epoch": 8.28890398166228, "grad_norm": 0.09553040564060211, "learning_rate": 3.0828757585272375e-07, "loss": 0.0008, "step": 32545 }, { "epoch": 8.290177434417183, "grad_norm": 0.2171875685453415, "learning_rate": 3.071931989289989e-07, "loss": 0.0004, "step": 32550 }, { "epoch": 8.291450887172086, "grad_norm": 0.06556446105241776, "learning_rate": 3.0610073760027223e-07, "loss": 0.0004, "step": 32555 }, { "epoch": 8.29272433992699, "grad_norm": 0.036132119596004486, "learning_rate": 3.0501019208244443e-07, "loss": 0.0004, "step": 32560 }, { "epoch": 8.29399779268189, "grad_norm": 0.04164735600352287, "learning_rate": 3.039215625910419e-07, "loss": 0.0006, "step": 32565 }, { "epoch": 8.295271245436794, "grad_norm": 0.022585995495319366, "learning_rate": 3.028348493412092e-07, "loss": 0.0002, "step": 32570 }, { "epoch": 8.296544698191697, "grad_norm": 0.012222534976899624, "learning_rate": 3.0175005254771796e-07, "loss": 0.0005, "step": 32575 }, { "epoch": 8.2978181509466, "grad_norm": 0.023989252746105194, "learning_rate": 3.0066717242495216e-07, "loss": 0.0005, "step": 32580 }, { "epoch": 8.299091603701502, "grad_norm": 0.004684292711317539, "learning_rate": 2.9958620918692726e-07, "loss": 0.0009, "step": 32585 }, { "epoch": 8.300365056456405, "grad_norm": 0.08601146936416626, "learning_rate": 2.9850716304727023e-07, "loss": 0.0009, "step": 32590 }, { "epoch": 8.301638509211308, "grad_norm": 0.04593789577484131, "learning_rate": 2.974300342192371e-07, "loss": 0.0005, "step": 32595 }, { "epoch": 8.302911961966212, "grad_norm": 0.039363350719213486, "learning_rate": 2.963548229156976e-07, "loss": 0.0008, "step": 32600 }, { "epoch": 8.304185414721115, "grad_norm": 0.04560606926679611, "learning_rate": 2.9528152934915177e-07, "loss": 0.0005, "step": 32605 }, { "epoch": 8.305458867476016, "grad_norm": 0.04429725185036659, "learning_rate": 2.942101537317088e-07, "loss": 0.0005, "step": 32610 }, { "epoch": 8.30673232023092, "grad_norm": 0.06907155364751816, "learning_rate": 2.9314069627511045e-07, "loss": 0.0007, "step": 32615 }, { "epoch": 8.308005772985823, "grad_norm": 0.010855840519070625, "learning_rate": 2.9207315719071096e-07, "loss": 0.0006, "step": 32620 }, { "epoch": 8.309279225740726, "grad_norm": 0.05614578351378441, "learning_rate": 2.9100753668949156e-07, "loss": 0.0007, "step": 32625 }, { "epoch": 8.310552678495627, "grad_norm": 0.04113015532493591, "learning_rate": 2.8994383498204825e-07, "loss": 0.0004, "step": 32630 }, { "epoch": 8.31182613125053, "grad_norm": 0.7616878747940063, "learning_rate": 2.88882052278604e-07, "loss": 0.001, "step": 32635 }, { "epoch": 8.313099584005434, "grad_norm": 0.00910609494894743, "learning_rate": 2.8782218878899646e-07, "loss": 0.0005, "step": 32640 }, { "epoch": 8.314373036760337, "grad_norm": 0.039670586585998535, "learning_rate": 2.8676424472269036e-07, "loss": 0.0008, "step": 32645 }, { "epoch": 8.315646489515238, "grad_norm": 0.003220077371224761, "learning_rate": 2.857082202887662e-07, "loss": 0.0004, "step": 32650 }, { "epoch": 8.316919942270141, "grad_norm": 0.054439593106508255, "learning_rate": 2.8465411569592703e-07, "loss": 0.0005, "step": 32655 }, { "epoch": 8.318193395025045, "grad_norm": 0.16194278001785278, "learning_rate": 2.8360193115249514e-07, "loss": 0.0005, "step": 32660 }, { "epoch": 8.319466847779948, "grad_norm": 0.04165615141391754, "learning_rate": 2.8255166686641524e-07, "loss": 0.0005, "step": 32665 }, { "epoch": 8.320740300534851, "grad_norm": 0.06316583603620529, "learning_rate": 2.815033230452502e-07, "loss": 0.0007, "step": 32670 }, { "epoch": 8.322013753289752, "grad_norm": 0.053146932274103165, "learning_rate": 2.8045689989618874e-07, "loss": 0.0006, "step": 32675 }, { "epoch": 8.323287206044656, "grad_norm": 0.013070215471088886, "learning_rate": 2.79412397626031e-07, "loss": 0.001, "step": 32680 }, { "epoch": 8.324560658799559, "grad_norm": 0.040775902569293976, "learning_rate": 2.783698164412063e-07, "loss": 0.0005, "step": 32685 }, { "epoch": 8.325834111554462, "grad_norm": 0.013108663260936737, "learning_rate": 2.773291565477587e-07, "loss": 0.0002, "step": 32690 }, { "epoch": 8.327107564309363, "grad_norm": 0.0352046899497509, "learning_rate": 2.762904181513548e-07, "loss": 0.0005, "step": 32695 }, { "epoch": 8.328381017064267, "grad_norm": 0.014789761044085026, "learning_rate": 2.752536014572815e-07, "loss": 0.0006, "step": 32700 }, { "epoch": 8.32965446981917, "grad_norm": 0.06454328447580338, "learning_rate": 2.7421870667044605e-07, "loss": 0.0006, "step": 32705 }, { "epoch": 8.330927922574073, "grad_norm": 0.06210288032889366, "learning_rate": 2.731857339953736e-07, "loss": 0.0009, "step": 32710 }, { "epoch": 8.332201375328975, "grad_norm": 0.003311991225928068, "learning_rate": 2.721546836362121e-07, "loss": 0.0005, "step": 32715 }, { "epoch": 8.333474828083878, "grad_norm": 0.04565794765949249, "learning_rate": 2.711255557967285e-07, "loss": 0.0002, "step": 32720 }, { "epoch": 8.334748280838781, "grad_norm": 0.6605680584907532, "learning_rate": 2.700983506803101e-07, "loss": 0.001, "step": 32725 }, { "epoch": 8.336021733593684, "grad_norm": 0.02738565392792225, "learning_rate": 2.690730684899645e-07, "loss": 0.0007, "step": 32730 }, { "epoch": 8.337295186348587, "grad_norm": 0.026291880756616592, "learning_rate": 2.6804970942831743e-07, "loss": 0.0005, "step": 32735 }, { "epoch": 8.338568639103489, "grad_norm": 0.03427678719162941, "learning_rate": 2.67028273697616e-07, "loss": 0.0005, "step": 32740 }, { "epoch": 8.339842091858392, "grad_norm": 0.03885680064558983, "learning_rate": 2.660087614997298e-07, "loss": 0.0004, "step": 32745 }, { "epoch": 8.341115544613295, "grad_norm": 0.013037662953138351, "learning_rate": 2.6499117303614206e-07, "loss": 0.0003, "step": 32750 }, { "epoch": 8.342388997368198, "grad_norm": 0.005640119314193726, "learning_rate": 2.6397550850796203e-07, "loss": 0.0006, "step": 32755 }, { "epoch": 8.3436624501231, "grad_norm": 0.08349486440420151, "learning_rate": 2.629617681159147e-07, "loss": 0.0005, "step": 32760 }, { "epoch": 8.344935902878003, "grad_norm": 0.067917600274086, "learning_rate": 2.6194995206034636e-07, "loss": 0.0005, "step": 32765 }, { "epoch": 8.346209355632906, "grad_norm": 0.010657728649675846, "learning_rate": 2.609400605412238e-07, "loss": 0.0005, "step": 32770 }, { "epoch": 8.34748280838781, "grad_norm": 0.07299024611711502, "learning_rate": 2.599320937581296e-07, "loss": 0.001, "step": 32775 }, { "epoch": 8.34875626114271, "grad_norm": 0.059701792895793915, "learning_rate": 2.589260519102721e-07, "loss": 0.0007, "step": 32780 }, { "epoch": 8.350029713897614, "grad_norm": 0.05400882661342621, "learning_rate": 2.579219351964757e-07, "loss": 0.0004, "step": 32785 }, { "epoch": 8.351303166652517, "grad_norm": 0.09583677351474762, "learning_rate": 2.5691974381518156e-07, "loss": 0.0005, "step": 32790 }, { "epoch": 8.35257661940742, "grad_norm": 0.1373244971036911, "learning_rate": 2.5591947796445695e-07, "loss": 0.0004, "step": 32795 }, { "epoch": 8.353850072162324, "grad_norm": 0.006468929350376129, "learning_rate": 2.5492113784198246e-07, "loss": 0.0003, "step": 32800 }, { "epoch": 8.355123524917225, "grad_norm": 0.05724632740020752, "learning_rate": 2.539247236450615e-07, "loss": 0.0004, "step": 32805 }, { "epoch": 8.356396977672128, "grad_norm": 0.009616665542125702, "learning_rate": 2.529302355706165e-07, "loss": 0.0008, "step": 32810 }, { "epoch": 8.357670430427032, "grad_norm": 0.049166541546583176, "learning_rate": 2.5193767381518683e-07, "loss": 0.0003, "step": 32815 }, { "epoch": 8.358943883181935, "grad_norm": 0.06318197399377823, "learning_rate": 2.5094703857493575e-07, "loss": 0.0004, "step": 32820 }, { "epoch": 8.360217335936836, "grad_norm": 0.039140939712524414, "learning_rate": 2.499583300456421e-07, "loss": 0.0008, "step": 32825 }, { "epoch": 8.36149078869174, "grad_norm": 0.02604858949780464, "learning_rate": 2.48971548422704e-07, "loss": 0.0005, "step": 32830 }, { "epoch": 8.362764241446643, "grad_norm": 0.14946486055850983, "learning_rate": 2.479866939011388e-07, "loss": 0.0007, "step": 32835 }, { "epoch": 8.364037694201546, "grad_norm": 0.004979562480002642, "learning_rate": 2.470037666755876e-07, "loss": 0.0005, "step": 32840 }, { "epoch": 8.365311146956447, "grad_norm": 0.026116106659173965, "learning_rate": 2.460227669403026e-07, "loss": 0.0009, "step": 32845 }, { "epoch": 8.36658459971135, "grad_norm": 0.03906213864684105, "learning_rate": 2.4504369488916324e-07, "loss": 0.0004, "step": 32850 }, { "epoch": 8.367858052466254, "grad_norm": 0.26191532611846924, "learning_rate": 2.4406655071566033e-07, "loss": 0.0009, "step": 32855 }, { "epoch": 8.369131505221157, "grad_norm": 0.06646355241537094, "learning_rate": 2.4309133461291046e-07, "loss": 0.0006, "step": 32860 }, { "epoch": 8.37040495797606, "grad_norm": 0.03967375308275223, "learning_rate": 2.421180467736439e-07, "loss": 0.0009, "step": 32865 }, { "epoch": 8.371678410730961, "grad_norm": 0.01745600253343582, "learning_rate": 2.4114668739021464e-07, "loss": 0.0004, "step": 32870 }, { "epoch": 8.372951863485865, "grad_norm": 0.023241804912686348, "learning_rate": 2.4017725665459015e-07, "loss": 0.0003, "step": 32875 }, { "epoch": 8.374225316240768, "grad_norm": 0.03520219027996063, "learning_rate": 2.392097547583616e-07, "loss": 0.0006, "step": 32880 }, { "epoch": 8.375498768995671, "grad_norm": 0.05175965279340744, "learning_rate": 2.3824418189273614e-07, "loss": 0.0005, "step": 32885 }, { "epoch": 8.376772221750572, "grad_norm": 0.04877376928925514, "learning_rate": 2.3728053824854102e-07, "loss": 0.0003, "step": 32890 }, { "epoch": 8.378045674505476, "grad_norm": 0.02772596850991249, "learning_rate": 2.3631882401621953e-07, "loss": 0.0004, "step": 32895 }, { "epoch": 8.379319127260379, "grad_norm": 0.046116724610328674, "learning_rate": 2.353590393858407e-07, "loss": 0.0004, "step": 32900 }, { "epoch": 8.380592580015282, "grad_norm": 0.06527494639158249, "learning_rate": 2.344011845470817e-07, "loss": 0.001, "step": 32905 }, { "epoch": 8.381866032770183, "grad_norm": 0.028639478608965874, "learning_rate": 2.334452596892467e-07, "loss": 0.0004, "step": 32910 }, { "epoch": 8.383139485525087, "grad_norm": 0.03478922322392464, "learning_rate": 2.3249126500125673e-07, "loss": 0.0004, "step": 32915 }, { "epoch": 8.38441293827999, "grad_norm": 0.017085717990994453, "learning_rate": 2.315392006716477e-07, "loss": 0.0007, "step": 32920 }, { "epoch": 8.385686391034893, "grad_norm": 0.05885407701134682, "learning_rate": 2.3058906688857907e-07, "loss": 0.0008, "step": 32925 }, { "epoch": 8.386959843789796, "grad_norm": 0.03723491355776787, "learning_rate": 2.2964086383982508e-07, "loss": 0.0003, "step": 32930 }, { "epoch": 8.388233296544698, "grad_norm": 0.02693098969757557, "learning_rate": 2.286945917127781e-07, "loss": 0.0003, "step": 32935 }, { "epoch": 8.389506749299601, "grad_norm": 0.016963601112365723, "learning_rate": 2.27750250694454e-07, "loss": 0.0005, "step": 32940 }, { "epoch": 8.390780202054504, "grad_norm": 0.02187582291662693, "learning_rate": 2.2680784097148022e-07, "loss": 0.0001, "step": 32945 }, { "epoch": 8.392053654809407, "grad_norm": 0.018093246966600418, "learning_rate": 2.258673627301078e-07, "loss": 0.001, "step": 32950 }, { "epoch": 8.393327107564309, "grad_norm": 0.05948103591799736, "learning_rate": 2.2492881615620244e-07, "loss": 0.0005, "step": 32955 }, { "epoch": 8.394600560319212, "grad_norm": 0.11691523343324661, "learning_rate": 2.2399220143525135e-07, "loss": 0.0009, "step": 32960 }, { "epoch": 8.395874013074115, "grad_norm": 0.018910732120275497, "learning_rate": 2.2305751875235648e-07, "loss": 0.001, "step": 32965 }, { "epoch": 8.397147465829018, "grad_norm": 0.00990990735590458, "learning_rate": 2.2212476829224227e-07, "loss": 0.0007, "step": 32970 }, { "epoch": 8.39842091858392, "grad_norm": 0.03281794488430023, "learning_rate": 2.211939502392446e-07, "loss": 0.001, "step": 32975 }, { "epoch": 8.399694371338823, "grad_norm": 0.003852215362712741, "learning_rate": 2.2026506477732522e-07, "loss": 0.0006, "step": 32980 }, { "epoch": 8.400967824093726, "grad_norm": 0.031516049057245255, "learning_rate": 2.1933811209005952e-07, "loss": 0.0005, "step": 32985 }, { "epoch": 8.40224127684863, "grad_norm": 0.19715392589569092, "learning_rate": 2.184130923606409e-07, "loss": 0.0006, "step": 32990 }, { "epoch": 8.40351472960353, "grad_norm": 0.03965148702263832, "learning_rate": 2.1749000577188094e-07, "loss": 0.0004, "step": 32995 }, { "epoch": 8.404788182358434, "grad_norm": 0.06952237337827682, "learning_rate": 2.1656885250621262e-07, "loss": 0.0008, "step": 33000 }, { "epoch": 8.406061635113337, "grad_norm": 0.06899528205394745, "learning_rate": 2.1564963274568028e-07, "loss": 0.0004, "step": 33005 }, { "epoch": 8.40733508786824, "grad_norm": 0.04036574438214302, "learning_rate": 2.1473234667195198e-07, "loss": 0.0005, "step": 33010 }, { "epoch": 8.408608540623144, "grad_norm": 0.03757728636264801, "learning_rate": 2.1381699446631154e-07, "loss": 0.0005, "step": 33015 }, { "epoch": 8.409881993378045, "grad_norm": 0.023424701765179634, "learning_rate": 2.1290357630966096e-07, "loss": 0.0009, "step": 33020 }, { "epoch": 8.411155446132948, "grad_norm": 0.09892252087593079, "learning_rate": 2.1199209238251696e-07, "loss": 0.0006, "step": 33025 }, { "epoch": 8.412428898887852, "grad_norm": 0.055493876338005066, "learning_rate": 2.1108254286502095e-07, "loss": 0.0006, "step": 33030 }, { "epoch": 8.413702351642755, "grad_norm": 0.06044374033808708, "learning_rate": 2.101749279369225e-07, "loss": 0.0009, "step": 33035 }, { "epoch": 8.414975804397656, "grad_norm": 0.012120824307203293, "learning_rate": 2.0926924777759926e-07, "loss": 0.0004, "step": 33040 }, { "epoch": 8.41624925715256, "grad_norm": 0.071334607899189, "learning_rate": 2.0836550256603805e-07, "loss": 0.0005, "step": 33045 }, { "epoch": 8.417522709907463, "grad_norm": 0.07064548134803772, "learning_rate": 2.0746369248084708e-07, "loss": 0.0005, "step": 33050 }, { "epoch": 8.418796162662366, "grad_norm": 0.06466453522443771, "learning_rate": 2.065638177002527e-07, "loss": 0.001, "step": 33055 }, { "epoch": 8.420069615417267, "grad_norm": 0.012038787826895714, "learning_rate": 2.0566587840209595e-07, "loss": 0.0003, "step": 33060 }, { "epoch": 8.42134306817217, "grad_norm": 0.07791174948215485, "learning_rate": 2.0476987476383824e-07, "loss": 0.0005, "step": 33065 }, { "epoch": 8.422616520927074, "grad_norm": 0.003797530895099044, "learning_rate": 2.038758069625568e-07, "loss": 0.0004, "step": 33070 }, { "epoch": 8.423889973681977, "grad_norm": 0.12647484242916107, "learning_rate": 2.0298367517494812e-07, "loss": 0.0009, "step": 33075 }, { "epoch": 8.42516342643688, "grad_norm": 0.005452955607324839, "learning_rate": 2.0209347957732328e-07, "loss": 0.0006, "step": 33080 }, { "epoch": 8.426436879191781, "grad_norm": 0.21439588069915771, "learning_rate": 2.0120522034561275e-07, "loss": 0.0004, "step": 33085 }, { "epoch": 8.427710331946685, "grad_norm": 0.021581733599305153, "learning_rate": 2.003188976553627e-07, "loss": 0.0002, "step": 33090 }, { "epoch": 8.428983784701588, "grad_norm": 0.012653627432882786, "learning_rate": 1.9943451168173866e-07, "loss": 0.0005, "step": 33095 }, { "epoch": 8.430257237456491, "grad_norm": 0.02127200923860073, "learning_rate": 1.9855206259952188e-07, "loss": 0.0007, "step": 33100 }, { "epoch": 8.431530690211392, "grad_norm": 0.049183305352926254, "learning_rate": 1.9767155058311173e-07, "loss": 0.0008, "step": 33105 }, { "epoch": 8.432804142966296, "grad_norm": 0.03455103188753128, "learning_rate": 1.9679297580652457e-07, "loss": 0.0003, "step": 33110 }, { "epoch": 8.434077595721199, "grad_norm": 0.024287454783916473, "learning_rate": 1.9591633844339263e-07, "loss": 0.0009, "step": 33115 }, { "epoch": 8.435351048476102, "grad_norm": 0.060650162398815155, "learning_rate": 1.9504163866696734e-07, "loss": 0.0004, "step": 33120 }, { "epoch": 8.436624501231003, "grad_norm": 0.04666263610124588, "learning_rate": 1.9416887665011486e-07, "loss": 0.0004, "step": 33125 }, { "epoch": 8.437897953985907, "grad_norm": 0.006905653979629278, "learning_rate": 1.9329805256531942e-07, "loss": 0.0003, "step": 33130 }, { "epoch": 8.43917140674081, "grad_norm": 0.025118889287114143, "learning_rate": 1.924291665846856e-07, "loss": 0.0005, "step": 33135 }, { "epoch": 8.440444859495713, "grad_norm": 0.030866090208292007, "learning_rate": 1.9156221887992821e-07, "loss": 0.0005, "step": 33140 }, { "epoch": 8.441718312250616, "grad_norm": 0.1824769228696823, "learning_rate": 1.906972096223847e-07, "loss": 0.0005, "step": 33145 }, { "epoch": 8.442991765005518, "grad_norm": 0.04050230234861374, "learning_rate": 1.8983413898300606e-07, "loss": 0.0005, "step": 33150 }, { "epoch": 8.444265217760421, "grad_norm": 0.04259530082345009, "learning_rate": 1.8897300713236256e-07, "loss": 0.0007, "step": 33155 }, { "epoch": 8.445538670515324, "grad_norm": 0.04310164600610733, "learning_rate": 1.88113814240638e-07, "loss": 0.0005, "step": 33160 }, { "epoch": 8.446812123270227, "grad_norm": 0.10890605300664902, "learning_rate": 1.8725656047763996e-07, "loss": 0.0008, "step": 33165 }, { "epoch": 8.448085576025129, "grad_norm": 0.0019736764952540398, "learning_rate": 1.8640124601278175e-07, "loss": 0.0003, "step": 33170 }, { "epoch": 8.449359028780032, "grad_norm": 0.03884386271238327, "learning_rate": 1.8554787101510484e-07, "loss": 0.0003, "step": 33175 }, { "epoch": 8.450632481534935, "grad_norm": 0.026086676865816116, "learning_rate": 1.8469643565325988e-07, "loss": 0.0004, "step": 33180 }, { "epoch": 8.451905934289838, "grad_norm": 0.0862264335155487, "learning_rate": 1.8384694009551563e-07, "loss": 0.0009, "step": 33185 }, { "epoch": 8.45317938704474, "grad_norm": 0.007415685337036848, "learning_rate": 1.8299938450976108e-07, "loss": 0.0006, "step": 33190 }, { "epoch": 8.454452839799643, "grad_norm": 0.13800092041492462, "learning_rate": 1.821537690634967e-07, "loss": 0.0004, "step": 33195 }, { "epoch": 8.455726292554546, "grad_norm": 0.02017914317548275, "learning_rate": 1.8131009392384324e-07, "loss": 0.0005, "step": 33200 }, { "epoch": 8.45699974530945, "grad_norm": 0.041915591806173325, "learning_rate": 1.8046835925753735e-07, "loss": 0.0005, "step": 33205 }, { "epoch": 8.458273198064353, "grad_norm": 0.034011393785476685, "learning_rate": 1.796285652309282e-07, "loss": 0.0005, "step": 33210 }, { "epoch": 8.459546650819254, "grad_norm": 0.05486088991165161, "learning_rate": 1.7879071200998855e-07, "loss": 0.0011, "step": 33215 }, { "epoch": 8.460820103574157, "grad_norm": 0.03607821837067604, "learning_rate": 1.7795479976030262e-07, "loss": 0.0002, "step": 33220 }, { "epoch": 8.46209355632906, "grad_norm": 0.01730327308177948, "learning_rate": 1.7712082864707158e-07, "loss": 0.0003, "step": 33225 }, { "epoch": 8.463367009083964, "grad_norm": 0.029618501663208008, "learning_rate": 1.7628879883511364e-07, "loss": 0.0004, "step": 33230 }, { "epoch": 8.464640461838865, "grad_norm": 0.049455977976322174, "learning_rate": 1.7545871048886498e-07, "loss": 0.0007, "step": 33235 }, { "epoch": 8.465913914593768, "grad_norm": 0.047657787799835205, "learning_rate": 1.746305637723744e-07, "loss": 0.0013, "step": 33240 }, { "epoch": 8.467187367348672, "grad_norm": 0.05869800224900246, "learning_rate": 1.7380435884930992e-07, "loss": 0.0008, "step": 33245 }, { "epoch": 8.468460820103575, "grad_norm": 0.039773695170879364, "learning_rate": 1.7298009588295416e-07, "loss": 0.0005, "step": 33250 }, { "epoch": 8.469734272858476, "grad_norm": 0.11474480479955673, "learning_rate": 1.7215777503620801e-07, "loss": 0.002, "step": 33255 }, { "epoch": 8.47100772561338, "grad_norm": 0.006065260618925095, "learning_rate": 1.7133739647158586e-07, "loss": 0.0005, "step": 33260 }, { "epoch": 8.472281178368283, "grad_norm": 0.02629201114177704, "learning_rate": 1.705189603512214e-07, "loss": 0.0007, "step": 33265 }, { "epoch": 8.473554631123186, "grad_norm": 0.050169847905635834, "learning_rate": 1.6970246683685965e-07, "loss": 0.0007, "step": 33270 }, { "epoch": 8.474828083878089, "grad_norm": 0.28381845355033875, "learning_rate": 1.6888791608986822e-07, "loss": 0.0006, "step": 33275 }, { "epoch": 8.47610153663299, "grad_norm": 0.004444583784788847, "learning_rate": 1.6807530827122499e-07, "loss": 0.0004, "step": 33280 }, { "epoch": 8.477374989387894, "grad_norm": 0.048391398042440414, "learning_rate": 1.6726464354152594e-07, "loss": 0.0004, "step": 33285 }, { "epoch": 8.478648442142797, "grad_norm": 0.041042450815439224, "learning_rate": 1.6645592206098404e-07, "loss": 0.0006, "step": 33290 }, { "epoch": 8.4799218948977, "grad_norm": 0.099955715239048, "learning_rate": 1.656491439894292e-07, "loss": 0.001, "step": 33295 }, { "epoch": 8.481195347652601, "grad_norm": 0.0523289293050766, "learning_rate": 1.648443094863006e-07, "loss": 0.0005, "step": 33300 }, { "epoch": 8.482468800407505, "grad_norm": 0.027195701375603676, "learning_rate": 1.6404141871066314e-07, "loss": 0.0007, "step": 33305 }, { "epoch": 8.483742253162408, "grad_norm": 0.06713664531707764, "learning_rate": 1.6324047182118996e-07, "loss": 0.0006, "step": 33310 }, { "epoch": 8.485015705917311, "grad_norm": 0.06506675481796265, "learning_rate": 1.624414689761733e-07, "loss": 0.0006, "step": 33315 }, { "epoch": 8.486289158672212, "grad_norm": 0.013515118509531021, "learning_rate": 1.6164441033351908e-07, "loss": 0.0003, "step": 33320 }, { "epoch": 8.487562611427116, "grad_norm": 0.03710680082440376, "learning_rate": 1.608492960507546e-07, "loss": 0.0005, "step": 33325 }, { "epoch": 8.488836064182019, "grad_norm": 0.06151629984378815, "learning_rate": 1.600561262850131e-07, "loss": 0.0007, "step": 33330 }, { "epoch": 8.490109516936922, "grad_norm": 0.04736845940351486, "learning_rate": 1.592649011930536e-07, "loss": 0.0004, "step": 33335 }, { "epoch": 8.491382969691825, "grad_norm": 0.016920674592256546, "learning_rate": 1.5847562093124546e-07, "loss": 0.0005, "step": 33340 }, { "epoch": 8.492656422446727, "grad_norm": 0.06321495026350021, "learning_rate": 1.5768828565557283e-07, "loss": 0.0007, "step": 33345 }, { "epoch": 8.49392987520163, "grad_norm": 0.03238917514681816, "learning_rate": 1.5690289552163894e-07, "loss": 0.0004, "step": 33350 }, { "epoch": 8.495203327956533, "grad_norm": 0.04023140296339989, "learning_rate": 1.5611945068465974e-07, "loss": 0.0007, "step": 33355 }, { "epoch": 8.496476780711436, "grad_norm": 0.015077505260705948, "learning_rate": 1.5533795129946793e-07, "loss": 0.0005, "step": 33360 }, { "epoch": 8.497750233466338, "grad_norm": 0.08540943264961243, "learning_rate": 1.5455839752051338e-07, "loss": 0.0005, "step": 33365 }, { "epoch": 8.499023686221241, "grad_norm": 0.02047811448574066, "learning_rate": 1.537807895018584e-07, "loss": 0.0003, "step": 33370 }, { "epoch": 8.500297138976144, "grad_norm": 0.06295090168714523, "learning_rate": 1.5300512739718222e-07, "loss": 0.0005, "step": 33375 }, { "epoch": 8.501570591731047, "grad_norm": 0.09074476361274719, "learning_rate": 1.5223141135978004e-07, "loss": 0.0005, "step": 33380 }, { "epoch": 8.502844044485949, "grad_norm": 0.021477479487657547, "learning_rate": 1.5145964154256066e-07, "loss": 0.0003, "step": 33385 }, { "epoch": 8.504117497240852, "grad_norm": 0.05256323143839836, "learning_rate": 1.5068981809804984e-07, "loss": 0.0005, "step": 33390 }, { "epoch": 8.505390949995755, "grad_norm": 0.03806120157241821, "learning_rate": 1.49921941178387e-07, "loss": 0.0004, "step": 33395 }, { "epoch": 8.506664402750658, "grad_norm": 0.0186834167689085, "learning_rate": 1.4915601093533184e-07, "loss": 0.0004, "step": 33400 }, { "epoch": 8.50793785550556, "grad_norm": 0.04121297225356102, "learning_rate": 1.4839202752025218e-07, "loss": 0.0008, "step": 33405 }, { "epoch": 8.509211308260463, "grad_norm": 0.050698716193437576, "learning_rate": 1.4762999108413613e-07, "loss": 0.0006, "step": 33410 }, { "epoch": 8.510484761015366, "grad_norm": 0.00662909634411335, "learning_rate": 1.4686990177758432e-07, "loss": 0.0005, "step": 33415 }, { "epoch": 8.51175821377027, "grad_norm": 0.027430780231952667, "learning_rate": 1.4611175975081326e-07, "loss": 0.0009, "step": 33420 }, { "epoch": 8.513031666525173, "grad_norm": 0.014861801639199257, "learning_rate": 1.4535556515365755e-07, "loss": 0.0003, "step": 33425 }, { "epoch": 8.514305119280074, "grad_norm": 0.07241980731487274, "learning_rate": 1.4460131813556212e-07, "loss": 0.0004, "step": 33430 }, { "epoch": 8.515578572034977, "grad_norm": 0.0630682110786438, "learning_rate": 1.4384901884558877e-07, "loss": 0.0005, "step": 33435 }, { "epoch": 8.51685202478988, "grad_norm": 0.04769793897867203, "learning_rate": 1.430986674324164e-07, "loss": 0.0004, "step": 33440 }, { "epoch": 8.518125477544784, "grad_norm": 0.02953922562301159, "learning_rate": 1.423502640443375e-07, "loss": 0.0003, "step": 33445 }, { "epoch": 8.519398930299685, "grad_norm": 0.050959497690200806, "learning_rate": 1.4160380882925817e-07, "loss": 0.0005, "step": 33450 }, { "epoch": 8.520672383054588, "grad_norm": 0.04682760685682297, "learning_rate": 1.4085930193470153e-07, "loss": 0.0003, "step": 33455 }, { "epoch": 8.521945835809491, "grad_norm": 0.017188509926199913, "learning_rate": 1.4011674350780547e-07, "loss": 0.0002, "step": 33460 }, { "epoch": 8.523219288564395, "grad_norm": 0.06759592145681381, "learning_rate": 1.3937613369532032e-07, "loss": 0.0006, "step": 33465 }, { "epoch": 8.524492741319296, "grad_norm": 0.032659467309713364, "learning_rate": 1.386374726436157e-07, "loss": 0.0006, "step": 33470 }, { "epoch": 8.5257661940742, "grad_norm": 0.09722547978162766, "learning_rate": 1.3790076049867152e-07, "loss": 0.0002, "step": 33475 }, { "epoch": 8.527039646829103, "grad_norm": 0.04318841174244881, "learning_rate": 1.3716599740608571e-07, "loss": 0.0003, "step": 33480 }, { "epoch": 8.528313099584006, "grad_norm": 0.04202130436897278, "learning_rate": 1.3643318351106995e-07, "loss": 0.0006, "step": 33485 }, { "epoch": 8.529586552338909, "grad_norm": 0.06254838407039642, "learning_rate": 1.3570231895845054e-07, "loss": 0.0006, "step": 33490 }, { "epoch": 8.53086000509381, "grad_norm": 0.008120332844555378, "learning_rate": 1.3497340389266754e-07, "loss": 0.0003, "step": 33495 }, { "epoch": 8.532133457848714, "grad_norm": 0.04687036946415901, "learning_rate": 1.3424643845778017e-07, "loss": 0.0008, "step": 33500 }, { "epoch": 8.533406910603617, "grad_norm": 0.008013220503926277, "learning_rate": 1.3352142279745462e-07, "loss": 0.0004, "step": 33505 }, { "epoch": 8.53468036335852, "grad_norm": 0.08324408531188965, "learning_rate": 1.3279835705497847e-07, "loss": 0.0009, "step": 33510 }, { "epoch": 8.535953816113421, "grad_norm": 0.022431474179029465, "learning_rate": 1.3207724137325296e-07, "loss": 0.0002, "step": 33515 }, { "epoch": 8.537227268868325, "grad_norm": 0.06420823931694031, "learning_rate": 1.3135807589478966e-07, "loss": 0.0012, "step": 33520 }, { "epoch": 8.538500721623228, "grad_norm": 0.0668562650680542, "learning_rate": 1.3064086076171935e-07, "loss": 0.0004, "step": 33525 }, { "epoch": 8.539774174378131, "grad_norm": 0.0021426002494990826, "learning_rate": 1.2992559611578637e-07, "loss": 0.0005, "step": 33530 }, { "epoch": 8.541047627133032, "grad_norm": 0.005449036601930857, "learning_rate": 1.2921228209834547e-07, "loss": 0.0005, "step": 33535 }, { "epoch": 8.542321079887936, "grad_norm": 0.036828696727752686, "learning_rate": 1.2850091885037384e-07, "loss": 0.0006, "step": 33540 }, { "epoch": 8.543594532642839, "grad_norm": 0.024595316499471664, "learning_rate": 1.2779150651245575e-07, "loss": 0.0007, "step": 33545 }, { "epoch": 8.544867985397742, "grad_norm": 0.06721708178520203, "learning_rate": 1.2708404522479456e-07, "loss": 0.0003, "step": 33550 }, { "epoch": 8.546141438152645, "grad_norm": 0.012136760167777538, "learning_rate": 1.2637853512720289e-07, "loss": 0.0007, "step": 33555 }, { "epoch": 8.547414890907547, "grad_norm": 0.0237245075404644, "learning_rate": 1.2567497635911585e-07, "loss": 0.0005, "step": 33560 }, { "epoch": 8.54868834366245, "grad_norm": 0.007861475460231304, "learning_rate": 1.2497336905957448e-07, "loss": 0.0005, "step": 33565 }, { "epoch": 8.549961796417353, "grad_norm": 0.011083767749369144, "learning_rate": 1.2427371336724004e-07, "loss": 0.0007, "step": 33570 }, { "epoch": 8.551235249172256, "grad_norm": 0.046650514006614685, "learning_rate": 1.2357600942038417e-07, "loss": 0.0006, "step": 33575 }, { "epoch": 8.552508701927158, "grad_norm": 0.03572867810726166, "learning_rate": 1.2288025735689546e-07, "loss": 0.0004, "step": 33580 }, { "epoch": 8.553782154682061, "grad_norm": 0.05097821727395058, "learning_rate": 1.2218645731427503e-07, "loss": 0.0004, "step": 33585 }, { "epoch": 8.555055607436964, "grad_norm": 0.057627078145742416, "learning_rate": 1.2149460942964097e-07, "loss": 0.0005, "step": 33590 }, { "epoch": 8.556329060191867, "grad_norm": 0.08201763033866882, "learning_rate": 1.2080471383972058e-07, "loss": 0.0007, "step": 33595 }, { "epoch": 8.557602512946769, "grad_norm": 0.029886692762374878, "learning_rate": 1.2011677068086036e-07, "loss": 0.0006, "step": 33600 }, { "epoch": 8.558875965701672, "grad_norm": 0.00535322492942214, "learning_rate": 1.1943078008901933e-07, "loss": 0.0003, "step": 33605 }, { "epoch": 8.560149418456575, "grad_norm": 0.020974302664399147, "learning_rate": 1.18746742199769e-07, "loss": 0.0003, "step": 33610 }, { "epoch": 8.561422871211478, "grad_norm": 0.02123364619910717, "learning_rate": 1.1806465714829574e-07, "loss": 0.0004, "step": 33615 }, { "epoch": 8.562696323966382, "grad_norm": 0.031112585216760635, "learning_rate": 1.1738452506940278e-07, "loss": 0.0004, "step": 33620 }, { "epoch": 8.563969776721283, "grad_norm": 0.009734038263559341, "learning_rate": 1.1670634609750154e-07, "loss": 0.0008, "step": 33625 }, { "epoch": 8.565243229476186, "grad_norm": 0.028426993638277054, "learning_rate": 1.1603012036662364e-07, "loss": 0.0006, "step": 33630 }, { "epoch": 8.56651668223109, "grad_norm": 0.02268953062593937, "learning_rate": 1.1535584801041112e-07, "loss": 0.0004, "step": 33635 }, { "epoch": 8.567790134985993, "grad_norm": 0.008561089634895325, "learning_rate": 1.1468352916212177e-07, "loss": 0.0006, "step": 33640 }, { "epoch": 8.569063587740894, "grad_norm": 0.03484739363193512, "learning_rate": 1.1401316395462381e-07, "loss": 0.0005, "step": 33645 }, { "epoch": 8.570337040495797, "grad_norm": 0.011499580927193165, "learning_rate": 1.1334475252040455e-07, "loss": 0.0003, "step": 33650 }, { "epoch": 8.5716104932507, "grad_norm": 0.05285044386982918, "learning_rate": 1.1267829499156058e-07, "loss": 0.0006, "step": 33655 }, { "epoch": 8.572883946005604, "grad_norm": 0.008213642053306103, "learning_rate": 1.1201379149980652e-07, "loss": 0.0002, "step": 33660 }, { "epoch": 8.574157398760505, "grad_norm": 0.035174593329429626, "learning_rate": 1.113512421764662e-07, "loss": 0.0004, "step": 33665 }, { "epoch": 8.575430851515408, "grad_norm": 0.06232050806283951, "learning_rate": 1.1069064715248157e-07, "loss": 0.001, "step": 33670 }, { "epoch": 8.576704304270311, "grad_norm": 0.09057635813951492, "learning_rate": 1.1003200655840485e-07, "loss": 0.0011, "step": 33675 }, { "epoch": 8.577977757025215, "grad_norm": 0.03582562506198883, "learning_rate": 1.0937532052440414e-07, "loss": 0.0005, "step": 33680 }, { "epoch": 8.579251209780118, "grad_norm": 0.005231249611824751, "learning_rate": 1.0872058918026008e-07, "loss": 0.0002, "step": 33685 }, { "epoch": 8.58052466253502, "grad_norm": 0.00969318300485611, "learning_rate": 1.0806781265537025e-07, "loss": 0.0002, "step": 33690 }, { "epoch": 8.581798115289923, "grad_norm": 0.04984629154205322, "learning_rate": 1.0741699107873926e-07, "loss": 0.0009, "step": 33695 }, { "epoch": 8.583071568044826, "grad_norm": 0.1452092081308365, "learning_rate": 1.06768124578992e-07, "loss": 0.0008, "step": 33700 }, { "epoch": 8.584345020799729, "grad_norm": 0.01940852589905262, "learning_rate": 1.0612121328436364e-07, "loss": 0.0002, "step": 33705 }, { "epoch": 8.58561847355463, "grad_norm": 0.01332137081772089, "learning_rate": 1.0547625732270416e-07, "loss": 0.0004, "step": 33710 }, { "epoch": 8.586891926309534, "grad_norm": 0.026770619675517082, "learning_rate": 1.0483325682147604e-07, "loss": 0.0005, "step": 33715 }, { "epoch": 8.588165379064437, "grad_norm": 0.00564334262162447, "learning_rate": 1.0419221190775541e-07, "loss": 0.0005, "step": 33720 }, { "epoch": 8.58943883181934, "grad_norm": 0.0356370285153389, "learning_rate": 1.0355312270823315e-07, "loss": 0.0004, "step": 33725 }, { "epoch": 8.590712284574241, "grad_norm": 0.048342570662498474, "learning_rate": 1.0291598934921266e-07, "loss": 0.0009, "step": 33730 }, { "epoch": 8.591985737329145, "grad_norm": 0.1705581098794937, "learning_rate": 1.02280811956611e-07, "loss": 0.0005, "step": 33735 }, { "epoch": 8.593259190084048, "grad_norm": 0.06315833330154419, "learning_rate": 1.0164759065595775e-07, "loss": 0.0008, "step": 33740 }, { "epoch": 8.594532642838951, "grad_norm": 0.11904191225767136, "learning_rate": 1.0101632557239838e-07, "loss": 0.0006, "step": 33745 }, { "epoch": 8.595806095593854, "grad_norm": 0.013296365737915039, "learning_rate": 1.0038701683068863e-07, "loss": 0.0005, "step": 33750 }, { "epoch": 8.597079548348756, "grad_norm": 0.03253663703799248, "learning_rate": 9.975966455520015e-08, "loss": 0.0006, "step": 33755 }, { "epoch": 8.598353001103659, "grad_norm": 0.035716935992240906, "learning_rate": 9.913426886991595e-08, "loss": 0.0006, "step": 33760 }, { "epoch": 8.599626453858562, "grad_norm": 0.047035571187734604, "learning_rate": 9.851082989843497e-08, "loss": 0.0003, "step": 33765 }, { "epoch": 8.600899906613465, "grad_norm": 0.007545944303274155, "learning_rate": 9.788934776396641e-08, "loss": 0.0006, "step": 33770 }, { "epoch": 8.602173359368367, "grad_norm": 0.05958074703812599, "learning_rate": 9.726982258933426e-08, "loss": 0.0006, "step": 33775 }, { "epoch": 8.60344681212327, "grad_norm": 0.052667707204818726, "learning_rate": 9.665225449697724e-08, "loss": 0.0008, "step": 33780 }, { "epoch": 8.604720264878173, "grad_norm": 0.03539160266518593, "learning_rate": 9.603664360894327e-08, "loss": 0.001, "step": 33785 }, { "epoch": 8.605993717633076, "grad_norm": 0.049461837857961655, "learning_rate": 9.542299004689726e-08, "loss": 0.0006, "step": 33790 }, { "epoch": 8.607267170387978, "grad_norm": 0.03253176063299179, "learning_rate": 9.481129393211552e-08, "loss": 0.0006, "step": 33795 }, { "epoch": 8.608540623142881, "grad_norm": 0.063555046916008, "learning_rate": 9.420155538548803e-08, "loss": 0.0003, "step": 33800 }, { "epoch": 8.609814075897784, "grad_norm": 0.011753259226679802, "learning_rate": 9.359377452751839e-08, "loss": 0.0003, "step": 33805 }, { "epoch": 8.611087528652687, "grad_norm": 0.007849846966564655, "learning_rate": 9.298795147832163e-08, "loss": 0.0006, "step": 33810 }, { "epoch": 8.61236098140759, "grad_norm": 0.009818264283239841, "learning_rate": 9.238408635762752e-08, "loss": 0.0004, "step": 33815 }, { "epoch": 8.613634434162492, "grad_norm": 0.059460144490003586, "learning_rate": 9.178217928477729e-08, "loss": 0.0009, "step": 33820 }, { "epoch": 8.614907886917395, "grad_norm": 0.07777761667966843, "learning_rate": 9.118223037872908e-08, "loss": 0.0006, "step": 33825 }, { "epoch": 8.616181339672298, "grad_norm": 0.01229078508913517, "learning_rate": 9.058423975804698e-08, "loss": 0.0003, "step": 33830 }, { "epoch": 8.617454792427202, "grad_norm": 0.005951831117272377, "learning_rate": 8.99882075409153e-08, "loss": 0.0005, "step": 33835 }, { "epoch": 8.618728245182103, "grad_norm": 0.08091524988412857, "learning_rate": 8.939413384512763e-08, "loss": 0.0006, "step": 33840 }, { "epoch": 8.620001697937006, "grad_norm": 0.02439255267381668, "learning_rate": 8.880201878809003e-08, "loss": 0.0003, "step": 33845 }, { "epoch": 8.62127515069191, "grad_norm": 0.030129266902804375, "learning_rate": 8.821186248682334e-08, "loss": 0.0004, "step": 33850 }, { "epoch": 8.622548603446813, "grad_norm": 0.049615345895290375, "learning_rate": 8.762366505796094e-08, "loss": 0.0004, "step": 33855 }, { "epoch": 8.623822056201714, "grad_norm": 0.013506405055522919, "learning_rate": 8.703742661774761e-08, "loss": 0.0003, "step": 33860 }, { "epoch": 8.625095508956617, "grad_norm": 0.2600776255130768, "learning_rate": 8.645314728204179e-08, "loss": 0.0005, "step": 33865 }, { "epoch": 8.62636896171152, "grad_norm": 0.01391504891216755, "learning_rate": 8.587082716631668e-08, "loss": 0.0006, "step": 33870 }, { "epoch": 8.627642414466424, "grad_norm": 0.05023716017603874, "learning_rate": 8.529046638565463e-08, "loss": 0.0005, "step": 33875 }, { "epoch": 8.628915867221327, "grad_norm": 0.03584897890686989, "learning_rate": 8.471206505475394e-08, "loss": 0.0006, "step": 33880 }, { "epoch": 8.630189319976228, "grad_norm": 0.10987048596143723, "learning_rate": 8.413562328792424e-08, "loss": 0.0008, "step": 33885 }, { "epoch": 8.631462772731131, "grad_norm": 0.05771024897694588, "learning_rate": 8.356114119908665e-08, "loss": 0.0003, "step": 33890 }, { "epoch": 8.632736225486035, "grad_norm": 0.07817961275577545, "learning_rate": 8.298861890177922e-08, "loss": 0.0003, "step": 33895 }, { "epoch": 8.634009678240938, "grad_norm": 0.026538966223597527, "learning_rate": 8.241805650914703e-08, "loss": 0.0003, "step": 33900 }, { "epoch": 8.63528313099584, "grad_norm": 0.08192047476768494, "learning_rate": 8.184945413395207e-08, "loss": 0.0011, "step": 33905 }, { "epoch": 8.636556583750743, "grad_norm": 0.026978803798556328, "learning_rate": 8.128281188856668e-08, "loss": 0.0005, "step": 33910 }, { "epoch": 8.637830036505646, "grad_norm": 0.030749240890145302, "learning_rate": 8.071812988497907e-08, "loss": 0.0003, "step": 33915 }, { "epoch": 8.639103489260549, "grad_norm": 0.06293671578168869, "learning_rate": 8.01554082347833e-08, "loss": 0.0002, "step": 33920 }, { "epoch": 8.64037694201545, "grad_norm": 0.08075267821550369, "learning_rate": 7.959464704919484e-08, "loss": 0.0009, "step": 33925 }, { "epoch": 8.641650394770354, "grad_norm": 0.00636645033955574, "learning_rate": 7.903584643903506e-08, "loss": 0.0003, "step": 33930 }, { "epoch": 8.642923847525257, "grad_norm": 0.0315297432243824, "learning_rate": 7.847900651474005e-08, "loss": 0.0009, "step": 33935 }, { "epoch": 8.64419730028016, "grad_norm": 0.05151808261871338, "learning_rate": 7.792412738635846e-08, "loss": 0.0004, "step": 33940 }, { "epoch": 8.645470753035063, "grad_norm": 0.09170547127723694, "learning_rate": 7.737120916355256e-08, "loss": 0.0015, "step": 33945 }, { "epoch": 8.646744205789965, "grad_norm": 0.035889748483896255, "learning_rate": 7.682025195559384e-08, "loss": 0.0006, "step": 33950 }, { "epoch": 8.648017658544868, "grad_norm": 0.006326430011540651, "learning_rate": 7.627125587137074e-08, "loss": 0.0009, "step": 33955 }, { "epoch": 8.649291111299771, "grad_norm": 0.00635235570371151, "learning_rate": 7.57242210193787e-08, "loss": 0.0002, "step": 33960 }, { "epoch": 8.650564564054674, "grad_norm": 0.009161241352558136, "learning_rate": 7.517914750773126e-08, "loss": 0.0006, "step": 33965 }, { "epoch": 8.651838016809576, "grad_norm": 0.08008871972560883, "learning_rate": 7.463603544414999e-08, "loss": 0.0005, "step": 33970 }, { "epoch": 8.653111469564479, "grad_norm": 0.03800499811768532, "learning_rate": 7.409488493597128e-08, "loss": 0.0004, "step": 33975 }, { "epoch": 8.654384922319382, "grad_norm": 0.03402039781212807, "learning_rate": 7.35556960901429e-08, "loss": 0.0003, "step": 33980 }, { "epoch": 8.655658375074285, "grad_norm": 0.0480465292930603, "learning_rate": 7.301846901322518e-08, "loss": 0.0009, "step": 33985 }, { "epoch": 8.656931827829187, "grad_norm": 0.04238911718130112, "learning_rate": 7.248320381138874e-08, "loss": 0.0005, "step": 33990 }, { "epoch": 8.65820528058409, "grad_norm": 0.0025082614738494158, "learning_rate": 7.194990059042118e-08, "loss": 0.001, "step": 33995 }, { "epoch": 8.659478733338993, "grad_norm": 0.21512778103351593, "learning_rate": 7.141855945571818e-08, "loss": 0.0008, "step": 34000 }, { "epoch": 8.660752186093896, "grad_norm": 0.03299584984779358, "learning_rate": 7.08891805122891e-08, "loss": 0.001, "step": 34005 }, { "epoch": 8.6620256388488, "grad_norm": 0.020183676853775978, "learning_rate": 7.036176386475468e-08, "loss": 0.0009, "step": 34010 }, { "epoch": 8.663299091603701, "grad_norm": 0.028747133910655975, "learning_rate": 6.983630961735043e-08, "loss": 0.0001, "step": 34015 }, { "epoch": 8.664572544358604, "grad_norm": 0.017280815169215202, "learning_rate": 6.931281787391997e-08, "loss": 0.0007, "step": 34020 }, { "epoch": 8.665845997113507, "grad_norm": 0.023590950295329094, "learning_rate": 6.879128873792274e-08, "loss": 0.0008, "step": 34025 }, { "epoch": 8.66711944986841, "grad_norm": 0.02895371988415718, "learning_rate": 6.827172231242851e-08, "loss": 0.001, "step": 34030 }, { "epoch": 8.668392902623312, "grad_norm": 0.02690306305885315, "learning_rate": 6.775411870011961e-08, "loss": 0.0008, "step": 34035 }, { "epoch": 8.669666355378215, "grad_norm": 0.02143423818051815, "learning_rate": 6.723847800328975e-08, "loss": 0.0006, "step": 34040 }, { "epoch": 8.670939808133118, "grad_norm": 0.4342106878757477, "learning_rate": 6.67248003238452e-08, "loss": 0.0005, "step": 34045 }, { "epoch": 8.672213260888022, "grad_norm": 0.029785672202706337, "learning_rate": 6.621308576330476e-08, "loss": 0.0008, "step": 34050 }, { "epoch": 8.673486713642923, "grad_norm": 0.06510920077562332, "learning_rate": 6.570333442279975e-08, "loss": 0.0004, "step": 34055 }, { "epoch": 8.674760166397826, "grad_norm": 0.04200411215424538, "learning_rate": 6.51955464030718e-08, "loss": 0.0004, "step": 34060 }, { "epoch": 8.67603361915273, "grad_norm": 0.012675778940320015, "learning_rate": 6.468972180447618e-08, "loss": 0.0005, "step": 34065 }, { "epoch": 8.677307071907633, "grad_norm": 0.06925980746746063, "learning_rate": 6.418586072697741e-08, "loss": 0.0009, "step": 34070 }, { "epoch": 8.678580524662536, "grad_norm": 0.013804818503558636, "learning_rate": 6.36839632701558e-08, "loss": 0.0004, "step": 34075 }, { "epoch": 8.679853977417437, "grad_norm": 0.1605415642261505, "learning_rate": 6.31840295331998e-08, "loss": 0.0007, "step": 34080 }, { "epoch": 8.68112743017234, "grad_norm": 0.04023122787475586, "learning_rate": 6.268605961491259e-08, "loss": 0.0005, "step": 34085 }, { "epoch": 8.682400882927244, "grad_norm": 0.01569008082151413, "learning_rate": 6.219005361370877e-08, "loss": 0.0002, "step": 34090 }, { "epoch": 8.683674335682147, "grad_norm": 0.008604924194514751, "learning_rate": 6.16960116276133e-08, "loss": 0.0003, "step": 34095 }, { "epoch": 8.684947788437048, "grad_norm": 0.041414644569158554, "learning_rate": 6.12039337542647e-08, "loss": 0.0007, "step": 34100 }, { "epoch": 8.686221241191951, "grad_norm": 0.0207996629178524, "learning_rate": 6.071382009091298e-08, "loss": 0.0004, "step": 34105 }, { "epoch": 8.687494693946855, "grad_norm": 0.0263021532446146, "learning_rate": 6.022567073441843e-08, "loss": 0.0009, "step": 34110 }, { "epoch": 8.688768146701758, "grad_norm": 0.047482434660196304, "learning_rate": 5.973948578125388e-08, "loss": 0.0003, "step": 34115 }, { "epoch": 8.69004159945666, "grad_norm": 0.0517306812107563, "learning_rate": 5.9255265327506916e-08, "loss": 0.0002, "step": 34120 }, { "epoch": 8.691315052211563, "grad_norm": 0.03698902204632759, "learning_rate": 5.87730094688721e-08, "loss": 0.0005, "step": 34125 }, { "epoch": 8.692588504966466, "grad_norm": 0.00715353898704052, "learning_rate": 5.8292718300658746e-08, "loss": 0.0003, "step": 34130 }, { "epoch": 8.693861957721369, "grad_norm": 0.06192997097969055, "learning_rate": 5.781439191778648e-08, "loss": 0.0006, "step": 34135 }, { "epoch": 8.695135410476272, "grad_norm": 0.009418828412890434, "learning_rate": 5.733803041478747e-08, "loss": 0.0006, "step": 34140 }, { "epoch": 8.696408863231174, "grad_norm": 0.025031089782714844, "learning_rate": 5.686363388580529e-08, "loss": 0.0009, "step": 34145 }, { "epoch": 8.697682315986077, "grad_norm": 0.020564522594213486, "learning_rate": 5.639120242459606e-08, "loss": 0.0002, "step": 34150 }, { "epoch": 8.69895576874098, "grad_norm": 0.039361514151096344, "learning_rate": 5.592073612452509e-08, "loss": 0.0008, "step": 34155 }, { "epoch": 8.700229221495883, "grad_norm": 0.005337740760296583, "learning_rate": 5.5452235078571337e-08, "loss": 0.0008, "step": 34160 }, { "epoch": 8.701502674250785, "grad_norm": 0.09040229022502899, "learning_rate": 5.4985699379326295e-08, "loss": 0.0005, "step": 34165 }, { "epoch": 8.702776127005688, "grad_norm": 0.00971122458577156, "learning_rate": 5.452112911899066e-08, "loss": 0.0008, "step": 34170 }, { "epoch": 8.704049579760591, "grad_norm": 0.01943052001297474, "learning_rate": 5.405852438937764e-08, "loss": 0.0002, "step": 34175 }, { "epoch": 8.705323032515494, "grad_norm": 0.014876682311296463, "learning_rate": 5.359788528191301e-08, "loss": 0.0003, "step": 34180 }, { "epoch": 8.706596485270396, "grad_norm": 0.005945043638348579, "learning_rate": 5.3139211887631714e-08, "loss": 0.0004, "step": 34185 }, { "epoch": 8.707869938025299, "grad_norm": 0.04083392396569252, "learning_rate": 5.268250429718236e-08, "loss": 0.0005, "step": 34190 }, { "epoch": 8.709143390780202, "grad_norm": 0.040081001818180084, "learning_rate": 5.222776260082385e-08, "loss": 0.0006, "step": 34195 }, { "epoch": 8.710416843535105, "grad_norm": 0.03146931529045105, "learning_rate": 5.1774986888427634e-08, "loss": 0.0009, "step": 34200 }, { "epoch": 8.711690296290008, "grad_norm": 0.028679242357611656, "learning_rate": 5.132417724947658e-08, "loss": 0.0005, "step": 34205 }, { "epoch": 8.71296374904491, "grad_norm": 0.08783368021249771, "learning_rate": 5.0875333773063864e-08, "loss": 0.0006, "step": 34210 }, { "epoch": 8.714237201799813, "grad_norm": 0.04001512750983238, "learning_rate": 5.0428456547892965e-08, "loss": 0.001, "step": 34215 }, { "epoch": 8.715510654554716, "grad_norm": 0.012697184458374977, "learning_rate": 4.998354566228325e-08, "loss": 0.0006, "step": 34220 }, { "epoch": 8.71678410730962, "grad_norm": 0.025039061903953552, "learning_rate": 4.954060120416104e-08, "loss": 0.0007, "step": 34225 }, { "epoch": 8.718057560064521, "grad_norm": 0.028681760653853416, "learning_rate": 4.909962326106743e-08, "loss": 0.001, "step": 34230 }, { "epoch": 8.719331012819424, "grad_norm": 0.01223793625831604, "learning_rate": 4.866061192015048e-08, "loss": 0.0005, "step": 34235 }, { "epoch": 8.720604465574327, "grad_norm": 0.05098114535212517, "learning_rate": 4.822356726817523e-08, "loss": 0.0004, "step": 34240 }, { "epoch": 8.72187791832923, "grad_norm": 0.002738058799877763, "learning_rate": 4.7788489391511484e-08, "loss": 0.0007, "step": 34245 }, { "epoch": 8.723151371084132, "grad_norm": 0.06506987661123276, "learning_rate": 4.735537837614823e-08, "loss": 0.0006, "step": 34250 }, { "epoch": 8.724424823839035, "grad_norm": 0.0054189665243029594, "learning_rate": 4.692423430767812e-08, "loss": 0.0002, "step": 34255 }, { "epoch": 8.725698276593938, "grad_norm": 0.06939057260751724, "learning_rate": 4.649505727130965e-08, "loss": 0.0008, "step": 34260 }, { "epoch": 8.726971729348842, "grad_norm": 0.10325853526592255, "learning_rate": 4.606784735186054e-08, "loss": 0.0008, "step": 34265 }, { "epoch": 8.728245182103745, "grad_norm": 0.0745445266366005, "learning_rate": 4.5642604633762135e-08, "loss": 0.0008, "step": 34270 }, { "epoch": 8.729518634858646, "grad_norm": 0.007234200369566679, "learning_rate": 4.5219329201053876e-08, "loss": 0.0002, "step": 34275 }, { "epoch": 8.73079208761355, "grad_norm": 0.05063542351126671, "learning_rate": 4.479802113738885e-08, "loss": 0.001, "step": 34280 }, { "epoch": 8.732065540368453, "grad_norm": 0.0069733308628201485, "learning_rate": 4.437868052602934e-08, "loss": 0.0003, "step": 34285 }, { "epoch": 8.733338993123356, "grad_norm": 0.023947888985276222, "learning_rate": 4.396130744985128e-08, "loss": 0.0008, "step": 34290 }, { "epoch": 8.734612445878257, "grad_norm": 0.014250208623707294, "learning_rate": 4.354590199133979e-08, "loss": 0.001, "step": 34295 }, { "epoch": 8.73588589863316, "grad_norm": 0.009762278757989407, "learning_rate": 4.313246423259032e-08, "loss": 0.0002, "step": 34300 }, { "epoch": 8.737159351388064, "grad_norm": 0.04241665452718735, "learning_rate": 4.272099425531306e-08, "loss": 0.0004, "step": 34305 }, { "epoch": 8.738432804142967, "grad_norm": 0.06584763526916504, "learning_rate": 4.231149214082631e-08, "loss": 0.0004, "step": 34310 }, { "epoch": 8.739706256897868, "grad_norm": 0.03234831243753433, "learning_rate": 4.190395797005975e-08, "loss": 0.0008, "step": 34315 }, { "epoch": 8.740979709652771, "grad_norm": 0.015535928308963776, "learning_rate": 4.149839182355453e-08, "loss": 0.0003, "step": 34320 }, { "epoch": 8.742253162407675, "grad_norm": 0.01818327233195305, "learning_rate": 4.1094793781463196e-08, "loss": 0.0003, "step": 34325 }, { "epoch": 8.743526615162578, "grad_norm": 0.0794239267706871, "learning_rate": 4.0693163923548605e-08, "loss": 0.0005, "step": 34330 }, { "epoch": 8.744800067917481, "grad_norm": 0.03391960263252258, "learning_rate": 4.029350232918616e-08, "loss": 0.0003, "step": 34335 }, { "epoch": 8.746073520672383, "grad_norm": 0.0032354139257222414, "learning_rate": 3.9895809077360456e-08, "loss": 0.0006, "step": 34340 }, { "epoch": 8.747346973427286, "grad_norm": 0.04726433381438255, "learning_rate": 3.950008424666751e-08, "loss": 0.0009, "step": 34345 }, { "epoch": 8.748620426182189, "grad_norm": 0.08543380349874496, "learning_rate": 3.910632791531588e-08, "loss": 0.0008, "step": 34350 }, { "epoch": 8.749893878937092, "grad_norm": 0.04467036575078964, "learning_rate": 3.871454016112331e-08, "loss": 0.0006, "step": 34355 }, { "epoch": 8.751167331691994, "grad_norm": 0.026232751086354256, "learning_rate": 3.8324721061517853e-08, "loss": 0.0003, "step": 34360 }, { "epoch": 8.752440784446897, "grad_norm": 0.008021113462746143, "learning_rate": 3.7936870693541236e-08, "loss": 0.0003, "step": 34365 }, { "epoch": 8.7537142372018, "grad_norm": 0.04784979298710823, "learning_rate": 3.755098913384325e-08, "loss": 0.0001, "step": 34370 }, { "epoch": 8.754987689956703, "grad_norm": 0.032041266560554504, "learning_rate": 3.7167076458687335e-08, "loss": 0.0003, "step": 34375 }, { "epoch": 8.756261142711605, "grad_norm": 0.008423088118433952, "learning_rate": 3.678513274394502e-08, "loss": 0.0006, "step": 34380 }, { "epoch": 8.757534595466508, "grad_norm": 0.9999530911445618, "learning_rate": 3.6405158065101475e-08, "loss": 0.001, "step": 34385 }, { "epoch": 8.758808048221411, "grad_norm": 0.03071960061788559, "learning_rate": 3.6027152497249975e-08, "loss": 0.0003, "step": 34390 }, { "epoch": 8.760081500976314, "grad_norm": 0.017748480662703514, "learning_rate": 3.5651116115097414e-08, "loss": 0.0006, "step": 34395 }, { "epoch": 8.761354953731217, "grad_norm": 0.21815618872642517, "learning_rate": 3.527704899295769e-08, "loss": 0.001, "step": 34400 }, { "epoch": 8.762628406486119, "grad_norm": 0.00954035110771656, "learning_rate": 3.490495120476056e-08, "loss": 0.0005, "step": 34405 }, { "epoch": 8.763901859241022, "grad_norm": 0.05044221505522728, "learning_rate": 3.4534822824041635e-08, "loss": 0.0004, "step": 34410 }, { "epoch": 8.765175311995925, "grad_norm": 0.0429665744304657, "learning_rate": 3.4166663923952406e-08, "loss": 0.0009, "step": 34415 }, { "epoch": 8.766448764750828, "grad_norm": 0.012159697711467743, "learning_rate": 3.380047457724911e-08, "loss": 0.0003, "step": 34420 }, { "epoch": 8.76772221750573, "grad_norm": 0.03898788243532181, "learning_rate": 3.343625485630497e-08, "loss": 0.0007, "step": 34425 }, { "epoch": 8.768995670260633, "grad_norm": 0.008564099669456482, "learning_rate": 3.3074004833097975e-08, "loss": 0.0006, "step": 34430 }, { "epoch": 8.770269123015536, "grad_norm": 0.04485134780406952, "learning_rate": 3.271372457922195e-08, "loss": 0.0004, "step": 34435 }, { "epoch": 8.77154257577044, "grad_norm": 0.03908466920256615, "learning_rate": 3.235541416587884e-08, "loss": 0.0009, "step": 34440 }, { "epoch": 8.772816028525341, "grad_norm": 0.04196301847696304, "learning_rate": 3.1999073663882e-08, "loss": 0.0008, "step": 34445 }, { "epoch": 8.774089481280244, "grad_norm": 0.05178524926304817, "learning_rate": 3.164470314365287e-08, "loss": 0.0005, "step": 34450 }, { "epoch": 8.775362934035147, "grad_norm": 0.0818624272942543, "learning_rate": 3.129230267522987e-08, "loss": 0.0007, "step": 34455 }, { "epoch": 8.77663638679005, "grad_norm": 0.024126214906573296, "learning_rate": 3.094187232825507e-08, "loss": 0.0005, "step": 34460 }, { "epoch": 8.777909839544954, "grad_norm": 0.053314365446567535, "learning_rate": 3.05934121719853e-08, "loss": 0.0007, "step": 34465 }, { "epoch": 8.779183292299855, "grad_norm": 0.008691268041729927, "learning_rate": 3.024692227528658e-08, "loss": 0.0003, "step": 34470 }, { "epoch": 8.780456745054758, "grad_norm": 0.05424538627266884, "learning_rate": 2.990240270663636e-08, "loss": 0.0008, "step": 34475 }, { "epoch": 8.781730197809662, "grad_norm": 0.01372547633945942, "learning_rate": 2.9559853534122384e-08, "loss": 0.0004, "step": 34480 }, { "epoch": 8.783003650564565, "grad_norm": 0.018959101289510727, "learning_rate": 2.9219274825441624e-08, "loss": 0.0009, "step": 34485 }, { "epoch": 8.784277103319466, "grad_norm": 0.019274838268756866, "learning_rate": 2.8880666647904675e-08, "loss": 0.0004, "step": 34490 }, { "epoch": 8.78555055607437, "grad_norm": 0.025210054591298103, "learning_rate": 2.8544029068429125e-08, "loss": 0.0006, "step": 34495 }, { "epoch": 8.786824008829273, "grad_norm": 0.029314346611499786, "learning_rate": 2.8209362153546193e-08, "loss": 0.0005, "step": 34500 }, { "epoch": 8.788097461584176, "grad_norm": 0.02324402891099453, "learning_rate": 2.78766659693952e-08, "loss": 0.0002, "step": 34505 }, { "epoch": 8.789370914339077, "grad_norm": 0.0531771145761013, "learning_rate": 2.7545940581727992e-08, "loss": 0.0005, "step": 34510 }, { "epoch": 8.79064436709398, "grad_norm": 0.05579235404729843, "learning_rate": 2.721718605590562e-08, "loss": 0.0006, "step": 34515 }, { "epoch": 8.791917819848884, "grad_norm": 0.006084661930799484, "learning_rate": 2.6890402456899445e-08, "loss": 0.0004, "step": 34520 }, { "epoch": 8.793191272603787, "grad_norm": 0.02881833352148533, "learning_rate": 2.656558984929336e-08, "loss": 0.0008, "step": 34525 }, { "epoch": 8.79446472535869, "grad_norm": 0.032345421612262726, "learning_rate": 2.624274829727824e-08, "loss": 0.0004, "step": 34530 }, { "epoch": 8.795738178113591, "grad_norm": 0.02054433524608612, "learning_rate": 2.5921877864659715e-08, "loss": 0.0007, "step": 34535 }, { "epoch": 8.797011630868495, "grad_norm": 0.03157127648591995, "learning_rate": 2.5602978614850394e-08, "loss": 0.0005, "step": 34540 }, { "epoch": 8.798285083623398, "grad_norm": 0.002265841467306018, "learning_rate": 2.5286050610874303e-08, "loss": 0.0006, "step": 34545 }, { "epoch": 8.799558536378301, "grad_norm": 0.054330382496118546, "learning_rate": 2.4971093915366896e-08, "loss": 0.0006, "step": 34550 }, { "epoch": 8.800831989133203, "grad_norm": 0.03531631454825401, "learning_rate": 2.4658108590571716e-08, "loss": 0.0007, "step": 34555 }, { "epoch": 8.802105441888106, "grad_norm": 0.1607406884431839, "learning_rate": 2.434709469834595e-08, "loss": 0.0013, "step": 34560 }, { "epoch": 8.803378894643009, "grad_norm": 0.08319348096847534, "learning_rate": 2.403805230015488e-08, "loss": 0.0005, "step": 34565 }, { "epoch": 8.804652347397912, "grad_norm": 0.02318297140300274, "learning_rate": 2.3730981457074086e-08, "loss": 0.0003, "step": 34570 }, { "epoch": 8.805925800152814, "grad_norm": 0.0035898867063224316, "learning_rate": 2.3425882229791697e-08, "loss": 0.0006, "step": 34575 }, { "epoch": 8.807199252907717, "grad_norm": 0.07828456163406372, "learning_rate": 2.312275467860281e-08, "loss": 0.0007, "step": 34580 }, { "epoch": 8.80847270566262, "grad_norm": 0.03007545694708824, "learning_rate": 2.282159886341617e-08, "loss": 0.0007, "step": 34585 }, { "epoch": 8.809746158417523, "grad_norm": 0.0716654360294342, "learning_rate": 2.2522414843748618e-08, "loss": 0.0005, "step": 34590 }, { "epoch": 8.811019611172426, "grad_norm": 0.04275783896446228, "learning_rate": 2.2225202678728408e-08, "loss": 0.0007, "step": 34595 }, { "epoch": 8.812293063927328, "grad_norm": 0.00861804373562336, "learning_rate": 2.1929962427093e-08, "loss": 0.0006, "step": 34600 }, { "epoch": 8.813566516682231, "grad_norm": 0.04052631929516792, "learning_rate": 2.1636694147191273e-08, "loss": 0.0004, "step": 34605 }, { "epoch": 8.814839969437134, "grad_norm": 0.04161206632852554, "learning_rate": 2.134539789698242e-08, "loss": 0.0015, "step": 34610 }, { "epoch": 8.816113422192037, "grad_norm": 0.013207347132265568, "learning_rate": 2.1056073734035953e-08, "loss": 0.0003, "step": 34615 }, { "epoch": 8.817386874946939, "grad_norm": 0.02246209979057312, "learning_rate": 2.076872171552946e-08, "loss": 0.0007, "step": 34620 }, { "epoch": 8.818660327701842, "grad_norm": 0.03768365830183029, "learning_rate": 2.0483341898254183e-08, "loss": 0.0004, "step": 34625 }, { "epoch": 8.819933780456745, "grad_norm": 0.01873680204153061, "learning_rate": 2.019993433860834e-08, "loss": 0.0006, "step": 34630 }, { "epoch": 8.821207233211648, "grad_norm": 0.0434701144695282, "learning_rate": 1.9918499092603793e-08, "loss": 0.0006, "step": 34635 }, { "epoch": 8.82248068596655, "grad_norm": 0.10161380469799042, "learning_rate": 1.9639036215859385e-08, "loss": 0.0003, "step": 34640 }, { "epoch": 8.823754138721453, "grad_norm": 0.06754057854413986, "learning_rate": 1.9361545763605382e-08, "loss": 0.0006, "step": 34645 }, { "epoch": 8.825027591476356, "grad_norm": 0.024109508842229843, "learning_rate": 1.908602779068347e-08, "loss": 0.0007, "step": 34650 }, { "epoch": 8.82630104423126, "grad_norm": 0.0577273927628994, "learning_rate": 1.8812482351544536e-08, "loss": 0.0003, "step": 34655 }, { "epoch": 8.827574496986163, "grad_norm": 0.026540150865912437, "learning_rate": 1.854090950024756e-08, "loss": 0.0002, "step": 34660 }, { "epoch": 8.828847949741064, "grad_norm": 0.004888932220637798, "learning_rate": 1.8271309290465168e-08, "loss": 0.0005, "step": 34665 }, { "epoch": 8.830121402495967, "grad_norm": 0.04320458695292473, "learning_rate": 1.8003681775479175e-08, "loss": 0.0004, "step": 34670 }, { "epoch": 8.83139485525087, "grad_norm": 0.030542919412255287, "learning_rate": 1.7738027008179504e-08, "loss": 0.0007, "step": 34675 }, { "epoch": 8.832668308005774, "grad_norm": 0.13770170509815216, "learning_rate": 1.7474345041068597e-08, "loss": 0.001, "step": 34680 }, { "epoch": 8.833941760760675, "grad_norm": 0.05615519732236862, "learning_rate": 1.7212635926256994e-08, "loss": 0.0006, "step": 34685 }, { "epoch": 8.835215213515578, "grad_norm": 0.007483866065740585, "learning_rate": 1.6952899715467764e-08, "loss": 0.0004, "step": 34690 }, { "epoch": 8.836488666270482, "grad_norm": 0.06619898974895477, "learning_rate": 1.669513646003207e-08, "loss": 0.0008, "step": 34695 }, { "epoch": 8.837762119025385, "grad_norm": 0.058820419013500214, "learning_rate": 1.6439346210892493e-08, "loss": 0.0007, "step": 34700 }, { "epoch": 8.839035571780286, "grad_norm": 0.08542945981025696, "learning_rate": 1.618552901859971e-08, "loss": 0.0005, "step": 34705 }, { "epoch": 8.84030902453519, "grad_norm": 0.029075412079691887, "learning_rate": 1.593368493331693e-08, "loss": 0.0005, "step": 34710 }, { "epoch": 8.841582477290093, "grad_norm": 0.05576715245842934, "learning_rate": 1.5683814004815445e-08, "loss": 0.0003, "step": 34715 }, { "epoch": 8.842855930044996, "grad_norm": 0.046184755861759186, "learning_rate": 1.5435916282476872e-08, "loss": 0.0007, "step": 34720 }, { "epoch": 8.844129382799899, "grad_norm": 0.05681058019399643, "learning_rate": 1.5189991815295347e-08, "loss": 0.0008, "step": 34725 }, { "epoch": 8.8454028355548, "grad_norm": 0.06134103983640671, "learning_rate": 1.4946040651871997e-08, "loss": 0.0005, "step": 34730 }, { "epoch": 8.846676288309704, "grad_norm": 0.06320099532604218, "learning_rate": 1.4704062840419364e-08, "loss": 0.0006, "step": 34735 }, { "epoch": 8.847949741064607, "grad_norm": 0.055185358971357346, "learning_rate": 1.4464058428758087e-08, "loss": 0.0011, "step": 34740 }, { "epoch": 8.84922319381951, "grad_norm": 0.10714032500982285, "learning_rate": 1.4226027464322444e-08, "loss": 0.0009, "step": 34745 }, { "epoch": 8.850496646574411, "grad_norm": 0.008389062248170376, "learning_rate": 1.3989969994152586e-08, "loss": 0.0005, "step": 34750 }, { "epoch": 8.851770099329315, "grad_norm": 0.01980949565768242, "learning_rate": 1.3755886064903412e-08, "loss": 0.0006, "step": 34755 }, { "epoch": 8.853043552084218, "grad_norm": 0.0030872905626893044, "learning_rate": 1.3523775722834586e-08, "loss": 0.0004, "step": 34760 }, { "epoch": 8.854317004839121, "grad_norm": 0.03897649794816971, "learning_rate": 1.3293639013819414e-08, "loss": 0.0008, "step": 34765 }, { "epoch": 8.855590457594023, "grad_norm": 0.007685089949518442, "learning_rate": 1.3065475983339293e-08, "loss": 0.0006, "step": 34770 }, { "epoch": 8.856863910348926, "grad_norm": 0.03854946047067642, "learning_rate": 1.2839286676487039e-08, "loss": 0.0003, "step": 34775 }, { "epoch": 8.858137363103829, "grad_norm": 0.057247187942266464, "learning_rate": 1.2615071137964674e-08, "loss": 0.0004, "step": 34780 }, { "epoch": 8.859410815858732, "grad_norm": 0.03412071242928505, "learning_rate": 1.2392829412082309e-08, "loss": 0.0002, "step": 34785 }, { "epoch": 8.860684268613635, "grad_norm": 0.022228775545954704, "learning_rate": 1.2172561542763694e-08, "loss": 0.0009, "step": 34790 }, { "epoch": 8.861957721368537, "grad_norm": 0.0403500497341156, "learning_rate": 1.1954267573539569e-08, "loss": 0.0007, "step": 34795 }, { "epoch": 8.86323117412344, "grad_norm": 0.022046895697712898, "learning_rate": 1.1737947547550976e-08, "loss": 0.0006, "step": 34800 }, { "epoch": 8.864504626878343, "grad_norm": 0.00572963198646903, "learning_rate": 1.1523601507550385e-08, "loss": 0.0004, "step": 34805 }, { "epoch": 8.865778079633246, "grad_norm": 0.02141764387488365, "learning_rate": 1.1311229495899467e-08, "loss": 0.0002, "step": 34810 }, { "epoch": 8.867051532388148, "grad_norm": 0.013102331198751926, "learning_rate": 1.110083155456798e-08, "loss": 0.0004, "step": 34815 }, { "epoch": 8.868324985143051, "grad_norm": 0.008526108227670193, "learning_rate": 1.089240772513711e-08, "loss": 0.0006, "step": 34820 }, { "epoch": 8.869598437897954, "grad_norm": 0.04440516233444214, "learning_rate": 1.0685958048799461e-08, "loss": 0.0006, "step": 34825 }, { "epoch": 8.870871890652857, "grad_norm": 0.032123006880283356, "learning_rate": 1.0481482566354617e-08, "loss": 0.0004, "step": 34830 }, { "epoch": 8.872145343407759, "grad_norm": 0.0140609135851264, "learning_rate": 1.0278981318212477e-08, "loss": 0.0004, "step": 34835 }, { "epoch": 8.873418796162662, "grad_norm": 0.04731736332178116, "learning_rate": 1.0078454344395472e-08, "loss": 0.0008, "step": 34840 }, { "epoch": 8.874692248917565, "grad_norm": 0.03865736722946167, "learning_rate": 9.879901684531901e-09, "loss": 0.0007, "step": 34845 }, { "epoch": 8.875965701672468, "grad_norm": 0.2627612352371216, "learning_rate": 9.683323377862597e-09, "loss": 0.0008, "step": 34850 }, { "epoch": 8.877239154427372, "grad_norm": 0.050388745963573456, "learning_rate": 9.488719463237595e-09, "loss": 0.001, "step": 34855 }, { "epoch": 8.878512607182273, "grad_norm": 0.02326519601047039, "learning_rate": 9.29608997911724e-09, "loss": 0.0021, "step": 34860 }, { "epoch": 8.879786059937176, "grad_norm": 0.033340975642204285, "learning_rate": 9.10543496356886e-09, "loss": 0.0004, "step": 34865 }, { "epoch": 8.88105951269208, "grad_norm": 0.03164214640855789, "learning_rate": 8.916754454274534e-09, "loss": 0.0007, "step": 34870 }, { "epoch": 8.882332965446983, "grad_norm": 0.05021017789840698, "learning_rate": 8.730048488521104e-09, "loss": 0.001, "step": 34875 }, { "epoch": 8.883606418201884, "grad_norm": 0.028671180829405785, "learning_rate": 8.54531710320794e-09, "loss": 0.0006, "step": 34880 }, { "epoch": 8.884879870956787, "grad_norm": 0.054648980498313904, "learning_rate": 8.362560334843617e-09, "loss": 0.0006, "step": 34885 }, { "epoch": 8.88615332371169, "grad_norm": 0.13810838758945465, "learning_rate": 8.181778219547021e-09, "loss": 0.0006, "step": 34890 }, { "epoch": 8.887426776466594, "grad_norm": 0.0039834571070969105, "learning_rate": 8.002970793046238e-09, "loss": 0.0006, "step": 34895 }, { "epoch": 8.888700229221495, "grad_norm": 0.048476848751306534, "learning_rate": 7.826138090678559e-09, "loss": 0.0008, "step": 34900 }, { "epoch": 8.889973681976398, "grad_norm": 0.03943387791514397, "learning_rate": 7.651280147390471e-09, "loss": 0.0006, "step": 34905 }, { "epoch": 8.891247134731302, "grad_norm": 0.03703722730278969, "learning_rate": 7.47839699774211e-09, "loss": 0.0004, "step": 34910 }, { "epoch": 8.892520587486205, "grad_norm": 0.10317102074623108, "learning_rate": 7.3074886758983665e-09, "loss": 0.0004, "step": 34915 }, { "epoch": 8.893794040241108, "grad_norm": 0.05385264754295349, "learning_rate": 7.1385552156355565e-09, "loss": 0.0007, "step": 34920 }, { "epoch": 8.89506749299601, "grad_norm": 0.0911436676979065, "learning_rate": 6.971596650341417e-09, "loss": 0.0007, "step": 34925 }, { "epoch": 8.896340945750913, "grad_norm": 0.5233944058418274, "learning_rate": 6.806613013011776e-09, "loss": 0.0008, "step": 34930 }, { "epoch": 8.897614398505816, "grad_norm": 0.0043238066136837006, "learning_rate": 6.643604336251664e-09, "loss": 0.0002, "step": 34935 }, { "epoch": 8.898887851260719, "grad_norm": 0.03765815123915672, "learning_rate": 6.4825706522775315e-09, "loss": 0.0005, "step": 34940 }, { "epoch": 8.90016130401562, "grad_norm": 0.07183603197336197, "learning_rate": 6.323511992915033e-09, "loss": 0.0007, "step": 34945 }, { "epoch": 8.901434756770524, "grad_norm": 0.03153042867779732, "learning_rate": 6.166428389596801e-09, "loss": 0.0007, "step": 34950 }, { "epoch": 8.902708209525427, "grad_norm": 0.028278501704335213, "learning_rate": 6.011319873370225e-09, "loss": 0.0004, "step": 34955 }, { "epoch": 8.90398166228033, "grad_norm": 0.08362292498350143, "learning_rate": 5.85818647488634e-09, "loss": 0.0005, "step": 34960 }, { "epoch": 8.905255115035231, "grad_norm": 0.026301683858036995, "learning_rate": 5.707028224412048e-09, "loss": 0.0007, "step": 34965 }, { "epoch": 8.906528567790135, "grad_norm": 0.035924434661865234, "learning_rate": 5.557845151819008e-09, "loss": 0.0008, "step": 34970 }, { "epoch": 8.907802020545038, "grad_norm": 0.03696788474917412, "learning_rate": 5.4106372865903036e-09, "loss": 0.0009, "step": 34975 }, { "epoch": 8.909075473299941, "grad_norm": 0.03893477842211723, "learning_rate": 5.265404657819329e-09, "loss": 0.0004, "step": 34980 }, { "epoch": 8.910348926054844, "grad_norm": 0.16848455369472504, "learning_rate": 5.1221472942086795e-09, "loss": 0.0008, "step": 34985 }, { "epoch": 8.911622378809746, "grad_norm": 0.012654613703489304, "learning_rate": 4.980865224070153e-09, "loss": 0.0003, "step": 34990 }, { "epoch": 8.912895831564649, "grad_norm": 0.01454266905784607, "learning_rate": 4.841558475324748e-09, "loss": 0.0003, "step": 34995 }, { "epoch": 8.914169284319552, "grad_norm": 0.10686734318733215, "learning_rate": 4.704227075504886e-09, "loss": 0.001, "step": 35000 }, { "epoch": 8.915442737074455, "grad_norm": 0.010535025969147682, "learning_rate": 4.568871051751078e-09, "loss": 0.0002, "step": 35005 }, { "epoch": 8.916716189829357, "grad_norm": 0.04978976398706436, "learning_rate": 4.435490430813038e-09, "loss": 0.0005, "step": 35010 }, { "epoch": 8.91798964258426, "grad_norm": 0.018843388184905052, "learning_rate": 4.304085239051903e-09, "loss": 0.0004, "step": 35015 }, { "epoch": 8.919263095339163, "grad_norm": 0.006831255741417408, "learning_rate": 4.174655502435787e-09, "loss": 0.0004, "step": 35020 }, { "epoch": 8.920536548094066, "grad_norm": 0.04266691580414772, "learning_rate": 4.047201246546451e-09, "loss": 0.0003, "step": 35025 }, { "epoch": 8.921810000848968, "grad_norm": 0.007579974364489317, "learning_rate": 3.921722496570413e-09, "loss": 0.0001, "step": 35030 }, { "epoch": 8.923083453603871, "grad_norm": 0.012390379793941975, "learning_rate": 3.798219277307835e-09, "loss": 0.0001, "step": 35035 }, { "epoch": 8.924356906358774, "grad_norm": 0.03487355634570122, "learning_rate": 3.6766916131658613e-09, "loss": 0.0005, "step": 35040 }, { "epoch": 8.925630359113677, "grad_norm": 0.03388279303908348, "learning_rate": 3.5571395281619457e-09, "loss": 0.0004, "step": 35045 }, { "epoch": 8.92690381186858, "grad_norm": 0.039849478751420975, "learning_rate": 3.4395630459227447e-09, "loss": 0.0006, "step": 35050 }, { "epoch": 8.928177264623482, "grad_norm": 0.03400818631052971, "learning_rate": 3.3239621896863362e-09, "loss": 0.0005, "step": 35055 }, { "epoch": 8.929450717378385, "grad_norm": 0.03909711539745331, "learning_rate": 3.2103369822988893e-09, "loss": 0.0005, "step": 35060 }, { "epoch": 8.930724170133288, "grad_norm": 0.03491996228694916, "learning_rate": 3.0986874462146654e-09, "loss": 0.0007, "step": 35065 }, { "epoch": 8.931997622888192, "grad_norm": 0.050555210560560226, "learning_rate": 2.989013603500457e-09, "loss": 0.0008, "step": 35070 }, { "epoch": 8.933271075643093, "grad_norm": 0.00725166592746973, "learning_rate": 2.8813154758311477e-09, "loss": 0.0004, "step": 35075 }, { "epoch": 8.934544528397996, "grad_norm": 0.015884356573224068, "learning_rate": 2.775593084490824e-09, "loss": 0.0005, "step": 35080 }, { "epoch": 8.9358179811529, "grad_norm": 0.07721884548664093, "learning_rate": 2.6718464503727724e-09, "loss": 0.0007, "step": 35085 }, { "epoch": 8.937091433907803, "grad_norm": 0.07868599146604538, "learning_rate": 2.5700755939817024e-09, "loss": 0.0006, "step": 35090 }, { "epoch": 8.938364886662704, "grad_norm": 0.0418260432779789, "learning_rate": 2.4702805354293037e-09, "loss": 0.0009, "step": 35095 }, { "epoch": 8.939638339417607, "grad_norm": 0.02314821630716324, "learning_rate": 2.372461294438688e-09, "loss": 0.0009, "step": 35100 }, { "epoch": 8.94091179217251, "grad_norm": 0.05753370746970177, "learning_rate": 2.2766178903432802e-09, "loss": 0.0003, "step": 35105 }, { "epoch": 8.942185244927414, "grad_norm": 0.05685974657535553, "learning_rate": 2.182750342083484e-09, "loss": 0.0004, "step": 35110 }, { "epoch": 8.943458697682315, "grad_norm": 0.22568228840827942, "learning_rate": 2.0908586682100163e-09, "loss": 0.0007, "step": 35115 }, { "epoch": 8.944732150437218, "grad_norm": 0.026722783222794533, "learning_rate": 2.0009428868839053e-09, "loss": 0.0005, "step": 35120 }, { "epoch": 8.946005603192122, "grad_norm": 0.04664972424507141, "learning_rate": 1.9130030158742706e-09, "loss": 0.0003, "step": 35125 }, { "epoch": 8.947279055947025, "grad_norm": 0.06833118945360184, "learning_rate": 1.827039072562764e-09, "loss": 0.0007, "step": 35130 }, { "epoch": 8.948552508701928, "grad_norm": 0.06114918738603592, "learning_rate": 1.743051073938018e-09, "loss": 0.0007, "step": 35135 }, { "epoch": 8.94982596145683, "grad_norm": 0.006703065242618322, "learning_rate": 1.6772833663791787e-09, "loss": 0.0018, "step": 35140 }, { "epoch": 8.951099414211733, "grad_norm": 0.009039285592734814, "learning_rate": 1.5968521097542434e-09, "loss": 0.0004, "step": 35145 }, { "epoch": 8.952372866966636, "grad_norm": 0.05732543766498566, "learning_rate": 1.5183968433085672e-09, "loss": 0.0004, "step": 35150 }, { "epoch": 8.953646319721539, "grad_norm": 0.00963275134563446, "learning_rate": 1.4419175825453047e-09, "loss": 0.0004, "step": 35155 }, { "epoch": 8.95491977247644, "grad_norm": 0.007677728775888681, "learning_rate": 1.3674143425812525e-09, "loss": 0.0003, "step": 35160 }, { "epoch": 8.956193225231344, "grad_norm": 0.06257695704698563, "learning_rate": 1.294887138137968e-09, "loss": 0.0012, "step": 35165 }, { "epoch": 8.957466677986247, "grad_norm": 0.023568086326122284, "learning_rate": 1.224335983551761e-09, "loss": 0.0006, "step": 35170 }, { "epoch": 8.95874013074115, "grad_norm": 0.0386810265481472, "learning_rate": 1.155760892762592e-09, "loss": 0.0005, "step": 35175 }, { "epoch": 8.960013583496051, "grad_norm": 0.042098984122276306, "learning_rate": 1.0891618793262836e-09, "loss": 0.0007, "step": 35180 }, { "epoch": 8.961287036250955, "grad_norm": 0.012299562804400921, "learning_rate": 1.02453895640231e-09, "loss": 0.0002, "step": 35185 }, { "epoch": 8.962560489005858, "grad_norm": 0.057044681161642075, "learning_rate": 9.618921367637869e-10, "loss": 0.0006, "step": 35190 }, { "epoch": 8.963833941760761, "grad_norm": 0.03482894226908684, "learning_rate": 9.012214327897006e-10, "loss": 0.0008, "step": 35195 }, { "epoch": 8.965107394515664, "grad_norm": 0.003840198740363121, "learning_rate": 8.425268564726808e-10, "loss": 0.0004, "step": 35200 }, { "epoch": 8.966380847270566, "grad_norm": 0.07721539586782455, "learning_rate": 7.85808419411227e-10, "loss": 0.0005, "step": 35205 }, { "epoch": 8.967654300025469, "grad_norm": 0.04779836907982826, "learning_rate": 7.310661328152613e-10, "loss": 0.0009, "step": 35210 }, { "epoch": 8.968927752780372, "grad_norm": 0.03589698299765587, "learning_rate": 6.783000075027968e-10, "loss": 0.0007, "step": 35215 }, { "epoch": 8.970201205535275, "grad_norm": 0.02378290519118309, "learning_rate": 6.275100539032686e-10, "loss": 0.0009, "step": 35220 }, { "epoch": 8.971474658290177, "grad_norm": 0.05126126483082771, "learning_rate": 5.786962820530928e-10, "loss": 0.0006, "step": 35225 }, { "epoch": 8.97274811104508, "grad_norm": 0.021384915336966515, "learning_rate": 5.318587015989974e-10, "loss": 0.0004, "step": 35230 }, { "epoch": 8.974021563799983, "grad_norm": 0.0061913542449474335, "learning_rate": 4.869973217991319e-10, "loss": 0.0006, "step": 35235 }, { "epoch": 8.975295016554886, "grad_norm": 0.06676270067691803, "learning_rate": 4.441121515186275e-10, "loss": 0.0014, "step": 35240 }, { "epoch": 8.976568469309788, "grad_norm": 0.025339210405945778, "learning_rate": 4.032031992329266e-10, "loss": 0.0013, "step": 35245 }, { "epoch": 8.977841922064691, "grad_norm": 0.05754878371953964, "learning_rate": 3.642704730266733e-10, "loss": 0.0008, "step": 35250 }, { "epoch": 8.979115374819594, "grad_norm": 0.11134814471006393, "learning_rate": 3.2731398059482333e-10, "loss": 0.001, "step": 35255 }, { "epoch": 8.980388827574497, "grad_norm": 0.017257321625947952, "learning_rate": 2.923337292404238e-10, "loss": 0.0004, "step": 35260 }, { "epoch": 8.981662280329399, "grad_norm": 0.083762027323246, "learning_rate": 2.5932972587683346e-10, "loss": 0.001, "step": 35265 }, { "epoch": 8.982935733084302, "grad_norm": 0.03784501925110817, "learning_rate": 2.2830197702772283e-10, "loss": 0.0006, "step": 35270 }, { "epoch": 8.984209185839205, "grad_norm": 0.023514244705438614, "learning_rate": 1.992504888226332e-10, "loss": 0.0008, "step": 35275 }, { "epoch": 8.985482638594108, "grad_norm": 0.03209243714809418, "learning_rate": 1.7217526700474828e-10, "loss": 0.0003, "step": 35280 }, { "epoch": 8.986756091349012, "grad_norm": 0.02091865800321102, "learning_rate": 1.4707631692534308e-10, "loss": 0.0004, "step": 35285 }, { "epoch": 8.988029544103913, "grad_norm": 0.02709525264799595, "learning_rate": 1.2395364354378382e-10, "loss": 0.0008, "step": 35290 }, { "epoch": 8.989302996858816, "grad_norm": 0.004283325280994177, "learning_rate": 1.0280725143085869e-10, "loss": 0.0004, "step": 35295 }, { "epoch": 8.99057644961372, "grad_norm": 0.0466977059841156, "learning_rate": 8.363714476433694e-11, "loss": 0.0005, "step": 35300 }, { "epoch": 8.991849902368623, "grad_norm": 0.07799071818590164, "learning_rate": 6.644332733340974e-11, "loss": 0.001, "step": 35305 }, { "epoch": 8.993123355123524, "grad_norm": 0.011416396126151085, "learning_rate": 5.12258025364698e-11, "loss": 0.0005, "step": 35310 }, { "epoch": 8.994396807878427, "grad_norm": 0.02207428775727749, "learning_rate": 3.798457338111128e-11, "loss": 0.0005, "step": 35315 }, { "epoch": 8.99567026063333, "grad_norm": 0.027158409357070923, "learning_rate": 2.6719642483019613e-11, "loss": 0.0011, "step": 35320 }, { "epoch": 8.996943713388234, "grad_norm": 0.00947508029639721, "learning_rate": 1.7431012070412422e-11, "loss": 0.0005, "step": 35325 }, { "epoch": 8.998217166143135, "grad_norm": 0.007210324984043837, "learning_rate": 1.0118683976267918e-11, "loss": 0.0006, "step": 35330 }, { "epoch": 8.999235928347058, "step": 35334, "total_flos": 3.443448483955645e+19, "train_loss": 0.010876436484895235, "train_runtime": 116473.6715, "train_samples_per_second": 7.281, "train_steps_per_second": 0.303 } ], "logging_steps": 5, "max_steps": 35334, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 13000, "total_flos": 3.443448483955645e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }