{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999850411368736, "eval_steps": 500, "global_step": 3342, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002991772625280479, "grad_norm": 24.73757525746042, "learning_rate": 2.985074626865672e-08, "loss": 1.4467, "step": 1 }, { "epoch": 0.0014958863126402393, "grad_norm": 25.170481305263397, "learning_rate": 1.4925373134328358e-07, "loss": 1.418, "step": 5 }, { "epoch": 0.0029917726252804786, "grad_norm": 17.671227057151118, "learning_rate": 2.9850746268656716e-07, "loss": 1.4125, "step": 10 }, { "epoch": 0.004487658937920718, "grad_norm": 8.909500017199079, "learning_rate": 4.4776119402985074e-07, "loss": 1.3226, "step": 15 }, { "epoch": 0.005983545250560957, "grad_norm": 11.280714871339804, "learning_rate": 5.970149253731343e-07, "loss": 1.1982, "step": 20 }, { "epoch": 0.0074794315632011965, "grad_norm": 8.398369640505729, "learning_rate": 7.462686567164179e-07, "loss": 1.0909, "step": 25 }, { "epoch": 0.008975317875841436, "grad_norm": 2.9571179549426505, "learning_rate": 8.955223880597015e-07, "loss": 0.9951, "step": 30 }, { "epoch": 0.010471204188481676, "grad_norm": 2.8894837770977224, "learning_rate": 1.044776119402985e-06, "loss": 0.9586, "step": 35 }, { "epoch": 0.011967090501121914, "grad_norm": 2.324424480950711, "learning_rate": 1.1940298507462686e-06, "loss": 0.9305, "step": 40 }, { "epoch": 0.013462976813762155, "grad_norm": 2.2619500315834866, "learning_rate": 1.3432835820895524e-06, "loss": 0.91, "step": 45 }, { "epoch": 0.014958863126402393, "grad_norm": 2.3542199193134663, "learning_rate": 1.4925373134328358e-06, "loss": 0.8925, "step": 50 }, { "epoch": 0.016454749439042633, "grad_norm": 2.1942393280485444, "learning_rate": 1.6417910447761196e-06, "loss": 0.8768, "step": 55 }, { "epoch": 0.01795063575168287, "grad_norm": 2.1616387387297245, "learning_rate": 1.791044776119403e-06, "loss": 0.8723, "step": 60 }, { "epoch": 0.01944652206432311, "grad_norm": 2.2022102428237957, "learning_rate": 1.9402985074626867e-06, "loss": 0.8639, "step": 65 }, { "epoch": 0.020942408376963352, "grad_norm": 2.1621065249124585, "learning_rate": 2.08955223880597e-06, "loss": 0.8699, "step": 70 }, { "epoch": 0.02243829468960359, "grad_norm": 2.3233760195216147, "learning_rate": 2.238805970149254e-06, "loss": 0.8603, "step": 75 }, { "epoch": 0.02393418100224383, "grad_norm": 2.266487924942459, "learning_rate": 2.3880597014925373e-06, "loss": 0.8537, "step": 80 }, { "epoch": 0.025430067314884067, "grad_norm": 2.2873594748046076, "learning_rate": 2.537313432835821e-06, "loss": 0.8487, "step": 85 }, { "epoch": 0.02692595362752431, "grad_norm": 2.283742446960398, "learning_rate": 2.686567164179105e-06, "loss": 0.8413, "step": 90 }, { "epoch": 0.028421839940164548, "grad_norm": 2.1774038405712486, "learning_rate": 2.835820895522388e-06, "loss": 0.8281, "step": 95 }, { "epoch": 0.029917726252804786, "grad_norm": 2.342263870606748, "learning_rate": 2.9850746268656716e-06, "loss": 0.8305, "step": 100 }, { "epoch": 0.031413612565445025, "grad_norm": 2.385361324640983, "learning_rate": 3.1343283582089558e-06, "loss": 0.8168, "step": 105 }, { "epoch": 0.032909498878085267, "grad_norm": 2.500114420278517, "learning_rate": 3.283582089552239e-06, "loss": 0.8207, "step": 110 }, { "epoch": 0.0344053851907255, "grad_norm": 2.4374602854285286, "learning_rate": 3.4328358208955225e-06, "loss": 0.8244, "step": 115 }, { "epoch": 0.03590127150336574, "grad_norm": 2.44198563300893, "learning_rate": 3.582089552238806e-06, "loss": 0.8199, "step": 120 }, { "epoch": 0.037397157816005985, "grad_norm": 2.2760332882107157, "learning_rate": 3.73134328358209e-06, "loss": 0.8216, "step": 125 }, { "epoch": 0.03889304412864622, "grad_norm": 2.508508841712968, "learning_rate": 3.8805970149253735e-06, "loss": 0.8014, "step": 130 }, { "epoch": 0.04038893044128646, "grad_norm": 2.3029396530303066, "learning_rate": 4.029850746268657e-06, "loss": 0.7989, "step": 135 }, { "epoch": 0.041884816753926704, "grad_norm": 2.3322348191256594, "learning_rate": 4.17910447761194e-06, "loss": 0.7964, "step": 140 }, { "epoch": 0.04338070306656694, "grad_norm": 2.331622885776369, "learning_rate": 4.3283582089552236e-06, "loss": 0.8013, "step": 145 }, { "epoch": 0.04487658937920718, "grad_norm": 2.288860960559162, "learning_rate": 4.477611940298508e-06, "loss": 0.8045, "step": 150 }, { "epoch": 0.04637247569184742, "grad_norm": 2.4508630826235778, "learning_rate": 4.626865671641791e-06, "loss": 0.7898, "step": 155 }, { "epoch": 0.04786836200448766, "grad_norm": 2.3184684975983045, "learning_rate": 4.7761194029850745e-06, "loss": 0.7937, "step": 160 }, { "epoch": 0.0493642483171279, "grad_norm": 2.1613921165346826, "learning_rate": 4.925373134328359e-06, "loss": 0.7911, "step": 165 }, { "epoch": 0.050860134629768135, "grad_norm": 2.453275213296358, "learning_rate": 5.074626865671642e-06, "loss": 0.7857, "step": 170 }, { "epoch": 0.05235602094240838, "grad_norm": 2.3284898790399353, "learning_rate": 5.2238805970149255e-06, "loss": 0.7793, "step": 175 }, { "epoch": 0.05385190725504862, "grad_norm": 2.3201172049873686, "learning_rate": 5.37313432835821e-06, "loss": 0.7779, "step": 180 }, { "epoch": 0.055347793567688854, "grad_norm": 2.357711081062365, "learning_rate": 5.522388059701493e-06, "loss": 0.7847, "step": 185 }, { "epoch": 0.056843679880329095, "grad_norm": 2.7611374374886037, "learning_rate": 5.671641791044776e-06, "loss": 0.7706, "step": 190 }, { "epoch": 0.05833956619296934, "grad_norm": 2.6271055846372513, "learning_rate": 5.820895522388061e-06, "loss": 0.7607, "step": 195 }, { "epoch": 0.05983545250560957, "grad_norm": 2.376880891581398, "learning_rate": 5.970149253731343e-06, "loss": 0.7671, "step": 200 }, { "epoch": 0.061331338818249814, "grad_norm": 2.3835041311189022, "learning_rate": 6.119402985074627e-06, "loss": 0.7586, "step": 205 }, { "epoch": 0.06282722513089005, "grad_norm": 2.462749716678564, "learning_rate": 6.2686567164179116e-06, "loss": 0.7758, "step": 210 }, { "epoch": 0.0643231114435303, "grad_norm": 2.4077847282275866, "learning_rate": 6.417910447761194e-06, "loss": 0.7638, "step": 215 }, { "epoch": 0.06581899775617053, "grad_norm": 2.3747888389216616, "learning_rate": 6.567164179104478e-06, "loss": 0.761, "step": 220 }, { "epoch": 0.06731488406881077, "grad_norm": 2.493092964041521, "learning_rate": 6.7164179104477625e-06, "loss": 0.7574, "step": 225 }, { "epoch": 0.068810770381451, "grad_norm": 2.6477317119354984, "learning_rate": 6.865671641791045e-06, "loss": 0.7498, "step": 230 }, { "epoch": 0.07030665669409125, "grad_norm": 2.563921275934862, "learning_rate": 7.014925373134329e-06, "loss": 0.761, "step": 235 }, { "epoch": 0.07180254300673149, "grad_norm": 2.4406043810417257, "learning_rate": 7.164179104477612e-06, "loss": 0.7423, "step": 240 }, { "epoch": 0.07329842931937172, "grad_norm": 2.2639780494151034, "learning_rate": 7.313432835820896e-06, "loss": 0.7478, "step": 245 }, { "epoch": 0.07479431563201197, "grad_norm": 2.6132105098393628, "learning_rate": 7.46268656716418e-06, "loss": 0.7522, "step": 250 }, { "epoch": 0.0762902019446522, "grad_norm": 2.5688006092201365, "learning_rate": 7.611940298507463e-06, "loss": 0.7409, "step": 255 }, { "epoch": 0.07778608825729244, "grad_norm": 2.4383140799178564, "learning_rate": 7.761194029850747e-06, "loss": 0.7449, "step": 260 }, { "epoch": 0.07928197456993269, "grad_norm": 2.3404023048993365, "learning_rate": 7.91044776119403e-06, "loss": 0.7306, "step": 265 }, { "epoch": 0.08077786088257292, "grad_norm": 2.435305377918958, "learning_rate": 8.059701492537314e-06, "loss": 0.7464, "step": 270 }, { "epoch": 0.08227374719521316, "grad_norm": 2.9299430373461433, "learning_rate": 8.208955223880599e-06, "loss": 0.7279, "step": 275 }, { "epoch": 0.08376963350785341, "grad_norm": 2.387726009024013, "learning_rate": 8.35820895522388e-06, "loss": 0.7388, "step": 280 }, { "epoch": 0.08526551982049364, "grad_norm": 2.268041368580347, "learning_rate": 8.507462686567165e-06, "loss": 0.7407, "step": 285 }, { "epoch": 0.08676140613313388, "grad_norm": 2.408844552763582, "learning_rate": 8.656716417910447e-06, "loss": 0.7222, "step": 290 }, { "epoch": 0.08825729244577413, "grad_norm": 2.401282808607445, "learning_rate": 8.805970149253732e-06, "loss": 0.7265, "step": 295 }, { "epoch": 0.08975317875841436, "grad_norm": 2.4691438286047425, "learning_rate": 8.955223880597016e-06, "loss": 0.7239, "step": 300 }, { "epoch": 0.0912490650710546, "grad_norm": 2.3268484057723673, "learning_rate": 9.104477611940299e-06, "loss": 0.7227, "step": 305 }, { "epoch": 0.09274495138369485, "grad_norm": 2.402308618181149, "learning_rate": 9.253731343283582e-06, "loss": 0.7244, "step": 310 }, { "epoch": 0.09424083769633508, "grad_norm": 2.3361311735184604, "learning_rate": 9.402985074626867e-06, "loss": 0.7263, "step": 315 }, { "epoch": 0.09573672400897532, "grad_norm": 2.335173501165771, "learning_rate": 9.552238805970149e-06, "loss": 0.7215, "step": 320 }, { "epoch": 0.09723261032161555, "grad_norm": 2.730322757042367, "learning_rate": 9.701492537313434e-06, "loss": 0.7332, "step": 325 }, { "epoch": 0.0987284966342558, "grad_norm": 2.2835032077275312, "learning_rate": 9.850746268656717e-06, "loss": 0.7115, "step": 330 }, { "epoch": 0.10022438294689603, "grad_norm": 2.4782326787594338, "learning_rate": 1e-05, "loss": 0.7142, "step": 335 }, { "epoch": 0.10172026925953627, "grad_norm": 2.4128411535499454, "learning_rate": 9.999931779967976e-06, "loss": 0.7108, "step": 340 }, { "epoch": 0.10321615557217652, "grad_norm": 2.230247650641441, "learning_rate": 9.99972712173349e-06, "loss": 0.7037, "step": 345 }, { "epoch": 0.10471204188481675, "grad_norm": 2.4375274487235363, "learning_rate": 9.999386030881264e-06, "loss": 0.7052, "step": 350 }, { "epoch": 0.10620792819745699, "grad_norm": 2.497148482639146, "learning_rate": 9.998908516718984e-06, "loss": 0.723, "step": 355 }, { "epoch": 0.10770381451009724, "grad_norm": 2.7572554578379793, "learning_rate": 9.998294592277064e-06, "loss": 0.7089, "step": 360 }, { "epoch": 0.10919970082273747, "grad_norm": 2.4811541468806175, "learning_rate": 9.997544274308282e-06, "loss": 0.7049, "step": 365 }, { "epoch": 0.11069558713537771, "grad_norm": 2.3612905719078556, "learning_rate": 9.996657583287326e-06, "loss": 0.7112, "step": 370 }, { "epoch": 0.11219147344801796, "grad_norm": 2.3269094749093346, "learning_rate": 9.995634543410231e-06, "loss": 0.7091, "step": 375 }, { "epoch": 0.11368735976065819, "grad_norm": 2.1856434021067543, "learning_rate": 9.99447518259372e-06, "loss": 0.6945, "step": 380 }, { "epoch": 0.11518324607329843, "grad_norm": 2.2818233887085677, "learning_rate": 9.99317953247445e-06, "loss": 0.695, "step": 385 }, { "epoch": 0.11667913238593867, "grad_norm": 2.5644750677779267, "learning_rate": 9.991747628408138e-06, "loss": 0.6878, "step": 390 }, { "epoch": 0.11817501869857891, "grad_norm": 2.485293875776813, "learning_rate": 9.990179509468595e-06, "loss": 0.6987, "step": 395 }, { "epoch": 0.11967090501121914, "grad_norm": 2.291043542367887, "learning_rate": 9.988475218446676e-06, "loss": 0.6898, "step": 400 }, { "epoch": 0.1211667913238594, "grad_norm": 2.4232767050288726, "learning_rate": 9.986634801849093e-06, "loss": 0.6963, "step": 405 }, { "epoch": 0.12266267763649963, "grad_norm": 2.4211336275289512, "learning_rate": 9.984658309897161e-06, "loss": 0.6793, "step": 410 }, { "epoch": 0.12415856394913986, "grad_norm": 2.5029979080579734, "learning_rate": 9.982545796525416e-06, "loss": 0.6773, "step": 415 }, { "epoch": 0.1256544502617801, "grad_norm": 2.4668720551969487, "learning_rate": 9.980297319380148e-06, "loss": 0.6741, "step": 420 }, { "epoch": 0.12715033657442035, "grad_norm": 2.1324057406570796, "learning_rate": 9.977912939817833e-06, "loss": 0.6717, "step": 425 }, { "epoch": 0.1286462228870606, "grad_norm": 2.216513918034811, "learning_rate": 9.97539272290345e-06, "loss": 0.664, "step": 430 }, { "epoch": 0.13014210919970082, "grad_norm": 2.3891946944700346, "learning_rate": 9.97273673740871e-06, "loss": 0.6779, "step": 435 }, { "epoch": 0.13163799551234107, "grad_norm": 2.107297355911597, "learning_rate": 9.96994505581018e-06, "loss": 0.663, "step": 440 }, { "epoch": 0.13313388182498131, "grad_norm": 2.2741240440107666, "learning_rate": 9.967017754287303e-06, "loss": 0.6628, "step": 445 }, { "epoch": 0.13462976813762154, "grad_norm": 2.231118541487464, "learning_rate": 9.963954912720319e-06, "loss": 0.6805, "step": 450 }, { "epoch": 0.13612565445026178, "grad_norm": 2.327411278722037, "learning_rate": 9.960756614688089e-06, "loss": 0.6572, "step": 455 }, { "epoch": 0.137621540762902, "grad_norm": 2.2727646648145097, "learning_rate": 9.957422947465814e-06, "loss": 0.6682, "step": 460 }, { "epoch": 0.13911742707554225, "grad_norm": 2.43427967377174, "learning_rate": 9.953954002022643e-06, "loss": 0.658, "step": 465 }, { "epoch": 0.1406133133881825, "grad_norm": 2.203173002529278, "learning_rate": 9.950349873019204e-06, "loss": 0.6513, "step": 470 }, { "epoch": 0.14210919970082272, "grad_norm": 2.159064147239943, "learning_rate": 9.946610658805018e-06, "loss": 0.6597, "step": 475 }, { "epoch": 0.14360508601346297, "grad_norm": 2.2802374368293186, "learning_rate": 9.94273646141581e-06, "loss": 0.6642, "step": 480 }, { "epoch": 0.14510097232610322, "grad_norm": 2.321550706239028, "learning_rate": 9.938727386570727e-06, "loss": 0.6525, "step": 485 }, { "epoch": 0.14659685863874344, "grad_norm": 2.3398188402263105, "learning_rate": 9.934583543669454e-06, "loss": 0.6583, "step": 490 }, { "epoch": 0.1480927449513837, "grad_norm": 2.1439110014914524, "learning_rate": 9.93030504578923e-06, "loss": 0.6413, "step": 495 }, { "epoch": 0.14958863126402394, "grad_norm": 2.2275265346511377, "learning_rate": 9.925892009681762e-06, "loss": 0.6529, "step": 500 }, { "epoch": 0.15108451757666416, "grad_norm": 2.3496939081419637, "learning_rate": 9.921344555770033e-06, "loss": 0.6437, "step": 505 }, { "epoch": 0.1525804038893044, "grad_norm": 2.238484219281493, "learning_rate": 9.916662808145023e-06, "loss": 0.6452, "step": 510 }, { "epoch": 0.15407629020194466, "grad_norm": 2.78908558811821, "learning_rate": 9.911846894562325e-06, "loss": 0.6436, "step": 515 }, { "epoch": 0.15557217651458488, "grad_norm": 2.320928708686177, "learning_rate": 9.906896946438646e-06, "loss": 0.6336, "step": 520 }, { "epoch": 0.15706806282722513, "grad_norm": 2.2586199846671686, "learning_rate": 9.901813098848238e-06, "loss": 0.6338, "step": 525 }, { "epoch": 0.15856394913986538, "grad_norm": 2.3116521162760217, "learning_rate": 9.896595490519196e-06, "loss": 0.6414, "step": 530 }, { "epoch": 0.1600598354525056, "grad_norm": 2.1311643830360767, "learning_rate": 9.891244263829685e-06, "loss": 0.64, "step": 535 }, { "epoch": 0.16155572176514585, "grad_norm": 2.3201652793369605, "learning_rate": 9.885759564804045e-06, "loss": 0.6197, "step": 540 }, { "epoch": 0.1630516080777861, "grad_norm": 2.1802123067545134, "learning_rate": 9.880141543108816e-06, "loss": 0.6354, "step": 545 }, { "epoch": 0.16454749439042632, "grad_norm": 2.3111352831943086, "learning_rate": 9.874390352048646e-06, "loss": 0.6422, "step": 550 }, { "epoch": 0.16604338070306657, "grad_norm": 2.3857931202103524, "learning_rate": 9.868506148562107e-06, "loss": 0.6255, "step": 555 }, { "epoch": 0.16753926701570682, "grad_norm": 2.3118891681947518, "learning_rate": 9.862489093217422e-06, "loss": 0.6123, "step": 560 }, { "epoch": 0.16903515332834704, "grad_norm": 2.3891897641974165, "learning_rate": 9.856339350208073e-06, "loss": 0.6426, "step": 565 }, { "epoch": 0.1705310396409873, "grad_norm": 2.305906878734901, "learning_rate": 9.850057087348328e-06, "loss": 0.6199, "step": 570 }, { "epoch": 0.17202692595362754, "grad_norm": 2.1960382748129432, "learning_rate": 9.843642476068654e-06, "loss": 0.6095, "step": 575 }, { "epoch": 0.17352281226626776, "grad_norm": 2.0839495395902534, "learning_rate": 9.837095691411047e-06, "loss": 0.6131, "step": 580 }, { "epoch": 0.175018698578908, "grad_norm": 2.4685394970589685, "learning_rate": 9.83041691202425e-06, "loss": 0.6257, "step": 585 }, { "epoch": 0.17651458489154825, "grad_norm": 2.8548483464223957, "learning_rate": 9.82360632015888e-06, "loss": 0.5935, "step": 590 }, { "epoch": 0.17801047120418848, "grad_norm": 2.5687866778693347, "learning_rate": 9.816664101662458e-06, "loss": 0.6176, "step": 595 }, { "epoch": 0.17950635751682872, "grad_norm": 2.1643123544103497, "learning_rate": 9.809590445974328e-06, "loss": 0.6236, "step": 600 }, { "epoch": 0.18100224382946897, "grad_norm": 2.1920911452788023, "learning_rate": 9.802385546120498e-06, "loss": 0.6149, "step": 605 }, { "epoch": 0.1824981301421092, "grad_norm": 2.1719167623114046, "learning_rate": 9.795049598708369e-06, "loss": 0.6165, "step": 610 }, { "epoch": 0.18399401645474944, "grad_norm": 2.045624267196742, "learning_rate": 9.787582803921366e-06, "loss": 0.6056, "step": 615 }, { "epoch": 0.1854899027673897, "grad_norm": 2.1670193890658105, "learning_rate": 9.77998536551348e-06, "loss": 0.583, "step": 620 }, { "epoch": 0.1869857890800299, "grad_norm": 2.143005021612413, "learning_rate": 9.77225749080371e-06, "loss": 0.6025, "step": 625 }, { "epoch": 0.18848167539267016, "grad_norm": 2.2897606994593733, "learning_rate": 9.764399390670401e-06, "loss": 0.6044, "step": 630 }, { "epoch": 0.18997756170531038, "grad_norm": 2.1407407791372304, "learning_rate": 9.756411279545486e-06, "loss": 0.6028, "step": 635 }, { "epoch": 0.19147344801795063, "grad_norm": 2.1400040414477512, "learning_rate": 9.748293375408647e-06, "loss": 0.6008, "step": 640 }, { "epoch": 0.19296933433059088, "grad_norm": 2.3487555741055646, "learning_rate": 9.740045899781353e-06, "loss": 0.5905, "step": 645 }, { "epoch": 0.1944652206432311, "grad_norm": 2.211663714643132, "learning_rate": 9.731669077720828e-06, "loss": 0.5834, "step": 650 }, { "epoch": 0.19596110695587135, "grad_norm": 2.188161715718423, "learning_rate": 9.723163137813898e-06, "loss": 0.5855, "step": 655 }, { "epoch": 0.1974569932685116, "grad_norm": 2.133955120338045, "learning_rate": 9.714528312170762e-06, "loss": 0.5944, "step": 660 }, { "epoch": 0.19895287958115182, "grad_norm": 2.2340780975578527, "learning_rate": 9.705764836418648e-06, "loss": 0.583, "step": 665 }, { "epoch": 0.20044876589379207, "grad_norm": 2.3292781920189936, "learning_rate": 9.696872949695399e-06, "loss": 0.5827, "step": 670 }, { "epoch": 0.20194465220643232, "grad_norm": 2.3176955302107647, "learning_rate": 9.687852894642932e-06, "loss": 0.584, "step": 675 }, { "epoch": 0.20344053851907254, "grad_norm": 2.2410986216187863, "learning_rate": 9.678704917400628e-06, "loss": 0.5702, "step": 680 }, { "epoch": 0.2049364248317128, "grad_norm": 2.2113552696479766, "learning_rate": 9.669429267598603e-06, "loss": 0.5656, "step": 685 }, { "epoch": 0.20643231114435304, "grad_norm": 2.1894234586204613, "learning_rate": 9.660026198350906e-06, "loss": 0.5688, "step": 690 }, { "epoch": 0.20792819745699326, "grad_norm": 2.2894157314528183, "learning_rate": 9.650495966248618e-06, "loss": 0.5563, "step": 695 }, { "epoch": 0.2094240837696335, "grad_norm": 2.2231586059805863, "learning_rate": 9.64083883135283e-06, "loss": 0.5642, "step": 700 }, { "epoch": 0.21091997008227376, "grad_norm": 2.227615707267463, "learning_rate": 9.631055057187564e-06, "loss": 0.5788, "step": 705 }, { "epoch": 0.21241585639491398, "grad_norm": 2.155741018622304, "learning_rate": 9.621144910732573e-06, "loss": 0.5634, "step": 710 }, { "epoch": 0.21391174270755423, "grad_norm": 2.396343334926677, "learning_rate": 9.611108662416064e-06, "loss": 0.5655, "step": 715 }, { "epoch": 0.21540762902019447, "grad_norm": 2.331449791458783, "learning_rate": 9.600946586107306e-06, "loss": 0.5739, "step": 720 }, { "epoch": 0.2169035153328347, "grad_norm": 2.2507152546219924, "learning_rate": 9.590658959109168e-06, "loss": 0.5768, "step": 725 }, { "epoch": 0.21839940164547494, "grad_norm": 2.164980578292193, "learning_rate": 9.58024606215055e-06, "loss": 0.5517, "step": 730 }, { "epoch": 0.2198952879581152, "grad_norm": 2.2186056393230484, "learning_rate": 9.569708179378716e-06, "loss": 0.5773, "step": 735 }, { "epoch": 0.22139117427075541, "grad_norm": 2.1412265933937245, "learning_rate": 9.559045598351544e-06, "loss": 0.5597, "step": 740 }, { "epoch": 0.22288706058339566, "grad_norm": 2.113998854082962, "learning_rate": 9.548258610029684e-06, "loss": 0.5602, "step": 745 }, { "epoch": 0.2243829468960359, "grad_norm": 2.1066935794719823, "learning_rate": 9.537347508768613e-06, "loss": 0.553, "step": 750 }, { "epoch": 0.22587883320867613, "grad_norm": 2.1269652854319285, "learning_rate": 9.526312592310597e-06, "loss": 0.5462, "step": 755 }, { "epoch": 0.22737471952131638, "grad_norm": 2.1421869014604966, "learning_rate": 9.515154161776584e-06, "loss": 0.5508, "step": 760 }, { "epoch": 0.22887060583395663, "grad_norm": 2.116284198421969, "learning_rate": 9.503872521657964e-06, "loss": 0.549, "step": 765 }, { "epoch": 0.23036649214659685, "grad_norm": 2.0774732327342673, "learning_rate": 9.49246797980828e-06, "loss": 0.5485, "step": 770 }, { "epoch": 0.2318623784592371, "grad_norm": 2.276120847003367, "learning_rate": 9.480940847434814e-06, "loss": 0.5553, "step": 775 }, { "epoch": 0.23335826477187735, "grad_norm": 2.1356056201671882, "learning_rate": 9.469291439090104e-06, "loss": 0.5465, "step": 780 }, { "epoch": 0.23485415108451757, "grad_norm": 2.048373811826588, "learning_rate": 9.457520072663353e-06, "loss": 0.5396, "step": 785 }, { "epoch": 0.23635003739715782, "grad_norm": 2.2466734007706397, "learning_rate": 9.445627069371758e-06, "loss": 0.5688, "step": 790 }, { "epoch": 0.23784592370979807, "grad_norm": 2.3976619549715292, "learning_rate": 9.433612753751748e-06, "loss": 0.5496, "step": 795 }, { "epoch": 0.2393418100224383, "grad_norm": 2.0982203268057793, "learning_rate": 9.421477453650118e-06, "loss": 0.5482, "step": 800 }, { "epoch": 0.24083769633507854, "grad_norm": 2.1926594347223936, "learning_rate": 9.409221500215096e-06, "loss": 0.5281, "step": 805 }, { "epoch": 0.2423335826477188, "grad_norm": 2.046500172753204, "learning_rate": 9.396845227887295e-06, "loss": 0.5495, "step": 810 }, { "epoch": 0.243829468960359, "grad_norm": 2.116270403530158, "learning_rate": 9.38434897439059e-06, "loss": 0.5333, "step": 815 }, { "epoch": 0.24532535527299926, "grad_norm": 2.1427393113292026, "learning_rate": 9.371733080722911e-06, "loss": 0.5314, "step": 820 }, { "epoch": 0.24682124158563948, "grad_norm": 2.2287931226941766, "learning_rate": 9.358997891146924e-06, "loss": 0.5389, "step": 825 }, { "epoch": 0.24831712789827973, "grad_norm": 2.183511996335904, "learning_rate": 9.346143753180646e-06, "loss": 0.5332, "step": 830 }, { "epoch": 0.24981301421091998, "grad_norm": 2.1563125330336077, "learning_rate": 9.333171017587956e-06, "loss": 0.5278, "step": 835 }, { "epoch": 0.2513089005235602, "grad_norm": 2.384672087516804, "learning_rate": 9.320080038369032e-06, "loss": 0.5321, "step": 840 }, { "epoch": 0.25280478683620045, "grad_norm": 2.2250998536771154, "learning_rate": 9.30687117275068e-06, "loss": 0.5237, "step": 845 }, { "epoch": 0.2543006731488407, "grad_norm": 2.3295538202244237, "learning_rate": 9.293544781176598e-06, "loss": 0.5238, "step": 850 }, { "epoch": 0.25579655946148094, "grad_norm": 2.46386287871832, "learning_rate": 9.280101227297526e-06, "loss": 0.5274, "step": 855 }, { "epoch": 0.2572924457741212, "grad_norm": 2.2480305463427865, "learning_rate": 9.266540877961337e-06, "loss": 0.535, "step": 860 }, { "epoch": 0.2587883320867614, "grad_norm": 2.1850110027540826, "learning_rate": 9.252864103203015e-06, "loss": 0.5216, "step": 865 }, { "epoch": 0.26028421839940163, "grad_norm": 2.1759114077528845, "learning_rate": 9.239071276234568e-06, "loss": 0.5162, "step": 870 }, { "epoch": 0.2617801047120419, "grad_norm": 2.1338769320741515, "learning_rate": 9.225162773434831e-06, "loss": 0.5143, "step": 875 }, { "epoch": 0.26327599102468213, "grad_norm": 2.1659203361390063, "learning_rate": 9.21113897433921e-06, "loss": 0.5103, "step": 880 }, { "epoch": 0.2647718773373224, "grad_norm": 2.122282430960376, "learning_rate": 9.197000261629314e-06, "loss": 0.5081, "step": 885 }, { "epoch": 0.26626776364996263, "grad_norm": 2.056748593014802, "learning_rate": 9.182747021122516e-06, "loss": 0.5117, "step": 890 }, { "epoch": 0.2677636499626028, "grad_norm": 2.203097118962648, "learning_rate": 9.168379641761425e-06, "loss": 0.5166, "step": 895 }, { "epoch": 0.26925953627524307, "grad_norm": 2.248299702751712, "learning_rate": 9.153898515603272e-06, "loss": 0.5121, "step": 900 }, { "epoch": 0.2707554225878833, "grad_norm": 2.215308947297488, "learning_rate": 9.139304037809216e-06, "loss": 0.5151, "step": 905 }, { "epoch": 0.27225130890052357, "grad_norm": 2.115586467592, "learning_rate": 9.124596606633551e-06, "loss": 0.5083, "step": 910 }, { "epoch": 0.2737471952131638, "grad_norm": 2.2977950459018017, "learning_rate": 9.10977662341285e-06, "loss": 0.5153, "step": 915 }, { "epoch": 0.275243081525804, "grad_norm": 2.205780583800523, "learning_rate": 9.094844492555004e-06, "loss": 0.5123, "step": 920 }, { "epoch": 0.27673896783844426, "grad_norm": 2.227802917043228, "learning_rate": 9.07980062152819e-06, "loss": 0.5117, "step": 925 }, { "epoch": 0.2782348541510845, "grad_norm": 2.2359783620231632, "learning_rate": 9.064645420849754e-06, "loss": 0.5022, "step": 930 }, { "epoch": 0.27973074046372476, "grad_norm": 2.1642613110172366, "learning_rate": 9.049379304075009e-06, "loss": 0.4907, "step": 935 }, { "epoch": 0.281226626776365, "grad_norm": 2.2277389804733447, "learning_rate": 9.03400268778594e-06, "loss": 0.5011, "step": 940 }, { "epoch": 0.28272251308900526, "grad_norm": 2.1493583853918907, "learning_rate": 9.018515991579851e-06, "loss": 0.5019, "step": 945 }, { "epoch": 0.28421839940164545, "grad_norm": 2.4395894627674073, "learning_rate": 9.002919638057908e-06, "loss": 0.5033, "step": 950 }, { "epoch": 0.2857142857142857, "grad_norm": 2.2370400153506806, "learning_rate": 8.987214052813605e-06, "loss": 0.5045, "step": 955 }, { "epoch": 0.28721017202692595, "grad_norm": 2.078576437577485, "learning_rate": 8.971399664421154e-06, "loss": 0.5009, "step": 960 }, { "epoch": 0.2887060583395662, "grad_norm": 2.2142839400817937, "learning_rate": 8.955476904423785e-06, "loss": 0.5023, "step": 965 }, { "epoch": 0.29020194465220644, "grad_norm": 2.14232609513754, "learning_rate": 8.939446207321982e-06, "loss": 0.477, "step": 970 }, { "epoch": 0.2916978309648467, "grad_norm": 2.21107323554905, "learning_rate": 8.923308010561608e-06, "loss": 0.4994, "step": 975 }, { "epoch": 0.2931937172774869, "grad_norm": 2.1386395431438054, "learning_rate": 8.907062754521985e-06, "loss": 0.5023, "step": 980 }, { "epoch": 0.29468960359012714, "grad_norm": 2.1332355719651037, "learning_rate": 8.89071088250387e-06, "loss": 0.4843, "step": 985 }, { "epoch": 0.2961854899027674, "grad_norm": 2.0749503641930276, "learning_rate": 8.87425284071735e-06, "loss": 0.4942, "step": 990 }, { "epoch": 0.29768137621540763, "grad_norm": 2.159991846647922, "learning_rate": 8.857689078269688e-06, "loss": 0.5108, "step": 995 }, { "epoch": 0.2991772625280479, "grad_norm": 2.1267522505598446, "learning_rate": 8.841020047153039e-06, "loss": 0.4935, "step": 1000 }, { "epoch": 0.30067314884068813, "grad_norm": 2.1642503588715245, "learning_rate": 8.824246202232142e-06, "loss": 0.4907, "step": 1005 }, { "epoch": 0.3021690351533283, "grad_norm": 2.084991570149356, "learning_rate": 8.80736800123189e-06, "loss": 0.4781, "step": 1010 }, { "epoch": 0.3036649214659686, "grad_norm": 2.1035440822771223, "learning_rate": 8.790385904724848e-06, "loss": 0.4845, "step": 1015 }, { "epoch": 0.3051608077786088, "grad_norm": 2.1736909744601687, "learning_rate": 8.773300376118685e-06, "loss": 0.4801, "step": 1020 }, { "epoch": 0.30665669409124907, "grad_norm": 2.2520314938860815, "learning_rate": 8.75611188164352e-06, "loss": 0.4893, "step": 1025 }, { "epoch": 0.3081525804038893, "grad_norm": 2.1104641749948403, "learning_rate": 8.738820890339217e-06, "loss": 0.4938, "step": 1030 }, { "epoch": 0.30964846671652957, "grad_norm": 2.0838403753220986, "learning_rate": 8.721427874042563e-06, "loss": 0.4835, "step": 1035 }, { "epoch": 0.31114435302916976, "grad_norm": 2.0711510810184266, "learning_rate": 8.703933307374413e-06, "loss": 0.4725, "step": 1040 }, { "epoch": 0.31264023934181, "grad_norm": 2.1063779245743888, "learning_rate": 8.686337667726723e-06, "loss": 0.4892, "step": 1045 }, { "epoch": 0.31413612565445026, "grad_norm": 2.1105067703269422, "learning_rate": 8.668641435249534e-06, "loss": 0.4825, "step": 1050 }, { "epoch": 0.3156320119670905, "grad_norm": 2.102573408737706, "learning_rate": 8.650845092837867e-06, "loss": 0.4885, "step": 1055 }, { "epoch": 0.31712789827973076, "grad_norm": 2.2988609972066274, "learning_rate": 8.632949126118538e-06, "loss": 0.4752, "step": 1060 }, { "epoch": 0.318623784592371, "grad_norm": 2.122502919871484, "learning_rate": 8.61495402343692e-06, "loss": 0.4769, "step": 1065 }, { "epoch": 0.3201196709050112, "grad_norm": 2.165018274340972, "learning_rate": 8.596860275843602e-06, "loss": 0.4671, "step": 1070 }, { "epoch": 0.32161555721765145, "grad_norm": 1.9717223958070753, "learning_rate": 8.578668377081001e-06, "loss": 0.4675, "step": 1075 }, { "epoch": 0.3231114435302917, "grad_norm": 2.1031743583556803, "learning_rate": 8.560378823569886e-06, "loss": 0.4713, "step": 1080 }, { "epoch": 0.32460732984293195, "grad_norm": 2.0178473800411307, "learning_rate": 8.541992114395825e-06, "loss": 0.4715, "step": 1085 }, { "epoch": 0.3261032161555722, "grad_norm": 2.0225831073597007, "learning_rate": 8.523508751295574e-06, "loss": 0.4772, "step": 1090 }, { "epoch": 0.3275991024682124, "grad_norm": 2.087877364586164, "learning_rate": 8.504929238643381e-06, "loss": 0.4688, "step": 1095 }, { "epoch": 0.32909498878085264, "grad_norm": 2.160270876260719, "learning_rate": 8.486254083437227e-06, "loss": 0.4665, "step": 1100 }, { "epoch": 0.3305908750934929, "grad_norm": 2.060627567407879, "learning_rate": 8.467483795284987e-06, "loss": 0.4617, "step": 1105 }, { "epoch": 0.33208676140613314, "grad_norm": 2.1150833498354893, "learning_rate": 8.448618886390523e-06, "loss": 0.4676, "step": 1110 }, { "epoch": 0.3335826477187734, "grad_norm": 2.026961078510351, "learning_rate": 8.429659871539709e-06, "loss": 0.4772, "step": 1115 }, { "epoch": 0.33507853403141363, "grad_norm": 2.6128907908421852, "learning_rate": 8.410607268086388e-06, "loss": 0.4678, "step": 1120 }, { "epoch": 0.3365744203440538, "grad_norm": 2.1162204368840185, "learning_rate": 8.391461595938245e-06, "loss": 0.4728, "step": 1125 }, { "epoch": 0.3380703066566941, "grad_norm": 2.0236924938571095, "learning_rate": 8.372223377542631e-06, "loss": 0.4556, "step": 1130 }, { "epoch": 0.3395661929693343, "grad_norm": 2.0470011404134345, "learning_rate": 8.352893137872292e-06, "loss": 0.4476, "step": 1135 }, { "epoch": 0.3410620792819746, "grad_norm": 2.100726525573022, "learning_rate": 8.333471404411054e-06, "loss": 0.458, "step": 1140 }, { "epoch": 0.3425579655946148, "grad_norm": 2.162727675316811, "learning_rate": 8.313958707139434e-06, "loss": 0.4751, "step": 1145 }, { "epoch": 0.34405385190725507, "grad_norm": 2.0116970709952495, "learning_rate": 8.29435557852016e-06, "loss": 0.4647, "step": 1150 }, { "epoch": 0.34554973821989526, "grad_norm": 1.9814728402387116, "learning_rate": 8.274662553483662e-06, "loss": 0.4441, "step": 1155 }, { "epoch": 0.3470456245325355, "grad_norm": 2.0453867973962607, "learning_rate": 8.254880169413455e-06, "loss": 0.4613, "step": 1160 }, { "epoch": 0.34854151084517576, "grad_norm": 2.0854663750868268, "learning_rate": 8.235008966131492e-06, "loss": 0.456, "step": 1165 }, { "epoch": 0.350037397157816, "grad_norm": 2.1204249951123706, "learning_rate": 8.215049485883419e-06, "loss": 0.4526, "step": 1170 }, { "epoch": 0.35153328347045626, "grad_norm": 2.125080372850005, "learning_rate": 8.195002273323792e-06, "loss": 0.4442, "step": 1175 }, { "epoch": 0.3530291697830965, "grad_norm": 2.2626876387499224, "learning_rate": 8.174867875501203e-06, "loss": 0.4491, "step": 1180 }, { "epoch": 0.3545250560957367, "grad_norm": 2.17644103793076, "learning_rate": 8.154646841843358e-06, "loss": 0.449, "step": 1185 }, { "epoch": 0.35602094240837695, "grad_norm": 1.9934405786856697, "learning_rate": 8.134339724142083e-06, "loss": 0.4491, "step": 1190 }, { "epoch": 0.3575168287210172, "grad_norm": 1.9811124546772585, "learning_rate": 8.113947076538264e-06, "loss": 0.4412, "step": 1195 }, { "epoch": 0.35901271503365745, "grad_norm": 2.1197485018681785, "learning_rate": 8.093469455506731e-06, "loss": 0.4448, "step": 1200 }, { "epoch": 0.3605086013462977, "grad_norm": 2.0582968984341967, "learning_rate": 8.07290741984107e-06, "loss": 0.4397, "step": 1205 }, { "epoch": 0.36200448765893795, "grad_norm": 1.9803742197531462, "learning_rate": 8.052261530638375e-06, "loss": 0.4486, "step": 1210 }, { "epoch": 0.36350037397157814, "grad_norm": 1.9763814917893987, "learning_rate": 8.03153235128393e-06, "loss": 0.4379, "step": 1215 }, { "epoch": 0.3649962602842184, "grad_norm": 2.177684384739003, "learning_rate": 8.01072044743585e-06, "loss": 0.4448, "step": 1220 }, { "epoch": 0.36649214659685864, "grad_norm": 2.2177683069308047, "learning_rate": 7.989826387009634e-06, "loss": 0.4398, "step": 1225 }, { "epoch": 0.3679880329094989, "grad_norm": 2.0614298881537416, "learning_rate": 7.96885074016267e-06, "loss": 0.438, "step": 1230 }, { "epoch": 0.36948391922213913, "grad_norm": 2.063175118233129, "learning_rate": 7.947794079278678e-06, "loss": 0.4353, "step": 1235 }, { "epoch": 0.3709798055347794, "grad_norm": 2.0902885795644943, "learning_rate": 7.926656978952089e-06, "loss": 0.4369, "step": 1240 }, { "epoch": 0.3724756918474196, "grad_norm": 2.081819065453435, "learning_rate": 7.905440015972372e-06, "loss": 0.4392, "step": 1245 }, { "epoch": 0.3739715781600598, "grad_norm": 1.9635390617281576, "learning_rate": 7.884143769308276e-06, "loss": 0.4318, "step": 1250 }, { "epoch": 0.3754674644727001, "grad_norm": 2.010397135845292, "learning_rate": 7.862768820092061e-06, "loss": 0.4294, "step": 1255 }, { "epoch": 0.3769633507853403, "grad_norm": 2.120029095014225, "learning_rate": 7.84131575160361e-06, "loss": 0.4367, "step": 1260 }, { "epoch": 0.37845923709798057, "grad_norm": 2.047223712557703, "learning_rate": 7.819785149254534e-06, "loss": 0.4247, "step": 1265 }, { "epoch": 0.37995512341062077, "grad_norm": 2.1565665198769546, "learning_rate": 7.798177600572184e-06, "loss": 0.4545, "step": 1270 }, { "epoch": 0.381451009723261, "grad_norm": 1.9698630282226646, "learning_rate": 7.776493695183623e-06, "loss": 0.4327, "step": 1275 }, { "epoch": 0.38294689603590126, "grad_norm": 2.027501209185265, "learning_rate": 7.754734024799544e-06, "loss": 0.4378, "step": 1280 }, { "epoch": 0.3844427823485415, "grad_norm": 1.9336783003915325, "learning_rate": 7.732899183198108e-06, "loss": 0.4199, "step": 1285 }, { "epoch": 0.38593866866118176, "grad_norm": 2.074909881667748, "learning_rate": 7.710989766208751e-06, "loss": 0.431, "step": 1290 }, { "epoch": 0.387434554973822, "grad_norm": 2.08466673344805, "learning_rate": 7.689006371695928e-06, "loss": 0.436, "step": 1295 }, { "epoch": 0.3889304412864622, "grad_norm": 2.0101045976441334, "learning_rate": 7.666949599542788e-06, "loss": 0.4363, "step": 1300 }, { "epoch": 0.39042632759910245, "grad_norm": 2.1388630620219304, "learning_rate": 7.644820051634813e-06, "loss": 0.4353, "step": 1305 }, { "epoch": 0.3919222139117427, "grad_norm": 1.9897181694789714, "learning_rate": 7.62261833184339e-06, "loss": 0.4321, "step": 1310 }, { "epoch": 0.39341810022438295, "grad_norm": 2.069750404086554, "learning_rate": 7.60034504600933e-06, "loss": 0.4166, "step": 1315 }, { "epoch": 0.3949139865370232, "grad_norm": 2.0828214162126564, "learning_rate": 7.5780008019263465e-06, "loss": 0.4309, "step": 1320 }, { "epoch": 0.39640987284966345, "grad_norm": 2.1311064881304183, "learning_rate": 7.555586209324455e-06, "loss": 0.42, "step": 1325 }, { "epoch": 0.39790575916230364, "grad_norm": 2.0067032988225715, "learning_rate": 7.533101879853348e-06, "loss": 0.4247, "step": 1330 }, { "epoch": 0.3994016454749439, "grad_norm": 2.1601395941384514, "learning_rate": 7.510548427065693e-06, "loss": 0.4103, "step": 1335 }, { "epoch": 0.40089753178758414, "grad_norm": 2.0545268261654166, "learning_rate": 7.487926466400403e-06, "loss": 0.418, "step": 1340 }, { "epoch": 0.4023934181002244, "grad_norm": 2.029856636678106, "learning_rate": 7.465236615165826e-06, "loss": 0.4265, "step": 1345 }, { "epoch": 0.40388930441286464, "grad_norm": 1.9396811090214083, "learning_rate": 7.4424794925229175e-06, "loss": 0.4241, "step": 1350 }, { "epoch": 0.4053851907255049, "grad_norm": 2.073788987162284, "learning_rate": 7.4196557194683265e-06, "loss": 0.4039, "step": 1355 }, { "epoch": 0.4068810770381451, "grad_norm": 2.070263015501858, "learning_rate": 7.3967659188174676e-06, "loss": 0.4331, "step": 1360 }, { "epoch": 0.4083769633507853, "grad_norm": 1.957024406881209, "learning_rate": 7.373810715187516e-06, "loss": 0.4198, "step": 1365 }, { "epoch": 0.4098728496634256, "grad_norm": 2.0021094595131705, "learning_rate": 7.350790734980359e-06, "loss": 0.4138, "step": 1370 }, { "epoch": 0.4113687359760658, "grad_norm": 2.038893591791927, "learning_rate": 7.327706606365512e-06, "loss": 0.4099, "step": 1375 }, { "epoch": 0.4128646222887061, "grad_norm": 2.091182328954734, "learning_rate": 7.304558959262973e-06, "loss": 0.4091, "step": 1380 }, { "epoch": 0.4143605086013463, "grad_norm": 2.005484469630839, "learning_rate": 7.281348425326034e-06, "loss": 0.4071, "step": 1385 }, { "epoch": 0.4158563949139865, "grad_norm": 2.000171729890043, "learning_rate": 7.258075637924039e-06, "loss": 0.4077, "step": 1390 }, { "epoch": 0.41735228122662676, "grad_norm": 1.88335343776708, "learning_rate": 7.234741232125111e-06, "loss": 0.4106, "step": 1395 }, { "epoch": 0.418848167539267, "grad_norm": 2.041697368575073, "learning_rate": 7.211345844678816e-06, "loss": 0.4124, "step": 1400 }, { "epoch": 0.42034405385190726, "grad_norm": 2.1120074891606313, "learning_rate": 7.1878901139987826e-06, "loss": 0.414, "step": 1405 }, { "epoch": 0.4218399401645475, "grad_norm": 2.017409414749495, "learning_rate": 7.164374680145293e-06, "loss": 0.4038, "step": 1410 }, { "epoch": 0.42333582647718776, "grad_norm": 2.0432465019716144, "learning_rate": 7.140800184807805e-06, "loss": 0.4073, "step": 1415 }, { "epoch": 0.42483171278982795, "grad_norm": 2.060077990063716, "learning_rate": 7.117167271287453e-06, "loss": 0.4068, "step": 1420 }, { "epoch": 0.4263275991024682, "grad_norm": 2.027592571205212, "learning_rate": 7.09347658447948e-06, "loss": 0.4042, "step": 1425 }, { "epoch": 0.42782348541510845, "grad_norm": 2.040823863949173, "learning_rate": 7.069728770855652e-06, "loss": 0.4034, "step": 1430 }, { "epoch": 0.4293193717277487, "grad_norm": 2.1465152715010722, "learning_rate": 7.0459244784466115e-06, "loss": 0.4049, "step": 1435 }, { "epoch": 0.43081525804038895, "grad_norm": 2.017024929241199, "learning_rate": 7.022064356824196e-06, "loss": 0.4051, "step": 1440 }, { "epoch": 0.4323111443530292, "grad_norm": 1.9756966229288817, "learning_rate": 6.998149057083711e-06, "loss": 0.3991, "step": 1445 }, { "epoch": 0.4338070306656694, "grad_norm": 1.9869718270881975, "learning_rate": 6.9741792318261585e-06, "loss": 0.4029, "step": 1450 }, { "epoch": 0.43530291697830964, "grad_norm": 2.1015981628011136, "learning_rate": 6.950155535140439e-06, "loss": 0.3998, "step": 1455 }, { "epoch": 0.4367988032909499, "grad_norm": 2.1512869214406174, "learning_rate": 6.926078622585496e-06, "loss": 0.4001, "step": 1460 }, { "epoch": 0.43829468960359014, "grad_norm": 2.0152270376530677, "learning_rate": 6.901949151172427e-06, "loss": 0.4047, "step": 1465 }, { "epoch": 0.4397905759162304, "grad_norm": 2.11665136839116, "learning_rate": 6.877767779346556e-06, "loss": 0.4064, "step": 1470 }, { "epoch": 0.4412864622288706, "grad_norm": 2.231208727114714, "learning_rate": 6.8535351669694694e-06, "loss": 0.3884, "step": 1475 }, { "epoch": 0.44278234854151083, "grad_norm": 1.9444993004804072, "learning_rate": 6.829251975301003e-06, "loss": 0.3949, "step": 1480 }, { "epoch": 0.4442782348541511, "grad_norm": 1.98272069907838, "learning_rate": 6.8049188669812024e-06, "loss": 0.395, "step": 1485 }, { "epoch": 0.4457741211667913, "grad_norm": 1.9120999593676538, "learning_rate": 6.7805365060122386e-06, "loss": 0.3968, "step": 1490 }, { "epoch": 0.4472700074794316, "grad_norm": 2.0053365034386186, "learning_rate": 6.756105557740289e-06, "loss": 0.402, "step": 1495 }, { "epoch": 0.4487658937920718, "grad_norm": 1.9514629474872618, "learning_rate": 6.731626688837387e-06, "loss": 0.3836, "step": 1500 }, { "epoch": 0.450261780104712, "grad_norm": 2.1087506038221955, "learning_rate": 6.707100567283217e-06, "loss": 0.3843, "step": 1505 }, { "epoch": 0.45175766641735227, "grad_norm": 2.1300871436189306, "learning_rate": 6.682527862346898e-06, "loss": 0.3996, "step": 1510 }, { "epoch": 0.4532535527299925, "grad_norm": 1.9854434493239195, "learning_rate": 6.657909244568721e-06, "loss": 0.4011, "step": 1515 }, { "epoch": 0.45474943904263276, "grad_norm": 1.9814246083045182, "learning_rate": 6.6332453857418375e-06, "loss": 0.4012, "step": 1520 }, { "epoch": 0.456245325355273, "grad_norm": 2.023928605650618, "learning_rate": 6.608536958893948e-06, "loss": 0.3962, "step": 1525 }, { "epoch": 0.45774121166791326, "grad_norm": 2.012248063709598, "learning_rate": 6.583784638268919e-06, "loss": 0.4001, "step": 1530 }, { "epoch": 0.45923709798055345, "grad_norm": 1.9208473033828253, "learning_rate": 6.5589890993083934e-06, "loss": 0.3965, "step": 1535 }, { "epoch": 0.4607329842931937, "grad_norm": 2.1713469007968476, "learning_rate": 6.534151018633355e-06, "loss": 0.3962, "step": 1540 }, { "epoch": 0.46222887060583395, "grad_norm": 2.079357026566145, "learning_rate": 6.509271074025668e-06, "loss": 0.3913, "step": 1545 }, { "epoch": 0.4637247569184742, "grad_norm": 1.9619607602752462, "learning_rate": 6.484349944409579e-06, "loss": 0.3885, "step": 1550 }, { "epoch": 0.46522064323111445, "grad_norm": 2.025383663450042, "learning_rate": 6.459388309833193e-06, "loss": 0.396, "step": 1555 }, { "epoch": 0.4667165295437547, "grad_norm": 1.9926140480691588, "learning_rate": 6.434386851449914e-06, "loss": 0.3978, "step": 1560 }, { "epoch": 0.4682124158563949, "grad_norm": 2.0001783057698677, "learning_rate": 6.409346251499859e-06, "loss": 0.3889, "step": 1565 }, { "epoch": 0.46970830216903514, "grad_norm": 1.9977538629610117, "learning_rate": 6.384267193291238e-06, "loss": 0.3872, "step": 1570 }, { "epoch": 0.4712041884816754, "grad_norm": 1.9725560974868908, "learning_rate": 6.3591503611817155e-06, "loss": 0.39, "step": 1575 }, { "epoch": 0.47270007479431564, "grad_norm": 1.9326640130746877, "learning_rate": 6.333996440559726e-06, "loss": 0.3815, "step": 1580 }, { "epoch": 0.4741959611069559, "grad_norm": 1.9055402813860574, "learning_rate": 6.308806117825777e-06, "loss": 0.3801, "step": 1585 }, { "epoch": 0.47569184741959614, "grad_norm": 1.9722002752461958, "learning_rate": 6.283580080373721e-06, "loss": 0.3804, "step": 1590 }, { "epoch": 0.47718773373223633, "grad_norm": 2.0082373206027526, "learning_rate": 6.25831901657199e-06, "loss": 0.3775, "step": 1595 }, { "epoch": 0.4786836200448766, "grad_norm": 1.9266846313881612, "learning_rate": 6.233023615744813e-06, "loss": 0.3883, "step": 1600 }, { "epoch": 0.4801795063575168, "grad_norm": 2.088263839026747, "learning_rate": 6.207694568153418e-06, "loss": 0.389, "step": 1605 }, { "epoch": 0.4816753926701571, "grad_norm": 2.1042065332247555, "learning_rate": 6.182332564977174e-06, "loss": 0.3792, "step": 1610 }, { "epoch": 0.4831712789827973, "grad_norm": 1.9720059814432505, "learning_rate": 6.156938298294752e-06, "loss": 0.3706, "step": 1615 }, { "epoch": 0.4846671652954376, "grad_norm": 1.9180314469419848, "learning_rate": 6.131512461065227e-06, "loss": 0.377, "step": 1620 }, { "epoch": 0.48616305160807777, "grad_norm": 2.2323059057893775, "learning_rate": 6.106055747109169e-06, "loss": 0.3737, "step": 1625 }, { "epoch": 0.487658937920718, "grad_norm": 1.9094100696871863, "learning_rate": 6.080568851089717e-06, "loss": 0.381, "step": 1630 }, { "epoch": 0.48915482423335827, "grad_norm": 1.8740312923707445, "learning_rate": 6.055052468493614e-06, "loss": 0.3712, "step": 1635 }, { "epoch": 0.4906507105459985, "grad_norm": 2.2274282125289364, "learning_rate": 6.029507295612235e-06, "loss": 0.3818, "step": 1640 }, { "epoch": 0.49214659685863876, "grad_norm": 1.9574498076717952, "learning_rate": 6.0039340295225845e-06, "loss": 0.3808, "step": 1645 }, { "epoch": 0.49364248317127896, "grad_norm": 2.019106731639845, "learning_rate": 5.978333368068278e-06, "loss": 0.3739, "step": 1650 }, { "epoch": 0.4951383694839192, "grad_norm": 2.003448813298111, "learning_rate": 5.952706009840491e-06, "loss": 0.3801, "step": 1655 }, { "epoch": 0.49663425579655945, "grad_norm": 2.033251099732251, "learning_rate": 5.9270526541589025e-06, "loss": 0.3719, "step": 1660 }, { "epoch": 0.4981301421091997, "grad_norm": 1.9736607384350244, "learning_rate": 5.901374001052614e-06, "loss": 0.3647, "step": 1665 }, { "epoch": 0.49962602842183995, "grad_norm": 1.8450885851841383, "learning_rate": 5.875670751241036e-06, "loss": 0.3694, "step": 1670 }, { "epoch": 0.5011219147344802, "grad_norm": 1.9550125676086019, "learning_rate": 5.849943606114782e-06, "loss": 0.3765, "step": 1675 }, { "epoch": 0.5026178010471204, "grad_norm": 1.9088487115244133, "learning_rate": 5.824193267716517e-06, "loss": 0.3628, "step": 1680 }, { "epoch": 0.5041136873597607, "grad_norm": 2.0236760384942887, "learning_rate": 5.798420438721804e-06, "loss": 0.3681, "step": 1685 }, { "epoch": 0.5056095736724009, "grad_norm": 1.8739105083496626, "learning_rate": 5.772625822419933e-06, "loss": 0.3626, "step": 1690 }, { "epoch": 0.5071054599850411, "grad_norm": 2.0538294961575048, "learning_rate": 5.74681012269472e-06, "loss": 0.3664, "step": 1695 }, { "epoch": 0.5086013462976814, "grad_norm": 1.9510478793415906, "learning_rate": 5.720974044005314e-06, "loss": 0.3687, "step": 1700 }, { "epoch": 0.5100972326103216, "grad_norm": 2.0027682706640206, "learning_rate": 5.695118291366959e-06, "loss": 0.3791, "step": 1705 }, { "epoch": 0.5115931189229619, "grad_norm": 2.017422075178467, "learning_rate": 5.669243570331766e-06, "loss": 0.3592, "step": 1710 }, { "epoch": 0.5130890052356021, "grad_norm": 1.9298103072373924, "learning_rate": 5.643350586969453e-06, "loss": 0.3624, "step": 1715 }, { "epoch": 0.5145848915482424, "grad_norm": 1.828229384099037, "learning_rate": 5.617440047848081e-06, "loss": 0.3693, "step": 1720 }, { "epoch": 0.5160807778608826, "grad_norm": 1.9835871613164413, "learning_rate": 5.591512660014773e-06, "loss": 0.367, "step": 1725 }, { "epoch": 0.5175766641735228, "grad_norm": 1.906111097537283, "learning_rate": 5.5655691309764225e-06, "loss": 0.3698, "step": 1730 }, { "epoch": 0.5190725504861631, "grad_norm": 1.8980070836105973, "learning_rate": 5.539610168680381e-06, "loss": 0.3617, "step": 1735 }, { "epoch": 0.5205684367988033, "grad_norm": 1.929980721348062, "learning_rate": 5.513636481495143e-06, "loss": 0.3603, "step": 1740 }, { "epoch": 0.5220643231114436, "grad_norm": 1.91015401663393, "learning_rate": 5.487648778191021e-06, "loss": 0.3533, "step": 1745 }, { "epoch": 0.5235602094240838, "grad_norm": 1.9455506909545779, "learning_rate": 5.4616477679207906e-06, "loss": 0.3746, "step": 1750 }, { "epoch": 0.525056095736724, "grad_norm": 1.8552115044332138, "learning_rate": 5.435634160200355e-06, "loss": 0.3583, "step": 1755 }, { "epoch": 0.5265519820493643, "grad_norm": 1.913776238110964, "learning_rate": 5.409608664889376e-06, "loss": 0.3571, "step": 1760 }, { "epoch": 0.5280478683620045, "grad_norm": 1.9566204864416041, "learning_rate": 5.383571992171904e-06, "loss": 0.3681, "step": 1765 }, { "epoch": 0.5295437546746448, "grad_norm": 2.0484694098984813, "learning_rate": 5.357524852536996e-06, "loss": 0.3579, "step": 1770 }, { "epoch": 0.531039640987285, "grad_norm": 1.9124761975111415, "learning_rate": 5.331467956759331e-06, "loss": 0.3508, "step": 1775 }, { "epoch": 0.5325355272999253, "grad_norm": 1.9151628917936323, "learning_rate": 5.305402015879817e-06, "loss": 0.3582, "step": 1780 }, { "epoch": 0.5340314136125655, "grad_norm": 1.8760817819604374, "learning_rate": 5.279327741186179e-06, "loss": 0.3607, "step": 1785 }, { "epoch": 0.5355272999252056, "grad_norm": 1.961131431192389, "learning_rate": 5.253245844193564e-06, "loss": 0.3517, "step": 1790 }, { "epoch": 0.537023186237846, "grad_norm": 1.971571895204417, "learning_rate": 5.227157036625108e-06, "loss": 0.3456, "step": 1795 }, { "epoch": 0.5385190725504861, "grad_norm": 1.8838335367241383, "learning_rate": 5.2010620303925275e-06, "loss": 0.3519, "step": 1800 }, { "epoch": 0.5400149588631264, "grad_norm": 1.829377568323147, "learning_rate": 5.174961537576685e-06, "loss": 0.3564, "step": 1805 }, { "epoch": 0.5415108451757666, "grad_norm": 1.8522486080816014, "learning_rate": 5.148856270408163e-06, "loss": 0.3568, "step": 1810 }, { "epoch": 0.5430067314884068, "grad_norm": 1.938579795945218, "learning_rate": 5.122746941247828e-06, "loss": 0.3607, "step": 1815 }, { "epoch": 0.5445026178010471, "grad_norm": 1.8962553032833915, "learning_rate": 5.096634262567388e-06, "loss": 0.3578, "step": 1820 }, { "epoch": 0.5459985041136873, "grad_norm": 1.7953489371783111, "learning_rate": 5.070518946929954e-06, "loss": 0.3495, "step": 1825 }, { "epoch": 0.5474943904263276, "grad_norm": 1.9518780090135102, "learning_rate": 5.044401706970592e-06, "loss": 0.3558, "step": 1830 }, { "epoch": 0.5489902767389678, "grad_norm": 2.029523910683152, "learning_rate": 5.018283255376882e-06, "loss": 0.3505, "step": 1835 }, { "epoch": 0.550486163051608, "grad_norm": 1.9831397143651377, "learning_rate": 4.992164304869464e-06, "loss": 0.3569, "step": 1840 }, { "epoch": 0.5519820493642483, "grad_norm": 2.1279272506945075, "learning_rate": 4.966045568182596e-06, "loss": 0.3372, "step": 1845 }, { "epoch": 0.5534779356768885, "grad_norm": 1.9637293854690605, "learning_rate": 4.939927758044698e-06, "loss": 0.3518, "step": 1850 }, { "epoch": 0.5549738219895288, "grad_norm": 2.0828701986556695, "learning_rate": 4.913811587158908e-06, "loss": 0.3443, "step": 1855 }, { "epoch": 0.556469708302169, "grad_norm": 1.981630887644782, "learning_rate": 4.887697768183633e-06, "loss": 0.3444, "step": 1860 }, { "epoch": 0.5579655946148093, "grad_norm": 1.8743980292802156, "learning_rate": 4.861587013713096e-06, "loss": 0.346, "step": 1865 }, { "epoch": 0.5594614809274495, "grad_norm": 1.9064350126377236, "learning_rate": 4.835480036257904e-06, "loss": 0.3467, "step": 1870 }, { "epoch": 0.5609573672400897, "grad_norm": 1.8972548935569284, "learning_rate": 4.809377548225589e-06, "loss": 0.3388, "step": 1875 }, { "epoch": 0.56245325355273, "grad_norm": 1.8677668781712522, "learning_rate": 4.783280261901179e-06, "loss": 0.3442, "step": 1880 }, { "epoch": 0.5639491398653702, "grad_norm": 1.900487755648876, "learning_rate": 4.757188889427761e-06, "loss": 0.3389, "step": 1885 }, { "epoch": 0.5654450261780105, "grad_norm": 1.9420787190248043, "learning_rate": 4.731104142787035e-06, "loss": 0.3472, "step": 1890 }, { "epoch": 0.5669409124906507, "grad_norm": 1.927314753260138, "learning_rate": 4.7050267337799074e-06, "loss": 0.3481, "step": 1895 }, { "epoch": 0.5684367988032909, "grad_norm": 1.9207634340998982, "learning_rate": 4.678957374007046e-06, "loss": 0.3424, "step": 1900 }, { "epoch": 0.5699326851159312, "grad_norm": 1.9212470848002643, "learning_rate": 4.652896774849477e-06, "loss": 0.3358, "step": 1905 }, { "epoch": 0.5714285714285714, "grad_norm": 1.924739770896096, "learning_rate": 4.626845647449161e-06, "loss": 0.3353, "step": 1910 }, { "epoch": 0.5729244577412117, "grad_norm": 1.9350839334038696, "learning_rate": 4.600804702689598e-06, "loss": 0.3348, "step": 1915 }, { "epoch": 0.5744203440538519, "grad_norm": 1.8695042520523082, "learning_rate": 4.57477465117642e-06, "loss": 0.338, "step": 1920 }, { "epoch": 0.5759162303664922, "grad_norm": 1.9312558535320394, "learning_rate": 4.54875620321801e-06, "loss": 0.343, "step": 1925 }, { "epoch": 0.5774121166791324, "grad_norm": 1.8821605226871228, "learning_rate": 4.522750068806107e-06, "loss": 0.3407, "step": 1930 }, { "epoch": 0.5789080029917726, "grad_norm": 2.0267756257950906, "learning_rate": 4.496756957596438e-06, "loss": 0.332, "step": 1935 }, { "epoch": 0.5804038893044129, "grad_norm": 1.87047157167879, "learning_rate": 4.4707775788893586e-06, "loss": 0.3377, "step": 1940 }, { "epoch": 0.5818997756170531, "grad_norm": 1.8334500325846965, "learning_rate": 4.444812641610482e-06, "loss": 0.3331, "step": 1945 }, { "epoch": 0.5833956619296934, "grad_norm": 1.865940227461524, "learning_rate": 4.418862854291356e-06, "loss": 0.3336, "step": 1950 }, { "epoch": 0.5848915482423336, "grad_norm": 1.8290658280068524, "learning_rate": 4.392928925050106e-06, "loss": 0.3237, "step": 1955 }, { "epoch": 0.5863874345549738, "grad_norm": 1.8622254708709993, "learning_rate": 4.3670115615721265e-06, "loss": 0.3376, "step": 1960 }, { "epoch": 0.5878833208676141, "grad_norm": 1.9201843032013242, "learning_rate": 4.341111471090762e-06, "loss": 0.3459, "step": 1965 }, { "epoch": 0.5893792071802543, "grad_norm": 1.8796680590731187, "learning_rate": 4.315229360368014e-06, "loss": 0.3278, "step": 1970 }, { "epoch": 0.5908750934928946, "grad_norm": 1.8016030738978284, "learning_rate": 4.289365935675255e-06, "loss": 0.3268, "step": 1975 }, { "epoch": 0.5923709798055348, "grad_norm": 1.9190193806693643, "learning_rate": 4.263521902773944e-06, "loss": 0.3333, "step": 1980 }, { "epoch": 0.5938668661181751, "grad_norm": 1.8784476290504393, "learning_rate": 4.237697966896385e-06, "loss": 0.3271, "step": 1985 }, { "epoch": 0.5953627524308153, "grad_norm": 1.8712250384764961, "learning_rate": 4.211894832726471e-06, "loss": 0.3342, "step": 1990 }, { "epoch": 0.5968586387434555, "grad_norm": 1.9036575376553382, "learning_rate": 4.1861132043804555e-06, "loss": 0.335, "step": 1995 }, { "epoch": 0.5983545250560958, "grad_norm": 1.9581740636617746, "learning_rate": 4.160353785387746e-06, "loss": 0.324, "step": 2000 }, { "epoch": 0.599850411368736, "grad_norm": 1.834158258904465, "learning_rate": 4.134617278671694e-06, "loss": 0.3278, "step": 2005 }, { "epoch": 0.6013462976813763, "grad_norm": 1.864212034584157, "learning_rate": 4.108904386530429e-06, "loss": 0.3293, "step": 2010 }, { "epoch": 0.6028421839940165, "grad_norm": 1.7424366923402765, "learning_rate": 4.083215810617678e-06, "loss": 0.327, "step": 2015 }, { "epoch": 0.6043380703066566, "grad_norm": 1.8772680059539715, "learning_rate": 4.057552251923633e-06, "loss": 0.3327, "step": 2020 }, { "epoch": 0.605833956619297, "grad_norm": 1.8850386701103279, "learning_rate": 4.031914410755809e-06, "loss": 0.327, "step": 2025 }, { "epoch": 0.6073298429319371, "grad_norm": 1.8735991544459796, "learning_rate": 4.0063029867199455e-06, "loss": 0.3278, "step": 2030 }, { "epoch": 0.6088257292445775, "grad_norm": 2.0742483586745952, "learning_rate": 3.980718678700909e-06, "loss": 0.3295, "step": 2035 }, { "epoch": 0.6103216155572176, "grad_norm": 1.805997806919521, "learning_rate": 3.955162184843625e-06, "loss": 0.318, "step": 2040 }, { "epoch": 0.6118175018698578, "grad_norm": 1.9482369327485018, "learning_rate": 3.929634202534026e-06, "loss": 0.3303, "step": 2045 }, { "epoch": 0.6133133881824981, "grad_norm": 1.8643741778263954, "learning_rate": 3.904135428380019e-06, "loss": 0.3221, "step": 2050 }, { "epoch": 0.6148092744951383, "grad_norm": 1.9119914679721755, "learning_rate": 3.8786665581924805e-06, "loss": 0.3259, "step": 2055 }, { "epoch": 0.6163051608077786, "grad_norm": 2.0294178588740808, "learning_rate": 3.853228286966265e-06, "loss": 0.3114, "step": 2060 }, { "epoch": 0.6178010471204188, "grad_norm": 1.8598282314437558, "learning_rate": 3.827821308861244e-06, "loss": 0.3242, "step": 2065 }, { "epoch": 0.6192969334330591, "grad_norm": 1.9818198802388973, "learning_rate": 3.8024463171833636e-06, "loss": 0.3252, "step": 2070 }, { "epoch": 0.6207928197456993, "grad_norm": 1.9439228162479631, "learning_rate": 3.777104004365721e-06, "loss": 0.3258, "step": 2075 }, { "epoch": 0.6222887060583395, "grad_norm": 1.9406393323579751, "learning_rate": 3.7517950619496713e-06, "loss": 0.327, "step": 2080 }, { "epoch": 0.6237845923709798, "grad_norm": 1.8702196116833902, "learning_rate": 3.7265201805659618e-06, "loss": 0.3274, "step": 2085 }, { "epoch": 0.62528047868362, "grad_norm": 1.8206045884367064, "learning_rate": 3.701280049915877e-06, "loss": 0.3087, "step": 2090 }, { "epoch": 0.6267763649962603, "grad_norm": 1.8946084974764223, "learning_rate": 3.676075358752426e-06, "loss": 0.3227, "step": 2095 }, { "epoch": 0.6282722513089005, "grad_norm": 1.8425099562092453, "learning_rate": 3.6509067948615464e-06, "loss": 0.3091, "step": 2100 }, { "epoch": 0.6297681376215407, "grad_norm": 1.833988306261615, "learning_rate": 3.6257750450433284e-06, "loss": 0.3158, "step": 2105 }, { "epoch": 0.631264023934181, "grad_norm": 1.848102418513888, "learning_rate": 3.6006807950932867e-06, "loss": 0.3231, "step": 2110 }, { "epoch": 0.6327599102468212, "grad_norm": 1.8597295350064236, "learning_rate": 3.575624729783632e-06, "loss": 0.317, "step": 2115 }, { "epoch": 0.6342557965594615, "grad_norm": 1.8571116178437028, "learning_rate": 3.550607532844596e-06, "loss": 0.3185, "step": 2120 }, { "epoch": 0.6357516828721017, "grad_norm": 1.850039717310936, "learning_rate": 3.5256298869457715e-06, "loss": 0.3153, "step": 2125 }, { "epoch": 0.637247569184742, "grad_norm": 1.8517187441330423, "learning_rate": 3.5006924736774793e-06, "loss": 0.3231, "step": 2130 }, { "epoch": 0.6387434554973822, "grad_norm": 1.886804887794377, "learning_rate": 3.47579597353217e-06, "loss": 0.3132, "step": 2135 }, { "epoch": 0.6402393418100224, "grad_norm": 1.8207891498106763, "learning_rate": 3.4509410658858606e-06, "loss": 0.3239, "step": 2140 }, { "epoch": 0.6417352281226627, "grad_norm": 1.9080722925799685, "learning_rate": 3.426128428979589e-06, "loss": 0.3127, "step": 2145 }, { "epoch": 0.6432311144353029, "grad_norm": 1.7978167092374475, "learning_rate": 3.4013587399009073e-06, "loss": 0.3112, "step": 2150 }, { "epoch": 0.6447270007479432, "grad_norm": 1.8462499082396047, "learning_rate": 3.376632674565411e-06, "loss": 0.3168, "step": 2155 }, { "epoch": 0.6462228870605834, "grad_norm": 1.856553229309688, "learning_rate": 3.351950907698285e-06, "loss": 0.3065, "step": 2160 }, { "epoch": 0.6477187733732236, "grad_norm": 1.7800004213781706, "learning_rate": 3.3273141128159005e-06, "loss": 0.3132, "step": 2165 }, { "epoch": 0.6492146596858639, "grad_norm": 1.9132965188669029, "learning_rate": 3.3027229622074335e-06, "loss": 0.3179, "step": 2170 }, { "epoch": 0.6507105459985041, "grad_norm": 1.7650226022206836, "learning_rate": 3.278178126916515e-06, "loss": 0.3137, "step": 2175 }, { "epoch": 0.6522064323111444, "grad_norm": 1.951509417973989, "learning_rate": 3.2536802767229243e-06, "loss": 0.3084, "step": 2180 }, { "epoch": 0.6537023186237846, "grad_norm": 1.772116366162939, "learning_rate": 3.2292300801243133e-06, "loss": 0.3102, "step": 2185 }, { "epoch": 0.6551982049364248, "grad_norm": 1.8140401176421401, "learning_rate": 3.20482820431796e-06, "loss": 0.3056, "step": 2190 }, { "epoch": 0.6566940912490651, "grad_norm": 1.8243620700136636, "learning_rate": 3.180475315182563e-06, "loss": 0.3033, "step": 2195 }, { "epoch": 0.6581899775617053, "grad_norm": 1.8380166168759837, "learning_rate": 3.1561720772600736e-06, "loss": 0.304, "step": 2200 }, { "epoch": 0.6596858638743456, "grad_norm": 1.8336050039462124, "learning_rate": 3.1319191537375577e-06, "loss": 0.3143, "step": 2205 }, { "epoch": 0.6611817501869858, "grad_norm": 1.8667890213032734, "learning_rate": 3.107717206429105e-06, "loss": 0.3031, "step": 2210 }, { "epoch": 0.6626776364996261, "grad_norm": 1.7638159112909835, "learning_rate": 3.0835668957577636e-06, "loss": 0.3013, "step": 2215 }, { "epoch": 0.6641735228122663, "grad_norm": 1.900781665691589, "learning_rate": 3.059468880737519e-06, "loss": 0.3073, "step": 2220 }, { "epoch": 0.6656694091249065, "grad_norm": 1.943524014415726, "learning_rate": 3.035423818955316e-06, "loss": 0.3087, "step": 2225 }, { "epoch": 0.6671652954375468, "grad_norm": 1.736021065342517, "learning_rate": 3.0114323665531066e-06, "loss": 0.2979, "step": 2230 }, { "epoch": 0.668661181750187, "grad_norm": 1.746010053168365, "learning_rate": 2.987495178209951e-06, "loss": 0.307, "step": 2235 }, { "epoch": 0.6701570680628273, "grad_norm": 1.8018064213578624, "learning_rate": 2.9636129071241515e-06, "loss": 0.3126, "step": 2240 }, { "epoch": 0.6716529543754675, "grad_norm": 1.8077932770071266, "learning_rate": 2.9397862049954307e-06, "loss": 0.3004, "step": 2245 }, { "epoch": 0.6731488406881077, "grad_norm": 1.7048569088891747, "learning_rate": 2.916015722007137e-06, "loss": 0.3066, "step": 2250 }, { "epoch": 0.674644727000748, "grad_norm": 1.7988871113907166, "learning_rate": 2.892302106808519e-06, "loss": 0.3052, "step": 2255 }, { "epoch": 0.6761406133133882, "grad_norm": 1.8715481375394143, "learning_rate": 2.8686460064970078e-06, "loss": 0.3085, "step": 2260 }, { "epoch": 0.6776364996260285, "grad_norm": 1.8258948545382783, "learning_rate": 2.8450480666005743e-06, "loss": 0.3023, "step": 2265 }, { "epoch": 0.6791323859386686, "grad_norm": 1.7183769572814935, "learning_rate": 2.821508931060104e-06, "loss": 0.3169, "step": 2270 }, { "epoch": 0.680628272251309, "grad_norm": 1.8087144140013556, "learning_rate": 2.7980292422118282e-06, "loss": 0.3, "step": 2275 }, { "epoch": 0.6821241585639491, "grad_norm": 1.9721800720444596, "learning_rate": 2.7746096407698004e-06, "loss": 0.3029, "step": 2280 }, { "epoch": 0.6836200448765893, "grad_norm": 1.8344419672931702, "learning_rate": 2.7512507658083996e-06, "loss": 0.2996, "step": 2285 }, { "epoch": 0.6851159311892296, "grad_norm": 1.757267551606752, "learning_rate": 2.7279532547449083e-06, "loss": 0.3033, "step": 2290 }, { "epoch": 0.6866118175018698, "grad_norm": 1.8575093423008022, "learning_rate": 2.704717743322104e-06, "loss": 0.2873, "step": 2295 }, { "epoch": 0.6881077038145101, "grad_norm": 1.761502547654336, "learning_rate": 2.681544865590926e-06, "loss": 0.2999, "step": 2300 }, { "epoch": 0.6896035901271503, "grad_norm": 1.958074773552565, "learning_rate": 2.6584352538931523e-06, "loss": 0.3023, "step": 2305 }, { "epoch": 0.6910994764397905, "grad_norm": 1.7604780827427178, "learning_rate": 2.635389538844166e-06, "loss": 0.2923, "step": 2310 }, { "epoch": 0.6925953627524308, "grad_norm": 1.860509876291064, "learning_rate": 2.612408349315734e-06, "loss": 0.2968, "step": 2315 }, { "epoch": 0.694091249065071, "grad_norm": 1.8116830542415268, "learning_rate": 2.5894923124188498e-06, "loss": 0.2911, "step": 2320 }, { "epoch": 0.6955871353777113, "grad_norm": 1.816773761816662, "learning_rate": 2.5666420534866256e-06, "loss": 0.3017, "step": 2325 }, { "epoch": 0.6970830216903515, "grad_norm": 1.810456487051493, "learning_rate": 2.543858196057214e-06, "loss": 0.3045, "step": 2330 }, { "epoch": 0.6985789080029918, "grad_norm": 1.8462477832363797, "learning_rate": 2.5211413618568114e-06, "loss": 0.2979, "step": 2335 }, { "epoch": 0.700074794315632, "grad_norm": 1.749680469906487, "learning_rate": 2.4984921707826805e-06, "loss": 0.298, "step": 2340 }, { "epoch": 0.7015706806282722, "grad_norm": 1.7715319612256217, "learning_rate": 2.4759112408862366e-06, "loss": 0.2905, "step": 2345 }, { "epoch": 0.7030665669409125, "grad_norm": 1.9011349884243633, "learning_rate": 2.4533991883561868e-06, "loss": 0.2938, "step": 2350 }, { "epoch": 0.7045624532535527, "grad_norm": 1.7509668722553002, "learning_rate": 2.4309566275017027e-06, "loss": 0.2931, "step": 2355 }, { "epoch": 0.706058339566193, "grad_norm": 1.7463279622870067, "learning_rate": 2.4085841707356787e-06, "loss": 0.2948, "step": 2360 }, { "epoch": 0.7075542258788332, "grad_norm": 1.7457958614044327, "learning_rate": 2.386282428558001e-06, "loss": 0.2935, "step": 2365 }, { "epoch": 0.7090501121914734, "grad_norm": 1.8306487338719184, "learning_rate": 2.364052009538892e-06, "loss": 0.3029, "step": 2370 }, { "epoch": 0.7105459985041137, "grad_norm": 1.8902782477754185, "learning_rate": 2.341893520302313e-06, "loss": 0.2937, "step": 2375 }, { "epoch": 0.7120418848167539, "grad_norm": 1.7948687484011157, "learning_rate": 2.3198075655094023e-06, "loss": 0.2925, "step": 2380 }, { "epoch": 0.7135377711293942, "grad_norm": 1.8682547497864384, "learning_rate": 2.297794747841976e-06, "loss": 0.2992, "step": 2385 }, { "epoch": 0.7150336574420344, "grad_norm": 1.7985072864408282, "learning_rate": 2.275855667986086e-06, "loss": 0.2992, "step": 2390 }, { "epoch": 0.7165295437546746, "grad_norm": 1.6780824098442955, "learning_rate": 2.2539909246156257e-06, "loss": 0.2902, "step": 2395 }, { "epoch": 0.7180254300673149, "grad_norm": 1.9327685022447323, "learning_rate": 2.232201114375988e-06, "loss": 0.2879, "step": 2400 }, { "epoch": 0.7195213163799551, "grad_norm": 1.8312593750432005, "learning_rate": 2.2104868318677963e-06, "loss": 0.2967, "step": 2405 }, { "epoch": 0.7210172026925954, "grad_norm": 1.8041698028281294, "learning_rate": 2.1888486696306706e-06, "loss": 0.2849, "step": 2410 }, { "epoch": 0.7225130890052356, "grad_norm": 1.8021876820178402, "learning_rate": 2.1672872181270575e-06, "loss": 0.2918, "step": 2415 }, { "epoch": 0.7240089753178759, "grad_norm": 1.807836863115144, "learning_rate": 2.1458030657261235e-06, "loss": 0.282, "step": 2420 }, { "epoch": 0.7255048616305161, "grad_norm": 1.7515999717106407, "learning_rate": 2.1243967986876933e-06, "loss": 0.2922, "step": 2425 }, { "epoch": 0.7270007479431563, "grad_norm": 1.8149872804694056, "learning_rate": 2.1030690011462567e-06, "loss": 0.2912, "step": 2430 }, { "epoch": 0.7284966342557966, "grad_norm": 1.7878582875336215, "learning_rate": 2.081820255095028e-06, "loss": 0.2886, "step": 2435 }, { "epoch": 0.7299925205684368, "grad_norm": 1.7664930533873893, "learning_rate": 2.0606511403700575e-06, "loss": 0.2964, "step": 2440 }, { "epoch": 0.7314884068810771, "grad_norm": 1.7856577814800616, "learning_rate": 2.0395622346344213e-06, "loss": 0.2849, "step": 2445 }, { "epoch": 0.7329842931937173, "grad_norm": 1.7620387064486105, "learning_rate": 2.018554113362449e-06, "loss": 0.2811, "step": 2450 }, { "epoch": 0.7344801795063575, "grad_norm": 1.746148787119175, "learning_rate": 1.9976273498240234e-06, "loss": 0.2866, "step": 2455 }, { "epoch": 0.7359760658189978, "grad_norm": 1.759195000248038, "learning_rate": 1.976782515068938e-06, "loss": 0.294, "step": 2460 }, { "epoch": 0.737471952131638, "grad_norm": 1.6081462651916374, "learning_rate": 1.9560201779113056e-06, "loss": 0.2821, "step": 2465 }, { "epoch": 0.7389678384442783, "grad_norm": 1.8127282683936143, "learning_rate": 1.9353409049140515e-06, "loss": 0.2827, "step": 2470 }, { "epoch": 0.7404637247569185, "grad_norm": 1.7928349569557254, "learning_rate": 1.9147452603734402e-06, "loss": 0.2889, "step": 2475 }, { "epoch": 0.7419596110695588, "grad_norm": 1.7519180416889486, "learning_rate": 1.894233806303689e-06, "loss": 0.2816, "step": 2480 }, { "epoch": 0.743455497382199, "grad_norm": 1.792648064853805, "learning_rate": 1.8738071024216141e-06, "loss": 0.2843, "step": 2485 }, { "epoch": 0.7449513836948392, "grad_norm": 1.7815734013272622, "learning_rate": 1.8534657061313744e-06, "loss": 0.2742, "step": 2490 }, { "epoch": 0.7464472700074795, "grad_norm": 1.825180595387709, "learning_rate": 1.8332101725092522e-06, "loss": 0.2816, "step": 2495 }, { "epoch": 0.7479431563201197, "grad_norm": 1.8420097876440362, "learning_rate": 1.8130410542885084e-06, "loss": 0.2808, "step": 2500 }, { "epoch": 0.74943904263276, "grad_norm": 1.8442353488656769, "learning_rate": 1.7929589018443016e-06, "loss": 0.2923, "step": 2505 }, { "epoch": 0.7509349289454001, "grad_norm": 1.876793012170064, "learning_rate": 1.7729642631786613e-06, "loss": 0.2872, "step": 2510 }, { "epoch": 0.7524308152580403, "grad_norm": 1.7511287142130798, "learning_rate": 1.7530576839055453e-06, "loss": 0.2822, "step": 2515 }, { "epoch": 0.7539267015706806, "grad_norm": 1.8394555324866848, "learning_rate": 1.7332397072359435e-06, "loss": 0.2765, "step": 2520 }, { "epoch": 0.7554225878833208, "grad_norm": 1.773080627419537, "learning_rate": 1.7135108739630573e-06, "loss": 0.2772, "step": 2525 }, { "epoch": 0.7569184741959611, "grad_norm": 1.7397840701003071, "learning_rate": 1.693871722447542e-06, "loss": 0.2748, "step": 2530 }, { "epoch": 0.7584143605086013, "grad_norm": 1.8139047134561623, "learning_rate": 1.6743227886028152e-06, "loss": 0.2809, "step": 2535 }, { "epoch": 0.7599102468212415, "grad_norm": 1.723146398169513, "learning_rate": 1.6548646058804347e-06, "loss": 0.277, "step": 2540 }, { "epoch": 0.7614061331338818, "grad_norm": 1.755509982892445, "learning_rate": 1.6354977052555393e-06, "loss": 0.2845, "step": 2545 }, { "epoch": 0.762902019446522, "grad_norm": 1.7634745348399379, "learning_rate": 1.6162226152123633e-06, "loss": 0.2845, "step": 2550 }, { "epoch": 0.7643979057591623, "grad_norm": 1.8539062432851583, "learning_rate": 1.5970398617298078e-06, "loss": 0.2828, "step": 2555 }, { "epoch": 0.7658937920718025, "grad_norm": 1.8053358835812254, "learning_rate": 1.5779499682670963e-06, "loss": 0.2774, "step": 2560 }, { "epoch": 0.7673896783844428, "grad_norm": 1.8014531312640616, "learning_rate": 1.5589534557494868e-06, "loss": 0.2841, "step": 2565 }, { "epoch": 0.768885564697083, "grad_norm": 1.735571527942806, "learning_rate": 1.5400508425540562e-06, "loss": 0.2746, "step": 2570 }, { "epoch": 0.7703814510097232, "grad_norm": 1.8540824858023373, "learning_rate": 1.5212426444955569e-06, "loss": 0.2807, "step": 2575 }, { "epoch": 0.7718773373223635, "grad_norm": 1.7139393419525597, "learning_rate": 1.5025293748123354e-06, "loss": 0.2815, "step": 2580 }, { "epoch": 0.7733732236350037, "grad_norm": 1.6431033212935895, "learning_rate": 1.4839115441523355e-06, "loss": 0.2696, "step": 2585 }, { "epoch": 0.774869109947644, "grad_norm": 1.7227778483828726, "learning_rate": 1.4653896605591584e-06, "loss": 0.2732, "step": 2590 }, { "epoch": 0.7763649962602842, "grad_norm": 1.7527519060060008, "learning_rate": 1.4469642294582048e-06, "loss": 0.2748, "step": 2595 }, { "epoch": 0.7778608825729244, "grad_norm": 1.6997524796558416, "learning_rate": 1.4286357536428696e-06, "loss": 0.2729, "step": 2600 }, { "epoch": 0.7793567688855647, "grad_norm": 1.7807204337692575, "learning_rate": 1.4104047332608379e-06, "loss": 0.2755, "step": 2605 }, { "epoch": 0.7808526551982049, "grad_norm": 1.7182846099936764, "learning_rate": 1.392271665800427e-06, "loss": 0.2777, "step": 2610 }, { "epoch": 0.7823485415108452, "grad_norm": 1.7302301084436003, "learning_rate": 1.3742370460770144e-06, "loss": 0.2762, "step": 2615 }, { "epoch": 0.7838444278234854, "grad_norm": 1.711106037244554, "learning_rate": 1.3563013662195356e-06, "loss": 0.2737, "step": 2620 }, { "epoch": 0.7853403141361257, "grad_norm": 1.8191358842574659, "learning_rate": 1.3384651156570483e-06, "loss": 0.2732, "step": 2625 }, { "epoch": 0.7868362004487659, "grad_norm": 1.751260410944088, "learning_rate": 1.3207287811053893e-06, "loss": 0.2771, "step": 2630 }, { "epoch": 0.7883320867614061, "grad_norm": 1.7320253510102213, "learning_rate": 1.3030928465538822e-06, "loss": 0.27, "step": 2635 }, { "epoch": 0.7898279730740464, "grad_norm": 1.7406452518990843, "learning_rate": 1.2855577932521352e-06, "loss": 0.2703, "step": 2640 }, { "epoch": 0.7913238593866866, "grad_norm": 1.8538751789457641, "learning_rate": 1.2681240996969085e-06, "loss": 0.2776, "step": 2645 }, { "epoch": 0.7928197456993269, "grad_norm": 1.740887599672242, "learning_rate": 1.250792241619051e-06, "loss": 0.2736, "step": 2650 }, { "epoch": 0.7943156320119671, "grad_norm": 1.8281991178787242, "learning_rate": 1.233562691970533e-06, "loss": 0.2749, "step": 2655 }, { "epoch": 0.7958115183246073, "grad_norm": 1.6556477939621426, "learning_rate": 1.2164359209115235e-06, "loss": 0.2776, "step": 2660 }, { "epoch": 0.7973074046372476, "grad_norm": 1.695787778492541, "learning_rate": 1.1994123957975722e-06, "loss": 0.2702, "step": 2665 }, { "epoch": 0.7988032909498878, "grad_norm": 1.7707776645975837, "learning_rate": 1.1824925811668485e-06, "loss": 0.2627, "step": 2670 }, { "epoch": 0.8002991772625281, "grad_norm": 1.8300425136047838, "learning_rate": 1.1656769387274714e-06, "loss": 0.2688, "step": 2675 }, { "epoch": 0.8017950635751683, "grad_norm": 1.6906589157556278, "learning_rate": 1.1489659273449073e-06, "loss": 0.2672, "step": 2680 }, { "epoch": 0.8032909498878086, "grad_norm": 1.7718115103968484, "learning_rate": 1.132360003029449e-06, "loss": 0.2673, "step": 2685 }, { "epoch": 0.8047868362004488, "grad_norm": 1.7597119643475179, "learning_rate": 1.115859618923773e-06, "loss": 0.2744, "step": 2690 }, { "epoch": 0.806282722513089, "grad_norm": 1.7801333538259148, "learning_rate": 1.0994652252905695e-06, "loss": 0.2662, "step": 2695 }, { "epoch": 0.8077786088257293, "grad_norm": 1.6866429011639965, "learning_rate": 1.083177269500264e-06, "loss": 0.2675, "step": 2700 }, { "epoch": 0.8092744951383695, "grad_norm": 1.9195992948000482, "learning_rate": 1.0669961960188008e-06, "loss": 0.2739, "step": 2705 }, { "epoch": 0.8107703814510098, "grad_norm": 1.8220041781840073, "learning_rate": 1.0509224463955249e-06, "loss": 0.2604, "step": 2710 }, { "epoch": 0.81226626776365, "grad_norm": 1.7303540258737908, "learning_rate": 1.0349564592511162e-06, "loss": 0.2743, "step": 2715 }, { "epoch": 0.8137621540762902, "grad_norm": 1.6406056857804932, "learning_rate": 1.0190986702656403e-06, "loss": 0.2719, "step": 2720 }, { "epoch": 0.8152580403889305, "grad_norm": 1.8590839739169418, "learning_rate": 1.0033495121666442e-06, "loss": 0.273, "step": 2725 }, { "epoch": 0.8167539267015707, "grad_norm": 1.7341252368355093, "learning_rate": 9.877094147173566e-07, "loss": 0.2712, "step": 2730 }, { "epoch": 0.818249813014211, "grad_norm": 1.7272695337289556, "learning_rate": 9.721788047049586e-07, "loss": 0.2628, "step": 2735 }, { "epoch": 0.8197456993268512, "grad_norm": 1.7050895419647492, "learning_rate": 9.567581059289322e-07, "loss": 0.2678, "step": 2740 }, { "epoch": 0.8212415856394913, "grad_norm": 1.7258978187627068, "learning_rate": 9.414477391895044e-07, "loss": 0.2715, "step": 2745 }, { "epoch": 0.8227374719521316, "grad_norm": 1.8460755537922702, "learning_rate": 9.262481222761588e-07, "loss": 0.2716, "step": 2750 }, { "epoch": 0.8242333582647718, "grad_norm": 1.7677837124955216, "learning_rate": 9.11159669956237e-07, "loss": 0.2725, "step": 2755 }, { "epoch": 0.8257292445774121, "grad_norm": 1.7183389424616196, "learning_rate": 8.961827939636198e-07, "loss": 0.2683, "step": 2760 }, { "epoch": 0.8272251308900523, "grad_norm": 1.8851170229714924, "learning_rate": 8.813179029874874e-07, "loss": 0.2588, "step": 2765 }, { "epoch": 0.8287210172026926, "grad_norm": 1.8163919089444864, "learning_rate": 8.665654026611797e-07, "loss": 0.2631, "step": 2770 }, { "epoch": 0.8302169035153328, "grad_norm": 1.7098860990754234, "learning_rate": 8.51925695551113e-07, "loss": 0.2679, "step": 2775 }, { "epoch": 0.831712789827973, "grad_norm": 1.7663056355635183, "learning_rate": 8.373991811458027e-07, "loss": 0.2652, "step": 2780 }, { "epoch": 0.8332086761406133, "grad_norm": 1.7186868648976898, "learning_rate": 8.229862558449592e-07, "loss": 0.2661, "step": 2785 }, { "epoch": 0.8347045624532535, "grad_norm": 1.8059879215165224, "learning_rate": 8.086873129486722e-07, "loss": 0.2686, "step": 2790 }, { "epoch": 0.8362004487658938, "grad_norm": 1.7374284001547664, "learning_rate": 7.945027426466801e-07, "loss": 0.2708, "step": 2795 }, { "epoch": 0.837696335078534, "grad_norm": 1.6598096486422094, "learning_rate": 7.804329320077181e-07, "loss": 0.2653, "step": 2800 }, { "epoch": 0.8391922213911742, "grad_norm": 1.676734657625906, "learning_rate": 7.664782649689611e-07, "loss": 0.2563, "step": 2805 }, { "epoch": 0.8406881077038145, "grad_norm": 1.7941246676620155, "learning_rate": 7.526391223255386e-07, "loss": 0.2643, "step": 2810 }, { "epoch": 0.8421839940164547, "grad_norm": 1.7441327844730907, "learning_rate": 7.389158817201541e-07, "loss": 0.2663, "step": 2815 }, { "epoch": 0.843679880329095, "grad_norm": 1.6764728143369185, "learning_rate": 7.253089176327738e-07, "loss": 0.2631, "step": 2820 }, { "epoch": 0.8451757666417352, "grad_norm": 1.7090343355435693, "learning_rate": 7.118186013704065e-07, "loss": 0.2579, "step": 2825 }, { "epoch": 0.8466716529543755, "grad_norm": 1.723034589615204, "learning_rate": 6.984453010569758e-07, "loss": 0.2718, "step": 2830 }, { "epoch": 0.8481675392670157, "grad_norm": 1.7083769223090157, "learning_rate": 6.851893816232729e-07, "loss": 0.259, "step": 2835 }, { "epoch": 0.8496634255796559, "grad_norm": 1.6983173618906942, "learning_rate": 6.720512047969957e-07, "loss": 0.2655, "step": 2840 }, { "epoch": 0.8511593118922962, "grad_norm": 1.6008652695866359, "learning_rate": 6.590311290928825e-07, "loss": 0.2661, "step": 2845 }, { "epoch": 0.8526551982049364, "grad_norm": 1.723592329316595, "learning_rate": 6.461295098029269e-07, "loss": 0.2548, "step": 2850 }, { "epoch": 0.8541510845175767, "grad_norm": 1.8054575785485054, "learning_rate": 6.333466989866787e-07, "loss": 0.264, "step": 2855 }, { "epoch": 0.8556469708302169, "grad_norm": 1.7902077125134892, "learning_rate": 6.206830454616447e-07, "loss": 0.266, "step": 2860 }, { "epoch": 0.8571428571428571, "grad_norm": 1.7147769185915753, "learning_rate": 6.08138894793765e-07, "loss": 0.2654, "step": 2865 }, { "epoch": 0.8586387434554974, "grad_norm": 1.7518112730752275, "learning_rate": 5.957145892879829e-07, "loss": 0.2594, "step": 2870 }, { "epoch": 0.8601346297681376, "grad_norm": 1.811592287193994, "learning_rate": 5.834104679789077e-07, "loss": 0.2647, "step": 2875 }, { "epoch": 0.8616305160807779, "grad_norm": 1.773326433422328, "learning_rate": 5.712268666215559e-07, "loss": 0.264, "step": 2880 }, { "epoch": 0.8631264023934181, "grad_norm": 1.68178039725722, "learning_rate": 5.591641176822005e-07, "loss": 0.2614, "step": 2885 }, { "epoch": 0.8646222887060584, "grad_norm": 1.6842479909967625, "learning_rate": 5.472225503292883e-07, "loss": 0.2621, "step": 2890 }, { "epoch": 0.8661181750186986, "grad_norm": 1.7449782410599817, "learning_rate": 5.354024904244632e-07, "loss": 0.2522, "step": 2895 }, { "epoch": 0.8676140613313388, "grad_norm": 1.7259602780620398, "learning_rate": 5.237042605136689e-07, "loss": 0.2614, "step": 2900 }, { "epoch": 0.8691099476439791, "grad_norm": 1.688101500268341, "learning_rate": 5.121281798183547e-07, "loss": 0.2611, "step": 2905 }, { "epoch": 0.8706058339566193, "grad_norm": 1.7726586716734274, "learning_rate": 5.00674564226758e-07, "loss": 0.2544, "step": 2910 }, { "epoch": 0.8721017202692596, "grad_norm": 1.6935216955087868, "learning_rate": 4.893437262852885e-07, "loss": 0.2523, "step": 2915 }, { "epoch": 0.8735976065818998, "grad_norm": 1.878804856678552, "learning_rate": 4.781359751899984e-07, "loss": 0.2538, "step": 2920 }, { "epoch": 0.87509349289454, "grad_norm": 1.64770700770445, "learning_rate": 4.6705161677814024e-07, "loss": 0.2569, "step": 2925 }, { "epoch": 0.8765893792071803, "grad_norm": 1.860024134107886, "learning_rate": 4.560909535198299e-07, "loss": 0.2576, "step": 2930 }, { "epoch": 0.8780852655198205, "grad_norm": 1.675929796569693, "learning_rate": 4.4525428450978627e-07, "loss": 0.2539, "step": 2935 }, { "epoch": 0.8795811518324608, "grad_norm": 1.6649509488101208, "learning_rate": 4.3454190545917317e-07, "loss": 0.2654, "step": 2940 }, { "epoch": 0.881077038145101, "grad_norm": 1.7894352860083609, "learning_rate": 4.239541086875265e-07, "loss": 0.2647, "step": 2945 }, { "epoch": 0.8825729244577412, "grad_norm": 1.688537215035147, "learning_rate": 4.134911831147798e-07, "loss": 0.2563, "step": 2950 }, { "epoch": 0.8840688107703815, "grad_norm": 1.7422422459372517, "learning_rate": 4.031534142533816e-07, "loss": 0.2517, "step": 2955 }, { "epoch": 0.8855646970830217, "grad_norm": 1.832964243427611, "learning_rate": 3.9294108420049935e-07, "loss": 0.2664, "step": 2960 }, { "epoch": 0.887060583395662, "grad_norm": 1.7932779810454953, "learning_rate": 3.828544716303284e-07, "loss": 0.2543, "step": 2965 }, { "epoch": 0.8885564697083022, "grad_norm": 1.8073243004592312, "learning_rate": 3.728938517864794e-07, "loss": 0.2601, "step": 2970 }, { "epoch": 0.8900523560209425, "grad_norm": 1.714561248097055, "learning_rate": 3.6305949647447545e-07, "loss": 0.2564, "step": 2975 }, { "epoch": 0.8915482423335827, "grad_norm": 1.647575871046988, "learning_rate": 3.5335167405433024e-07, "loss": 0.2607, "step": 2980 }, { "epoch": 0.8930441286462228, "grad_norm": 1.6977984176077578, "learning_rate": 3.437706494332266e-07, "loss": 0.2522, "step": 2985 }, { "epoch": 0.8945400149588631, "grad_norm": 1.7141499596339997, "learning_rate": 3.3431668405828675e-07, "loss": 0.2558, "step": 2990 }, { "epoch": 0.8960359012715033, "grad_norm": 1.6494105719449952, "learning_rate": 3.249900359094388e-07, "loss": 0.256, "step": 2995 }, { "epoch": 0.8975317875841436, "grad_norm": 1.6630293618544516, "learning_rate": 3.1579095949237584e-07, "loss": 0.2508, "step": 3000 }, { "epoch": 0.8990276738967838, "grad_norm": 1.7346655505039537, "learning_rate": 3.067197058316157e-07, "loss": 0.2614, "step": 3005 }, { "epoch": 0.900523560209424, "grad_norm": 1.7107296935219805, "learning_rate": 2.9777652246364306e-07, "loss": 0.2538, "step": 3010 }, { "epoch": 0.9020194465220643, "grad_norm": 1.6491436991741326, "learning_rate": 2.889616534301598e-07, "loss": 0.2521, "step": 3015 }, { "epoch": 0.9035153328347045, "grad_norm": 1.7323747022001885, "learning_rate": 2.8027533927142525e-07, "loss": 0.2593, "step": 3020 }, { "epoch": 0.9050112191473448, "grad_norm": 1.7534706658955106, "learning_rate": 2.717178170196916e-07, "loss": 0.249, "step": 3025 }, { "epoch": 0.906507105459985, "grad_norm": 1.7068949519667596, "learning_rate": 2.6328932019273556e-07, "loss": 0.2625, "step": 3030 }, { "epoch": 0.9080029917726253, "grad_norm": 1.7466561136363379, "learning_rate": 2.549900787874876e-07, "loss": 0.2572, "step": 3035 }, { "epoch": 0.9094988780852655, "grad_norm": 1.6487218463492848, "learning_rate": 2.468203192737512e-07, "loss": 0.2618, "step": 3040 }, { "epoch": 0.9109947643979057, "grad_norm": 1.5699982289102938, "learning_rate": 2.3878026458803047e-07, "loss": 0.2559, "step": 3045 }, { "epoch": 0.912490650710546, "grad_norm": 1.678827851691801, "learning_rate": 2.3087013412743998e-07, "loss": 0.2504, "step": 3050 }, { "epoch": 0.9139865370231862, "grad_norm": 1.732344143690627, "learning_rate": 2.2309014374372106e-07, "loss": 0.2556, "step": 3055 }, { "epoch": 0.9154824233358265, "grad_norm": 1.6563146141875156, "learning_rate": 2.1544050573735153e-07, "loss": 0.2555, "step": 3060 }, { "epoch": 0.9169783096484667, "grad_norm": 1.8096543479163172, "learning_rate": 2.079214288517506e-07, "loss": 0.2553, "step": 3065 }, { "epoch": 0.9184741959611069, "grad_norm": 1.690741820888644, "learning_rate": 2.0053311826758458e-07, "loss": 0.256, "step": 3070 }, { "epoch": 0.9199700822737472, "grad_norm": 1.7615351195511213, "learning_rate": 1.9327577559716815e-07, "loss": 0.2562, "step": 3075 }, { "epoch": 0.9214659685863874, "grad_norm": 1.8449608271118088, "learning_rate": 1.8614959887896078e-07, "loss": 0.2549, "step": 3080 }, { "epoch": 0.9229618548990277, "grad_norm": 1.7775694545753302, "learning_rate": 1.79154782572164e-07, "loss": 0.247, "step": 3085 }, { "epoch": 0.9244577412116679, "grad_norm": 1.6956013013917148, "learning_rate": 1.7229151755141394e-07, "loss": 0.2571, "step": 3090 }, { "epoch": 0.9259536275243081, "grad_norm": 1.6373082200013647, "learning_rate": 1.655599911015754e-07, "loss": 0.2547, "step": 3095 }, { "epoch": 0.9274495138369484, "grad_norm": 1.7078162984487721, "learning_rate": 1.5896038691262772e-07, "loss": 0.2592, "step": 3100 }, { "epoch": 0.9289454001495886, "grad_norm": 1.6367425145666301, "learning_rate": 1.52492885074656e-07, "loss": 0.2561, "step": 3105 }, { "epoch": 0.9304412864622289, "grad_norm": 1.5872236691558035, "learning_rate": 1.4615766207293157e-07, "loss": 0.2518, "step": 3110 }, { "epoch": 0.9319371727748691, "grad_norm": 1.714675291765629, "learning_rate": 1.3995489078310055e-07, "loss": 0.2633, "step": 3115 }, { "epoch": 0.9334330590875094, "grad_norm": 1.619406307330865, "learning_rate": 1.338847404664667e-07, "loss": 0.2548, "step": 3120 }, { "epoch": 0.9349289454001496, "grad_norm": 1.5539413237386495, "learning_rate": 1.2794737676536993e-07, "loss": 0.2527, "step": 3125 }, { "epoch": 0.9364248317127898, "grad_norm": 1.7495161399355714, "learning_rate": 1.2214296169866578e-07, "loss": 0.2515, "step": 3130 }, { "epoch": 0.9379207180254301, "grad_norm": 1.641652885536429, "learning_rate": 1.164716536573074e-07, "loss": 0.2501, "step": 3135 }, { "epoch": 0.9394166043380703, "grad_norm": 1.753141085715687, "learning_rate": 1.1093360740002057e-07, "loss": 0.2515, "step": 3140 }, { "epoch": 0.9409124906507106, "grad_norm": 1.7530034719134988, "learning_rate": 1.0552897404908391e-07, "loss": 0.2559, "step": 3145 }, { "epoch": 0.9424083769633508, "grad_norm": 1.5804220071987112, "learning_rate": 1.0025790108620092e-07, "loss": 0.2483, "step": 3150 }, { "epoch": 0.943904263275991, "grad_norm": 1.822783043661551, "learning_rate": 9.512053234847774e-08, "loss": 0.258, "step": 3155 }, { "epoch": 0.9454001495886313, "grad_norm": 1.7776638404626903, "learning_rate": 9.01170080244984e-08, "loss": 0.2463, "step": 3160 }, { "epoch": 0.9468960359012715, "grad_norm": 1.7244463932526486, "learning_rate": 8.52474646504986e-08, "loss": 0.2506, "step": 3165 }, { "epoch": 0.9483919222139118, "grad_norm": 1.7184065717867174, "learning_rate": 8.05120351066413e-08, "loss": 0.2605, "step": 3170 }, { "epoch": 0.949887808526552, "grad_norm": 1.7978606844090408, "learning_rate": 7.591084861338749e-08, "loss": 0.2503, "step": 3175 }, { "epoch": 0.9513836948391923, "grad_norm": 1.6764242072572402, "learning_rate": 7.144403072797346e-08, "loss": 0.2523, "step": 3180 }, { "epoch": 0.9528795811518325, "grad_norm": 1.6752659598734612, "learning_rate": 6.711170334098294e-08, "loss": 0.2566, "step": 3185 }, { "epoch": 0.9543754674644727, "grad_norm": 1.7696006414428376, "learning_rate": 6.291398467302146e-08, "loss": 0.2579, "step": 3190 }, { "epoch": 0.955871353777113, "grad_norm": 1.6541063626129755, "learning_rate": 5.885098927148947e-08, "loss": 0.2505, "step": 3195 }, { "epoch": 0.9573672400897532, "grad_norm": 1.791951476550907, "learning_rate": 5.492282800745707e-08, "loss": 0.252, "step": 3200 }, { "epoch": 0.9588631264023935, "grad_norm": 1.6998940151846391, "learning_rate": 5.112960807263978e-08, "loss": 0.2602, "step": 3205 }, { "epoch": 0.9603590127150337, "grad_norm": 1.739892053817991, "learning_rate": 4.7471432976471944e-08, "loss": 0.264, "step": 3210 }, { "epoch": 0.9618548990276738, "grad_norm": 1.5429992279839573, "learning_rate": 4.3948402543282366e-08, "loss": 0.2543, "step": 3215 }, { "epoch": 0.9633507853403142, "grad_norm": 1.772813294024904, "learning_rate": 4.056061290956981e-08, "loss": 0.2524, "step": 3220 }, { "epoch": 0.9648466716529543, "grad_norm": 1.5751929247313246, "learning_rate": 3.730815652138231e-08, "loss": 0.2525, "step": 3225 }, { "epoch": 0.9663425579655947, "grad_norm": 1.7360588122089868, "learning_rate": 3.4191122131790324e-08, "loss": 0.255, "step": 3230 }, { "epoch": 0.9678384442782348, "grad_norm": 1.7743122424766984, "learning_rate": 3.120959479846919e-08, "loss": 0.2584, "step": 3235 }, { "epoch": 0.9693343305908751, "grad_norm": 1.574467195657007, "learning_rate": 2.8363655881374906e-08, "loss": 0.2558, "step": 3240 }, { "epoch": 0.9708302169035153, "grad_norm": 1.80180199036063, "learning_rate": 2.5653383040524228e-08, "loss": 0.2568, "step": 3245 }, { "epoch": 0.9723261032161555, "grad_norm": 1.6886241273143858, "learning_rate": 2.3078850233878015e-08, "loss": 0.2466, "step": 3250 }, { "epoch": 0.9738219895287958, "grad_norm": 1.7815633691396229, "learning_rate": 2.064012771532009e-08, "loss": 0.2536, "step": 3255 }, { "epoch": 0.975317875841436, "grad_norm": 1.6956109134529065, "learning_rate": 1.83372820327421e-08, "loss": 0.2592, "step": 3260 }, { "epoch": 0.9768137621540763, "grad_norm": 1.7584432907260417, "learning_rate": 1.6170376026226065e-08, "loss": 0.2647, "step": 3265 }, { "epoch": 0.9783096484667165, "grad_norm": 1.687889717075937, "learning_rate": 1.4139468826331327e-08, "loss": 0.2529, "step": 3270 }, { "epoch": 0.9798055347793567, "grad_norm": 1.6973842345080912, "learning_rate": 1.2244615852479158e-08, "loss": 0.2586, "step": 3275 }, { "epoch": 0.981301421091997, "grad_norm": 1.7860998582475756, "learning_rate": 1.0485868811441757e-08, "loss": 0.2596, "step": 3280 }, { "epoch": 0.9827973074046372, "grad_norm": 1.7444036807918029, "learning_rate": 8.86327569593115e-09, "loss": 0.253, "step": 3285 }, { "epoch": 0.9842931937172775, "grad_norm": 1.7876798673093501, "learning_rate": 7.376880783289131e-09, "loss": 0.2551, "step": 3290 }, { "epoch": 0.9857890800299177, "grad_norm": 1.66622892909602, "learning_rate": 6.026724634279335e-09, "loss": 0.2557, "step": 3295 }, { "epoch": 0.9872849663425579, "grad_norm": 1.7386422804846284, "learning_rate": 4.8128440919792405e-09, "loss": 0.253, "step": 3300 }, { "epoch": 0.9887808526551982, "grad_norm": 1.5376218727713236, "learning_rate": 3.73527228077708e-09, "loss": 0.2501, "step": 3305 }, { "epoch": 0.9902767389678384, "grad_norm": 1.7638583930274379, "learning_rate": 2.7940386054664537e-09, "loss": 0.262, "step": 3310 }, { "epoch": 0.9917726252804787, "grad_norm": 1.9162749151140541, "learning_rate": 1.9891687504436373e-09, "loss": 0.2575, "step": 3315 }, { "epoch": 0.9932685115931189, "grad_norm": 1.880864088528354, "learning_rate": 1.320684679008144e-09, "loss": 0.2602, "step": 3320 }, { "epoch": 0.9947643979057592, "grad_norm": 1.7803280986620529, "learning_rate": 7.886046327609809e-10, "loss": 0.2543, "step": 3325 }, { "epoch": 0.9962602842183994, "grad_norm": 1.5859942056150071, "learning_rate": 3.929431311094911e-10, "loss": 0.2563, "step": 3330 }, { "epoch": 0.9977561705310396, "grad_norm": 1.6644206261602157, "learning_rate": 1.337109708704487e-10, "loss": 0.2515, "step": 3335 }, { "epoch": 0.9992520568436799, "grad_norm": 1.6728854762026557, "learning_rate": 1.091522597362893e-11, "loss": 0.2602, "step": 3340 }, { "epoch": 0.999850411368736, "eval_loss": 0.25460532307624817, "eval_runtime": 342.3221, "eval_samples_per_second": 3.152, "eval_steps_per_second": 0.789, "step": 3342 }, { "epoch": 0.999850411368736, "step": 3342, "total_flos": 699694464368640.0, "train_loss": 0.43199071664427796, "train_runtime": 75290.8899, "train_samples_per_second": 1.421, "train_steps_per_second": 0.044 } ], "logging_steps": 5, "max_steps": 3342, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 699694464368640.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }