{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.653019447287615, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0040941658137154556, "grad_norm": 0.283203125, "learning_rate": 1.7391304347826085e-05, "loss": 1.2477, "step": 4 }, { "epoch": 0.008188331627430911, "grad_norm": 0.2734375, "learning_rate": 3.478260869565217e-05, "loss": 1.292, "step": 8 }, { "epoch": 0.012282497441146366, "grad_norm": 0.212890625, "learning_rate": 5.2173913043478256e-05, "loss": 1.201, "step": 12 }, { "epoch": 0.016376663254861822, "grad_norm": 0.208984375, "learning_rate": 6.956521739130434e-05, "loss": 1.218, "step": 16 }, { "epoch": 0.02047082906857728, "grad_norm": 0.259765625, "learning_rate": 8.695652173913043e-05, "loss": 1.2022, "step": 20 }, { "epoch": 0.02456499488229273, "grad_norm": 0.171875, "learning_rate": 0.00010434782608695651, "loss": 1.1587, "step": 24 }, { "epoch": 0.028659160696008188, "grad_norm": 0.1728515625, "learning_rate": 0.00012173913043478261, "loss": 1.1458, "step": 28 }, { "epoch": 0.032753326509723645, "grad_norm": 0.1640625, "learning_rate": 0.00013913043478260868, "loss": 1.1289, "step": 32 }, { "epoch": 0.0368474923234391, "grad_norm": 0.2060546875, "learning_rate": 0.00015652173913043477, "loss": 1.1263, "step": 36 }, { "epoch": 0.04094165813715456, "grad_norm": 0.1962890625, "learning_rate": 0.00017391304347826085, "loss": 1.1077, "step": 40 }, { "epoch": 0.04503582395087001, "grad_norm": 0.2119140625, "learning_rate": 0.0001913043478260869, "loss": 1.1116, "step": 44 }, { "epoch": 0.04912998976458546, "grad_norm": 0.208984375, "learning_rate": 0.00020869565217391303, "loss": 1.0815, "step": 48 }, { "epoch": 0.05322415557830092, "grad_norm": 0.203125, "learning_rate": 0.0002260869565217391, "loss": 1.1132, "step": 52 }, { "epoch": 0.057318321392016376, "grad_norm": 0.2109375, "learning_rate": 0.00024347826086956522, "loss": 1.1032, "step": 56 }, { "epoch": 0.06141248720573183, "grad_norm": 0.197265625, "learning_rate": 0.0002608695652173913, "loss": 1.0802, "step": 60 }, { "epoch": 0.06550665301944729, "grad_norm": 0.1962890625, "learning_rate": 0.00027826086956521737, "loss": 1.0692, "step": 64 }, { "epoch": 0.06960081883316274, "grad_norm": 0.212890625, "learning_rate": 0.00029565217391304345, "loss": 1.0476, "step": 68 }, { "epoch": 0.0736949846468782, "grad_norm": 0.1943359375, "learning_rate": 0.00029999985464629347, "loss": 1.0535, "step": 72 }, { "epoch": 0.07778915046059365, "grad_norm": 0.1787109375, "learning_rate": 0.00029999920863038815, "loss": 1.042, "step": 76 }, { "epoch": 0.08188331627430911, "grad_norm": 0.1875, "learning_rate": 0.0002999980458040957, "loss": 1.0121, "step": 80 }, { "epoch": 0.08597748208802457, "grad_norm": 0.2197265625, "learning_rate": 0.0002999963661714225, "loss": 1.0423, "step": 84 }, { "epoch": 0.09007164790174002, "grad_norm": 0.2060546875, "learning_rate": 0.0002999941697381556, "loss": 1.0429, "step": 88 }, { "epoch": 0.09416581371545547, "grad_norm": 0.193359375, "learning_rate": 0.0002999914565118627, "loss": 1.0316, "step": 92 }, { "epoch": 0.09825997952917093, "grad_norm": 0.1875, "learning_rate": 0.0002999882265018919, "loss": 1.0462, "step": 96 }, { "epoch": 0.1023541453428864, "grad_norm": 0.2138671875, "learning_rate": 0.000299984479719372, "loss": 1.0391, "step": 100 }, { "epoch": 0.10644831115660185, "grad_norm": 0.2001953125, "learning_rate": 0.00029998021617721224, "loss": 1.0167, "step": 104 }, { "epoch": 0.1105424769703173, "grad_norm": 0.1865234375, "learning_rate": 0.0002999754358901023, "loss": 1.001, "step": 108 }, { "epoch": 0.11463664278403275, "grad_norm": 0.2001953125, "learning_rate": 0.00029997013887451236, "loss": 1.0101, "step": 112 }, { "epoch": 0.1187308085977482, "grad_norm": 0.2138671875, "learning_rate": 0.0002999643251486927, "loss": 0.9859, "step": 116 }, { "epoch": 0.12282497441146366, "grad_norm": 0.205078125, "learning_rate": 0.0002999579947326742, "loss": 1.0245, "step": 120 }, { "epoch": 0.1269191402251791, "grad_norm": 0.22265625, "learning_rate": 0.0002999511476482678, "loss": 0.9762, "step": 124 }, { "epoch": 0.13101330603889458, "grad_norm": 0.2021484375, "learning_rate": 0.00029994378391906453, "loss": 0.9698, "step": 128 }, { "epoch": 0.13510747185261002, "grad_norm": 0.2001953125, "learning_rate": 0.0002999359035704355, "loss": 0.9787, "step": 132 }, { "epoch": 0.13920163766632548, "grad_norm": 0.2041015625, "learning_rate": 0.00029992750662953196, "loss": 1.0197, "step": 136 }, { "epoch": 0.14329580348004095, "grad_norm": 0.208984375, "learning_rate": 0.00029991859312528476, "loss": 0.9958, "step": 140 }, { "epoch": 0.1473899692937564, "grad_norm": 0.2216796875, "learning_rate": 0.00029990916308840476, "loss": 0.9495, "step": 144 }, { "epoch": 0.15148413510747186, "grad_norm": 0.1982421875, "learning_rate": 0.0002998992165513824, "loss": 1.0079, "step": 148 }, { "epoch": 0.1555783009211873, "grad_norm": 0.212890625, "learning_rate": 0.00029988875354848766, "loss": 0.9369, "step": 152 }, { "epoch": 0.15967246673490276, "grad_norm": 0.2119140625, "learning_rate": 0.00029987777411576996, "loss": 0.9607, "step": 156 }, { "epoch": 0.16376663254861823, "grad_norm": 0.2099609375, "learning_rate": 0.0002998662782910581, "loss": 0.9773, "step": 160 }, { "epoch": 0.16786079836233367, "grad_norm": 0.2490234375, "learning_rate": 0.0002998542661139601, "loss": 0.9218, "step": 164 }, { "epoch": 0.17195496417604914, "grad_norm": 0.203125, "learning_rate": 0.0002998417376258628, "loss": 0.9753, "step": 168 }, { "epoch": 0.17604912998976457, "grad_norm": 0.2294921875, "learning_rate": 0.00029982869286993225, "loss": 0.9777, "step": 172 }, { "epoch": 0.18014329580348004, "grad_norm": 0.265625, "learning_rate": 0.00029981513189111314, "loss": 0.9389, "step": 176 }, { "epoch": 0.1842374616171955, "grad_norm": 0.2158203125, "learning_rate": 0.00029980105473612865, "loss": 0.9107, "step": 180 }, { "epoch": 0.18833162743091095, "grad_norm": 0.205078125, "learning_rate": 0.0002997864614534805, "loss": 0.9503, "step": 184 }, { "epoch": 0.19242579324462641, "grad_norm": 0.2314453125, "learning_rate": 0.00029977135209344874, "loss": 0.9258, "step": 188 }, { "epoch": 0.19651995905834185, "grad_norm": 0.2138671875, "learning_rate": 0.0002997557267080915, "loss": 0.9276, "step": 192 }, { "epoch": 0.20061412487205732, "grad_norm": 0.216796875, "learning_rate": 0.0002997395853512448, "loss": 0.9303, "step": 196 }, { "epoch": 0.2047082906857728, "grad_norm": 0.228515625, "learning_rate": 0.00029972292807852233, "loss": 0.9378, "step": 200 }, { "epoch": 0.20880245649948823, "grad_norm": 0.2216796875, "learning_rate": 0.00029970575494731543, "loss": 0.9271, "step": 204 }, { "epoch": 0.2128966223132037, "grad_norm": 0.2119140625, "learning_rate": 0.00029968806601679283, "loss": 0.9574, "step": 208 }, { "epoch": 0.21699078812691913, "grad_norm": 0.2236328125, "learning_rate": 0.00029966986134790025, "loss": 0.9354, "step": 212 }, { "epoch": 0.2210849539406346, "grad_norm": 0.2216796875, "learning_rate": 0.0002996511410033605, "loss": 0.9185, "step": 216 }, { "epoch": 0.22517911975435004, "grad_norm": 0.2314453125, "learning_rate": 0.000299631905047673, "loss": 0.9327, "step": 220 }, { "epoch": 0.2292732855680655, "grad_norm": 0.2138671875, "learning_rate": 0.00029961215354711376, "loss": 0.9141, "step": 224 }, { "epoch": 0.23336745138178097, "grad_norm": 0.2421875, "learning_rate": 0.000299591886569735, "loss": 0.9369, "step": 228 }, { "epoch": 0.2374616171954964, "grad_norm": 0.234375, "learning_rate": 0.0002995711041853649, "loss": 0.9163, "step": 232 }, { "epoch": 0.24155578300921188, "grad_norm": 0.2255859375, "learning_rate": 0.00029954980646560765, "loss": 0.8721, "step": 236 }, { "epoch": 0.24564994882292732, "grad_norm": 0.224609375, "learning_rate": 0.0002995279934838427, "loss": 0.8767, "step": 240 }, { "epoch": 0.24974411463664278, "grad_norm": 0.240234375, "learning_rate": 0.00029950566531522515, "loss": 0.8851, "step": 244 }, { "epoch": 0.2538382804503582, "grad_norm": 0.2197265625, "learning_rate": 0.00029948282203668477, "loss": 0.9255, "step": 248 }, { "epoch": 0.2579324462640737, "grad_norm": 0.265625, "learning_rate": 0.00029945946372692635, "loss": 0.8865, "step": 252 }, { "epoch": 0.26202661207778916, "grad_norm": 0.2216796875, "learning_rate": 0.00029943559046642903, "loss": 0.8514, "step": 256 }, { "epoch": 0.2661207778915046, "grad_norm": 0.234375, "learning_rate": 0.00029941120233744625, "loss": 0.9019, "step": 260 }, { "epoch": 0.27021494370522003, "grad_norm": 0.251953125, "learning_rate": 0.00029938629942400546, "loss": 0.9104, "step": 264 }, { "epoch": 0.2743091095189355, "grad_norm": 0.265625, "learning_rate": 0.00029936088181190754, "loss": 0.8915, "step": 268 }, { "epoch": 0.27840327533265097, "grad_norm": 0.224609375, "learning_rate": 0.000299334949588727, "loss": 0.889, "step": 272 }, { "epoch": 0.28249744114636643, "grad_norm": 0.2275390625, "learning_rate": 0.00029930850284381116, "loss": 0.9146, "step": 276 }, { "epoch": 0.2865916069600819, "grad_norm": 0.23046875, "learning_rate": 0.00029928154166828025, "loss": 0.8627, "step": 280 }, { "epoch": 0.2906857727737973, "grad_norm": 0.25, "learning_rate": 0.0002992540661550268, "loss": 0.8735, "step": 284 }, { "epoch": 0.2947799385875128, "grad_norm": 0.23046875, "learning_rate": 0.00029922607639871557, "loss": 0.8819, "step": 288 }, { "epoch": 0.29887410440122825, "grad_norm": 0.244140625, "learning_rate": 0.000299197572495783, "loss": 0.8979, "step": 292 }, { "epoch": 0.3029682702149437, "grad_norm": 0.244140625, "learning_rate": 0.00029916855454443706, "loss": 0.9047, "step": 296 }, { "epoch": 0.3070624360286592, "grad_norm": 0.2412109375, "learning_rate": 0.0002991390226446568, "loss": 0.8861, "step": 300 }, { "epoch": 0.3111566018423746, "grad_norm": 0.2353515625, "learning_rate": 0.000299108976898192, "loss": 0.8957, "step": 304 }, { "epoch": 0.31525076765609006, "grad_norm": 0.26171875, "learning_rate": 0.000299078417408563, "loss": 0.8565, "step": 308 }, { "epoch": 0.3193449334698055, "grad_norm": 0.244140625, "learning_rate": 0.00029904734428105997, "loss": 0.8656, "step": 312 }, { "epoch": 0.323439099283521, "grad_norm": 0.263671875, "learning_rate": 0.000299015757622743, "loss": 0.8339, "step": 316 }, { "epoch": 0.32753326509723646, "grad_norm": 0.232421875, "learning_rate": 0.00029898365754244135, "loss": 0.813, "step": 320 }, { "epoch": 0.33162743091095187, "grad_norm": 0.2353515625, "learning_rate": 0.00029895104415075336, "loss": 0.8969, "step": 324 }, { "epoch": 0.33572159672466734, "grad_norm": 0.248046875, "learning_rate": 0.0002989179175600459, "loss": 0.7858, "step": 328 }, { "epoch": 0.3398157625383828, "grad_norm": 0.2333984375, "learning_rate": 0.0002988842778844539, "loss": 0.8451, "step": 332 }, { "epoch": 0.34390992835209827, "grad_norm": 0.244140625, "learning_rate": 0.00029885012523988034, "loss": 0.8592, "step": 336 }, { "epoch": 0.34800409416581374, "grad_norm": 0.2275390625, "learning_rate": 0.0002988154597439954, "loss": 0.8831, "step": 340 }, { "epoch": 0.35209825997952915, "grad_norm": 0.216796875, "learning_rate": 0.0002987802815162363, "loss": 0.8825, "step": 344 }, { "epoch": 0.3561924257932446, "grad_norm": 0.251953125, "learning_rate": 0.0002987445906778068, "loss": 0.817, "step": 348 }, { "epoch": 0.3602865916069601, "grad_norm": 0.2431640625, "learning_rate": 0.00029870838735167684, "loss": 0.8549, "step": 352 }, { "epoch": 0.36438075742067555, "grad_norm": 0.212890625, "learning_rate": 0.0002986716716625822, "loss": 0.8458, "step": 356 }, { "epoch": 0.368474923234391, "grad_norm": 0.26171875, "learning_rate": 0.0002986344437370238, "loss": 0.8335, "step": 360 }, { "epoch": 0.3725690890481064, "grad_norm": 0.2451171875, "learning_rate": 0.00029859670370326757, "loss": 0.8258, "step": 364 }, { "epoch": 0.3766632548618219, "grad_norm": 0.228515625, "learning_rate": 0.0002985584516913437, "loss": 0.8816, "step": 368 }, { "epoch": 0.38075742067553736, "grad_norm": 0.2265625, "learning_rate": 0.0002985196878330466, "loss": 0.8361, "step": 372 }, { "epoch": 0.38485158648925283, "grad_norm": 0.23828125, "learning_rate": 0.000298480412261934, "loss": 0.8259, "step": 376 }, { "epoch": 0.3889457523029683, "grad_norm": 0.25390625, "learning_rate": 0.0002984406251133268, "loss": 0.8277, "step": 380 }, { "epoch": 0.3930399181166837, "grad_norm": 0.248046875, "learning_rate": 0.0002984003265243084, "loss": 0.861, "step": 384 }, { "epoch": 0.3971340839303992, "grad_norm": 0.240234375, "learning_rate": 0.00029835951663372446, "loss": 0.8145, "step": 388 }, { "epoch": 0.40122824974411464, "grad_norm": 0.25390625, "learning_rate": 0.0002983181955821822, "loss": 0.8501, "step": 392 }, { "epoch": 0.4053224155578301, "grad_norm": 0.25390625, "learning_rate": 0.00029827636351205004, "loss": 0.8362, "step": 396 }, { "epoch": 0.4094165813715456, "grad_norm": 0.25, "learning_rate": 0.00029823402056745706, "loss": 0.7834, "step": 400 }, { "epoch": 0.413510747185261, "grad_norm": 0.236328125, "learning_rate": 0.0002981911668942925, "loss": 0.8486, "step": 404 }, { "epoch": 0.41760491299897645, "grad_norm": 0.26171875, "learning_rate": 0.00029814780264020535, "loss": 0.8006, "step": 408 }, { "epoch": 0.4216990788126919, "grad_norm": 0.263671875, "learning_rate": 0.00029810392795460365, "loss": 0.8553, "step": 412 }, { "epoch": 0.4257932446264074, "grad_norm": 0.2119140625, "learning_rate": 0.00029805954298865413, "loss": 0.8271, "step": 416 }, { "epoch": 0.42988741044012285, "grad_norm": 0.2451171875, "learning_rate": 0.0002980146478952818, "loss": 0.8035, "step": 420 }, { "epoch": 0.43398157625383826, "grad_norm": 0.263671875, "learning_rate": 0.000297969242829169, "loss": 0.7901, "step": 424 }, { "epoch": 0.43807574206755373, "grad_norm": 0.267578125, "learning_rate": 0.0002979233279467554, "loss": 0.7974, "step": 428 }, { "epoch": 0.4421699078812692, "grad_norm": 0.2421875, "learning_rate": 0.000297876903406237, "loss": 0.8294, "step": 432 }, { "epoch": 0.44626407369498466, "grad_norm": 0.22265625, "learning_rate": 0.000297829969367566, "loss": 0.7868, "step": 436 }, { "epoch": 0.4503582395087001, "grad_norm": 0.26171875, "learning_rate": 0.0002977825259924497, "loss": 0.842, "step": 440 }, { "epoch": 0.45445240532241554, "grad_norm": 0.25390625, "learning_rate": 0.00029773457344435067, "loss": 0.8359, "step": 444 }, { "epoch": 0.458546571136131, "grad_norm": 0.2421875, "learning_rate": 0.0002976861118884856, "loss": 0.8127, "step": 448 }, { "epoch": 0.4626407369498465, "grad_norm": 0.2392578125, "learning_rate": 0.00029763714149182483, "loss": 0.8251, "step": 452 }, { "epoch": 0.46673490276356194, "grad_norm": 0.28515625, "learning_rate": 0.0002975876624230921, "loss": 0.8145, "step": 456 }, { "epoch": 0.47082906857727735, "grad_norm": 0.26171875, "learning_rate": 0.0002975376748527636, "loss": 0.814, "step": 460 }, { "epoch": 0.4749232343909928, "grad_norm": 0.255859375, "learning_rate": 0.00029748717895306746, "loss": 0.8419, "step": 464 }, { "epoch": 0.4790174002047083, "grad_norm": 0.2431640625, "learning_rate": 0.0002974361748979834, "loss": 0.814, "step": 468 }, { "epoch": 0.48311156601842375, "grad_norm": 0.251953125, "learning_rate": 0.00029738466286324176, "loss": 0.8097, "step": 472 }, { "epoch": 0.4872057318321392, "grad_norm": 0.25390625, "learning_rate": 0.00029733264302632325, "loss": 0.7909, "step": 476 }, { "epoch": 0.49129989764585463, "grad_norm": 0.265625, "learning_rate": 0.0002972801155664581, "loss": 0.8078, "step": 480 }, { "epoch": 0.4953940634595701, "grad_norm": 0.228515625, "learning_rate": 0.00029722708066462543, "loss": 0.8108, "step": 484 }, { "epoch": 0.49948822927328557, "grad_norm": 0.271484375, "learning_rate": 0.00029717353850355286, "loss": 0.852, "step": 488 }, { "epoch": 0.503582395087001, "grad_norm": 0.2578125, "learning_rate": 0.0002971194892677157, "loss": 0.7383, "step": 492 }, { "epoch": 0.5076765609007164, "grad_norm": 0.2333984375, "learning_rate": 0.0002970649331433362, "loss": 0.7806, "step": 496 }, { "epoch": 0.5117707267144319, "grad_norm": 0.267578125, "learning_rate": 0.0002970098703183832, "loss": 0.7578, "step": 500 }, { "epoch": 0.5158648925281474, "grad_norm": 0.2314453125, "learning_rate": 0.0002969543009825713, "loss": 0.8208, "step": 504 }, { "epoch": 0.5199590583418628, "grad_norm": 0.2314453125, "learning_rate": 0.0002968982253273603, "loss": 0.8233, "step": 508 }, { "epoch": 0.5240532241555783, "grad_norm": 0.224609375, "learning_rate": 0.0002968416435459544, "loss": 0.8092, "step": 512 }, { "epoch": 0.5281473899692938, "grad_norm": 0.2578125, "learning_rate": 0.00029678455583330156, "loss": 0.8246, "step": 516 }, { "epoch": 0.5322415557830092, "grad_norm": 0.25390625, "learning_rate": 0.0002967269623860931, "loss": 0.7451, "step": 520 }, { "epoch": 0.5363357215967247, "grad_norm": 0.248046875, "learning_rate": 0.00029666886340276263, "loss": 0.759, "step": 524 }, { "epoch": 0.5404298874104401, "grad_norm": 0.23046875, "learning_rate": 0.00029661025908348556, "loss": 0.8068, "step": 528 }, { "epoch": 0.5445240532241555, "grad_norm": 0.26171875, "learning_rate": 0.0002965511496301784, "loss": 0.7771, "step": 532 }, { "epoch": 0.548618219037871, "grad_norm": 0.24609375, "learning_rate": 0.0002964915352464982, "loss": 0.8213, "step": 536 }, { "epoch": 0.5527123848515865, "grad_norm": 0.263671875, "learning_rate": 0.0002964314161378415, "loss": 0.8283, "step": 540 }, { "epoch": 0.5568065506653019, "grad_norm": 0.271484375, "learning_rate": 0.000296370792511344, "loss": 0.7587, "step": 544 }, { "epoch": 0.5609007164790174, "grad_norm": 0.2275390625, "learning_rate": 0.0002963096645758795, "loss": 0.7708, "step": 548 }, { "epoch": 0.5649948822927329, "grad_norm": 0.2578125, "learning_rate": 0.00029624803254205953, "loss": 0.8349, "step": 552 }, { "epoch": 0.5690890481064483, "grad_norm": 0.2490234375, "learning_rate": 0.0002961858966222324, "loss": 0.7805, "step": 556 }, { "epoch": 0.5731832139201638, "grad_norm": 0.244140625, "learning_rate": 0.0002961232570304824, "loss": 0.757, "step": 560 }, { "epoch": 0.5772773797338793, "grad_norm": 0.251953125, "learning_rate": 0.0002960601139826294, "loss": 0.7807, "step": 564 }, { "epoch": 0.5813715455475946, "grad_norm": 0.2431640625, "learning_rate": 0.00029599646769622775, "loss": 0.7528, "step": 568 }, { "epoch": 0.5854657113613101, "grad_norm": 0.236328125, "learning_rate": 0.00029593231839056554, "loss": 0.817, "step": 572 }, { "epoch": 0.5895598771750256, "grad_norm": 0.26953125, "learning_rate": 0.0002958676662866643, "loss": 0.8212, "step": 576 }, { "epoch": 0.593654042988741, "grad_norm": 0.2470703125, "learning_rate": 0.00029580251160727766, "loss": 0.7561, "step": 580 }, { "epoch": 0.5977482088024565, "grad_norm": 0.2353515625, "learning_rate": 0.00029573685457689086, "loss": 0.8184, "step": 584 }, { "epoch": 0.601842374616172, "grad_norm": 0.236328125, "learning_rate": 0.00029567069542172004, "loss": 0.7591, "step": 588 }, { "epoch": 0.6059365404298874, "grad_norm": 0.255859375, "learning_rate": 0.0002956040343697114, "loss": 0.7598, "step": 592 }, { "epoch": 0.6100307062436029, "grad_norm": 0.2578125, "learning_rate": 0.0002955368716505401, "loss": 0.7583, "step": 596 }, { "epoch": 0.6141248720573184, "grad_norm": 0.248046875, "learning_rate": 0.0002954692074956102, "loss": 0.7787, "step": 600 }, { "epoch": 0.6182190378710338, "grad_norm": 0.2255859375, "learning_rate": 0.00029540104213805307, "loss": 0.8317, "step": 604 }, { "epoch": 0.6223132036847492, "grad_norm": 0.25, "learning_rate": 0.00029533237581272706, "loss": 0.7833, "step": 608 }, { "epoch": 0.6264073694984647, "grad_norm": 0.248046875, "learning_rate": 0.00029526320875621656, "loss": 0.7263, "step": 612 }, { "epoch": 0.6305015353121801, "grad_norm": 0.2333984375, "learning_rate": 0.00029519354120683116, "loss": 0.8114, "step": 616 }, { "epoch": 0.6345957011258956, "grad_norm": 0.2578125, "learning_rate": 0.0002951233734046049, "loss": 0.785, "step": 620 }, { "epoch": 0.638689866939611, "grad_norm": 0.23828125, "learning_rate": 0.0002950527055912955, "loss": 0.8104, "step": 624 }, { "epoch": 0.6427840327533265, "grad_norm": 0.234375, "learning_rate": 0.00029498153801038303, "loss": 0.7885, "step": 628 }, { "epoch": 0.646878198567042, "grad_norm": 0.2373046875, "learning_rate": 0.00029490987090707, "loss": 0.7883, "step": 632 }, { "epoch": 0.6509723643807575, "grad_norm": 0.291015625, "learning_rate": 0.0002948377045282796, "loss": 0.8154, "step": 636 }, { "epoch": 0.6550665301944729, "grad_norm": 0.25, "learning_rate": 0.0002947650391226555, "loss": 0.7979, "step": 640 }, { "epoch": 0.6591606960081884, "grad_norm": 0.236328125, "learning_rate": 0.00029469187494056046, "loss": 0.7896, "step": 644 }, { "epoch": 0.6632548618219037, "grad_norm": 0.255859375, "learning_rate": 0.0002946182122340759, "loss": 0.7295, "step": 648 }, { "epoch": 0.6673490276356192, "grad_norm": 0.2353515625, "learning_rate": 0.0002945440512570009, "loss": 0.7889, "step": 652 }, { "epoch": 0.6714431934493347, "grad_norm": 0.24609375, "learning_rate": 0.00029446939226485125, "loss": 0.7931, "step": 656 }, { "epoch": 0.6755373592630501, "grad_norm": 0.2578125, "learning_rate": 0.00029439423551485844, "loss": 0.7944, "step": 660 }, { "epoch": 0.6796315250767656, "grad_norm": 0.24609375, "learning_rate": 0.0002943185812659693, "loss": 0.8238, "step": 664 }, { "epoch": 0.6837256908904811, "grad_norm": 0.263671875, "learning_rate": 0.00029424242977884436, "loss": 0.827, "step": 668 }, { "epoch": 0.6878198567041965, "grad_norm": 0.2470703125, "learning_rate": 0.00029416578131585765, "loss": 0.7974, "step": 672 }, { "epoch": 0.691914022517912, "grad_norm": 0.26171875, "learning_rate": 0.00029408863614109533, "loss": 0.8207, "step": 676 }, { "epoch": 0.6960081883316275, "grad_norm": 0.2490234375, "learning_rate": 0.000294010994520355, "loss": 0.7433, "step": 680 }, { "epoch": 0.7001023541453428, "grad_norm": 0.220703125, "learning_rate": 0.00029393285672114477, "loss": 0.8231, "step": 684 }, { "epoch": 0.7041965199590583, "grad_norm": 0.23828125, "learning_rate": 0.0002938542230126821, "loss": 0.8487, "step": 688 }, { "epoch": 0.7082906857727738, "grad_norm": 0.2490234375, "learning_rate": 0.0002937750936658933, "loss": 0.7503, "step": 692 }, { "epoch": 0.7123848515864892, "grad_norm": 0.2890625, "learning_rate": 0.00029369546895341225, "loss": 0.8059, "step": 696 }, { "epoch": 0.7164790174002047, "grad_norm": 0.2412109375, "learning_rate": 0.0002936153491495796, "loss": 0.7972, "step": 700 }, { "epoch": 0.7205731832139202, "grad_norm": 0.2470703125, "learning_rate": 0.0002935347345304417, "loss": 0.7924, "step": 704 }, { "epoch": 0.7246673490276356, "grad_norm": 0.255859375, "learning_rate": 0.00029345362537374996, "loss": 0.7478, "step": 708 }, { "epoch": 0.7287615148413511, "grad_norm": 0.2333984375, "learning_rate": 0.0002933720219589595, "loss": 0.7237, "step": 712 }, { "epoch": 0.7328556806550666, "grad_norm": 0.255859375, "learning_rate": 0.00029328992456722835, "loss": 0.7898, "step": 716 }, { "epoch": 0.736949846468782, "grad_norm": 0.2578125, "learning_rate": 0.00029320733348141666, "loss": 0.7344, "step": 720 }, { "epoch": 0.7410440122824974, "grad_norm": 0.2265625, "learning_rate": 0.00029312424898608546, "loss": 0.8047, "step": 724 }, { "epoch": 0.7451381780962129, "grad_norm": 0.251953125, "learning_rate": 0.0002930406713674957, "loss": 0.7926, "step": 728 }, { "epoch": 0.7492323439099283, "grad_norm": 0.255859375, "learning_rate": 0.00029295660091360764, "loss": 0.7631, "step": 732 }, { "epoch": 0.7533265097236438, "grad_norm": 0.234375, "learning_rate": 0.00029287203791407917, "loss": 0.7931, "step": 736 }, { "epoch": 0.7574206755373593, "grad_norm": 0.2314453125, "learning_rate": 0.00029278698266026545, "loss": 0.8366, "step": 740 }, { "epoch": 0.7615148413510747, "grad_norm": 0.267578125, "learning_rate": 0.0002927014354452177, "loss": 0.7789, "step": 744 }, { "epoch": 0.7656090071647902, "grad_norm": 0.23828125, "learning_rate": 0.000292615396563682, "loss": 0.7381, "step": 748 }, { "epoch": 0.7697031729785057, "grad_norm": 0.2451171875, "learning_rate": 0.00029252886631209846, "loss": 0.7583, "step": 752 }, { "epoch": 0.7737973387922211, "grad_norm": 0.265625, "learning_rate": 0.0002924418449886003, "loss": 0.7299, "step": 756 }, { "epoch": 0.7778915046059366, "grad_norm": 0.255859375, "learning_rate": 0.00029235433289301257, "loss": 0.753, "step": 760 }, { "epoch": 0.781985670419652, "grad_norm": 0.2373046875, "learning_rate": 0.0002922663303268512, "loss": 0.7687, "step": 764 }, { "epoch": 0.7860798362333674, "grad_norm": 0.25390625, "learning_rate": 0.00029217783759332214, "loss": 0.7792, "step": 768 }, { "epoch": 0.7901740020470829, "grad_norm": 0.263671875, "learning_rate": 0.00029208885499732004, "loss": 0.7712, "step": 772 }, { "epoch": 0.7942681678607983, "grad_norm": 0.259765625, "learning_rate": 0.0002919993828454274, "loss": 0.7814, "step": 776 }, { "epoch": 0.7983623336745138, "grad_norm": 0.2470703125, "learning_rate": 0.0002919094214459134, "loss": 0.7868, "step": 780 }, { "epoch": 0.8024564994882293, "grad_norm": 0.23046875, "learning_rate": 0.000291818971108733, "loss": 0.759, "step": 784 }, { "epoch": 0.8065506653019447, "grad_norm": 0.25390625, "learning_rate": 0.0002917280321455255, "loss": 0.722, "step": 788 }, { "epoch": 0.8106448311156602, "grad_norm": 0.2421875, "learning_rate": 0.00029163660486961404, "loss": 0.7269, "step": 792 }, { "epoch": 0.8147389969293757, "grad_norm": 0.26953125, "learning_rate": 0.0002915446895960041, "loss": 0.7571, "step": 796 }, { "epoch": 0.8188331627430911, "grad_norm": 0.26171875, "learning_rate": 0.0002914522866413823, "loss": 0.7498, "step": 800 }, { "epoch": 0.8229273285568065, "grad_norm": 0.298828125, "learning_rate": 0.00029135939632411576, "loss": 0.7437, "step": 804 }, { "epoch": 0.827021494370522, "grad_norm": 0.265625, "learning_rate": 0.00029126601896425084, "loss": 0.7524, "step": 808 }, { "epoch": 0.8311156601842374, "grad_norm": 0.255859375, "learning_rate": 0.0002911721548835116, "loss": 0.783, "step": 812 }, { "epoch": 0.8352098259979529, "grad_norm": 0.2392578125, "learning_rate": 0.0002910778044052995, "loss": 0.7625, "step": 816 }, { "epoch": 0.8393039918116684, "grad_norm": 0.2333984375, "learning_rate": 0.00029098296785469153, "loss": 0.8289, "step": 820 }, { "epoch": 0.8433981576253838, "grad_norm": 0.23828125, "learning_rate": 0.00029088764555843953, "loss": 0.7998, "step": 824 }, { "epoch": 0.8474923234390993, "grad_norm": 0.3046875, "learning_rate": 0.0002907918378449689, "loss": 0.7699, "step": 828 }, { "epoch": 0.8515864892528148, "grad_norm": 0.24609375, "learning_rate": 0.00029069554504437757, "loss": 0.7366, "step": 832 }, { "epoch": 0.8556806550665302, "grad_norm": 0.25390625, "learning_rate": 0.0002905987674884347, "loss": 0.7756, "step": 836 }, { "epoch": 0.8597748208802457, "grad_norm": 0.244140625, "learning_rate": 0.00029050150551057977, "loss": 0.8355, "step": 840 }, { "epoch": 0.8638689866939611, "grad_norm": 0.232421875, "learning_rate": 0.00029040375944592114, "loss": 0.7178, "step": 844 }, { "epoch": 0.8679631525076765, "grad_norm": 0.26171875, "learning_rate": 0.00029030552963123517, "loss": 0.7798, "step": 848 }, { "epoch": 0.872057318321392, "grad_norm": 0.240234375, "learning_rate": 0.0002902068164049649, "loss": 0.7485, "step": 852 }, { "epoch": 0.8761514841351075, "grad_norm": 0.31640625, "learning_rate": 0.0002901076201072189, "loss": 0.7272, "step": 856 }, { "epoch": 0.8802456499488229, "grad_norm": 0.2412109375, "learning_rate": 0.00029000794107977016, "loss": 0.7575, "step": 860 }, { "epoch": 0.8843398157625384, "grad_norm": 0.26171875, "learning_rate": 0.0002899077796660549, "loss": 0.7391, "step": 864 }, { "epoch": 0.8884339815762539, "grad_norm": 0.234375, "learning_rate": 0.0002898071362111713, "loss": 0.7357, "step": 868 }, { "epoch": 0.8925281473899693, "grad_norm": 0.240234375, "learning_rate": 0.00028970601106187844, "loss": 0.7975, "step": 872 }, { "epoch": 0.8966223132036848, "grad_norm": 0.263671875, "learning_rate": 0.000289604404566595, "loss": 0.7639, "step": 876 }, { "epoch": 0.9007164790174002, "grad_norm": 0.26953125, "learning_rate": 0.00028950231707539813, "loss": 0.7482, "step": 880 }, { "epoch": 0.9048106448311156, "grad_norm": 0.234375, "learning_rate": 0.0002893997489400221, "loss": 0.7782, "step": 884 }, { "epoch": 0.9089048106448311, "grad_norm": 0.25, "learning_rate": 0.00028929670051385745, "loss": 0.7509, "step": 888 }, { "epoch": 0.9129989764585466, "grad_norm": 0.251953125, "learning_rate": 0.0002891931721519492, "loss": 0.7274, "step": 892 }, { "epoch": 0.917093142272262, "grad_norm": 0.240234375, "learning_rate": 0.0002890891642109962, "loss": 0.7512, "step": 896 }, { "epoch": 0.9211873080859775, "grad_norm": 0.26953125, "learning_rate": 0.0002889846770493496, "loss": 0.7375, "step": 900 }, { "epoch": 0.925281473899693, "grad_norm": 0.25, "learning_rate": 0.00028887971102701154, "loss": 0.6872, "step": 904 }, { "epoch": 0.9293756397134084, "grad_norm": 0.26171875, "learning_rate": 0.0002887742665056342, "loss": 0.7695, "step": 908 }, { "epoch": 0.9334698055271239, "grad_norm": 0.2451171875, "learning_rate": 0.0002886683438485183, "loss": 0.7726, "step": 912 }, { "epoch": 0.9375639713408394, "grad_norm": 0.28515625, "learning_rate": 0.000288561943420612, "loss": 0.7426, "step": 916 }, { "epoch": 0.9416581371545547, "grad_norm": 0.25390625, "learning_rate": 0.0002884550655885095, "loss": 0.7311, "step": 920 }, { "epoch": 0.9457523029682702, "grad_norm": 0.259765625, "learning_rate": 0.00028834771072044994, "loss": 0.7497, "step": 924 }, { "epoch": 0.9498464687819856, "grad_norm": 0.2421875, "learning_rate": 0.00028823987918631596, "loss": 0.7134, "step": 928 }, { "epoch": 0.9539406345957011, "grad_norm": 0.24609375, "learning_rate": 0.0002881315713576326, "loss": 0.7818, "step": 932 }, { "epoch": 0.9580348004094166, "grad_norm": 0.2392578125, "learning_rate": 0.0002880227876075659, "loss": 0.7358, "step": 936 }, { "epoch": 0.962128966223132, "grad_norm": 0.271484375, "learning_rate": 0.00028791352831092164, "loss": 0.7309, "step": 940 }, { "epoch": 0.9662231320368475, "grad_norm": 0.2216796875, "learning_rate": 0.0002878037938441441, "loss": 0.7227, "step": 944 }, { "epoch": 0.970317297850563, "grad_norm": 0.255859375, "learning_rate": 0.0002876935845853146, "loss": 0.7828, "step": 948 }, { "epoch": 0.9744114636642784, "grad_norm": 0.26953125, "learning_rate": 0.0002875829009141505, "loss": 0.7156, "step": 952 }, { "epoch": 0.9785056294779939, "grad_norm": 0.255859375, "learning_rate": 0.00028747174321200363, "loss": 0.7602, "step": 956 }, { "epoch": 0.9825997952917093, "grad_norm": 0.248046875, "learning_rate": 0.000287360111861859, "loss": 0.7051, "step": 960 }, { "epoch": 0.9866939611054247, "grad_norm": 0.267578125, "learning_rate": 0.00028724800724833354, "loss": 0.6861, "step": 964 }, { "epoch": 0.9907881269191402, "grad_norm": 0.251953125, "learning_rate": 0.00028713542975767486, "loss": 0.7947, "step": 968 }, { "epoch": 0.9948822927328557, "grad_norm": 0.25, "learning_rate": 0.0002870223797777598, "loss": 0.7499, "step": 972 }, { "epoch": 0.9989764585465711, "grad_norm": 0.25390625, "learning_rate": 0.0002869088576980931, "loss": 0.7674, "step": 976 }, { "epoch": 1.0030706243602865, "grad_norm": 0.25, "learning_rate": 0.0002867948639098061, "loss": 0.6806, "step": 980 }, { "epoch": 1.007164790174002, "grad_norm": 0.27734375, "learning_rate": 0.00028668039880565526, "loss": 0.7057, "step": 984 }, { "epoch": 1.0112589559877174, "grad_norm": 0.2255859375, "learning_rate": 0.0002865654627800212, "loss": 0.6704, "step": 988 }, { "epoch": 1.015353121801433, "grad_norm": 0.2421875, "learning_rate": 0.00028645005622890673, "loss": 0.708, "step": 992 }, { "epoch": 1.0194472876151484, "grad_norm": 0.275390625, "learning_rate": 0.0002863341795499361, "loss": 0.6166, "step": 996 }, { "epoch": 1.0235414534288638, "grad_norm": 0.255859375, "learning_rate": 0.00028621783314235314, "loss": 0.6911, "step": 1000 }, { "epoch": 1.0276356192425793, "grad_norm": 0.251953125, "learning_rate": 0.0002861010174070202, "loss": 0.6379, "step": 1004 }, { "epoch": 1.0317297850562948, "grad_norm": 0.259765625, "learning_rate": 0.0002859837327464167, "loss": 0.691, "step": 1008 }, { "epoch": 1.0358239508700102, "grad_norm": 0.232421875, "learning_rate": 0.0002858659795646375, "loss": 0.6792, "step": 1012 }, { "epoch": 1.0399181166837257, "grad_norm": 0.26953125, "learning_rate": 0.000285747758267392, "loss": 0.6846, "step": 1016 }, { "epoch": 1.0440122824974412, "grad_norm": 0.23828125, "learning_rate": 0.00028562906926200225, "loss": 0.7281, "step": 1020 }, { "epoch": 1.0481064483111566, "grad_norm": 0.267578125, "learning_rate": 0.0002855099129574018, "loss": 0.6851, "step": 1024 }, { "epoch": 1.052200614124872, "grad_norm": 0.265625, "learning_rate": 0.00028539028976413435, "loss": 0.7062, "step": 1028 }, { "epoch": 1.0562947799385876, "grad_norm": 0.2373046875, "learning_rate": 0.000285270200094352, "loss": 0.6651, "step": 1032 }, { "epoch": 1.060388945752303, "grad_norm": 0.236328125, "learning_rate": 0.0002851496443618143, "loss": 0.6657, "step": 1036 }, { "epoch": 1.0644831115660185, "grad_norm": 0.2470703125, "learning_rate": 0.00028502862298188634, "loss": 0.6647, "step": 1040 }, { "epoch": 1.068577277379734, "grad_norm": 0.2578125, "learning_rate": 0.00028490713637153786, "loss": 0.7025, "step": 1044 }, { "epoch": 1.0726714431934494, "grad_norm": 0.23828125, "learning_rate": 0.00028478518494934123, "loss": 0.6967, "step": 1048 }, { "epoch": 1.076765609007165, "grad_norm": 0.251953125, "learning_rate": 0.0002846627691354705, "loss": 0.7205, "step": 1052 }, { "epoch": 1.0808597748208801, "grad_norm": 0.26171875, "learning_rate": 0.00028453988935169954, "loss": 0.6407, "step": 1056 }, { "epoch": 1.0849539406345956, "grad_norm": 0.2421875, "learning_rate": 0.000284416546021401, "loss": 0.6704, "step": 1060 }, { "epoch": 1.089048106448311, "grad_norm": 0.2314453125, "learning_rate": 0.0002842927395695446, "loss": 0.692, "step": 1064 }, { "epoch": 1.0931422722620265, "grad_norm": 0.236328125, "learning_rate": 0.0002841684704226955, "loss": 0.6846, "step": 1068 }, { "epoch": 1.097236438075742, "grad_norm": 0.2490234375, "learning_rate": 0.0002840437390090133, "loss": 0.651, "step": 1072 }, { "epoch": 1.1013306038894575, "grad_norm": 0.267578125, "learning_rate": 0.0002839185457582502, "loss": 0.7293, "step": 1076 }, { "epoch": 1.105424769703173, "grad_norm": 0.248046875, "learning_rate": 0.0002837928911017496, "loss": 0.6636, "step": 1080 }, { "epoch": 1.1095189355168884, "grad_norm": 0.25, "learning_rate": 0.0002836667754724447, "loss": 0.6393, "step": 1084 }, { "epoch": 1.1136131013306039, "grad_norm": 0.251953125, "learning_rate": 0.0002835401993048568, "loss": 0.7271, "step": 1088 }, { "epoch": 1.1177072671443193, "grad_norm": 0.25, "learning_rate": 0.0002834131630350942, "loss": 0.6788, "step": 1092 }, { "epoch": 1.1218014329580348, "grad_norm": 0.267578125, "learning_rate": 0.00028328566710085024, "loss": 0.6806, "step": 1096 }, { "epoch": 1.1258955987717503, "grad_norm": 0.2294921875, "learning_rate": 0.000283157711941402, "loss": 0.7403, "step": 1100 }, { "epoch": 1.1299897645854657, "grad_norm": 0.248046875, "learning_rate": 0.000283029297997609, "loss": 0.6585, "step": 1104 }, { "epoch": 1.1340839303991812, "grad_norm": 0.267578125, "learning_rate": 0.00028290042571191114, "loss": 0.6178, "step": 1108 }, { "epoch": 1.1381780962128967, "grad_norm": 0.26171875, "learning_rate": 0.0002827710955283277, "loss": 0.688, "step": 1112 }, { "epoch": 1.1422722620266121, "grad_norm": 0.283203125, "learning_rate": 0.00028264130789245565, "loss": 0.6886, "step": 1116 }, { "epoch": 1.1463664278403276, "grad_norm": 0.25390625, "learning_rate": 0.00028251106325146797, "loss": 0.6451, "step": 1120 }, { "epoch": 1.150460593654043, "grad_norm": 0.25, "learning_rate": 0.0002823803620541122, "loss": 0.7498, "step": 1124 }, { "epoch": 1.1545547594677585, "grad_norm": 0.26171875, "learning_rate": 0.00028224920475070905, "loss": 0.6605, "step": 1128 }, { "epoch": 1.158648925281474, "grad_norm": 0.2431640625, "learning_rate": 0.00028211759179315053, "loss": 0.6827, "step": 1132 }, { "epoch": 1.1627430910951895, "grad_norm": 0.267578125, "learning_rate": 0.00028198552363489874, "loss": 0.6841, "step": 1136 }, { "epoch": 1.1668372569089047, "grad_norm": 0.27734375, "learning_rate": 0.000281853000730984, "loss": 0.6681, "step": 1140 }, { "epoch": 1.1709314227226202, "grad_norm": 0.2578125, "learning_rate": 0.0002817200235380035, "loss": 0.6497, "step": 1144 }, { "epoch": 1.1750255885363357, "grad_norm": 0.25, "learning_rate": 0.00028158659251411954, "loss": 0.6667, "step": 1148 }, { "epoch": 1.1791197543500511, "grad_norm": 0.2578125, "learning_rate": 0.0002814527081190583, "loss": 0.6781, "step": 1152 }, { "epoch": 1.1832139201637666, "grad_norm": 0.28125, "learning_rate": 0.0002813183708141077, "loss": 0.6829, "step": 1156 }, { "epoch": 1.187308085977482, "grad_norm": 0.25390625, "learning_rate": 0.00028118358106211635, "loss": 0.6888, "step": 1160 }, { "epoch": 1.1914022517911975, "grad_norm": 0.267578125, "learning_rate": 0.0002810483393274916, "loss": 0.7212, "step": 1164 }, { "epoch": 1.195496417604913, "grad_norm": 0.275390625, "learning_rate": 0.00028091264607619826, "loss": 0.7186, "step": 1168 }, { "epoch": 1.1995905834186285, "grad_norm": 0.267578125, "learning_rate": 0.0002807765017757565, "loss": 0.6889, "step": 1172 }, { "epoch": 1.203684749232344, "grad_norm": 0.2578125, "learning_rate": 0.00028063990689524093, "loss": 0.7395, "step": 1176 }, { "epoch": 1.2077789150460594, "grad_norm": 0.25, "learning_rate": 0.00028050286190527823, "loss": 0.6695, "step": 1180 }, { "epoch": 1.2118730808597749, "grad_norm": 0.265625, "learning_rate": 0.00028036536727804606, "loss": 0.6742, "step": 1184 }, { "epoch": 1.2159672466734903, "grad_norm": 0.267578125, "learning_rate": 0.0002802274234872713, "loss": 0.6568, "step": 1188 }, { "epoch": 1.2200614124872058, "grad_norm": 0.267578125, "learning_rate": 0.00028008903100822834, "loss": 0.7136, "step": 1192 }, { "epoch": 1.2241555783009213, "grad_norm": 0.2734375, "learning_rate": 0.0002799501903177375, "loss": 0.631, "step": 1196 }, { "epoch": 1.2282497441146367, "grad_norm": 0.2490234375, "learning_rate": 0.00027981090189416343, "loss": 0.7038, "step": 1200 }, { "epoch": 1.2323439099283522, "grad_norm": 0.271484375, "learning_rate": 0.00027967116621741326, "loss": 0.6591, "step": 1204 }, { "epoch": 1.2364380757420674, "grad_norm": 0.271484375, "learning_rate": 0.0002795309837689352, "loss": 0.7088, "step": 1208 }, { "epoch": 1.240532241555783, "grad_norm": 0.267578125, "learning_rate": 0.0002793903550317169, "loss": 0.6818, "step": 1212 }, { "epoch": 1.2446264073694984, "grad_norm": 0.23828125, "learning_rate": 0.00027924928049028337, "loss": 0.7158, "step": 1216 }, { "epoch": 1.2487205731832138, "grad_norm": 0.25, "learning_rate": 0.00027910776063069586, "loss": 0.6467, "step": 1220 }, { "epoch": 1.2528147389969293, "grad_norm": 0.296875, "learning_rate": 0.0002789657959405498, "loss": 0.6674, "step": 1224 }, { "epoch": 1.2569089048106448, "grad_norm": 0.23828125, "learning_rate": 0.00027882338690897327, "loss": 0.6747, "step": 1228 }, { "epoch": 1.2610030706243602, "grad_norm": 0.2734375, "learning_rate": 0.00027868053402662534, "loss": 0.6999, "step": 1232 }, { "epoch": 1.2650972364380757, "grad_norm": 0.25390625, "learning_rate": 0.00027853723778569427, "loss": 0.7059, "step": 1236 }, { "epoch": 1.2691914022517912, "grad_norm": 0.28125, "learning_rate": 0.00027839349867989587, "loss": 0.714, "step": 1240 }, { "epoch": 1.2732855680655066, "grad_norm": 0.267578125, "learning_rate": 0.00027824931720447194, "loss": 0.712, "step": 1244 }, { "epoch": 1.277379733879222, "grad_norm": 0.275390625, "learning_rate": 0.0002781046938561882, "loss": 0.6872, "step": 1248 }, { "epoch": 1.2814738996929376, "grad_norm": 0.251953125, "learning_rate": 0.00027795962913333304, "loss": 0.7098, "step": 1252 }, { "epoch": 1.285568065506653, "grad_norm": 0.275390625, "learning_rate": 0.00027781412353571544, "loss": 0.6901, "step": 1256 }, { "epoch": 1.2896622313203685, "grad_norm": 0.287109375, "learning_rate": 0.00027766817756466334, "loss": 0.6867, "step": 1260 }, { "epoch": 1.293756397134084, "grad_norm": 0.25390625, "learning_rate": 0.00027752179172302213, "loss": 0.7032, "step": 1264 }, { "epoch": 1.2978505629477994, "grad_norm": 0.25390625, "learning_rate": 0.0002773749665151525, "loss": 0.6738, "step": 1268 }, { "epoch": 1.301944728761515, "grad_norm": 0.267578125, "learning_rate": 0.00027722770244692924, "loss": 0.6769, "step": 1272 }, { "epoch": 1.3060388945752304, "grad_norm": 0.283203125, "learning_rate": 0.0002770800000257388, "loss": 0.6839, "step": 1276 }, { "epoch": 1.3101330603889458, "grad_norm": 0.259765625, "learning_rate": 0.0002769318597604784, "loss": 0.6871, "step": 1280 }, { "epoch": 1.3142272262026613, "grad_norm": 0.255859375, "learning_rate": 0.0002767832821615534, "loss": 0.7099, "step": 1284 }, { "epoch": 1.3183213920163768, "grad_norm": 0.23828125, "learning_rate": 0.0002766342677408763, "loss": 0.702, "step": 1288 }, { "epoch": 1.3224155578300922, "grad_norm": 0.283203125, "learning_rate": 0.0002764848170118644, "loss": 0.6578, "step": 1292 }, { "epoch": 1.3265097236438077, "grad_norm": 0.25, "learning_rate": 0.00027633493048943825, "loss": 0.6473, "step": 1296 }, { "epoch": 1.330603889457523, "grad_norm": 0.275390625, "learning_rate": 0.00027618460869002016, "loss": 0.6512, "step": 1300 }, { "epoch": 1.3346980552712384, "grad_norm": 0.263671875, "learning_rate": 0.00027603385213153186, "loss": 0.7167, "step": 1304 }, { "epoch": 1.3387922210849539, "grad_norm": 0.259765625, "learning_rate": 0.0002758826613333932, "loss": 0.7231, "step": 1308 }, { "epoch": 1.3428863868986693, "grad_norm": 0.248046875, "learning_rate": 0.00027573103681652, "loss": 0.7133, "step": 1312 }, { "epoch": 1.3469805527123848, "grad_norm": 0.265625, "learning_rate": 0.0002755789791033227, "loss": 0.6972, "step": 1316 }, { "epoch": 1.3510747185261003, "grad_norm": 0.27734375, "learning_rate": 0.00027542648871770384, "loss": 0.7027, "step": 1320 }, { "epoch": 1.3551688843398157, "grad_norm": 0.259765625, "learning_rate": 0.00027527356618505715, "loss": 0.6562, "step": 1324 }, { "epoch": 1.3592630501535312, "grad_norm": 0.275390625, "learning_rate": 0.00027512021203226507, "loss": 0.6721, "step": 1328 }, { "epoch": 1.3633572159672467, "grad_norm": 0.29296875, "learning_rate": 0.00027496642678769717, "loss": 0.7029, "step": 1332 }, { "epoch": 1.3674513817809621, "grad_norm": 0.25390625, "learning_rate": 0.0002748122109812083, "loss": 0.7028, "step": 1336 }, { "epoch": 1.3715455475946776, "grad_norm": 0.25390625, "learning_rate": 0.00027465756514413677, "loss": 0.6865, "step": 1340 }, { "epoch": 1.375639713408393, "grad_norm": 0.275390625, "learning_rate": 0.00027450248980930264, "loss": 0.7197, "step": 1344 }, { "epoch": 1.3797338792221086, "grad_norm": 0.2578125, "learning_rate": 0.00027434698551100567, "loss": 0.6694, "step": 1348 }, { "epoch": 1.383828045035824, "grad_norm": 0.2890625, "learning_rate": 0.0002741910527850235, "loss": 0.7057, "step": 1352 }, { "epoch": 1.3879222108495395, "grad_norm": 0.279296875, "learning_rate": 0.0002740346921686101, "loss": 0.7029, "step": 1356 }, { "epoch": 1.3920163766632547, "grad_norm": 0.26953125, "learning_rate": 0.00027387790420049357, "loss": 0.6723, "step": 1360 }, { "epoch": 1.3961105424769702, "grad_norm": 0.255859375, "learning_rate": 0.0002737206894208744, "loss": 0.6895, "step": 1364 }, { "epoch": 1.4002047082906857, "grad_norm": 0.267578125, "learning_rate": 0.0002735630483714236, "loss": 0.6655, "step": 1368 }, { "epoch": 1.4042988741044011, "grad_norm": 0.27734375, "learning_rate": 0.00027340498159528106, "loss": 0.6728, "step": 1372 }, { "epoch": 1.4083930399181166, "grad_norm": 0.267578125, "learning_rate": 0.00027324648963705317, "loss": 0.7074, "step": 1376 }, { "epoch": 1.412487205731832, "grad_norm": 0.263671875, "learning_rate": 0.00027308757304281154, "loss": 0.6722, "step": 1380 }, { "epoch": 1.4165813715455475, "grad_norm": 0.2734375, "learning_rate": 0.00027292823236009056, "loss": 0.6882, "step": 1384 }, { "epoch": 1.420675537359263, "grad_norm": 0.373046875, "learning_rate": 0.0002727684681378861, "loss": 0.6971, "step": 1388 }, { "epoch": 1.4247697031729785, "grad_norm": 0.265625, "learning_rate": 0.000272608280926653, "loss": 0.6815, "step": 1392 }, { "epoch": 1.428863868986694, "grad_norm": 0.279296875, "learning_rate": 0.00027244767127830366, "loss": 0.6712, "step": 1396 }, { "epoch": 1.4329580348004094, "grad_norm": 0.322265625, "learning_rate": 0.00027228663974620583, "loss": 0.6738, "step": 1400 }, { "epoch": 1.4370522006141249, "grad_norm": 0.294921875, "learning_rate": 0.000272125186885181, "loss": 0.6815, "step": 1404 }, { "epoch": 1.4411463664278403, "grad_norm": 0.28515625, "learning_rate": 0.00027196331325150217, "loss": 0.6807, "step": 1408 }, { "epoch": 1.4452405322415558, "grad_norm": 0.26171875, "learning_rate": 0.00027180101940289206, "loss": 0.6822, "step": 1412 }, { "epoch": 1.4493346980552713, "grad_norm": 0.255859375, "learning_rate": 0.0002716383058985213, "loss": 0.6641, "step": 1416 }, { "epoch": 1.4534288638689867, "grad_norm": 0.251953125, "learning_rate": 0.00027147517329900636, "loss": 0.7067, "step": 1420 }, { "epoch": 1.4575230296827022, "grad_norm": 0.279296875, "learning_rate": 0.00027131162216640774, "loss": 0.6732, "step": 1424 }, { "epoch": 1.4616171954964177, "grad_norm": 0.2890625, "learning_rate": 0.0002711476530642279, "loss": 0.6609, "step": 1428 }, { "epoch": 1.4657113613101331, "grad_norm": 0.267578125, "learning_rate": 0.0002709832665574093, "loss": 0.7021, "step": 1432 }, { "epoch": 1.4698055271238486, "grad_norm": 0.2490234375, "learning_rate": 0.00027081846321233273, "loss": 0.6793, "step": 1436 }, { "epoch": 1.473899692937564, "grad_norm": 0.279296875, "learning_rate": 0.000270653243596815, "loss": 0.6538, "step": 1440 }, { "epoch": 1.4779938587512795, "grad_norm": 0.265625, "learning_rate": 0.00027048760828010725, "loss": 0.7086, "step": 1444 }, { "epoch": 1.482088024564995, "grad_norm": 0.251953125, "learning_rate": 0.00027032155783289274, "loss": 0.6975, "step": 1448 }, { "epoch": 1.4861821903787105, "grad_norm": 0.2578125, "learning_rate": 0.00027015509282728525, "loss": 0.7226, "step": 1452 }, { "epoch": 1.4902763561924257, "grad_norm": 0.265625, "learning_rate": 0.00026998821383682664, "loss": 0.6931, "step": 1456 }, { "epoch": 1.4943705220061412, "grad_norm": 0.248046875, "learning_rate": 0.00026982092143648537, "loss": 0.624, "step": 1460 }, { "epoch": 1.4984646878198566, "grad_norm": 0.255859375, "learning_rate": 0.00026965321620265405, "loss": 0.6446, "step": 1464 }, { "epoch": 1.5025588536335721, "grad_norm": 0.26171875, "learning_rate": 0.0002694850987131478, "loss": 0.7135, "step": 1468 }, { "epoch": 1.5066530194472876, "grad_norm": 0.26953125, "learning_rate": 0.0002693165695472022, "loss": 0.6878, "step": 1472 }, { "epoch": 1.510747185261003, "grad_norm": 0.263671875, "learning_rate": 0.00026914762928547097, "loss": 0.6612, "step": 1476 }, { "epoch": 1.5148413510747185, "grad_norm": 0.265625, "learning_rate": 0.00026897827851002457, "loss": 0.6975, "step": 1480 }, { "epoch": 1.518935516888434, "grad_norm": 0.271484375, "learning_rate": 0.0002688085178043475, "loss": 0.673, "step": 1484 }, { "epoch": 1.5230296827021494, "grad_norm": 0.263671875, "learning_rate": 0.000268638347753337, "loss": 0.6826, "step": 1488 }, { "epoch": 1.527123848515865, "grad_norm": 0.2890625, "learning_rate": 0.0002684677689433004, "loss": 0.6603, "step": 1492 }, { "epoch": 1.5312180143295804, "grad_norm": 0.2734375, "learning_rate": 0.0002682967819619535, "loss": 0.698, "step": 1496 }, { "epoch": 1.5353121801432958, "grad_norm": 0.28515625, "learning_rate": 0.00026812538739841833, "loss": 0.7188, "step": 1500 }, { "epoch": 1.5394063459570113, "grad_norm": 0.275390625, "learning_rate": 0.00026795358584322135, "loss": 0.7101, "step": 1504 }, { "epoch": 1.5435005117707266, "grad_norm": 0.255859375, "learning_rate": 0.0002677813778882911, "loss": 0.68, "step": 1508 }, { "epoch": 1.547594677584442, "grad_norm": 0.271484375, "learning_rate": 0.0002676087641269566, "loss": 0.7013, "step": 1512 }, { "epoch": 1.5516888433981575, "grad_norm": 0.25390625, "learning_rate": 0.0002674357451539448, "loss": 0.6477, "step": 1516 }, { "epoch": 1.555783009211873, "grad_norm": 0.2734375, "learning_rate": 0.00026726232156537886, "loss": 0.6855, "step": 1520 }, { "epoch": 1.5598771750255884, "grad_norm": 0.27734375, "learning_rate": 0.000267088493958776, "loss": 0.6618, "step": 1524 }, { "epoch": 1.563971340839304, "grad_norm": 0.29296875, "learning_rate": 0.0002669142629330455, "loss": 0.6477, "step": 1528 }, { "epoch": 1.5680655066530194, "grad_norm": 0.255859375, "learning_rate": 0.00026673962908848654, "loss": 0.7286, "step": 1532 }, { "epoch": 1.5721596724667348, "grad_norm": 0.255859375, "learning_rate": 0.0002665645930267862, "loss": 0.6947, "step": 1536 }, { "epoch": 1.5762538382804503, "grad_norm": 0.2734375, "learning_rate": 0.0002663891553510174, "loss": 0.6563, "step": 1540 }, { "epoch": 1.5803480040941658, "grad_norm": 0.259765625, "learning_rate": 0.00026621331666563665, "loss": 0.6722, "step": 1544 }, { "epoch": 1.5844421699078812, "grad_norm": 0.298828125, "learning_rate": 0.0002660370775764822, "loss": 0.6563, "step": 1548 }, { "epoch": 1.5885363357215967, "grad_norm": 0.267578125, "learning_rate": 0.000265860438690772, "loss": 0.678, "step": 1552 }, { "epoch": 1.5926305015353122, "grad_norm": 0.462890625, "learning_rate": 0.00026568340061710124, "loss": 0.6558, "step": 1556 }, { "epoch": 1.5967246673490276, "grad_norm": 0.26171875, "learning_rate": 0.0002655059639654406, "loss": 0.6684, "step": 1560 }, { "epoch": 1.600818833162743, "grad_norm": 0.287109375, "learning_rate": 0.000265328129347134, "loss": 0.6614, "step": 1564 }, { "epoch": 1.6049129989764586, "grad_norm": 0.271484375, "learning_rate": 0.00026514989737489646, "loss": 0.6763, "step": 1568 }, { "epoch": 1.609007164790174, "grad_norm": 0.263671875, "learning_rate": 0.00026497126866281223, "loss": 0.6667, "step": 1572 }, { "epoch": 1.6131013306038895, "grad_norm": 0.271484375, "learning_rate": 0.0002647922438263323, "loss": 0.6783, "step": 1576 }, { "epoch": 1.617195496417605, "grad_norm": 0.265625, "learning_rate": 0.00026461282348227267, "loss": 0.6843, "step": 1580 }, { "epoch": 1.6212896622313204, "grad_norm": 0.271484375, "learning_rate": 0.00026443300824881174, "loss": 0.6728, "step": 1584 }, { "epoch": 1.625383828045036, "grad_norm": 0.2490234375, "learning_rate": 0.00026425279874548883, "loss": 0.6666, "step": 1588 }, { "epoch": 1.6294779938587514, "grad_norm": 0.263671875, "learning_rate": 0.0002640721955932013, "loss": 0.6979, "step": 1592 }, { "epoch": 1.6335721596724668, "grad_norm": 0.2490234375, "learning_rate": 0.00026389119941420323, "loss": 0.6888, "step": 1596 }, { "epoch": 1.6376663254861823, "grad_norm": 0.2734375, "learning_rate": 0.0002637098108321024, "loss": 0.6875, "step": 1600 }, { "epoch": 1.6417604912998978, "grad_norm": 0.263671875, "learning_rate": 0.000263528030471859, "loss": 0.6285, "step": 1604 }, { "epoch": 1.6458546571136132, "grad_norm": 0.291015625, "learning_rate": 0.0002633458589597827, "loss": 0.6737, "step": 1608 }, { "epoch": 1.6499488229273287, "grad_norm": 0.259765625, "learning_rate": 0.0002631632969235311, "loss": 0.6747, "step": 1612 }, { "epoch": 1.6540429887410442, "grad_norm": 0.2734375, "learning_rate": 0.00026298034499210715, "loss": 0.6784, "step": 1616 }, { "epoch": 1.6581371545547596, "grad_norm": 0.259765625, "learning_rate": 0.00026279700379585724, "loss": 0.6657, "step": 1620 }, { "epoch": 1.6622313203684749, "grad_norm": 0.275390625, "learning_rate": 0.000262613273966469, "loss": 0.705, "step": 1624 }, { "epoch": 1.6663254861821903, "grad_norm": 0.27734375, "learning_rate": 0.00026242915613696897, "loss": 0.7061, "step": 1628 }, { "epoch": 1.6704196519959058, "grad_norm": 0.24609375, "learning_rate": 0.0002622446509417206, "loss": 0.6685, "step": 1632 }, { "epoch": 1.6745138178096213, "grad_norm": 0.2734375, "learning_rate": 0.00026205975901642174, "loss": 0.7255, "step": 1636 }, { "epoch": 1.6786079836233367, "grad_norm": 0.263671875, "learning_rate": 0.000261874480998103, "loss": 0.5933, "step": 1640 }, { "epoch": 1.6827021494370522, "grad_norm": 0.251953125, "learning_rate": 0.00026168881752512517, "loss": 0.7004, "step": 1644 }, { "epoch": 1.6867963152507677, "grad_norm": 0.28515625, "learning_rate": 0.00026150276923717693, "loss": 0.6795, "step": 1648 }, { "epoch": 1.6908904810644831, "grad_norm": 0.25390625, "learning_rate": 0.0002613163367752729, "loss": 0.6927, "step": 1652 }, { "epoch": 1.6949846468781986, "grad_norm": 0.302734375, "learning_rate": 0.00026112952078175146, "loss": 0.6589, "step": 1656 }, { "epoch": 1.699078812691914, "grad_norm": 0.26953125, "learning_rate": 0.0002609423219002722, "loss": 0.6693, "step": 1660 }, { "epoch": 1.7031729785056293, "grad_norm": 0.26953125, "learning_rate": 0.0002607547407758141, "loss": 0.7185, "step": 1664 }, { "epoch": 1.7072671443193448, "grad_norm": 0.275390625, "learning_rate": 0.00026056677805467304, "loss": 0.6349, "step": 1668 }, { "epoch": 1.7113613101330603, "grad_norm": 0.271484375, "learning_rate": 0.0002603784343844597, "loss": 0.7128, "step": 1672 }, { "epoch": 1.7154554759467757, "grad_norm": 0.265625, "learning_rate": 0.00026018971041409715, "loss": 0.6974, "step": 1676 }, { "epoch": 1.7195496417604912, "grad_norm": 0.265625, "learning_rate": 0.0002600006067938191, "loss": 0.6253, "step": 1680 }, { "epoch": 1.7236438075742067, "grad_norm": 0.259765625, "learning_rate": 0.00025981112417516693, "loss": 0.688, "step": 1684 }, { "epoch": 1.7277379733879221, "grad_norm": 0.2734375, "learning_rate": 0.000259621263210988, "loss": 0.6281, "step": 1688 }, { "epoch": 1.7318321392016376, "grad_norm": 0.2451171875, "learning_rate": 0.0002594310245554333, "loss": 0.7162, "step": 1692 }, { "epoch": 1.735926305015353, "grad_norm": 0.275390625, "learning_rate": 0.0002592404088639549, "loss": 0.7011, "step": 1696 }, { "epoch": 1.7400204708290685, "grad_norm": 0.25390625, "learning_rate": 0.0002590494167933042, "loss": 0.6928, "step": 1700 }, { "epoch": 1.744114636642784, "grad_norm": 0.271484375, "learning_rate": 0.0002588580490015292, "loss": 0.6628, "step": 1704 }, { "epoch": 1.7482088024564995, "grad_norm": 0.26953125, "learning_rate": 0.00025866630614797243, "loss": 0.6604, "step": 1708 }, { "epoch": 1.752302968270215, "grad_norm": 0.26953125, "learning_rate": 0.00025847418889326867, "loss": 0.7133, "step": 1712 }, { "epoch": 1.7563971340839304, "grad_norm": 0.27734375, "learning_rate": 0.0002582816978993428, "loss": 0.6795, "step": 1716 }, { "epoch": 1.7604912998976459, "grad_norm": 0.26953125, "learning_rate": 0.0002580888338294072, "loss": 0.6114, "step": 1720 }, { "epoch": 1.7645854657113613, "grad_norm": 0.2578125, "learning_rate": 0.00025789559734795984, "loss": 0.6771, "step": 1724 }, { "epoch": 1.7686796315250768, "grad_norm": 0.255859375, "learning_rate": 0.0002577019891207816, "loss": 0.6623, "step": 1728 }, { "epoch": 1.7727737973387923, "grad_norm": 0.2734375, "learning_rate": 0.00025750800981493434, "loss": 0.6542, "step": 1732 }, { "epoch": 1.7768679631525077, "grad_norm": 0.2578125, "learning_rate": 0.0002573136600987584, "loss": 0.6879, "step": 1736 }, { "epoch": 1.7809621289662232, "grad_norm": 0.298828125, "learning_rate": 0.0002571189406418702, "loss": 0.6624, "step": 1740 }, { "epoch": 1.7850562947799387, "grad_norm": 0.25390625, "learning_rate": 0.0002569238521151603, "loss": 0.7035, "step": 1744 }, { "epoch": 1.7891504605936541, "grad_norm": 0.283203125, "learning_rate": 0.0002567283951907908, "loss": 0.6695, "step": 1748 }, { "epoch": 1.7932446264073696, "grad_norm": 0.279296875, "learning_rate": 0.000256532570542193, "loss": 0.6326, "step": 1752 }, { "epoch": 1.797338792221085, "grad_norm": 0.255859375, "learning_rate": 0.0002563363788440652, "loss": 0.6895, "step": 1756 }, { "epoch": 1.8014329580348005, "grad_norm": 0.267578125, "learning_rate": 0.00025613982077237043, "loss": 0.6529, "step": 1760 }, { "epoch": 1.805527123848516, "grad_norm": 0.263671875, "learning_rate": 0.0002559428970043338, "loss": 0.6558, "step": 1764 }, { "epoch": 1.8096212896622315, "grad_norm": 0.271484375, "learning_rate": 0.00025574560821844066, "loss": 0.7197, "step": 1768 }, { "epoch": 1.813715455475947, "grad_norm": 0.2734375, "learning_rate": 0.0002555479550944338, "loss": 0.6678, "step": 1772 }, { "epoch": 1.8178096212896624, "grad_norm": 0.275390625, "learning_rate": 0.0002553499383133115, "loss": 0.6602, "step": 1776 }, { "epoch": 1.8219037871033776, "grad_norm": 0.25390625, "learning_rate": 0.0002551515585573248, "loss": 0.6831, "step": 1780 }, { "epoch": 1.825997952917093, "grad_norm": 0.255859375, "learning_rate": 0.0002549528165099755, "loss": 0.6853, "step": 1784 }, { "epoch": 1.8300921187308086, "grad_norm": 0.2734375, "learning_rate": 0.00025475371285601356, "loss": 0.6836, "step": 1788 }, { "epoch": 1.834186284544524, "grad_norm": 0.275390625, "learning_rate": 0.00025455424828143473, "loss": 0.6681, "step": 1792 }, { "epoch": 1.8382804503582395, "grad_norm": 0.275390625, "learning_rate": 0.0002543544234734786, "loss": 0.6962, "step": 1796 }, { "epoch": 1.842374616171955, "grad_norm": 0.267578125, "learning_rate": 0.00025415423912062557, "loss": 0.6646, "step": 1800 }, { "epoch": 1.8464687819856704, "grad_norm": 0.251953125, "learning_rate": 0.00025395369591259503, "loss": 0.647, "step": 1804 }, { "epoch": 1.850562947799386, "grad_norm": 0.27734375, "learning_rate": 0.00025375279454034264, "loss": 0.6682, "step": 1808 }, { "epoch": 1.8546571136131014, "grad_norm": 0.259765625, "learning_rate": 0.00025355153569605823, "loss": 0.6989, "step": 1812 }, { "epoch": 1.8587512794268168, "grad_norm": 0.25390625, "learning_rate": 0.00025334992007316315, "loss": 0.7076, "step": 1816 }, { "epoch": 1.862845445240532, "grad_norm": 0.26953125, "learning_rate": 0.00025314794836630807, "loss": 0.6588, "step": 1820 }, { "epoch": 1.8669396110542475, "grad_norm": 0.263671875, "learning_rate": 0.0002529456212713705, "loss": 0.6762, "step": 1824 }, { "epoch": 1.871033776867963, "grad_norm": 0.255859375, "learning_rate": 0.0002527429394854524, "loss": 0.7133, "step": 1828 }, { "epoch": 1.8751279426816785, "grad_norm": 0.2578125, "learning_rate": 0.0002525399037068778, "loss": 0.7536, "step": 1832 }, { "epoch": 1.879222108495394, "grad_norm": 0.259765625, "learning_rate": 0.00025233651463519045, "loss": 0.6799, "step": 1836 }, { "epoch": 1.8833162743091094, "grad_norm": 0.275390625, "learning_rate": 0.00025213277297115124, "loss": 0.6846, "step": 1840 }, { "epoch": 1.8874104401228249, "grad_norm": 0.265625, "learning_rate": 0.0002519286794167359, "loss": 0.6479, "step": 1844 }, { "epoch": 1.8915046059365404, "grad_norm": 0.279296875, "learning_rate": 0.00025172423467513267, "loss": 0.6588, "step": 1848 }, { "epoch": 1.8955987717502558, "grad_norm": 0.283203125, "learning_rate": 0.0002515194394507396, "loss": 0.6707, "step": 1852 }, { "epoch": 1.8996929375639713, "grad_norm": 0.259765625, "learning_rate": 0.00025131429444916247, "loss": 0.6688, "step": 1856 }, { "epoch": 1.9037871033776868, "grad_norm": 0.267578125, "learning_rate": 0.00025110880037721215, "loss": 0.6671, "step": 1860 }, { "epoch": 1.9078812691914022, "grad_norm": 0.26953125, "learning_rate": 0.00025090295794290214, "loss": 0.6216, "step": 1864 }, { "epoch": 1.9119754350051177, "grad_norm": 0.2578125, "learning_rate": 0.00025069676785544623, "loss": 0.6815, "step": 1868 }, { "epoch": 1.9160696008188332, "grad_norm": 0.251953125, "learning_rate": 0.00025049023082525607, "loss": 0.7354, "step": 1872 }, { "epoch": 1.9201637666325486, "grad_norm": 0.2734375, "learning_rate": 0.0002502833475639386, "loss": 0.6874, "step": 1876 }, { "epoch": 1.924257932446264, "grad_norm": 0.26171875, "learning_rate": 0.0002500761187842937, "loss": 0.6667, "step": 1880 }, { "epoch": 1.9283520982599796, "grad_norm": 0.2451171875, "learning_rate": 0.0002498685452003118, "loss": 0.6612, "step": 1884 }, { "epoch": 1.932446264073695, "grad_norm": 0.287109375, "learning_rate": 0.0002496606275271711, "loss": 0.6915, "step": 1888 }, { "epoch": 1.9365404298874105, "grad_norm": 0.267578125, "learning_rate": 0.0002494523664812355, "loss": 0.6593, "step": 1892 }, { "epoch": 1.940634595701126, "grad_norm": 0.275390625, "learning_rate": 0.00024924376278005197, "loss": 0.6741, "step": 1896 }, { "epoch": 1.9447287615148414, "grad_norm": 0.25390625, "learning_rate": 0.000249034817142348, "loss": 0.6915, "step": 1900 }, { "epoch": 1.9488229273285569, "grad_norm": 0.3125, "learning_rate": 0.0002488255302880293, "loss": 0.6599, "step": 1904 }, { "epoch": 1.9529170931422724, "grad_norm": 0.259765625, "learning_rate": 0.000248615902938177, "loss": 0.64, "step": 1908 }, { "epoch": 1.9570112589559878, "grad_norm": 0.263671875, "learning_rate": 0.00024840593581504567, "loss": 0.6631, "step": 1912 }, { "epoch": 1.9611054247697033, "grad_norm": 0.2734375, "learning_rate": 0.0002481956296420603, "loss": 0.6774, "step": 1916 }, { "epoch": 1.9651995905834188, "grad_norm": 0.275390625, "learning_rate": 0.0002479849851438142, "loss": 0.6896, "step": 1920 }, { "epoch": 1.9692937563971342, "grad_norm": 0.267578125, "learning_rate": 0.0002477740030460663, "loss": 0.6584, "step": 1924 }, { "epoch": 1.9733879222108497, "grad_norm": 0.2734375, "learning_rate": 0.0002475626840757386, "loss": 0.6339, "step": 1928 }, { "epoch": 1.9774820880245652, "grad_norm": 0.26171875, "learning_rate": 0.000247351028960914, "loss": 0.6797, "step": 1932 }, { "epoch": 1.9815762538382804, "grad_norm": 0.28515625, "learning_rate": 0.0002471390384308334, "loss": 0.6773, "step": 1936 }, { "epoch": 1.9856704196519959, "grad_norm": 0.271484375, "learning_rate": 0.00024692671321589326, "loss": 0.6526, "step": 1940 }, { "epoch": 1.9897645854657113, "grad_norm": 0.279296875, "learning_rate": 0.00024671405404764335, "loss": 0.6519, "step": 1944 }, { "epoch": 1.9938587512794268, "grad_norm": 0.28125, "learning_rate": 0.0002465010616587841, "loss": 0.6989, "step": 1948 }, { "epoch": 1.9979529170931423, "grad_norm": 0.25, "learning_rate": 0.0002462877367831637, "loss": 0.7191, "step": 1952 }, { "epoch": 2.0020470829068575, "grad_norm": 0.248046875, "learning_rate": 0.0002460740801557763, "loss": 0.6218, "step": 1956 }, { "epoch": 2.006141248720573, "grad_norm": 0.279296875, "learning_rate": 0.0002458600925127587, "loss": 0.6015, "step": 1960 }, { "epoch": 2.0102354145342884, "grad_norm": 0.27734375, "learning_rate": 0.0002456457745913885, "loss": 0.5645, "step": 1964 }, { "epoch": 2.014329580348004, "grad_norm": 0.2490234375, "learning_rate": 0.00024543112713008104, "loss": 0.6114, "step": 1968 }, { "epoch": 2.0184237461617194, "grad_norm": 0.267578125, "learning_rate": 0.00024521615086838726, "loss": 0.5825, "step": 1972 }, { "epoch": 2.022517911975435, "grad_norm": 0.26171875, "learning_rate": 0.00024500084654699065, "loss": 0.6006, "step": 1976 }, { "epoch": 2.0266120777891503, "grad_norm": 0.296875, "learning_rate": 0.0002447852149077053, "loss": 0.5932, "step": 1980 }, { "epoch": 2.030706243602866, "grad_norm": 0.263671875, "learning_rate": 0.00024456925669347294, "loss": 0.5681, "step": 1984 }, { "epoch": 2.0348004094165812, "grad_norm": 0.259765625, "learning_rate": 0.00024435297264836043, "loss": 0.5712, "step": 1988 }, { "epoch": 2.0388945752302967, "grad_norm": 0.291015625, "learning_rate": 0.00024413636351755736, "loss": 0.5174, "step": 1992 }, { "epoch": 2.042988741044012, "grad_norm": 0.28515625, "learning_rate": 0.00024391943004737333, "loss": 0.6106, "step": 1996 }, { "epoch": 2.0470829068577276, "grad_norm": 0.2734375, "learning_rate": 0.00024370217298523534, "loss": 0.5328, "step": 2000 }, { "epoch": 2.051177072671443, "grad_norm": 0.267578125, "learning_rate": 0.00024348459307968546, "loss": 0.6033, "step": 2004 }, { "epoch": 2.0552712384851586, "grad_norm": 0.279296875, "learning_rate": 0.00024326669108037802, "loss": 0.5893, "step": 2008 }, { "epoch": 2.059365404298874, "grad_norm": 0.267578125, "learning_rate": 0.00024304846773807708, "loss": 0.6343, "step": 2012 }, { "epoch": 2.0634595701125895, "grad_norm": 0.302734375, "learning_rate": 0.0002428299238046538, "loss": 0.5784, "step": 2016 }, { "epoch": 2.067553735926305, "grad_norm": 0.271484375, "learning_rate": 0.00024261106003308408, "loss": 0.562, "step": 2020 }, { "epoch": 2.0716479017400204, "grad_norm": 0.279296875, "learning_rate": 0.00024239187717744567, "loss": 0.5716, "step": 2024 }, { "epoch": 2.075742067553736, "grad_norm": 0.25390625, "learning_rate": 0.0002421723759929157, "loss": 0.6365, "step": 2028 }, { "epoch": 2.0798362333674514, "grad_norm": 0.296875, "learning_rate": 0.0002419525572357682, "loss": 0.6373, "step": 2032 }, { "epoch": 2.083930399181167, "grad_norm": 0.27734375, "learning_rate": 0.00024173242166337114, "loss": 0.628, "step": 2036 }, { "epoch": 2.0880245649948823, "grad_norm": 0.263671875, "learning_rate": 0.00024151197003418427, "loss": 0.5754, "step": 2040 }, { "epoch": 2.092118730808598, "grad_norm": 0.27734375, "learning_rate": 0.0002412912031077562, "loss": 0.584, "step": 2044 }, { "epoch": 2.0962128966223132, "grad_norm": 0.291015625, "learning_rate": 0.0002410701216447219, "loss": 0.6191, "step": 2048 }, { "epoch": 2.1003070624360287, "grad_norm": 0.265625, "learning_rate": 0.0002408487264068, "loss": 0.657, "step": 2052 }, { "epoch": 2.104401228249744, "grad_norm": 0.275390625, "learning_rate": 0.00024062701815679032, "loss": 0.595, "step": 2056 }, { "epoch": 2.1084953940634596, "grad_norm": 0.287109375, "learning_rate": 0.00024040499765857093, "loss": 0.6026, "step": 2060 }, { "epoch": 2.112589559877175, "grad_norm": 0.302734375, "learning_rate": 0.000240182665677096, "loss": 0.5219, "step": 2064 }, { "epoch": 2.1166837256908906, "grad_norm": 0.29296875, "learning_rate": 0.00023996002297839268, "loss": 0.6293, "step": 2068 }, { "epoch": 2.120777891504606, "grad_norm": 0.28515625, "learning_rate": 0.00023973707032955879, "loss": 0.6043, "step": 2072 }, { "epoch": 2.1248720573183215, "grad_norm": 0.291015625, "learning_rate": 0.00023951380849875995, "loss": 0.6114, "step": 2076 }, { "epoch": 2.128966223132037, "grad_norm": 0.271484375, "learning_rate": 0.00023929023825522715, "loss": 0.5865, "step": 2080 }, { "epoch": 2.1330603889457525, "grad_norm": 0.29296875, "learning_rate": 0.00023906636036925396, "loss": 0.6042, "step": 2084 }, { "epoch": 2.137154554759468, "grad_norm": 0.3046875, "learning_rate": 0.00023884217561219386, "loss": 0.6083, "step": 2088 }, { "epoch": 2.1412487205731834, "grad_norm": 0.28125, "learning_rate": 0.00023861768475645772, "loss": 0.6202, "step": 2092 }, { "epoch": 2.145342886386899, "grad_norm": 0.279296875, "learning_rate": 0.00023839288857551095, "loss": 0.5945, "step": 2096 }, { "epoch": 2.1494370522006143, "grad_norm": 0.2734375, "learning_rate": 0.00023816778784387094, "loss": 0.6022, "step": 2100 }, { "epoch": 2.15353121801433, "grad_norm": 0.271484375, "learning_rate": 0.00023794238333710454, "loss": 0.5561, "step": 2104 }, { "epoch": 2.1576253838280453, "grad_norm": 0.27734375, "learning_rate": 0.00023771667583182498, "loss": 0.5638, "step": 2108 }, { "epoch": 2.1617195496417603, "grad_norm": 0.28125, "learning_rate": 0.00023749066610568968, "loss": 0.5325, "step": 2112 }, { "epoch": 2.1658137154554757, "grad_norm": 0.28125, "learning_rate": 0.00023726435493739726, "loss": 0.5708, "step": 2116 }, { "epoch": 2.169907881269191, "grad_norm": 0.302734375, "learning_rate": 0.00023703774310668483, "loss": 0.6038, "step": 2120 }, { "epoch": 2.1740020470829067, "grad_norm": 0.2734375, "learning_rate": 0.00023681083139432549, "loss": 0.5861, "step": 2124 }, { "epoch": 2.178096212896622, "grad_norm": 0.26953125, "learning_rate": 0.00023658362058212568, "loss": 0.6176, "step": 2128 }, { "epoch": 2.1821903787103376, "grad_norm": 0.28125, "learning_rate": 0.00023635611145292213, "loss": 0.5727, "step": 2132 }, { "epoch": 2.186284544524053, "grad_norm": 0.294921875, "learning_rate": 0.00023612830479057957, "loss": 0.5516, "step": 2136 }, { "epoch": 2.1903787103377685, "grad_norm": 0.298828125, "learning_rate": 0.00023590020137998787, "loss": 0.6297, "step": 2140 }, { "epoch": 2.194472876151484, "grad_norm": 0.2734375, "learning_rate": 0.0002356718020070591, "loss": 0.5994, "step": 2144 }, { "epoch": 2.1985670419651995, "grad_norm": 0.271484375, "learning_rate": 0.00023544310745872532, "loss": 0.591, "step": 2148 }, { "epoch": 2.202661207778915, "grad_norm": 0.2890625, "learning_rate": 0.00023521411852293545, "loss": 0.6033, "step": 2152 }, { "epoch": 2.2067553735926304, "grad_norm": 0.28515625, "learning_rate": 0.00023498483598865272, "loss": 0.5993, "step": 2156 }, { "epoch": 2.210849539406346, "grad_norm": 0.291015625, "learning_rate": 0.00023475526064585192, "loss": 0.6182, "step": 2160 }, { "epoch": 2.2149437052200613, "grad_norm": 0.275390625, "learning_rate": 0.00023452539328551664, "loss": 0.6129, "step": 2164 }, { "epoch": 2.219037871033777, "grad_norm": 0.2734375, "learning_rate": 0.00023429523469963682, "loss": 0.6295, "step": 2168 }, { "epoch": 2.2231320368474923, "grad_norm": 0.27734375, "learning_rate": 0.0002340647856812055, "loss": 0.5664, "step": 2172 }, { "epoch": 2.2272262026612077, "grad_norm": 0.275390625, "learning_rate": 0.0002338340470242165, "loss": 0.6029, "step": 2176 }, { "epoch": 2.231320368474923, "grad_norm": 0.283203125, "learning_rate": 0.00023360301952366166, "loss": 0.6056, "step": 2180 }, { "epoch": 2.2354145342886387, "grad_norm": 0.29296875, "learning_rate": 0.0002333717039755279, "loss": 0.577, "step": 2184 }, { "epoch": 2.239508700102354, "grad_norm": 0.28125, "learning_rate": 0.0002331401011767946, "loss": 0.6348, "step": 2188 }, { "epoch": 2.2436028659160696, "grad_norm": 0.3046875, "learning_rate": 0.00023290821192543083, "loss": 0.6237, "step": 2192 }, { "epoch": 2.247697031729785, "grad_norm": 0.28125, "learning_rate": 0.00023267603702039263, "loss": 0.5623, "step": 2196 }, { "epoch": 2.2517911975435005, "grad_norm": 0.2890625, "learning_rate": 0.0002324435772616203, "loss": 0.5739, "step": 2200 }, { "epoch": 2.255885363357216, "grad_norm": 0.259765625, "learning_rate": 0.00023221083345003544, "loss": 0.5992, "step": 2204 }, { "epoch": 2.2599795291709315, "grad_norm": 0.310546875, "learning_rate": 0.00023197780638753847, "loss": 0.5831, "step": 2208 }, { "epoch": 2.264073694984647, "grad_norm": 0.29296875, "learning_rate": 0.00023174449687700564, "loss": 0.6073, "step": 2212 }, { "epoch": 2.2681678607983624, "grad_norm": 0.265625, "learning_rate": 0.00023151090572228635, "loss": 0.563, "step": 2216 }, { "epoch": 2.272262026612078, "grad_norm": 0.3125, "learning_rate": 0.0002312770337282005, "loss": 0.5428, "step": 2220 }, { "epoch": 2.2763561924257933, "grad_norm": 0.279296875, "learning_rate": 0.00023104288170053543, "loss": 0.5443, "step": 2224 }, { "epoch": 2.280450358239509, "grad_norm": 0.259765625, "learning_rate": 0.0002308084504460435, "loss": 0.5721, "step": 2228 }, { "epoch": 2.2845445240532243, "grad_norm": 0.2890625, "learning_rate": 0.00023057374077243884, "loss": 0.5796, "step": 2232 }, { "epoch": 2.2886386898669397, "grad_norm": 0.29296875, "learning_rate": 0.00023033875348839526, "loss": 0.6163, "step": 2236 }, { "epoch": 2.292732855680655, "grad_norm": 0.28515625, "learning_rate": 0.00023010348940354262, "loss": 0.5588, "step": 2240 }, { "epoch": 2.2968270214943707, "grad_norm": 0.283203125, "learning_rate": 0.0002298679493284648, "loss": 0.6182, "step": 2244 }, { "epoch": 2.300921187308086, "grad_norm": 0.28515625, "learning_rate": 0.00022963213407469643, "loss": 0.6273, "step": 2248 }, { "epoch": 2.3050153531218016, "grad_norm": 0.279296875, "learning_rate": 0.00022939604445472027, "loss": 0.5913, "step": 2252 }, { "epoch": 2.309109518935517, "grad_norm": 0.28515625, "learning_rate": 0.00022915968128196443, "loss": 0.605, "step": 2256 }, { "epoch": 2.313203684749232, "grad_norm": 0.30859375, "learning_rate": 0.00022892304537079945, "loss": 0.5575, "step": 2260 }, { "epoch": 2.317297850562948, "grad_norm": 0.30078125, "learning_rate": 0.00022868613753653565, "loss": 0.6122, "step": 2264 }, { "epoch": 2.321392016376663, "grad_norm": 0.318359375, "learning_rate": 0.00022844895859542016, "loss": 0.644, "step": 2268 }, { "epoch": 2.325486182190379, "grad_norm": 0.30078125, "learning_rate": 0.00022821150936463427, "loss": 0.6069, "step": 2272 }, { "epoch": 2.329580348004094, "grad_norm": 0.275390625, "learning_rate": 0.0002279737906622905, "loss": 0.5912, "step": 2276 }, { "epoch": 2.3336745138178094, "grad_norm": 0.2890625, "learning_rate": 0.00022773580330742973, "loss": 0.5825, "step": 2280 }, { "epoch": 2.337768679631525, "grad_norm": 0.267578125, "learning_rate": 0.00022749754812001856, "loss": 0.6167, "step": 2284 }, { "epoch": 2.3418628454452404, "grad_norm": 0.27734375, "learning_rate": 0.0002272590259209464, "loss": 0.5872, "step": 2288 }, { "epoch": 2.345957011258956, "grad_norm": 0.298828125, "learning_rate": 0.00022702023753202257, "loss": 0.5571, "step": 2292 }, { "epoch": 2.3500511770726713, "grad_norm": 0.28125, "learning_rate": 0.0002267811837759735, "loss": 0.5777, "step": 2296 }, { "epoch": 2.3541453428863868, "grad_norm": 0.28515625, "learning_rate": 0.00022654186547644003, "loss": 0.5692, "step": 2300 }, { "epoch": 2.3582395087001022, "grad_norm": 0.298828125, "learning_rate": 0.00022630228345797435, "loss": 0.5789, "step": 2304 }, { "epoch": 2.3623336745138177, "grad_norm": 0.302734375, "learning_rate": 0.00022606243854603729, "loss": 0.6044, "step": 2308 }, { "epoch": 2.366427840327533, "grad_norm": 0.294921875, "learning_rate": 0.0002258223315669956, "loss": 0.6143, "step": 2312 }, { "epoch": 2.3705220061412486, "grad_norm": 0.28125, "learning_rate": 0.00022558196334811873, "loss": 0.5742, "step": 2316 }, { "epoch": 2.374616171954964, "grad_norm": 0.291015625, "learning_rate": 0.0002253413347175764, "loss": 0.6163, "step": 2320 }, { "epoch": 2.3787103377686796, "grad_norm": 0.30078125, "learning_rate": 0.00022510044650443547, "loss": 0.612, "step": 2324 }, { "epoch": 2.382804503582395, "grad_norm": 0.318359375, "learning_rate": 0.00022485929953865714, "loss": 0.5886, "step": 2328 }, { "epoch": 2.3868986693961105, "grad_norm": 0.30859375, "learning_rate": 0.00022461789465109426, "loss": 0.5864, "step": 2332 }, { "epoch": 2.390992835209826, "grad_norm": 0.29296875, "learning_rate": 0.00022437623267348823, "loss": 0.6113, "step": 2336 }, { "epoch": 2.3950870010235414, "grad_norm": 0.283203125, "learning_rate": 0.00022413431443846617, "loss": 0.6025, "step": 2340 }, { "epoch": 2.399181166837257, "grad_norm": 0.298828125, "learning_rate": 0.00022389214077953823, "loss": 0.6034, "step": 2344 }, { "epoch": 2.4032753326509724, "grad_norm": 0.298828125, "learning_rate": 0.00022364971253109462, "loss": 0.666, "step": 2348 }, { "epoch": 2.407369498464688, "grad_norm": 0.283203125, "learning_rate": 0.00022340703052840257, "loss": 0.5732, "step": 2352 }, { "epoch": 2.4114636642784033, "grad_norm": 0.298828125, "learning_rate": 0.0002231640956076037, "loss": 0.6258, "step": 2356 }, { "epoch": 2.4155578300921188, "grad_norm": 0.28515625, "learning_rate": 0.00022292090860571108, "loss": 0.5836, "step": 2360 }, { "epoch": 2.4196519959058342, "grad_norm": 0.28125, "learning_rate": 0.00022267747036060627, "loss": 0.5722, "step": 2364 }, { "epoch": 2.4237461617195497, "grad_norm": 0.287109375, "learning_rate": 0.00022243378171103636, "loss": 0.5837, "step": 2368 }, { "epoch": 2.427840327533265, "grad_norm": 0.2734375, "learning_rate": 0.00022218984349661134, "loss": 0.6311, "step": 2372 }, { "epoch": 2.4319344933469806, "grad_norm": 0.265625, "learning_rate": 0.00022194565655780102, "loss": 0.6041, "step": 2376 }, { "epoch": 2.436028659160696, "grad_norm": 0.3046875, "learning_rate": 0.00022170122173593206, "loss": 0.5853, "step": 2380 }, { "epoch": 2.4401228249744116, "grad_norm": 0.3125, "learning_rate": 0.00022145653987318536, "loss": 0.6081, "step": 2384 }, { "epoch": 2.444216990788127, "grad_norm": 0.294921875, "learning_rate": 0.0002212116118125929, "loss": 0.5846, "step": 2388 }, { "epoch": 2.4483111566018425, "grad_norm": 0.287109375, "learning_rate": 0.00022096643839803482, "loss": 0.5953, "step": 2392 }, { "epoch": 2.452405322415558, "grad_norm": 0.298828125, "learning_rate": 0.00022072102047423673, "loss": 0.554, "step": 2396 }, { "epoch": 2.4564994882292734, "grad_norm": 0.29296875, "learning_rate": 0.0002204753588867667, "loss": 0.6111, "step": 2400 }, { "epoch": 2.460593654042989, "grad_norm": 0.2890625, "learning_rate": 0.0002202294544820321, "loss": 0.6074, "step": 2404 }, { "epoch": 2.4646878198567044, "grad_norm": 0.30859375, "learning_rate": 0.00021998330810727725, "loss": 0.6092, "step": 2408 }, { "epoch": 2.46878198567042, "grad_norm": 0.3125, "learning_rate": 0.00021973692061057987, "loss": 0.5686, "step": 2412 }, { "epoch": 2.472876151484135, "grad_norm": 0.287109375, "learning_rate": 0.0002194902928408486, "loss": 0.6034, "step": 2416 }, { "epoch": 2.4769703172978508, "grad_norm": 0.28515625, "learning_rate": 0.0002192434256478199, "loss": 0.5837, "step": 2420 }, { "epoch": 2.481064483111566, "grad_norm": 0.27734375, "learning_rate": 0.00021899631988205506, "loss": 0.5938, "step": 2424 }, { "epoch": 2.4851586489252817, "grad_norm": 0.296875, "learning_rate": 0.00021874897639493745, "loss": 0.5849, "step": 2428 }, { "epoch": 2.4892528147389967, "grad_norm": 0.283203125, "learning_rate": 0.00021850139603866946, "loss": 0.6057, "step": 2432 }, { "epoch": 2.493346980552712, "grad_norm": 0.279296875, "learning_rate": 0.0002182535796662696, "loss": 0.555, "step": 2436 }, { "epoch": 2.4974411463664277, "grad_norm": 0.287109375, "learning_rate": 0.00021800552813156947, "loss": 0.5598, "step": 2440 }, { "epoch": 2.501535312180143, "grad_norm": 0.279296875, "learning_rate": 0.00021775724228921108, "loss": 0.6042, "step": 2444 }, { "epoch": 2.5056294779938586, "grad_norm": 0.314453125, "learning_rate": 0.00021750872299464358, "loss": 0.5955, "step": 2448 }, { "epoch": 2.509723643807574, "grad_norm": 0.3046875, "learning_rate": 0.00021725997110412043, "loss": 0.5756, "step": 2452 }, { "epoch": 2.5138178096212895, "grad_norm": 0.28125, "learning_rate": 0.0002170109874746967, "loss": 0.5865, "step": 2456 }, { "epoch": 2.517911975435005, "grad_norm": 0.287109375, "learning_rate": 0.00021676177296422566, "loss": 0.6014, "step": 2460 }, { "epoch": 2.5220061412487205, "grad_norm": 0.291015625, "learning_rate": 0.00021651232843135617, "loss": 0.5804, "step": 2464 }, { "epoch": 2.526100307062436, "grad_norm": 0.3046875, "learning_rate": 0.00021626265473552965, "loss": 0.6267, "step": 2468 }, { "epoch": 2.5301944728761514, "grad_norm": 0.30078125, "learning_rate": 0.00021601275273697696, "loss": 0.5653, "step": 2472 }, { "epoch": 2.534288638689867, "grad_norm": 0.296875, "learning_rate": 0.00021576262329671568, "loss": 0.5678, "step": 2476 }, { "epoch": 2.5383828045035823, "grad_norm": 0.326171875, "learning_rate": 0.00021551226727654696, "loss": 0.5694, "step": 2480 }, { "epoch": 2.542476970317298, "grad_norm": 0.287109375, "learning_rate": 0.00021526168553905265, "loss": 0.5915, "step": 2484 }, { "epoch": 2.5465711361310133, "grad_norm": 0.294921875, "learning_rate": 0.00021501087894759227, "loss": 0.5917, "step": 2488 }, { "epoch": 2.5506653019447287, "grad_norm": 0.298828125, "learning_rate": 0.00021475984836629998, "loss": 0.583, "step": 2492 }, { "epoch": 2.554759467758444, "grad_norm": 0.29296875, "learning_rate": 0.0002145085946600819, "loss": 0.6088, "step": 2496 }, { "epoch": 2.5588536335721597, "grad_norm": 0.265625, "learning_rate": 0.00021425711869461266, "loss": 0.5994, "step": 2500 }, { "epoch": 2.562947799385875, "grad_norm": 0.2890625, "learning_rate": 0.00021400542133633276, "loss": 0.5656, "step": 2504 }, { "epoch": 2.5670419651995906, "grad_norm": 0.2890625, "learning_rate": 0.00021375350345244557, "loss": 0.6544, "step": 2508 }, { "epoch": 2.571136131013306, "grad_norm": 0.2890625, "learning_rate": 0.00021350136591091415, "loss": 0.5995, "step": 2512 }, { "epoch": 2.5752302968270215, "grad_norm": 0.287109375, "learning_rate": 0.00021324900958045843, "loss": 0.6408, "step": 2516 }, { "epoch": 2.579324462640737, "grad_norm": 0.302734375, "learning_rate": 0.00021299643533055214, "loss": 0.62, "step": 2520 }, { "epoch": 2.5834186284544525, "grad_norm": 0.29296875, "learning_rate": 0.0002127436440314199, "loss": 0.5754, "step": 2524 }, { "epoch": 2.587512794268168, "grad_norm": 0.310546875, "learning_rate": 0.000212490636554034, "loss": 0.5731, "step": 2528 }, { "epoch": 2.5916069600818834, "grad_norm": 0.30078125, "learning_rate": 0.00021223741377011178, "loss": 0.582, "step": 2532 }, { "epoch": 2.595701125895599, "grad_norm": 0.287109375, "learning_rate": 0.00021198397655211216, "loss": 0.5946, "step": 2536 }, { "epoch": 2.5997952917093143, "grad_norm": 0.30859375, "learning_rate": 0.00021173032577323302, "loss": 0.6236, "step": 2540 }, { "epoch": 2.60388945752303, "grad_norm": 0.298828125, "learning_rate": 0.00021147646230740814, "loss": 0.6059, "step": 2544 }, { "epoch": 2.6079836233367453, "grad_norm": 0.298828125, "learning_rate": 0.00021122238702930377, "loss": 0.5637, "step": 2548 }, { "epoch": 2.6120777891504607, "grad_norm": 0.275390625, "learning_rate": 0.00021096810081431628, "loss": 0.5897, "step": 2552 }, { "epoch": 2.616171954964176, "grad_norm": 0.3046875, "learning_rate": 0.00021071360453856866, "loss": 0.5732, "step": 2556 }, { "epoch": 2.6202661207778917, "grad_norm": 0.28515625, "learning_rate": 0.00021045889907890763, "loss": 0.6022, "step": 2560 }, { "epoch": 2.6243602865916067, "grad_norm": 0.306640625, "learning_rate": 0.00021020398531290067, "loss": 0.6029, "step": 2564 }, { "epoch": 2.6284544524053226, "grad_norm": 0.291015625, "learning_rate": 0.00020994886411883297, "loss": 0.6171, "step": 2568 }, { "epoch": 2.6325486182190376, "grad_norm": 0.267578125, "learning_rate": 0.00020969353637570443, "loss": 0.5882, "step": 2572 }, { "epoch": 2.6366427840327535, "grad_norm": 0.30859375, "learning_rate": 0.0002094380029632265, "loss": 0.6273, "step": 2576 }, { "epoch": 2.6407369498464686, "grad_norm": 0.28125, "learning_rate": 0.00020918226476181935, "loss": 0.5672, "step": 2580 }, { "epoch": 2.6448311156601845, "grad_norm": 0.314453125, "learning_rate": 0.00020892632265260866, "loss": 0.6087, "step": 2584 }, { "epoch": 2.6489252814738995, "grad_norm": 0.298828125, "learning_rate": 0.00020867017751742266, "loss": 0.5993, "step": 2588 }, { "epoch": 2.6530194472876154, "grad_norm": 0.30078125, "learning_rate": 0.00020841383023878916, "loss": 0.5903, "step": 2592 }, { "epoch": 2.6571136131013304, "grad_norm": 0.291015625, "learning_rate": 0.00020815728169993233, "loss": 0.5986, "step": 2596 }, { "epoch": 2.661207778915046, "grad_norm": 0.28515625, "learning_rate": 0.0002079005327847699, "loss": 0.6083, "step": 2600 }, { "epoch": 2.6653019447287614, "grad_norm": 0.298828125, "learning_rate": 0.00020764358437790994, "loss": 0.5921, "step": 2604 }, { "epoch": 2.669396110542477, "grad_norm": 0.30078125, "learning_rate": 0.00020738643736464772, "loss": 0.5782, "step": 2608 }, { "epoch": 2.6734902763561923, "grad_norm": 0.322265625, "learning_rate": 0.00020712909263096297, "loss": 0.6629, "step": 2612 }, { "epoch": 2.6775844421699078, "grad_norm": 0.291015625, "learning_rate": 0.00020687155106351661, "loss": 0.6093, "step": 2616 }, { "epoch": 2.6816786079836232, "grad_norm": 0.30859375, "learning_rate": 0.00020661381354964762, "loss": 0.5907, "step": 2620 }, { "epoch": 2.6857727737973387, "grad_norm": 0.30859375, "learning_rate": 0.00020635588097737015, "loss": 0.5855, "step": 2624 }, { "epoch": 2.689866939611054, "grad_norm": 0.31640625, "learning_rate": 0.00020609775423537053, "loss": 0.6578, "step": 2628 }, { "epoch": 2.6939611054247696, "grad_norm": 0.333984375, "learning_rate": 0.00020583943421300405, "loss": 0.5752, "step": 2632 }, { "epoch": 2.698055271238485, "grad_norm": 0.306640625, "learning_rate": 0.0002055809218002917, "loss": 0.6257, "step": 2636 }, { "epoch": 2.7021494370522006, "grad_norm": 0.3046875, "learning_rate": 0.00020532221788791767, "loss": 0.6225, "step": 2640 }, { "epoch": 2.706243602865916, "grad_norm": 0.318359375, "learning_rate": 0.00020506332336722572, "loss": 0.5765, "step": 2644 }, { "epoch": 2.7103377686796315, "grad_norm": 0.3125, "learning_rate": 0.00020480423913021636, "loss": 0.6118, "step": 2648 }, { "epoch": 2.714431934493347, "grad_norm": 0.30078125, "learning_rate": 0.0002045449660695439, "loss": 0.5911, "step": 2652 }, { "epoch": 2.7185261003070624, "grad_norm": 0.298828125, "learning_rate": 0.00020428550507851313, "loss": 0.5962, "step": 2656 }, { "epoch": 2.722620266120778, "grad_norm": 0.27734375, "learning_rate": 0.00020402585705107617, "loss": 0.5941, "step": 2660 }, { "epoch": 2.7267144319344934, "grad_norm": 0.306640625, "learning_rate": 0.00020376602288182992, "loss": 0.6027, "step": 2664 }, { "epoch": 2.730808597748209, "grad_norm": 0.3125, "learning_rate": 0.0002035060034660123, "loss": 0.5518, "step": 2668 }, { "epoch": 2.7349027635619243, "grad_norm": 0.29296875, "learning_rate": 0.00020324579969949964, "loss": 0.6127, "step": 2672 }, { "epoch": 2.7389969293756398, "grad_norm": 0.30859375, "learning_rate": 0.00020298541247880343, "loss": 0.6011, "step": 2676 }, { "epoch": 2.7430910951893552, "grad_norm": 0.296875, "learning_rate": 0.00020272484270106712, "loss": 0.5692, "step": 2680 }, { "epoch": 2.7471852610030707, "grad_norm": 0.3046875, "learning_rate": 0.0002024640912640633, "loss": 0.6303, "step": 2684 }, { "epoch": 2.751279426816786, "grad_norm": 0.302734375, "learning_rate": 0.0002022031590661904, "loss": 0.6613, "step": 2688 }, { "epoch": 2.7553735926305016, "grad_norm": 0.28125, "learning_rate": 0.00020194204700646958, "loss": 0.6369, "step": 2692 }, { "epoch": 2.759467758444217, "grad_norm": 0.30078125, "learning_rate": 0.0002016807559845418, "loss": 0.6217, "step": 2696 }, { "epoch": 2.7635619242579326, "grad_norm": 0.30859375, "learning_rate": 0.00020141928690066446, "loss": 0.651, "step": 2700 }, { "epoch": 2.767656090071648, "grad_norm": 0.27734375, "learning_rate": 0.0002011576406557087, "loss": 0.5942, "step": 2704 }, { "epoch": 2.7717502558853635, "grad_norm": 0.318359375, "learning_rate": 0.0002008958181511559, "loss": 0.6106, "step": 2708 }, { "epoch": 2.775844421699079, "grad_norm": 0.3125, "learning_rate": 0.00020063382028909468, "loss": 0.5713, "step": 2712 }, { "epoch": 2.7799385875127944, "grad_norm": 0.29296875, "learning_rate": 0.00020037164797221798, "loss": 0.6318, "step": 2716 }, { "epoch": 2.7840327533265095, "grad_norm": 0.28515625, "learning_rate": 0.00020010930210381973, "loss": 0.5837, "step": 2720 }, { "epoch": 2.7881269191402254, "grad_norm": 0.294921875, "learning_rate": 0.00019984678358779182, "loss": 0.5899, "step": 2724 }, { "epoch": 2.7922210849539404, "grad_norm": 0.314453125, "learning_rate": 0.000199584093328621, "loss": 0.5779, "step": 2728 }, { "epoch": 2.7963152507676563, "grad_norm": 0.328125, "learning_rate": 0.00019932123223138573, "loss": 0.6003, "step": 2732 }, { "epoch": 2.8004094165813713, "grad_norm": 0.283203125, "learning_rate": 0.0001990582012017531, "loss": 0.5879, "step": 2736 }, { "epoch": 2.8045035823950872, "grad_norm": 0.30859375, "learning_rate": 0.00019879500114597569, "loss": 0.5881, "step": 2740 }, { "epoch": 2.8085977482088023, "grad_norm": 0.283203125, "learning_rate": 0.00019853163297088843, "loss": 0.5774, "step": 2744 }, { "epoch": 2.812691914022518, "grad_norm": 0.310546875, "learning_rate": 0.00019826809758390548, "loss": 0.6252, "step": 2748 }, { "epoch": 2.816786079836233, "grad_norm": 0.287109375, "learning_rate": 0.00019800439589301715, "loss": 0.65, "step": 2752 }, { "epoch": 2.8208802456499487, "grad_norm": 0.318359375, "learning_rate": 0.00019774052880678676, "loss": 0.6235, "step": 2756 }, { "epoch": 2.824974411463664, "grad_norm": 0.30859375, "learning_rate": 0.00019747649723434732, "loss": 0.5734, "step": 2760 }, { "epoch": 2.8290685772773796, "grad_norm": 0.27734375, "learning_rate": 0.00019721230208539882, "loss": 0.6161, "step": 2764 }, { "epoch": 2.833162743091095, "grad_norm": 0.30859375, "learning_rate": 0.00019694794427020461, "loss": 0.5959, "step": 2768 }, { "epoch": 2.8372569089048105, "grad_norm": 0.302734375, "learning_rate": 0.0001966834246995887, "loss": 0.5986, "step": 2772 }, { "epoch": 2.841351074718526, "grad_norm": 0.3046875, "learning_rate": 0.00019641874428493223, "loss": 0.612, "step": 2776 }, { "epoch": 2.8454452405322415, "grad_norm": 0.296875, "learning_rate": 0.00019615390393817067, "loss": 0.5609, "step": 2780 }, { "epoch": 2.849539406345957, "grad_norm": 0.3046875, "learning_rate": 0.00019588890457179035, "loss": 0.6219, "step": 2784 }, { "epoch": 2.8536335721596724, "grad_norm": 0.302734375, "learning_rate": 0.00019562374709882564, "loss": 0.6171, "step": 2788 }, { "epoch": 2.857727737973388, "grad_norm": 0.310546875, "learning_rate": 0.00019535843243285566, "loss": 0.6393, "step": 2792 }, { "epoch": 2.8618219037871033, "grad_norm": 0.31640625, "learning_rate": 0.00019509296148800093, "loss": 0.5619, "step": 2796 }, { "epoch": 2.865916069600819, "grad_norm": 0.298828125, "learning_rate": 0.0001948273351789207, "loss": 0.6155, "step": 2800 }, { "epoch": 2.8700102354145343, "grad_norm": 0.30859375, "learning_rate": 0.00019456155442080928, "loss": 0.6029, "step": 2804 }, { "epoch": 2.8741044012282497, "grad_norm": 0.298828125, "learning_rate": 0.00019429562012939316, "loss": 0.593, "step": 2808 }, { "epoch": 2.878198567041965, "grad_norm": 0.310546875, "learning_rate": 0.00019402953322092805, "loss": 0.6088, "step": 2812 }, { "epoch": 2.8822927328556807, "grad_norm": 0.296875, "learning_rate": 0.00019376329461219516, "loss": 0.64, "step": 2816 }, { "epoch": 2.886386898669396, "grad_norm": 0.29296875, "learning_rate": 0.00019349690522049853, "loss": 0.6028, "step": 2820 }, { "epoch": 2.8904810644831116, "grad_norm": 0.287109375, "learning_rate": 0.00019323036596366174, "loss": 0.6205, "step": 2824 }, { "epoch": 2.894575230296827, "grad_norm": 0.291015625, "learning_rate": 0.00019296367776002466, "loss": 0.6108, "step": 2828 }, { "epoch": 2.8986693961105425, "grad_norm": 0.296875, "learning_rate": 0.00019269684152844037, "loss": 0.63, "step": 2832 }, { "epoch": 2.902763561924258, "grad_norm": 0.294921875, "learning_rate": 0.00019242985818827198, "loss": 0.6686, "step": 2836 }, { "epoch": 2.9068577277379735, "grad_norm": 0.318359375, "learning_rate": 0.0001921627286593894, "loss": 0.5533, "step": 2840 }, { "epoch": 2.910951893551689, "grad_norm": 0.298828125, "learning_rate": 0.00019189545386216625, "loss": 0.5544, "step": 2844 }, { "epoch": 2.9150460593654044, "grad_norm": 0.306640625, "learning_rate": 0.00019162803471747667, "loss": 0.6254, "step": 2848 }, { "epoch": 2.91914022517912, "grad_norm": 0.3125, "learning_rate": 0.0001913604721466922, "loss": 0.5852, "step": 2852 }, { "epoch": 2.9232343909928353, "grad_norm": 0.3125, "learning_rate": 0.00019109276707167839, "loss": 0.5763, "step": 2856 }, { "epoch": 2.927328556806551, "grad_norm": 0.28515625, "learning_rate": 0.00019082492041479188, "loss": 0.624, "step": 2860 }, { "epoch": 2.9314227226202663, "grad_norm": 0.28515625, "learning_rate": 0.00019055693309887712, "loss": 0.6114, "step": 2864 }, { "epoch": 2.9355168884339817, "grad_norm": 0.296875, "learning_rate": 0.00019028880604726316, "loss": 0.6243, "step": 2868 }, { "epoch": 2.939611054247697, "grad_norm": 0.306640625, "learning_rate": 0.00019002054018376052, "loss": 0.595, "step": 2872 }, { "epoch": 2.943705220061412, "grad_norm": 0.296875, "learning_rate": 0.00018975213643265799, "loss": 0.5596, "step": 2876 }, { "epoch": 2.947799385875128, "grad_norm": 0.306640625, "learning_rate": 0.00018948359571871936, "loss": 0.5698, "step": 2880 }, { "epoch": 2.951893551688843, "grad_norm": 0.3125, "learning_rate": 0.00018921491896718048, "loss": 0.6207, "step": 2884 }, { "epoch": 2.955987717502559, "grad_norm": 0.306640625, "learning_rate": 0.00018894610710374574, "loss": 0.6277, "step": 2888 }, { "epoch": 2.960081883316274, "grad_norm": 0.33203125, "learning_rate": 0.00018867716105458506, "loss": 0.5984, "step": 2892 }, { "epoch": 2.96417604912999, "grad_norm": 0.291015625, "learning_rate": 0.00018840808174633088, "loss": 0.5292, "step": 2896 }, { "epoch": 2.968270214943705, "grad_norm": 0.29296875, "learning_rate": 0.00018813887010607456, "loss": 0.5835, "step": 2900 }, { "epoch": 2.972364380757421, "grad_norm": 0.302734375, "learning_rate": 0.00018786952706136343, "loss": 0.6388, "step": 2904 }, { "epoch": 2.976458546571136, "grad_norm": 0.298828125, "learning_rate": 0.00018760005354019765, "loss": 0.6434, "step": 2908 }, { "epoch": 2.9805527123848514, "grad_norm": 0.287109375, "learning_rate": 0.00018733045047102695, "loss": 0.5781, "step": 2912 }, { "epoch": 2.984646878198567, "grad_norm": 0.30859375, "learning_rate": 0.00018706071878274718, "loss": 0.6452, "step": 2916 }, { "epoch": 2.9887410440122824, "grad_norm": 0.283203125, "learning_rate": 0.00018679085940469763, "loss": 0.6056, "step": 2920 }, { "epoch": 2.992835209825998, "grad_norm": 0.291015625, "learning_rate": 0.00018652087326665741, "loss": 0.5462, "step": 2924 }, { "epoch": 2.9969293756397133, "grad_norm": 0.291015625, "learning_rate": 0.00018625076129884233, "loss": 0.6397, "step": 2928 }, { "epoch": 3.0010235414534288, "grad_norm": 0.2890625, "learning_rate": 0.0001859805244319017, "loss": 0.5911, "step": 2932 }, { "epoch": 3.0051177072671442, "grad_norm": 0.28125, "learning_rate": 0.00018571016359691532, "loss": 0.5014, "step": 2936 }, { "epoch": 3.0092118730808597, "grad_norm": 0.28515625, "learning_rate": 0.00018543967972539004, "loss": 0.5192, "step": 2940 }, { "epoch": 3.013306038894575, "grad_norm": 0.30859375, "learning_rate": 0.00018516907374925651, "loss": 0.5236, "step": 2944 }, { "epoch": 3.0174002047082906, "grad_norm": 0.2890625, "learning_rate": 0.00018489834660086624, "loss": 0.5035, "step": 2948 }, { "epoch": 3.021494370522006, "grad_norm": 0.29296875, "learning_rate": 0.00018462749921298817, "loss": 0.5226, "step": 2952 }, { "epoch": 3.0255885363357216, "grad_norm": 0.322265625, "learning_rate": 0.0001843565325188055, "loss": 0.5353, "step": 2956 }, { "epoch": 3.029682702149437, "grad_norm": 0.279296875, "learning_rate": 0.00018408544745191247, "loss": 0.4981, "step": 2960 }, { "epoch": 3.0337768679631525, "grad_norm": 0.29296875, "learning_rate": 0.00018381424494631128, "loss": 0.4917, "step": 2964 }, { "epoch": 3.037871033776868, "grad_norm": 0.302734375, "learning_rate": 0.00018354292593640857, "loss": 0.4919, "step": 2968 }, { "epoch": 3.0419651995905834, "grad_norm": 0.322265625, "learning_rate": 0.0001832714913570126, "loss": 0.5445, "step": 2972 }, { "epoch": 3.046059365404299, "grad_norm": 0.310546875, "learning_rate": 0.00018299994214332962, "loss": 0.5512, "step": 2976 }, { "epoch": 3.0501535312180144, "grad_norm": 0.314453125, "learning_rate": 0.00018272827923096095, "loss": 0.483, "step": 2980 }, { "epoch": 3.05424769703173, "grad_norm": 0.314453125, "learning_rate": 0.00018245650355589964, "loss": 0.4659, "step": 2984 }, { "epoch": 3.0583418628454453, "grad_norm": 0.310546875, "learning_rate": 0.00018218461605452722, "loss": 0.5279, "step": 2988 }, { "epoch": 3.0624360286591608, "grad_norm": 0.29296875, "learning_rate": 0.00018191261766361053, "loss": 0.5276, "step": 2992 }, { "epoch": 3.0665301944728762, "grad_norm": 0.306640625, "learning_rate": 0.0001816405093202985, "loss": 0.5401, "step": 2996 }, { "epoch": 3.0706243602865917, "grad_norm": 0.30859375, "learning_rate": 0.0001813682919621189, "loss": 0.5575, "step": 3000 }, { "epoch": 3.074718526100307, "grad_norm": 0.314453125, "learning_rate": 0.00018109596652697495, "loss": 0.5572, "step": 3004 }, { "epoch": 3.0788126919140226, "grad_norm": 0.32421875, "learning_rate": 0.00018082353395314243, "loss": 0.4961, "step": 3008 }, { "epoch": 3.082906857727738, "grad_norm": 0.33203125, "learning_rate": 0.00018055099517926625, "loss": 0.5249, "step": 3012 }, { "epoch": 3.0870010235414536, "grad_norm": 0.30859375, "learning_rate": 0.00018027835114435702, "loss": 0.541, "step": 3016 }, { "epoch": 3.091095189355169, "grad_norm": 0.318359375, "learning_rate": 0.00018000560278778825, "loss": 0.5231, "step": 3020 }, { "epoch": 3.0951893551688845, "grad_norm": 0.330078125, "learning_rate": 0.00017973275104929276, "loss": 0.5053, "step": 3024 }, { "epoch": 3.0992835209826, "grad_norm": 0.3203125, "learning_rate": 0.0001794597968689596, "loss": 0.4864, "step": 3028 }, { "epoch": 3.1033776867963154, "grad_norm": 0.306640625, "learning_rate": 0.0001791867411872308, "loss": 0.4932, "step": 3032 }, { "epoch": 3.107471852610031, "grad_norm": 0.3125, "learning_rate": 0.00017891358494489805, "loss": 0.5039, "step": 3036 }, { "epoch": 3.1115660184237464, "grad_norm": 0.310546875, "learning_rate": 0.00017864032908309946, "loss": 0.5224, "step": 3040 }, { "epoch": 3.115660184237462, "grad_norm": 0.30078125, "learning_rate": 0.00017836697454331658, "loss": 0.5166, "step": 3044 }, { "epoch": 3.119754350051177, "grad_norm": 0.287109375, "learning_rate": 0.00017809352226737075, "loss": 0.5291, "step": 3048 }, { "epoch": 3.1238485158648923, "grad_norm": 0.30078125, "learning_rate": 0.0001778199731974201, "loss": 0.5146, "step": 3052 }, { "epoch": 3.127942681678608, "grad_norm": 0.3203125, "learning_rate": 0.0001775463282759563, "loss": 0.5323, "step": 3056 }, { "epoch": 3.1320368474923233, "grad_norm": 0.318359375, "learning_rate": 0.00017727258844580125, "loss": 0.5511, "step": 3060 }, { "epoch": 3.1361310133060387, "grad_norm": 0.326171875, "learning_rate": 0.00017699875465010382, "loss": 0.5502, "step": 3064 }, { "epoch": 3.140225179119754, "grad_norm": 0.310546875, "learning_rate": 0.00017672482783233668, "loss": 0.5313, "step": 3068 }, { "epoch": 3.1443193449334697, "grad_norm": 0.294921875, "learning_rate": 0.00017645080893629298, "loss": 0.4961, "step": 3072 }, { "epoch": 3.148413510747185, "grad_norm": 0.310546875, "learning_rate": 0.00017617669890608305, "loss": 0.5336, "step": 3076 }, { "epoch": 3.1525076765609006, "grad_norm": 0.3125, "learning_rate": 0.00017590249868613137, "loss": 0.5259, "step": 3080 }, { "epoch": 3.156601842374616, "grad_norm": 0.318359375, "learning_rate": 0.000175628209221173, "loss": 0.5445, "step": 3084 }, { "epoch": 3.1606960081883315, "grad_norm": 0.31640625, "learning_rate": 0.00017535383145625056, "loss": 0.5006, "step": 3088 }, { "epoch": 3.164790174002047, "grad_norm": 0.349609375, "learning_rate": 0.00017507936633671093, "loss": 0.4755, "step": 3092 }, { "epoch": 3.1688843398157625, "grad_norm": 0.3125, "learning_rate": 0.00017480481480820195, "loss": 0.5028, "step": 3096 }, { "epoch": 3.172978505629478, "grad_norm": 0.32421875, "learning_rate": 0.0001745301778166691, "loss": 0.4948, "step": 3100 }, { "epoch": 3.1770726714431934, "grad_norm": 0.330078125, "learning_rate": 0.00017425545630835238, "loss": 0.489, "step": 3104 }, { "epoch": 3.181166837256909, "grad_norm": 0.369140625, "learning_rate": 0.000173980651229783, "loss": 0.518, "step": 3108 }, { "epoch": 3.1852610030706243, "grad_norm": 0.314453125, "learning_rate": 0.00017370576352778008, "loss": 0.5336, "step": 3112 }, { "epoch": 3.18935516888434, "grad_norm": 0.326171875, "learning_rate": 0.0001734307941494474, "loss": 0.5271, "step": 3116 }, { "epoch": 3.1934493346980553, "grad_norm": 0.30078125, "learning_rate": 0.00017315574404217017, "loss": 0.5106, "step": 3120 }, { "epoch": 3.1975435005117707, "grad_norm": 0.310546875, "learning_rate": 0.00017288061415361174, "loss": 0.4721, "step": 3124 }, { "epoch": 3.201637666325486, "grad_norm": 0.3359375, "learning_rate": 0.00017260540543171036, "loss": 0.5574, "step": 3128 }, { "epoch": 3.2057318321392017, "grad_norm": 0.3125, "learning_rate": 0.00017233011882467582, "loss": 0.5063, "step": 3132 }, { "epoch": 3.209825997952917, "grad_norm": 0.296875, "learning_rate": 0.00017205475528098637, "loss": 0.5305, "step": 3136 }, { "epoch": 3.2139201637666326, "grad_norm": 0.333984375, "learning_rate": 0.0001717793157493852, "loss": 0.5379, "step": 3140 }, { "epoch": 3.218014329580348, "grad_norm": 0.30859375, "learning_rate": 0.00017150380117887751, "loss": 0.5002, "step": 3144 }, { "epoch": 3.2221084953940635, "grad_norm": 0.302734375, "learning_rate": 0.00017122821251872684, "loss": 0.4996, "step": 3148 }, { "epoch": 3.226202661207779, "grad_norm": 0.3203125, "learning_rate": 0.00017095255071845206, "loss": 0.5536, "step": 3152 }, { "epoch": 3.2302968270214945, "grad_norm": 0.3125, "learning_rate": 0.00017067681672782416, "loss": 0.5189, "step": 3156 }, { "epoch": 3.23439099283521, "grad_norm": 0.34765625, "learning_rate": 0.00017040101149686264, "loss": 0.5405, "step": 3160 }, { "epoch": 3.2384851586489254, "grad_norm": 0.3125, "learning_rate": 0.0001701251359758326, "loss": 0.5288, "step": 3164 }, { "epoch": 3.242579324462641, "grad_norm": 0.31640625, "learning_rate": 0.00016984919111524136, "loss": 0.5539, "step": 3168 }, { "epoch": 3.2466734902763563, "grad_norm": 0.33203125, "learning_rate": 0.00016957317786583497, "loss": 0.5721, "step": 3172 }, { "epoch": 3.250767656090072, "grad_norm": 0.314453125, "learning_rate": 0.00016929709717859525, "loss": 0.4987, "step": 3176 }, { "epoch": 3.2548618219037873, "grad_norm": 0.306640625, "learning_rate": 0.00016902095000473637, "loss": 0.4804, "step": 3180 }, { "epoch": 3.2589559877175027, "grad_norm": 0.326171875, "learning_rate": 0.00016874473729570148, "loss": 0.5401, "step": 3184 }, { "epoch": 3.263050153531218, "grad_norm": 0.314453125, "learning_rate": 0.00016846846000315957, "loss": 0.4868, "step": 3188 }, { "epoch": 3.2671443193449337, "grad_norm": 0.3046875, "learning_rate": 0.00016819211907900225, "loss": 0.5596, "step": 3192 }, { "epoch": 3.2712384851586487, "grad_norm": 0.298828125, "learning_rate": 0.0001679157154753402, "loss": 0.5044, "step": 3196 }, { "epoch": 3.2753326509723646, "grad_norm": 0.326171875, "learning_rate": 0.00016763925014450008, "loss": 0.5377, "step": 3200 }, { "epoch": 3.2794268167860796, "grad_norm": 0.318359375, "learning_rate": 0.0001673627240390214, "loss": 0.5826, "step": 3204 }, { "epoch": 3.2835209825997955, "grad_norm": 0.32421875, "learning_rate": 0.0001670861381116529, "loss": 0.5723, "step": 3208 }, { "epoch": 3.2876151484135105, "grad_norm": 0.337890625, "learning_rate": 0.00016680949331534948, "loss": 0.4922, "step": 3212 }, { "epoch": 3.291709314227226, "grad_norm": 0.33203125, "learning_rate": 0.00016653279060326883, "loss": 0.5247, "step": 3216 }, { "epoch": 3.2958034800409415, "grad_norm": 0.333984375, "learning_rate": 0.00016625603092876824, "loss": 0.5303, "step": 3220 }, { "epoch": 3.299897645854657, "grad_norm": 0.333984375, "learning_rate": 0.00016597921524540125, "loss": 0.5386, "step": 3224 }, { "epoch": 3.3039918116683724, "grad_norm": 0.310546875, "learning_rate": 0.00016570234450691436, "loss": 0.5131, "step": 3228 }, { "epoch": 3.308085977482088, "grad_norm": 0.349609375, "learning_rate": 0.00016542541966724374, "loss": 0.473, "step": 3232 }, { "epoch": 3.3121801432958033, "grad_norm": 0.328125, "learning_rate": 0.000165148441680512, "loss": 0.5059, "step": 3236 }, { "epoch": 3.316274309109519, "grad_norm": 0.32421875, "learning_rate": 0.0001648714115010248, "loss": 0.4812, "step": 3240 }, { "epoch": 3.3203684749232343, "grad_norm": 0.32421875, "learning_rate": 0.0001645943300832678, "loss": 0.5404, "step": 3244 }, { "epoch": 3.3244626407369497, "grad_norm": 0.31640625, "learning_rate": 0.00016431719838190287, "loss": 0.5238, "step": 3248 }, { "epoch": 3.328556806550665, "grad_norm": 0.33984375, "learning_rate": 0.00016404001735176549, "loss": 0.4977, "step": 3252 }, { "epoch": 3.3326509723643807, "grad_norm": 0.337890625, "learning_rate": 0.00016376278794786087, "loss": 0.5405, "step": 3256 }, { "epoch": 3.336745138178096, "grad_norm": 0.318359375, "learning_rate": 0.00016348551112536095, "loss": 0.5508, "step": 3260 }, { "epoch": 3.3408393039918116, "grad_norm": 0.322265625, "learning_rate": 0.00016320818783960105, "loss": 0.5301, "step": 3264 }, { "epoch": 3.344933469805527, "grad_norm": 0.310546875, "learning_rate": 0.00016293081904607663, "loss": 0.4835, "step": 3268 }, { "epoch": 3.3490276356192425, "grad_norm": 0.30859375, "learning_rate": 0.00016265340570043978, "loss": 0.5369, "step": 3272 }, { "epoch": 3.353121801432958, "grad_norm": 0.3359375, "learning_rate": 0.00016237594875849628, "loss": 0.5258, "step": 3276 }, { "epoch": 3.3572159672466735, "grad_norm": 0.33203125, "learning_rate": 0.00016209844917620207, "loss": 0.579, "step": 3280 }, { "epoch": 3.361310133060389, "grad_norm": 0.353515625, "learning_rate": 0.00016182090790965988, "loss": 0.5422, "step": 3284 }, { "epoch": 3.3654042988741044, "grad_norm": 0.306640625, "learning_rate": 0.00016154332591511623, "loss": 0.5292, "step": 3288 }, { "epoch": 3.36949846468782, "grad_norm": 0.330078125, "learning_rate": 0.00016126570414895785, "loss": 0.5345, "step": 3292 }, { "epoch": 3.3735926305015353, "grad_norm": 0.3203125, "learning_rate": 0.0001609880435677085, "loss": 0.5141, "step": 3296 }, { "epoch": 3.377686796315251, "grad_norm": 0.33203125, "learning_rate": 0.00016071034512802577, "loss": 0.4843, "step": 3300 }, { "epoch": 3.3817809621289663, "grad_norm": 0.328125, "learning_rate": 0.00016043260978669763, "loss": 0.514, "step": 3304 }, { "epoch": 3.3858751279426818, "grad_norm": 0.32421875, "learning_rate": 0.00016015483850063912, "loss": 0.5275, "step": 3308 }, { "epoch": 3.389969293756397, "grad_norm": 0.2890625, "learning_rate": 0.00015987703222688926, "loss": 0.5134, "step": 3312 }, { "epoch": 3.3940634595701127, "grad_norm": 0.314453125, "learning_rate": 0.00015959919192260757, "loss": 0.5475, "step": 3316 }, { "epoch": 3.398157625383828, "grad_norm": 0.345703125, "learning_rate": 0.00015932131854507072, "loss": 0.5782, "step": 3320 }, { "epoch": 3.4022517911975436, "grad_norm": 0.322265625, "learning_rate": 0.0001590434130516695, "loss": 0.5299, "step": 3324 }, { "epoch": 3.406345957011259, "grad_norm": 0.33203125, "learning_rate": 0.00015876547639990518, "loss": 0.549, "step": 3328 }, { "epoch": 3.4104401228249746, "grad_norm": 0.3515625, "learning_rate": 0.0001584875095473865, "loss": 0.5149, "step": 3332 }, { "epoch": 3.41453428863869, "grad_norm": 0.328125, "learning_rate": 0.0001582095134518263, "loss": 0.5442, "step": 3336 }, { "epoch": 3.4186284544524055, "grad_norm": 0.3203125, "learning_rate": 0.00015793148907103802, "loss": 0.5089, "step": 3340 }, { "epoch": 3.422722620266121, "grad_norm": 0.337890625, "learning_rate": 0.00015765343736293263, "loss": 0.5063, "step": 3344 }, { "epoch": 3.4268167860798364, "grad_norm": 0.3203125, "learning_rate": 0.00015737535928551528, "loss": 0.508, "step": 3348 }, { "epoch": 3.4309109518935514, "grad_norm": 0.318359375, "learning_rate": 0.00015709725579688197, "loss": 0.5439, "step": 3352 }, { "epoch": 3.4350051177072674, "grad_norm": 0.310546875, "learning_rate": 0.00015681912785521618, "loss": 0.5304, "step": 3356 }, { "epoch": 3.4390992835209824, "grad_norm": 0.314453125, "learning_rate": 0.0001565409764187857, "loss": 0.5379, "step": 3360 }, { "epoch": 3.4431934493346983, "grad_norm": 0.333984375, "learning_rate": 0.00015626280244593937, "loss": 0.5265, "step": 3364 }, { "epoch": 3.4472876151484133, "grad_norm": 0.36328125, "learning_rate": 0.00015598460689510342, "loss": 0.5461, "step": 3368 }, { "epoch": 3.4513817809621288, "grad_norm": 0.33203125, "learning_rate": 0.00015570639072477865, "loss": 0.5123, "step": 3372 }, { "epoch": 3.4554759467758442, "grad_norm": 0.32421875, "learning_rate": 0.00015542815489353687, "loss": 0.5182, "step": 3376 }, { "epoch": 3.4595701125895597, "grad_norm": 0.32421875, "learning_rate": 0.0001551499003600175, "loss": 0.5118, "step": 3380 }, { "epoch": 3.463664278403275, "grad_norm": 0.30078125, "learning_rate": 0.00015487162808292454, "loss": 0.5201, "step": 3384 }, { "epoch": 3.4677584442169906, "grad_norm": 0.31640625, "learning_rate": 0.00015459333902102302, "loss": 0.5019, "step": 3388 }, { "epoch": 3.471852610030706, "grad_norm": 0.328125, "learning_rate": 0.00015431503413313594, "loss": 0.5194, "step": 3392 }, { "epoch": 3.4759467758444216, "grad_norm": 0.326171875, "learning_rate": 0.00015403671437814063, "loss": 0.5196, "step": 3396 }, { "epoch": 3.480040941658137, "grad_norm": 0.3125, "learning_rate": 0.00015375838071496583, "loss": 0.5081, "step": 3400 }, { "epoch": 3.4841351074718525, "grad_norm": 0.3203125, "learning_rate": 0.00015348003410258813, "loss": 0.5039, "step": 3404 }, { "epoch": 3.488229273285568, "grad_norm": 0.326171875, "learning_rate": 0.00015320167550002863, "loss": 0.536, "step": 3408 }, { "epoch": 3.4923234390992834, "grad_norm": 0.341796875, "learning_rate": 0.00015292330586634997, "loss": 0.5372, "step": 3412 }, { "epoch": 3.496417604912999, "grad_norm": 0.33203125, "learning_rate": 0.00015264492616065255, "loss": 0.4794, "step": 3416 }, { "epoch": 3.5005117707267144, "grad_norm": 0.32421875, "learning_rate": 0.00015236653734207158, "loss": 0.5065, "step": 3420 }, { "epoch": 3.50460593654043, "grad_norm": 0.3203125, "learning_rate": 0.0001520881403697738, "loss": 0.4901, "step": 3424 }, { "epoch": 3.5087001023541453, "grad_norm": 0.326171875, "learning_rate": 0.00015180973620295383, "loss": 0.5332, "step": 3428 }, { "epoch": 3.512794268167861, "grad_norm": 0.322265625, "learning_rate": 0.00015153132580083116, "loss": 0.4868, "step": 3432 }, { "epoch": 3.5168884339815762, "grad_norm": 0.322265625, "learning_rate": 0.00015125291012264684, "loss": 0.55, "step": 3436 }, { "epoch": 3.5209825997952917, "grad_norm": 0.328125, "learning_rate": 0.00015097449012765993, "loss": 0.4989, "step": 3440 }, { "epoch": 3.525076765609007, "grad_norm": 0.31640625, "learning_rate": 0.0001506960667751445, "loss": 0.4985, "step": 3444 }, { "epoch": 3.5291709314227226, "grad_norm": 0.45703125, "learning_rate": 0.00015041764102438618, "loss": 0.5499, "step": 3448 }, { "epoch": 3.533265097236438, "grad_norm": 0.328125, "learning_rate": 0.00015013921383467884, "loss": 0.5324, "step": 3452 }, { "epoch": 3.5373592630501536, "grad_norm": 0.32421875, "learning_rate": 0.0001498607861653212, "loss": 0.5165, "step": 3456 }, { "epoch": 3.541453428863869, "grad_norm": 0.3046875, "learning_rate": 0.0001495823589756138, "loss": 0.5517, "step": 3460 }, { "epoch": 3.5455475946775845, "grad_norm": 0.341796875, "learning_rate": 0.00014930393322485548, "loss": 0.548, "step": 3464 }, { "epoch": 3.5496417604913, "grad_norm": 0.318359375, "learning_rate": 0.00014902550987234004, "loss": 0.5582, "step": 3468 }, { "epoch": 3.5537359263050154, "grad_norm": 0.330078125, "learning_rate": 0.00014874708987735316, "loss": 0.5943, "step": 3472 }, { "epoch": 3.557830092118731, "grad_norm": 0.330078125, "learning_rate": 0.0001484686741991688, "loss": 0.512, "step": 3476 }, { "epoch": 3.5619242579324464, "grad_norm": 0.33984375, "learning_rate": 0.00014819026379704614, "loss": 0.527, "step": 3480 }, { "epoch": 3.566018423746162, "grad_norm": 0.328125, "learning_rate": 0.0001479118596302262, "loss": 0.5487, "step": 3484 }, { "epoch": 3.5701125895598773, "grad_norm": 0.3046875, "learning_rate": 0.0001476334626579284, "loss": 0.5161, "step": 3488 }, { "epoch": 3.574206755373593, "grad_norm": 0.333984375, "learning_rate": 0.00014735507383934748, "loss": 0.5076, "step": 3492 }, { "epoch": 3.5783009211873082, "grad_norm": 0.3203125, "learning_rate": 0.00014707669413365008, "loss": 0.5335, "step": 3496 }, { "epoch": 3.5823950870010233, "grad_norm": 0.322265625, "learning_rate": 0.0001467983244999714, "loss": 0.476, "step": 3500 }, { "epoch": 3.586489252814739, "grad_norm": 0.34765625, "learning_rate": 0.0001465199658974119, "loss": 0.5597, "step": 3504 }, { "epoch": 3.590583418628454, "grad_norm": 0.310546875, "learning_rate": 0.00014624161928503417, "loss": 0.5411, "step": 3508 }, { "epoch": 3.59467758444217, "grad_norm": 0.328125, "learning_rate": 0.0001459632856218594, "loss": 0.5302, "step": 3512 }, { "epoch": 3.598771750255885, "grad_norm": 0.34375, "learning_rate": 0.0001456849658668641, "loss": 0.5309, "step": 3516 }, { "epoch": 3.602865916069601, "grad_norm": 0.3515625, "learning_rate": 0.000145406660978977, "loss": 0.5123, "step": 3520 }, { "epoch": 3.606960081883316, "grad_norm": 0.59765625, "learning_rate": 0.00014512837191707552, "loss": 0.5087, "step": 3524 }, { "epoch": 3.611054247697032, "grad_norm": 0.33203125, "learning_rate": 0.0001448500996399825, "loss": 0.5063, "step": 3528 }, { "epoch": 3.615148413510747, "grad_norm": 0.349609375, "learning_rate": 0.0001445718451064631, "loss": 0.5133, "step": 3532 }, { "epoch": 3.619242579324463, "grad_norm": 0.328125, "learning_rate": 0.0001442936092752213, "loss": 0.5234, "step": 3536 }, { "epoch": 3.623336745138178, "grad_norm": 0.34375, "learning_rate": 0.00014401539310489656, "loss": 0.534, "step": 3540 }, { "epoch": 3.6274309109518934, "grad_norm": 0.40234375, "learning_rate": 0.0001437371975540606, "loss": 0.4773, "step": 3544 }, { "epoch": 3.631525076765609, "grad_norm": 0.33203125, "learning_rate": 0.00014345902358121423, "loss": 0.5007, "step": 3548 }, { "epoch": 3.6356192425793243, "grad_norm": 0.40625, "learning_rate": 0.0001431808721447838, "loss": 0.5236, "step": 3552 }, { "epoch": 3.63971340839304, "grad_norm": 0.30859375, "learning_rate": 0.000142902744203118, "loss": 0.5174, "step": 3556 }, { "epoch": 3.6438075742067553, "grad_norm": 0.337890625, "learning_rate": 0.0001426246407144847, "loss": 0.5517, "step": 3560 }, { "epoch": 3.6479017400204707, "grad_norm": 0.31640625, "learning_rate": 0.00014234656263706737, "loss": 0.5203, "step": 3564 }, { "epoch": 3.651995905834186, "grad_norm": 0.3359375, "learning_rate": 0.00014206851092896195, "loss": 0.515, "step": 3568 }, { "epoch": 3.6560900716479017, "grad_norm": 0.306640625, "learning_rate": 0.00014179048654817367, "loss": 0.5206, "step": 3572 }, { "epoch": 3.660184237461617, "grad_norm": 0.345703125, "learning_rate": 0.00014151249045261347, "loss": 0.5397, "step": 3576 }, { "epoch": 3.6642784032753326, "grad_norm": 0.345703125, "learning_rate": 0.0001412345236000948, "loss": 0.5156, "step": 3580 }, { "epoch": 3.668372569089048, "grad_norm": 0.408203125, "learning_rate": 0.0001409565869483305, "loss": 0.5261, "step": 3584 }, { "epoch": 3.6724667349027635, "grad_norm": 0.328125, "learning_rate": 0.00014067868145492928, "loss": 0.5615, "step": 3588 }, { "epoch": 3.676560900716479, "grad_norm": 0.3359375, "learning_rate": 0.0001404008080773924, "loss": 0.4976, "step": 3592 }, { "epoch": 3.6806550665301945, "grad_norm": 0.34375, "learning_rate": 0.0001401229677731107, "loss": 0.5772, "step": 3596 }, { "epoch": 3.68474923234391, "grad_norm": 0.33984375, "learning_rate": 0.00013984516149936088, "loss": 0.5169, "step": 3600 }, { "epoch": 3.6888433981576254, "grad_norm": 0.314453125, "learning_rate": 0.00013956739021330234, "loss": 0.5636, "step": 3604 }, { "epoch": 3.692937563971341, "grad_norm": 0.3203125, "learning_rate": 0.0001392896548719742, "loss": 0.5431, "step": 3608 }, { "epoch": 3.6970317297850563, "grad_norm": 0.345703125, "learning_rate": 0.0001390119564322915, "loss": 0.5717, "step": 3612 }, { "epoch": 3.701125895598772, "grad_norm": 0.33203125, "learning_rate": 0.00013873429585104215, "loss": 0.5165, "step": 3616 }, { "epoch": 3.7052200614124873, "grad_norm": 0.333984375, "learning_rate": 0.00013845667408488377, "loss": 0.5255, "step": 3620 }, { "epoch": 3.7093142272262027, "grad_norm": 0.318359375, "learning_rate": 0.0001381790920903401, "loss": 0.5074, "step": 3624 }, { "epoch": 3.713408393039918, "grad_norm": 0.328125, "learning_rate": 0.00013790155082379793, "loss": 0.5742, "step": 3628 }, { "epoch": 3.7175025588536337, "grad_norm": 0.314453125, "learning_rate": 0.0001376240512415037, "loss": 0.5069, "step": 3632 }, { "epoch": 3.721596724667349, "grad_norm": 0.3203125, "learning_rate": 0.00013734659429956024, "loss": 0.5224, "step": 3636 }, { "epoch": 3.7256908904810646, "grad_norm": 0.35546875, "learning_rate": 0.0001370691809539234, "loss": 0.6127, "step": 3640 }, { "epoch": 3.72978505629478, "grad_norm": 0.32421875, "learning_rate": 0.00013679181216039898, "loss": 0.5235, "step": 3644 }, { "epoch": 3.7338792221084955, "grad_norm": 0.330078125, "learning_rate": 0.00013651448887463905, "loss": 0.4979, "step": 3648 }, { "epoch": 3.737973387922211, "grad_norm": 0.326171875, "learning_rate": 0.00013623721205213916, "loss": 0.5628, "step": 3652 }, { "epoch": 3.742067553735926, "grad_norm": 0.3203125, "learning_rate": 0.00013595998264823454, "loss": 0.5107, "step": 3656 }, { "epoch": 3.746161719549642, "grad_norm": 0.333984375, "learning_rate": 0.00013568280161809713, "loss": 0.543, "step": 3660 }, { "epoch": 3.750255885363357, "grad_norm": 0.33203125, "learning_rate": 0.00013540566991673226, "loss": 0.4928, "step": 3664 }, { "epoch": 3.754350051177073, "grad_norm": 0.328125, "learning_rate": 0.0001351285884989752, "loss": 0.4854, "step": 3668 }, { "epoch": 3.758444216990788, "grad_norm": 0.3203125, "learning_rate": 0.000134851558319488, "loss": 0.5305, "step": 3672 }, { "epoch": 3.762538382804504, "grad_norm": 0.33984375, "learning_rate": 0.00013457458033275623, "loss": 0.5603, "step": 3676 }, { "epoch": 3.766632548618219, "grad_norm": 0.33203125, "learning_rate": 0.00013429765549308558, "loss": 0.5247, "step": 3680 }, { "epoch": 3.7707267144319347, "grad_norm": 0.326171875, "learning_rate": 0.0001340207847545987, "loss": 0.5073, "step": 3684 }, { "epoch": 3.7748208802456498, "grad_norm": 0.33203125, "learning_rate": 0.00013374396907123173, "loss": 0.5365, "step": 3688 }, { "epoch": 3.7789150460593657, "grad_norm": 0.33984375, "learning_rate": 0.00013346720939673112, "loss": 0.5316, "step": 3692 }, { "epoch": 3.7830092118730807, "grad_norm": 0.33203125, "learning_rate": 0.0001331905066846505, "loss": 0.5118, "step": 3696 }, { "epoch": 3.787103377686796, "grad_norm": 0.337890625, "learning_rate": 0.0001329138618883471, "loss": 0.5699, "step": 3700 }, { "epoch": 3.7911975435005116, "grad_norm": 0.330078125, "learning_rate": 0.00013263727596097855, "loss": 0.5433, "step": 3704 }, { "epoch": 3.795291709314227, "grad_norm": 0.341796875, "learning_rate": 0.0001323607498554999, "loss": 0.4993, "step": 3708 }, { "epoch": 3.7993858751279426, "grad_norm": 0.349609375, "learning_rate": 0.0001320842845246598, "loss": 0.4852, "step": 3712 }, { "epoch": 3.803480040941658, "grad_norm": 0.333984375, "learning_rate": 0.00013180788092099775, "loss": 0.5193, "step": 3716 }, { "epoch": 3.8075742067553735, "grad_norm": 0.34375, "learning_rate": 0.0001315315399968404, "loss": 0.5161, "step": 3720 }, { "epoch": 3.811668372569089, "grad_norm": 0.3125, "learning_rate": 0.0001312552627042985, "loss": 0.4898, "step": 3724 }, { "epoch": 3.8157625383828044, "grad_norm": 0.337890625, "learning_rate": 0.00013097904999526363, "loss": 0.5347, "step": 3728 }, { "epoch": 3.81985670419652, "grad_norm": 0.33203125, "learning_rate": 0.00013070290282140472, "loss": 0.5471, "step": 3732 }, { "epoch": 3.8239508700102354, "grad_norm": 0.31640625, "learning_rate": 0.00013042682213416503, "loss": 0.4795, "step": 3736 }, { "epoch": 3.828045035823951, "grad_norm": 0.333984375, "learning_rate": 0.00013015080888475865, "loss": 0.5442, "step": 3740 }, { "epoch": 3.8321392016376663, "grad_norm": 0.330078125, "learning_rate": 0.00012987486402416738, "loss": 0.4921, "step": 3744 }, { "epoch": 3.8362333674513818, "grad_norm": 0.3125, "learning_rate": 0.00012959898850313736, "loss": 0.5354, "step": 3748 }, { "epoch": 3.8403275332650972, "grad_norm": 0.294921875, "learning_rate": 0.00012932318327217585, "loss": 0.4821, "step": 3752 }, { "epoch": 3.8444216990788127, "grad_norm": 0.3125, "learning_rate": 0.00012904744928154792, "loss": 0.5164, "step": 3756 }, { "epoch": 3.848515864892528, "grad_norm": 0.333984375, "learning_rate": 0.00012877178748127313, "loss": 0.529, "step": 3760 }, { "epoch": 3.8526100307062436, "grad_norm": 0.328125, "learning_rate": 0.0001284961988211225, "loss": 0.5858, "step": 3764 }, { "epoch": 3.856704196519959, "grad_norm": 0.3359375, "learning_rate": 0.00012822068425061476, "loss": 0.5239, "step": 3768 }, { "epoch": 3.8607983623336746, "grad_norm": 0.330078125, "learning_rate": 0.00012794524471901363, "loss": 0.5538, "step": 3772 }, { "epoch": 3.86489252814739, "grad_norm": 0.322265625, "learning_rate": 0.00012766988117532418, "loss": 0.5425, "step": 3776 }, { "epoch": 3.8689866939611055, "grad_norm": 0.3359375, "learning_rate": 0.00012739459456828967, "loss": 0.5078, "step": 3780 }, { "epoch": 3.873080859774821, "grad_norm": 0.330078125, "learning_rate": 0.00012711938584638823, "loss": 0.4744, "step": 3784 }, { "epoch": 3.8771750255885364, "grad_norm": 0.341796875, "learning_rate": 0.00012684425595782984, "loss": 0.5257, "step": 3788 }, { "epoch": 3.881269191402252, "grad_norm": 0.326171875, "learning_rate": 0.00012656920585055263, "loss": 0.4963, "step": 3792 }, { "epoch": 3.8853633572159674, "grad_norm": 0.33203125, "learning_rate": 0.00012629423647221992, "loss": 0.5248, "step": 3796 }, { "epoch": 3.889457523029683, "grad_norm": 0.326171875, "learning_rate": 0.000126019348770217, "loss": 0.5464, "step": 3800 }, { "epoch": 3.8935516888433983, "grad_norm": 0.349609375, "learning_rate": 0.00012574454369164762, "loss": 0.5515, "step": 3804 }, { "epoch": 3.8976458546571138, "grad_norm": 0.32421875, "learning_rate": 0.0001254698221833309, "loss": 0.5136, "step": 3808 }, { "epoch": 3.901740020470829, "grad_norm": 0.337890625, "learning_rate": 0.00012519518519179805, "loss": 0.5189, "step": 3812 }, { "epoch": 3.9058341862845447, "grad_norm": 0.322265625, "learning_rate": 0.00012492063366328899, "loss": 0.561, "step": 3816 }, { "epoch": 3.9099283520982597, "grad_norm": 0.30859375, "learning_rate": 0.0001246461685437494, "loss": 0.4895, "step": 3820 }, { "epoch": 3.9140225179119756, "grad_norm": 0.32421875, "learning_rate": 0.00012437179077882693, "loss": 0.5158, "step": 3824 }, { "epoch": 3.9181166837256907, "grad_norm": 0.326171875, "learning_rate": 0.00012409750131386858, "loss": 0.5381, "step": 3828 }, { "epoch": 3.9222108495394066, "grad_norm": 0.326171875, "learning_rate": 0.0001238233010939169, "loss": 0.5346, "step": 3832 }, { "epoch": 3.9263050153531216, "grad_norm": 0.330078125, "learning_rate": 0.000123549191063707, "loss": 0.5082, "step": 3836 }, { "epoch": 3.9303991811668375, "grad_norm": 0.333984375, "learning_rate": 0.00012327517216766327, "loss": 0.4927, "step": 3840 }, { "epoch": 3.9344933469805525, "grad_norm": 0.328125, "learning_rate": 0.00012300124534989616, "loss": 0.5242, "step": 3844 }, { "epoch": 3.9385875127942684, "grad_norm": 0.318359375, "learning_rate": 0.0001227274115541987, "loss": 0.5478, "step": 3848 }, { "epoch": 3.9426816786079835, "grad_norm": 0.328125, "learning_rate": 0.00012245367172404367, "loss": 0.5329, "step": 3852 }, { "epoch": 3.946775844421699, "grad_norm": 0.33203125, "learning_rate": 0.0001221800268025799, "loss": 0.5041, "step": 3856 }, { "epoch": 3.9508700102354144, "grad_norm": 0.330078125, "learning_rate": 0.00012190647773262923, "loss": 0.5311, "step": 3860 }, { "epoch": 3.95496417604913, "grad_norm": 0.328125, "learning_rate": 0.0001216330254566834, "loss": 0.5264, "step": 3864 }, { "epoch": 3.9590583418628453, "grad_norm": 0.314453125, "learning_rate": 0.00012135967091690053, "loss": 0.4574, "step": 3868 }, { "epoch": 3.963152507676561, "grad_norm": 0.326171875, "learning_rate": 0.00012108641505510195, "loss": 0.5259, "step": 3872 }, { "epoch": 3.9672466734902763, "grad_norm": 0.337890625, "learning_rate": 0.00012081325881276917, "loss": 0.5375, "step": 3876 }, { "epoch": 3.9713408393039917, "grad_norm": 0.3203125, "learning_rate": 0.00012054020313104037, "loss": 0.5233, "step": 3880 }, { "epoch": 3.975435005117707, "grad_norm": 0.3203125, "learning_rate": 0.0001202672489507072, "loss": 0.5807, "step": 3884 }, { "epoch": 3.9795291709314227, "grad_norm": 0.349609375, "learning_rate": 0.00011999439721221173, "loss": 0.493, "step": 3888 }, { "epoch": 3.983623336745138, "grad_norm": 0.37109375, "learning_rate": 0.00011972164885564298, "loss": 0.5157, "step": 3892 }, { "epoch": 3.9877175025588536, "grad_norm": 0.34765625, "learning_rate": 0.00011944900482073375, "loss": 0.4709, "step": 3896 }, { "epoch": 3.991811668372569, "grad_norm": 0.330078125, "learning_rate": 0.00011917646604685753, "loss": 0.5762, "step": 3900 }, { "epoch": 3.9959058341862845, "grad_norm": 0.322265625, "learning_rate": 0.00011890403347302505, "loss": 0.4861, "step": 3904 }, { "epoch": 4.0, "grad_norm": 0.3359375, "learning_rate": 0.00011863170803788112, "loss": 0.495, "step": 3908 }, { "epoch": 4.004094165813715, "grad_norm": 0.306640625, "learning_rate": 0.00011835949067970147, "loss": 0.4461, "step": 3912 }, { "epoch": 4.008188331627431, "grad_norm": 0.2890625, "learning_rate": 0.00011808738233638947, "loss": 0.4565, "step": 3916 }, { "epoch": 4.012282497441146, "grad_norm": 0.33203125, "learning_rate": 0.00011781538394547278, "loss": 0.4521, "step": 3920 }, { "epoch": 4.016376663254862, "grad_norm": 0.29296875, "learning_rate": 0.00011754349644410038, "loss": 0.4518, "step": 3924 }, { "epoch": 4.020470829068577, "grad_norm": 0.3359375, "learning_rate": 0.00011727172076903907, "loss": 0.5021, "step": 3928 }, { "epoch": 4.024564994882293, "grad_norm": 0.322265625, "learning_rate": 0.00011700005785667038, "loss": 0.4439, "step": 3932 }, { "epoch": 4.028659160696008, "grad_norm": 0.3203125, "learning_rate": 0.0001167285086429874, "loss": 0.4796, "step": 3936 }, { "epoch": 4.032753326509724, "grad_norm": 0.328125, "learning_rate": 0.00011645707406359143, "loss": 0.4299, "step": 3940 }, { "epoch": 4.036847492323439, "grad_norm": 0.318359375, "learning_rate": 0.00011618575505368872, "loss": 0.4309, "step": 3944 }, { "epoch": 4.040941658137155, "grad_norm": 0.349609375, "learning_rate": 0.00011591455254808753, "loss": 0.4322, "step": 3948 }, { "epoch": 4.04503582395087, "grad_norm": 0.318359375, "learning_rate": 0.00011564346748119453, "loss": 0.4437, "step": 3952 }, { "epoch": 4.049129989764586, "grad_norm": 0.30859375, "learning_rate": 0.00011537250078701184, "loss": 0.4282, "step": 3956 }, { "epoch": 4.053224155578301, "grad_norm": 0.345703125, "learning_rate": 0.0001151016533991337, "loss": 0.4951, "step": 3960 }, { "epoch": 4.0573183213920165, "grad_norm": 0.3359375, "learning_rate": 0.00011483092625074347, "loss": 0.4729, "step": 3964 }, { "epoch": 4.061412487205732, "grad_norm": 0.345703125, "learning_rate": 0.00011456032027460996, "loss": 0.4921, "step": 3968 }, { "epoch": 4.0655066530194475, "grad_norm": 0.337890625, "learning_rate": 0.00011428983640308463, "loss": 0.4696, "step": 3972 }, { "epoch": 4.0696008188331625, "grad_norm": 0.333984375, "learning_rate": 0.00011401947556809827, "loss": 0.4852, "step": 3976 }, { "epoch": 4.073694984646878, "grad_norm": 0.357421875, "learning_rate": 0.00011374923870115769, "loss": 0.4644, "step": 3980 }, { "epoch": 4.077789150460593, "grad_norm": 0.318359375, "learning_rate": 0.00011347912673334255, "loss": 0.46, "step": 3984 }, { "epoch": 4.081883316274309, "grad_norm": 0.3359375, "learning_rate": 0.00011320914059530232, "loss": 0.48, "step": 3988 }, { "epoch": 4.085977482088024, "grad_norm": 0.32421875, "learning_rate": 0.00011293928121725278, "loss": 0.4841, "step": 3992 }, { "epoch": 4.09007164790174, "grad_norm": 0.353515625, "learning_rate": 0.00011266954952897305, "loss": 0.4967, "step": 3996 }, { "epoch": 4.094165813715455, "grad_norm": 0.33203125, "learning_rate": 0.00011239994645980233, "loss": 0.4563, "step": 4000 }, { "epoch": 4.098259979529171, "grad_norm": 0.359375, "learning_rate": 0.00011213047293863659, "loss": 0.4613, "step": 4004 }, { "epoch": 4.102354145342886, "grad_norm": 0.322265625, "learning_rate": 0.00011186112989392545, "loss": 0.454, "step": 4008 }, { "epoch": 4.106448311156602, "grad_norm": 0.330078125, "learning_rate": 0.00011159191825366912, "loss": 0.4905, "step": 4012 }, { "epoch": 4.110542476970317, "grad_norm": 0.33984375, "learning_rate": 0.00011132283894541492, "loss": 0.4467, "step": 4016 }, { "epoch": 4.114636642784033, "grad_norm": 0.3203125, "learning_rate": 0.00011105389289625427, "loss": 0.4868, "step": 4020 }, { "epoch": 4.118730808597748, "grad_norm": 0.373046875, "learning_rate": 0.00011078508103281952, "loss": 0.4499, "step": 4024 }, { "epoch": 4.122824974411464, "grad_norm": 0.357421875, "learning_rate": 0.00011051640428128062, "loss": 0.4711, "step": 4028 }, { "epoch": 4.126919140225179, "grad_norm": 0.35546875, "learning_rate": 0.00011024786356734199, "loss": 0.458, "step": 4032 }, { "epoch": 4.131013306038895, "grad_norm": 0.357421875, "learning_rate": 0.00010997945981623944, "loss": 0.5013, "step": 4036 }, { "epoch": 4.13510747185261, "grad_norm": 0.349609375, "learning_rate": 0.00010971119395273683, "loss": 0.449, "step": 4040 }, { "epoch": 4.139201637666326, "grad_norm": 0.337890625, "learning_rate": 0.00010944306690112285, "loss": 0.4601, "step": 4044 }, { "epoch": 4.143295803480041, "grad_norm": 0.35546875, "learning_rate": 0.00010917507958520812, "loss": 0.4678, "step": 4048 }, { "epoch": 4.147389969293757, "grad_norm": 0.333984375, "learning_rate": 0.00010890723292832163, "loss": 0.4787, "step": 4052 }, { "epoch": 4.151484135107472, "grad_norm": 0.3203125, "learning_rate": 0.00010863952785330779, "loss": 0.4761, "step": 4056 }, { "epoch": 4.155578300921187, "grad_norm": 0.353515625, "learning_rate": 0.0001083719652825233, "loss": 0.4994, "step": 4060 }, { "epoch": 4.159672466734903, "grad_norm": 0.3671875, "learning_rate": 0.00010810454613783376, "loss": 0.5027, "step": 4064 }, { "epoch": 4.163766632548619, "grad_norm": 0.337890625, "learning_rate": 0.0001078372713406106, "loss": 0.4376, "step": 4068 }, { "epoch": 4.167860798362334, "grad_norm": 0.34765625, "learning_rate": 0.00010757014181172803, "loss": 0.4762, "step": 4072 }, { "epoch": 4.171954964176049, "grad_norm": 0.3203125, "learning_rate": 0.00010730315847155966, "loss": 0.4696, "step": 4076 }, { "epoch": 4.176049129989765, "grad_norm": 0.345703125, "learning_rate": 0.00010703632223997534, "loss": 0.4367, "step": 4080 }, { "epoch": 4.18014329580348, "grad_norm": 0.349609375, "learning_rate": 0.00010676963403633828, "loss": 0.4768, "step": 4084 }, { "epoch": 4.184237461617196, "grad_norm": 0.326171875, "learning_rate": 0.00010650309477950149, "loss": 0.4386, "step": 4088 }, { "epoch": 4.188331627430911, "grad_norm": 0.33203125, "learning_rate": 0.00010623670538780487, "loss": 0.4399, "step": 4092 }, { "epoch": 4.1924257932446265, "grad_norm": 0.341796875, "learning_rate": 0.00010597046677907198, "loss": 0.4919, "step": 4096 }, { "epoch": 4.1965199590583415, "grad_norm": 0.36328125, "learning_rate": 0.0001057043798706068, "loss": 0.4441, "step": 4100 }, { "epoch": 4.200614124872057, "grad_norm": 0.32421875, "learning_rate": 0.00010543844557919073, "loss": 0.4588, "step": 4104 }, { "epoch": 4.2047082906857725, "grad_norm": 0.35546875, "learning_rate": 0.00010517266482107927, "loss": 0.4188, "step": 4108 }, { "epoch": 4.208802456499488, "grad_norm": 0.333984375, "learning_rate": 0.00010490703851199903, "loss": 0.5049, "step": 4112 }, { "epoch": 4.212896622313203, "grad_norm": 0.345703125, "learning_rate": 0.00010464156756714434, "loss": 0.4469, "step": 4116 }, { "epoch": 4.216990788126919, "grad_norm": 0.328125, "learning_rate": 0.00010437625290117429, "loss": 0.5029, "step": 4120 }, { "epoch": 4.221084953940634, "grad_norm": 0.3515625, "learning_rate": 0.00010411109542820963, "loss": 0.4443, "step": 4124 }, { "epoch": 4.22517911975435, "grad_norm": 0.353515625, "learning_rate": 0.00010384609606182933, "loss": 0.4548, "step": 4128 }, { "epoch": 4.229273285568065, "grad_norm": 0.33203125, "learning_rate": 0.00010358125571506772, "loss": 0.427, "step": 4132 }, { "epoch": 4.233367451381781, "grad_norm": 0.337890625, "learning_rate": 0.00010331657530041128, "loss": 0.4632, "step": 4136 }, { "epoch": 4.237461617195496, "grad_norm": 0.35546875, "learning_rate": 0.00010305205572979536, "loss": 0.48, "step": 4140 }, { "epoch": 4.241555783009212, "grad_norm": 0.33984375, "learning_rate": 0.00010278769791460118, "loss": 0.4348, "step": 4144 }, { "epoch": 4.245649948822927, "grad_norm": 0.35546875, "learning_rate": 0.00010252350276565269, "loss": 0.4493, "step": 4148 }, { "epoch": 4.249744114636643, "grad_norm": 0.3359375, "learning_rate": 0.00010225947119321326, "loss": 0.4539, "step": 4152 }, { "epoch": 4.253838280450358, "grad_norm": 0.35546875, "learning_rate": 0.00010199560410698284, "loss": 0.4563, "step": 4156 }, { "epoch": 4.257932446264074, "grad_norm": 0.333984375, "learning_rate": 0.00010173190241609452, "loss": 0.4621, "step": 4160 }, { "epoch": 4.262026612077789, "grad_norm": 0.328125, "learning_rate": 0.00010146836702911154, "loss": 0.445, "step": 4164 }, { "epoch": 4.266120777891505, "grad_norm": 0.328125, "learning_rate": 0.00010120499885402429, "loss": 0.4708, "step": 4168 }, { "epoch": 4.27021494370522, "grad_norm": 0.361328125, "learning_rate": 0.00010094179879824689, "loss": 0.4508, "step": 4172 }, { "epoch": 4.274309109518936, "grad_norm": 0.345703125, "learning_rate": 0.00010067876776861423, "loss": 0.4957, "step": 4176 }, { "epoch": 4.278403275332651, "grad_norm": 0.34375, "learning_rate": 0.00010041590667137899, "loss": 0.4882, "step": 4180 }, { "epoch": 4.282497441146367, "grad_norm": 0.337890625, "learning_rate": 0.00010015321641220816, "loss": 0.4791, "step": 4184 }, { "epoch": 4.286591606960082, "grad_norm": 0.34375, "learning_rate": 9.989069789618023e-05, "loss": 0.4626, "step": 4188 }, { "epoch": 4.290685772773798, "grad_norm": 0.330078125, "learning_rate": 9.9628352027782e-05, "loss": 0.4212, "step": 4192 }, { "epoch": 4.294779938587513, "grad_norm": 0.34765625, "learning_rate": 9.93661797109053e-05, "loss": 0.4547, "step": 4196 }, { "epoch": 4.298874104401229, "grad_norm": 0.330078125, "learning_rate": 9.910418184884408e-05, "loss": 0.457, "step": 4200 }, { "epoch": 4.302968270214944, "grad_norm": 0.326171875, "learning_rate": 9.884235934429126e-05, "loss": 0.424, "step": 4204 }, { "epoch": 4.30706243602866, "grad_norm": 0.34765625, "learning_rate": 9.858071309933554e-05, "loss": 0.4666, "step": 4208 }, { "epoch": 4.311156601842375, "grad_norm": 0.330078125, "learning_rate": 9.831924401545822e-05, "loss": 0.4658, "step": 4212 }, { "epoch": 4.3152507676560905, "grad_norm": 0.353515625, "learning_rate": 9.805795299353042e-05, "loss": 0.4881, "step": 4216 }, { "epoch": 4.3193449334698055, "grad_norm": 0.326171875, "learning_rate": 9.77968409338096e-05, "loss": 0.4483, "step": 4220 }, { "epoch": 4.3234390992835205, "grad_norm": 0.359375, "learning_rate": 9.753590873593667e-05, "loss": 0.4651, "step": 4224 }, { "epoch": 4.3275332650972365, "grad_norm": 0.345703125, "learning_rate": 9.727515729893288e-05, "loss": 0.4345, "step": 4228 }, { "epoch": 4.3316274309109515, "grad_norm": 0.34375, "learning_rate": 9.701458752119661e-05, "loss": 0.4293, "step": 4232 }, { "epoch": 4.335721596724667, "grad_norm": 0.34375, "learning_rate": 9.675420030050035e-05, "loss": 0.4462, "step": 4236 }, { "epoch": 4.339815762538382, "grad_norm": 0.333984375, "learning_rate": 9.649399653398771e-05, "loss": 0.4796, "step": 4240 }, { "epoch": 4.343909928352098, "grad_norm": 0.337890625, "learning_rate": 9.623397711817012e-05, "loss": 0.4458, "step": 4244 }, { "epoch": 4.348004094165813, "grad_norm": 0.341796875, "learning_rate": 9.597414294892379e-05, "loss": 0.4275, "step": 4248 }, { "epoch": 4.352098259979529, "grad_norm": 0.365234375, "learning_rate": 9.571449492148686e-05, "loss": 0.5074, "step": 4252 }, { "epoch": 4.356192425793244, "grad_norm": 0.357421875, "learning_rate": 9.545503393045605e-05, "loss": 0.4946, "step": 4256 }, { "epoch": 4.36028659160696, "grad_norm": 0.375, "learning_rate": 9.519576086978357e-05, "loss": 0.5084, "step": 4260 }, { "epoch": 4.364380757420675, "grad_norm": 0.34375, "learning_rate": 9.493667663277424e-05, "loss": 0.4977, "step": 4264 }, { "epoch": 4.368474923234391, "grad_norm": 0.3515625, "learning_rate": 9.467778211208231e-05, "loss": 0.4551, "step": 4268 }, { "epoch": 4.372569089048106, "grad_norm": 0.3359375, "learning_rate": 9.441907819970826e-05, "loss": 0.4583, "step": 4272 }, { "epoch": 4.376663254861822, "grad_norm": 0.33984375, "learning_rate": 9.416056578699593e-05, "loss": 0.4288, "step": 4276 }, { "epoch": 4.380757420675537, "grad_norm": 0.349609375, "learning_rate": 9.39022457646294e-05, "loss": 0.4798, "step": 4280 }, { "epoch": 4.384851586489253, "grad_norm": 0.341796875, "learning_rate": 9.364411902262982e-05, "loss": 0.4408, "step": 4284 }, { "epoch": 4.388945752302968, "grad_norm": 0.349609375, "learning_rate": 9.33861864503524e-05, "loss": 0.4388, "step": 4288 }, { "epoch": 4.393039918116684, "grad_norm": 0.353515625, "learning_rate": 9.31284489364834e-05, "loss": 0.48, "step": 4292 }, { "epoch": 4.397134083930399, "grad_norm": 0.349609375, "learning_rate": 9.287090736903701e-05, "loss": 0.4625, "step": 4296 }, { "epoch": 4.401228249744115, "grad_norm": 0.33984375, "learning_rate": 9.261356263535225e-05, "loss": 0.483, "step": 4300 }, { "epoch": 4.40532241555783, "grad_norm": 0.33984375, "learning_rate": 9.235641562209006e-05, "loss": 0.4827, "step": 4304 }, { "epoch": 4.409416581371546, "grad_norm": 0.328125, "learning_rate": 9.209946721523007e-05, "loss": 0.4702, "step": 4308 }, { "epoch": 4.413510747185261, "grad_norm": 0.33984375, "learning_rate": 9.184271830006764e-05, "loss": 0.479, "step": 4312 }, { "epoch": 4.417604912998977, "grad_norm": 0.35546875, "learning_rate": 9.158616976121085e-05, "loss": 0.4666, "step": 4316 }, { "epoch": 4.421699078812692, "grad_norm": 0.353515625, "learning_rate": 9.132982248257736e-05, "loss": 0.4342, "step": 4320 }, { "epoch": 4.425793244626408, "grad_norm": 0.34375, "learning_rate": 9.107367734739135e-05, "loss": 0.4567, "step": 4324 }, { "epoch": 4.429887410440123, "grad_norm": 0.3515625, "learning_rate": 9.081773523818064e-05, "loss": 0.4731, "step": 4328 }, { "epoch": 4.433981576253839, "grad_norm": 0.337890625, "learning_rate": 9.05619970367735e-05, "loss": 0.4324, "step": 4332 }, { "epoch": 4.438075742067554, "grad_norm": 0.3359375, "learning_rate": 9.030646362429553e-05, "loss": 0.4642, "step": 4336 }, { "epoch": 4.4421699078812695, "grad_norm": 0.376953125, "learning_rate": 9.005113588116699e-05, "loss": 0.5286, "step": 4340 }, { "epoch": 4.4462640736949846, "grad_norm": 0.375, "learning_rate": 8.979601468709933e-05, "loss": 0.4724, "step": 4344 }, { "epoch": 4.4503582395087005, "grad_norm": 0.3203125, "learning_rate": 8.954110092109238e-05, "loss": 0.4447, "step": 4348 }, { "epoch": 4.4544524053224155, "grad_norm": 0.357421875, "learning_rate": 8.928639546143135e-05, "loss": 0.4771, "step": 4352 }, { "epoch": 4.458546571136131, "grad_norm": 0.353515625, "learning_rate": 8.903189918568372e-05, "loss": 0.4557, "step": 4356 }, { "epoch": 4.462640736949846, "grad_norm": 0.3515625, "learning_rate": 8.877761297069622e-05, "loss": 0.4514, "step": 4360 }, { "epoch": 4.466734902763562, "grad_norm": 0.349609375, "learning_rate": 8.85235376925919e-05, "loss": 0.455, "step": 4364 }, { "epoch": 4.470829068577277, "grad_norm": 0.33203125, "learning_rate": 8.826967422676698e-05, "loss": 0.4382, "step": 4368 }, { "epoch": 4.474923234390992, "grad_norm": 0.359375, "learning_rate": 8.801602344788783e-05, "loss": 0.4872, "step": 4372 }, { "epoch": 4.479017400204708, "grad_norm": 0.365234375, "learning_rate": 8.776258622988823e-05, "loss": 0.4487, "step": 4376 }, { "epoch": 4.483111566018424, "grad_norm": 0.375, "learning_rate": 8.750936344596597e-05, "loss": 0.4817, "step": 4380 }, { "epoch": 4.487205731832139, "grad_norm": 0.34375, "learning_rate": 8.725635596858014e-05, "loss": 0.4282, "step": 4384 }, { "epoch": 4.491299897645854, "grad_norm": 0.353515625, "learning_rate": 8.700356466944786e-05, "loss": 0.4675, "step": 4388 }, { "epoch": 4.49539406345957, "grad_norm": 0.34375, "learning_rate": 8.675099041954158e-05, "loss": 0.4866, "step": 4392 }, { "epoch": 4.499488229273285, "grad_norm": 0.353515625, "learning_rate": 8.649863408908586e-05, "loss": 0.5158, "step": 4396 }, { "epoch": 4.503582395087001, "grad_norm": 0.375, "learning_rate": 8.62464965475544e-05, "loss": 0.427, "step": 4400 }, { "epoch": 4.507676560900716, "grad_norm": 0.3515625, "learning_rate": 8.599457866366725e-05, "loss": 0.4479, "step": 4404 }, { "epoch": 4.511770726714432, "grad_norm": 0.33984375, "learning_rate": 8.574288130538736e-05, "loss": 0.4869, "step": 4408 }, { "epoch": 4.515864892528147, "grad_norm": 0.337890625, "learning_rate": 8.549140533991807e-05, "loss": 0.4635, "step": 4412 }, { "epoch": 4.519959058341863, "grad_norm": 0.359375, "learning_rate": 8.524015163369993e-05, "loss": 0.4592, "step": 4416 }, { "epoch": 4.524053224155578, "grad_norm": 0.330078125, "learning_rate": 8.498912105240773e-05, "loss": 0.4623, "step": 4420 }, { "epoch": 4.528147389969294, "grad_norm": 0.359375, "learning_rate": 8.473831446094733e-05, "loss": 0.4546, "step": 4424 }, { "epoch": 4.532241555783009, "grad_norm": 0.34765625, "learning_rate": 8.448773272345298e-05, "loss": 0.5183, "step": 4428 }, { "epoch": 4.536335721596725, "grad_norm": 0.361328125, "learning_rate": 8.423737670328432e-05, "loss": 0.4457, "step": 4432 }, { "epoch": 4.54042988741044, "grad_norm": 0.3203125, "learning_rate": 8.398724726302301e-05, "loss": 0.4337, "step": 4436 }, { "epoch": 4.544524053224156, "grad_norm": 0.359375, "learning_rate": 8.373734526447032e-05, "loss": 0.4435, "step": 4440 }, { "epoch": 4.548618219037871, "grad_norm": 0.34375, "learning_rate": 8.348767156864382e-05, "loss": 0.4753, "step": 4444 }, { "epoch": 4.552712384851587, "grad_norm": 0.35546875, "learning_rate": 8.323822703577431e-05, "loss": 0.4746, "step": 4448 }, { "epoch": 4.556806550665302, "grad_norm": 0.37109375, "learning_rate": 8.298901252530326e-05, "loss": 0.44, "step": 4452 }, { "epoch": 4.560900716479018, "grad_norm": 0.32421875, "learning_rate": 8.274002889587954e-05, "loss": 0.4366, "step": 4456 }, { "epoch": 4.564994882292733, "grad_norm": 0.33984375, "learning_rate": 8.249127700535643e-05, "loss": 0.4715, "step": 4460 }, { "epoch": 4.569089048106449, "grad_norm": 0.333984375, "learning_rate": 8.224275771078889e-05, "loss": 0.437, "step": 4464 }, { "epoch": 4.573183213920164, "grad_norm": 0.34765625, "learning_rate": 8.199447186843051e-05, "loss": 0.4556, "step": 4468 }, { "epoch": 4.5772773797338795, "grad_norm": 0.3671875, "learning_rate": 8.174642033373037e-05, "loss": 0.438, "step": 4472 }, { "epoch": 4.5813715455475945, "grad_norm": 0.3515625, "learning_rate": 8.149860396133048e-05, "loss": 0.4682, "step": 4476 }, { "epoch": 4.58546571136131, "grad_norm": 0.349609375, "learning_rate": 8.125102360506255e-05, "loss": 0.4744, "step": 4480 }, { "epoch": 4.5895598771750254, "grad_norm": 0.361328125, "learning_rate": 8.100368011794491e-05, "loss": 0.5337, "step": 4484 }, { "epoch": 4.593654042988741, "grad_norm": 0.337890625, "learning_rate": 8.075657435218008e-05, "loss": 0.4905, "step": 4488 }, { "epoch": 4.597748208802456, "grad_norm": 0.349609375, "learning_rate": 8.050970715915138e-05, "loss": 0.4834, "step": 4492 }, { "epoch": 4.601842374616172, "grad_norm": 0.361328125, "learning_rate": 8.02630793894201e-05, "loss": 0.4572, "step": 4496 }, { "epoch": 4.605936540429887, "grad_norm": 0.333984375, "learning_rate": 8.001669189272272e-05, "loss": 0.441, "step": 4500 }, { "epoch": 4.610030706243603, "grad_norm": 0.34375, "learning_rate": 7.977054551796792e-05, "loss": 0.429, "step": 4504 }, { "epoch": 4.614124872057318, "grad_norm": 0.337890625, "learning_rate": 7.952464111323335e-05, "loss": 0.458, "step": 4508 }, { "epoch": 4.618219037871034, "grad_norm": 0.3515625, "learning_rate": 7.927897952576326e-05, "loss": 0.4573, "step": 4512 }, { "epoch": 4.622313203684749, "grad_norm": 0.3515625, "learning_rate": 7.903356160196522e-05, "loss": 0.4269, "step": 4516 }, { "epoch": 4.626407369498464, "grad_norm": 0.3515625, "learning_rate": 7.878838818740711e-05, "loss": 0.4751, "step": 4520 }, { "epoch": 4.63050153531218, "grad_norm": 0.38671875, "learning_rate": 7.85434601268146e-05, "loss": 0.4122, "step": 4524 }, { "epoch": 4.634595701125896, "grad_norm": 0.34375, "learning_rate": 7.829877826406794e-05, "loss": 0.4739, "step": 4528 }, { "epoch": 4.638689866939611, "grad_norm": 0.361328125, "learning_rate": 7.805434344219902e-05, "loss": 0.4686, "step": 4532 }, { "epoch": 4.642784032753326, "grad_norm": 0.34375, "learning_rate": 7.781015650338865e-05, "loss": 0.4247, "step": 4536 }, { "epoch": 4.646878198567042, "grad_norm": 0.421875, "learning_rate": 7.756621828896363e-05, "loss": 0.4744, "step": 4540 }, { "epoch": 4.650972364380758, "grad_norm": 0.357421875, "learning_rate": 7.732252963939369e-05, "loss": 0.4481, "step": 4544 }, { "epoch": 4.655066530194473, "grad_norm": 0.3359375, "learning_rate": 7.707909139428889e-05, "loss": 0.4948, "step": 4548 }, { "epoch": 4.659160696008188, "grad_norm": 0.33984375, "learning_rate": 7.683590439239626e-05, "loss": 0.5113, "step": 4552 }, { "epoch": 4.663254861821904, "grad_norm": 0.3515625, "learning_rate": 7.65929694715974e-05, "loss": 0.4743, "step": 4556 }, { "epoch": 4.667349027635619, "grad_norm": 0.34765625, "learning_rate": 7.63502874689054e-05, "loss": 0.4563, "step": 4560 }, { "epoch": 4.671443193449335, "grad_norm": 0.361328125, "learning_rate": 7.610785922046173e-05, "loss": 0.4818, "step": 4564 }, { "epoch": 4.67553735926305, "grad_norm": 0.330078125, "learning_rate": 7.586568556153378e-05, "loss": 0.4595, "step": 4568 }, { "epoch": 4.679631525076766, "grad_norm": 0.357421875, "learning_rate": 7.562376732651177e-05, "loss": 0.4212, "step": 4572 }, { "epoch": 4.683725690890481, "grad_norm": 0.345703125, "learning_rate": 7.53821053489057e-05, "loss": 0.4709, "step": 4576 }, { "epoch": 4.687819856704197, "grad_norm": 0.37109375, "learning_rate": 7.514070046134281e-05, "loss": 0.5149, "step": 4580 }, { "epoch": 4.691914022517912, "grad_norm": 0.337890625, "learning_rate": 7.489955349556457e-05, "loss": 0.4564, "step": 4584 }, { "epoch": 4.696008188331628, "grad_norm": 0.365234375, "learning_rate": 7.465866528242361e-05, "loss": 0.4646, "step": 4588 }, { "epoch": 4.700102354145343, "grad_norm": 0.35546875, "learning_rate": 7.441803665188124e-05, "loss": 0.4564, "step": 4592 }, { "epoch": 4.7041965199590585, "grad_norm": 0.361328125, "learning_rate": 7.417766843300443e-05, "loss": 0.4589, "step": 4596 }, { "epoch": 4.7082906857727735, "grad_norm": 0.322265625, "learning_rate": 7.393756145396267e-05, "loss": 0.4152, "step": 4600 }, { "epoch": 4.7123848515864895, "grad_norm": 0.35546875, "learning_rate": 7.369771654202563e-05, "loss": 0.4353, "step": 4604 }, { "epoch": 4.7164790174002045, "grad_norm": 0.3359375, "learning_rate": 7.345813452355999e-05, "loss": 0.4508, "step": 4608 }, { "epoch": 4.72057318321392, "grad_norm": 0.37109375, "learning_rate": 7.321881622402648e-05, "loss": 0.4222, "step": 4612 }, { "epoch": 4.724667349027635, "grad_norm": 0.353515625, "learning_rate": 7.297976246797742e-05, "loss": 0.4815, "step": 4616 }, { "epoch": 4.728761514841351, "grad_norm": 0.373046875, "learning_rate": 7.274097407905361e-05, "loss": 0.5069, "step": 4620 }, { "epoch": 4.732855680655066, "grad_norm": 0.357421875, "learning_rate": 7.250245187998141e-05, "loss": 0.5212, "step": 4624 }, { "epoch": 4.736949846468782, "grad_norm": 0.361328125, "learning_rate": 7.226419669257027e-05, "loss": 0.4838, "step": 4628 }, { "epoch": 4.741044012282497, "grad_norm": 0.34375, "learning_rate": 7.202620933770954e-05, "loss": 0.5086, "step": 4632 }, { "epoch": 4.745138178096213, "grad_norm": 0.349609375, "learning_rate": 7.178849063536572e-05, "loss": 0.4991, "step": 4636 }, { "epoch": 4.749232343909928, "grad_norm": 0.35546875, "learning_rate": 7.155104140457982e-05, "loss": 0.4491, "step": 4640 }, { "epoch": 4.753326509723644, "grad_norm": 0.375, "learning_rate": 7.131386246346439e-05, "loss": 0.4606, "step": 4644 }, { "epoch": 4.757420675537359, "grad_norm": 0.326171875, "learning_rate": 7.107695462920057e-05, "loss": 0.4968, "step": 4648 }, { "epoch": 4.761514841351075, "grad_norm": 0.35546875, "learning_rate": 7.084031871803559e-05, "loss": 0.4662, "step": 4652 }, { "epoch": 4.76560900716479, "grad_norm": 0.36328125, "learning_rate": 7.060395554527977e-05, "loss": 0.505, "step": 4656 }, { "epoch": 4.769703172978506, "grad_norm": 0.34375, "learning_rate": 7.03678659253036e-05, "loss": 0.476, "step": 4660 }, { "epoch": 4.773797338792221, "grad_norm": 0.359375, "learning_rate": 7.013205067153522e-05, "loss": 0.4489, "step": 4664 }, { "epoch": 4.777891504605937, "grad_norm": 0.341796875, "learning_rate": 6.989651059645743e-05, "loss": 0.4559, "step": 4668 }, { "epoch": 4.781985670419652, "grad_norm": 0.349609375, "learning_rate": 6.966124651160479e-05, "loss": 0.4769, "step": 4672 }, { "epoch": 4.786079836233368, "grad_norm": 0.3359375, "learning_rate": 6.942625922756114e-05, "loss": 0.4543, "step": 4676 }, { "epoch": 4.790174002047083, "grad_norm": 0.3671875, "learning_rate": 6.91915495539565e-05, "loss": 0.4677, "step": 4680 }, { "epoch": 4.794268167860798, "grad_norm": 0.34375, "learning_rate": 6.89571182994645e-05, "loss": 0.4448, "step": 4684 }, { "epoch": 4.798362333674514, "grad_norm": 0.373046875, "learning_rate": 6.872296627179943e-05, "loss": 0.4758, "step": 4688 }, { "epoch": 4.80245649948823, "grad_norm": 0.3515625, "learning_rate": 6.848909427771361e-05, "loss": 0.463, "step": 4692 }, { "epoch": 4.806550665301945, "grad_norm": 0.35546875, "learning_rate": 6.825550312299432e-05, "loss": 0.4602, "step": 4696 }, { "epoch": 4.81064483111566, "grad_norm": 0.357421875, "learning_rate": 6.802219361246149e-05, "loss": 0.4589, "step": 4700 }, { "epoch": 4.814738996929376, "grad_norm": 0.3359375, "learning_rate": 6.778916654996455e-05, "loss": 0.4715, "step": 4704 }, { "epoch": 4.818833162743092, "grad_norm": 0.36328125, "learning_rate": 6.755642273837969e-05, "loss": 0.4401, "step": 4708 }, { "epoch": 4.822927328556807, "grad_norm": 0.328125, "learning_rate": 6.732396297960732e-05, "loss": 0.4417, "step": 4712 }, { "epoch": 4.827021494370522, "grad_norm": 0.333984375, "learning_rate": 6.70917880745692e-05, "loss": 0.4187, "step": 4716 }, { "epoch": 4.8311156601842375, "grad_norm": 0.36328125, "learning_rate": 6.68598988232054e-05, "loss": 0.4683, "step": 4720 }, { "epoch": 4.835209825997953, "grad_norm": 0.33203125, "learning_rate": 6.662829602447207e-05, "loss": 0.4654, "step": 4724 }, { "epoch": 4.8393039918116685, "grad_norm": 0.341796875, "learning_rate": 6.639698047633834e-05, "loss": 0.4565, "step": 4728 }, { "epoch": 4.8433981576253835, "grad_norm": 0.357421875, "learning_rate": 6.616595297578346e-05, "loss": 0.4464, "step": 4732 }, { "epoch": 4.847492323439099, "grad_norm": 0.361328125, "learning_rate": 6.59352143187945e-05, "loss": 0.4892, "step": 4736 }, { "epoch": 4.851586489252814, "grad_norm": 0.361328125, "learning_rate": 6.57047653003632e-05, "loss": 0.4535, "step": 4740 }, { "epoch": 4.85568065506653, "grad_norm": 0.361328125, "learning_rate": 6.547460671448333e-05, "loss": 0.4681, "step": 4744 }, { "epoch": 4.859774820880245, "grad_norm": 0.3515625, "learning_rate": 6.524473935414807e-05, "loss": 0.4673, "step": 4748 }, { "epoch": 4.863868986693961, "grad_norm": 0.37109375, "learning_rate": 6.50151640113473e-05, "loss": 0.4545, "step": 4752 }, { "epoch": 4.867963152507676, "grad_norm": 0.353515625, "learning_rate": 6.478588147706454e-05, "loss": 0.4575, "step": 4756 }, { "epoch": 4.872057318321392, "grad_norm": 0.353515625, "learning_rate": 6.455689254127464e-05, "loss": 0.4646, "step": 4760 }, { "epoch": 4.876151484135107, "grad_norm": 0.345703125, "learning_rate": 6.43281979929409e-05, "loss": 0.4741, "step": 4764 }, { "epoch": 4.880245649948823, "grad_norm": 0.37890625, "learning_rate": 6.409979862001215e-05, "loss": 0.4658, "step": 4768 }, { "epoch": 4.884339815762538, "grad_norm": 0.34375, "learning_rate": 6.387169520942037e-05, "loss": 0.5014, "step": 4772 }, { "epoch": 4.888433981576254, "grad_norm": 0.33984375, "learning_rate": 6.364388854707786e-05, "loss": 0.4532, "step": 4776 }, { "epoch": 4.892528147389969, "grad_norm": 0.4296875, "learning_rate": 6.341637941787433e-05, "loss": 0.4745, "step": 4780 }, { "epoch": 4.896622313203685, "grad_norm": 0.33984375, "learning_rate": 6.318916860567447e-05, "loss": 0.4742, "step": 4784 }, { "epoch": 4.9007164790174, "grad_norm": 0.3359375, "learning_rate": 6.296225689331523e-05, "loss": 0.4955, "step": 4788 }, { "epoch": 4.904810644831116, "grad_norm": 0.345703125, "learning_rate": 6.273564506260277e-05, "loss": 0.4486, "step": 4792 }, { "epoch": 4.908904810644831, "grad_norm": 0.33203125, "learning_rate": 6.250933389431029e-05, "loss": 0.4609, "step": 4796 }, { "epoch": 4.912998976458547, "grad_norm": 0.361328125, "learning_rate": 6.228332416817504e-05, "loss": 0.4645, "step": 4800 }, { "epoch": 4.917093142272262, "grad_norm": 0.376953125, "learning_rate": 6.205761666289548e-05, "loss": 0.4939, "step": 4804 }, { "epoch": 4.921187308085978, "grad_norm": 0.333984375, "learning_rate": 6.183221215612904e-05, "loss": 0.4646, "step": 4808 }, { "epoch": 4.925281473899693, "grad_norm": 0.369140625, "learning_rate": 6.16071114244891e-05, "loss": 0.4811, "step": 4812 }, { "epoch": 4.929375639713409, "grad_norm": 0.365234375, "learning_rate": 6.138231524354229e-05, "loss": 0.4903, "step": 4816 }, { "epoch": 4.933469805527124, "grad_norm": 0.373046875, "learning_rate": 6.115782438780612e-05, "loss": 0.4538, "step": 4820 }, { "epoch": 4.93756397134084, "grad_norm": 0.365234375, "learning_rate": 6.093363963074602e-05, "loss": 0.4825, "step": 4824 }, { "epoch": 4.941658137154555, "grad_norm": 0.32421875, "learning_rate": 6.070976174477281e-05, "loss": 0.4081, "step": 4828 }, { "epoch": 4.94575230296827, "grad_norm": 0.328125, "learning_rate": 6.048619150124005e-05, "loss": 0.4719, "step": 4832 }, { "epoch": 4.949846468781986, "grad_norm": 0.3671875, "learning_rate": 6.026292967044121e-05, "loss": 0.4626, "step": 4836 }, { "epoch": 4.9539406345957016, "grad_norm": 0.36328125, "learning_rate": 6.003997702160727e-05, "loss": 0.4362, "step": 4840 }, { "epoch": 4.958034800409417, "grad_norm": 0.353515625, "learning_rate": 5.981733432290399e-05, "loss": 0.4964, "step": 4844 }, { "epoch": 4.962128966223132, "grad_norm": 0.3515625, "learning_rate": 5.959500234142904e-05, "loss": 0.464, "step": 4848 }, { "epoch": 4.9662231320368475, "grad_norm": 0.353515625, "learning_rate": 5.937298184320967e-05, "loss": 0.4479, "step": 4852 }, { "epoch": 4.970317297850563, "grad_norm": 0.3515625, "learning_rate": 5.9151273593199924e-05, "loss": 0.468, "step": 4856 }, { "epoch": 4.974411463664278, "grad_norm": 0.373046875, "learning_rate": 5.892987835527809e-05, "loss": 0.4376, "step": 4860 }, { "epoch": 4.9785056294779935, "grad_norm": 0.34765625, "learning_rate": 5.870879689224377e-05, "loss": 0.441, "step": 4864 }, { "epoch": 4.982599795291709, "grad_norm": 0.333984375, "learning_rate": 5.84880299658157e-05, "loss": 0.4317, "step": 4868 }, { "epoch": 4.986693961105424, "grad_norm": 0.357421875, "learning_rate": 5.8267578336628875e-05, "loss": 0.4447, "step": 4872 }, { "epoch": 4.99078812691914, "grad_norm": 0.357421875, "learning_rate": 5.804744276423181e-05, "loss": 0.4438, "step": 4876 }, { "epoch": 4.994882292732855, "grad_norm": 0.330078125, "learning_rate": 5.782762400708424e-05, "loss": 0.4339, "step": 4880 }, { "epoch": 4.998976458546571, "grad_norm": 0.3515625, "learning_rate": 5.760812282255433e-05, "loss": 0.4601, "step": 4884 }, { "epoch": 5.003070624360286, "grad_norm": 0.33984375, "learning_rate": 5.7388939966915894e-05, "loss": 0.4073, "step": 4888 }, { "epoch": 5.007164790174002, "grad_norm": 0.3203125, "learning_rate": 5.7170076195346144e-05, "loss": 0.4749, "step": 4892 }, { "epoch": 5.011258955987717, "grad_norm": 0.33203125, "learning_rate": 5.695153226192293e-05, "loss": 0.4516, "step": 4896 }, { "epoch": 5.015353121801433, "grad_norm": 0.328125, "learning_rate": 5.6733308919621946e-05, "loss": 0.4908, "step": 4900 }, { "epoch": 5.019447287615148, "grad_norm": 0.33203125, "learning_rate": 5.651540692031448e-05, "loss": 0.4131, "step": 4904 }, { "epoch": 5.023541453428864, "grad_norm": 0.33984375, "learning_rate": 5.629782701476464e-05, "loss": 0.4205, "step": 4908 }, { "epoch": 5.027635619242579, "grad_norm": 0.34765625, "learning_rate": 5.608056995262668e-05, "loss": 0.4144, "step": 4912 }, { "epoch": 5.031729785056295, "grad_norm": 0.3515625, "learning_rate": 5.586363648244261e-05, "loss": 0.4081, "step": 4916 }, { "epoch": 5.03582395087001, "grad_norm": 0.34765625, "learning_rate": 5.564702735163956e-05, "loss": 0.4058, "step": 4920 }, { "epoch": 5.039918116683726, "grad_norm": 0.35546875, "learning_rate": 5.543074330652706e-05, "loss": 0.4239, "step": 4924 }, { "epoch": 5.044012282497441, "grad_norm": 0.337890625, "learning_rate": 5.521478509229468e-05, "loss": 0.4397, "step": 4928 }, { "epoch": 5.048106448311157, "grad_norm": 0.349609375, "learning_rate": 5.499915345300936e-05, "loss": 0.4284, "step": 4932 }, { "epoch": 5.052200614124872, "grad_norm": 0.35546875, "learning_rate": 5.478384913161277e-05, "loss": 0.4047, "step": 4936 }, { "epoch": 5.056294779938588, "grad_norm": 0.33203125, "learning_rate": 5.456887286991891e-05, "loss": 0.4198, "step": 4940 }, { "epoch": 5.060388945752303, "grad_norm": 0.3203125, "learning_rate": 5.435422540861151e-05, "loss": 0.3665, "step": 4944 }, { "epoch": 5.064483111566019, "grad_norm": 0.375, "learning_rate": 5.413990748724129e-05, "loss": 0.4719, "step": 4948 }, { "epoch": 5.068577277379734, "grad_norm": 0.341796875, "learning_rate": 5.392591984422371e-05, "loss": 0.4073, "step": 4952 }, { "epoch": 5.07267144319345, "grad_norm": 0.34765625, "learning_rate": 5.37122632168363e-05, "loss": 0.3911, "step": 4956 }, { "epoch": 5.076765609007165, "grad_norm": 0.3828125, "learning_rate": 5.349893834121593e-05, "loss": 0.4298, "step": 4960 }, { "epoch": 5.080859774820881, "grad_norm": 0.3359375, "learning_rate": 5.32859459523566e-05, "loss": 0.4137, "step": 4964 }, { "epoch": 5.084953940634596, "grad_norm": 0.375, "learning_rate": 5.3073286784106714e-05, "loss": 0.439, "step": 4968 }, { "epoch": 5.0890481064483115, "grad_norm": 0.32421875, "learning_rate": 5.2860961569166595e-05, "loss": 0.3966, "step": 4972 }, { "epoch": 5.0931422722620265, "grad_norm": 0.33984375, "learning_rate": 5.264897103908599e-05, "loss": 0.4607, "step": 4976 }, { "epoch": 5.0972364380757424, "grad_norm": 0.3515625, "learning_rate": 5.243731592426135e-05, "loss": 0.4216, "step": 4980 }, { "epoch": 5.1013306038894575, "grad_norm": 0.328125, "learning_rate": 5.222599695393368e-05, "loss": 0.4576, "step": 4984 }, { "epoch": 5.105424769703173, "grad_norm": 0.357421875, "learning_rate": 5.2015014856185796e-05, "loss": 0.4706, "step": 4988 }, { "epoch": 5.109518935516888, "grad_norm": 0.3359375, "learning_rate": 5.1804370357939663e-05, "loss": 0.4304, "step": 4992 }, { "epoch": 5.113613101330604, "grad_norm": 0.359375, "learning_rate": 5.15940641849543e-05, "loss": 0.4476, "step": 4996 }, { "epoch": 5.117707267144319, "grad_norm": 0.349609375, "learning_rate": 5.138409706182299e-05, "loss": 0.4314, "step": 5000 }, { "epoch": 5.121801432958035, "grad_norm": 0.353515625, "learning_rate": 5.1174469711970716e-05, "loss": 0.4327, "step": 5004 }, { "epoch": 5.12589559877175, "grad_norm": 0.349609375, "learning_rate": 5.0965182857651964e-05, "loss": 0.4191, "step": 5008 }, { "epoch": 5.129989764585465, "grad_norm": 0.3671875, "learning_rate": 5.075623721994806e-05, "loss": 0.4486, "step": 5012 }, { "epoch": 5.134083930399181, "grad_norm": 0.34765625, "learning_rate": 5.05476335187645e-05, "loss": 0.4605, "step": 5016 }, { "epoch": 5.138178096212896, "grad_norm": 0.353515625, "learning_rate": 5.033937247282891e-05, "loss": 0.4839, "step": 5020 }, { "epoch": 5.142272262026612, "grad_norm": 0.34765625, "learning_rate": 5.013145479968824e-05, "loss": 0.4389, "step": 5024 }, { "epoch": 5.146366427840327, "grad_norm": 0.349609375, "learning_rate": 4.992388121570625e-05, "loss": 0.4528, "step": 5028 }, { "epoch": 5.150460593654043, "grad_norm": 0.349609375, "learning_rate": 4.9716652436061364e-05, "loss": 0.4562, "step": 5032 }, { "epoch": 5.154554759467758, "grad_norm": 0.353515625, "learning_rate": 4.950976917474393e-05, "loss": 0.4521, "step": 5036 }, { "epoch": 5.158648925281474, "grad_norm": 0.337890625, "learning_rate": 4.930323214455374e-05, "loss": 0.3968, "step": 5040 }, { "epoch": 5.162743091095189, "grad_norm": 0.3515625, "learning_rate": 4.909704205709785e-05, "loss": 0.4224, "step": 5044 }, { "epoch": 5.166837256908905, "grad_norm": 0.34375, "learning_rate": 4.889119962278786e-05, "loss": 0.4055, "step": 5048 }, { "epoch": 5.17093142272262, "grad_norm": 0.3515625, "learning_rate": 4.868570555083752e-05, "loss": 0.4874, "step": 5052 }, { "epoch": 5.175025588536336, "grad_norm": 0.345703125, "learning_rate": 4.8480560549260394e-05, "loss": 0.4371, "step": 5056 }, { "epoch": 5.179119754350051, "grad_norm": 0.34765625, "learning_rate": 4.827576532486737e-05, "loss": 0.4315, "step": 5060 }, { "epoch": 5.183213920163767, "grad_norm": 0.3515625, "learning_rate": 4.807132058326409e-05, "loss": 0.4226, "step": 5064 }, { "epoch": 5.187308085977482, "grad_norm": 0.369140625, "learning_rate": 4.786722702884874e-05, "loss": 0.4251, "step": 5068 }, { "epoch": 5.191402251791198, "grad_norm": 0.34765625, "learning_rate": 4.766348536480954e-05, "loss": 0.4411, "step": 5072 }, { "epoch": 5.195496417604913, "grad_norm": 0.33984375, "learning_rate": 4.7460096293122174e-05, "loss": 0.3946, "step": 5076 }, { "epoch": 5.199590583418629, "grad_norm": 0.337890625, "learning_rate": 4.725706051454759e-05, "loss": 0.4129, "step": 5080 }, { "epoch": 5.203684749232344, "grad_norm": 0.34375, "learning_rate": 4.705437872862955e-05, "loss": 0.4273, "step": 5084 }, { "epoch": 5.20777891504606, "grad_norm": 0.34765625, "learning_rate": 4.685205163369197e-05, "loss": 0.4656, "step": 5088 }, { "epoch": 5.211873080859775, "grad_norm": 0.345703125, "learning_rate": 4.665007992683687e-05, "loss": 0.4251, "step": 5092 }, { "epoch": 5.2159672466734905, "grad_norm": 0.33984375, "learning_rate": 4.6448464303941824e-05, "loss": 0.4032, "step": 5096 }, { "epoch": 5.220061412487206, "grad_norm": 0.357421875, "learning_rate": 4.6247205459657364e-05, "loss": 0.4569, "step": 5100 }, { "epoch": 5.2241555783009215, "grad_norm": 0.33984375, "learning_rate": 4.604630408740498e-05, "loss": 0.4433, "step": 5104 }, { "epoch": 5.2282497441146365, "grad_norm": 0.353515625, "learning_rate": 4.584576087937445e-05, "loss": 0.4305, "step": 5108 }, { "epoch": 5.232343909928352, "grad_norm": 0.3515625, "learning_rate": 4.5645576526521355e-05, "loss": 0.3824, "step": 5112 }, { "epoch": 5.236438075742067, "grad_norm": 0.341796875, "learning_rate": 4.5445751718565165e-05, "loss": 0.4235, "step": 5116 }, { "epoch": 5.240532241555783, "grad_norm": 0.3671875, "learning_rate": 4.524628714398645e-05, "loss": 0.4215, "step": 5120 }, { "epoch": 5.244626407369498, "grad_norm": 0.369140625, "learning_rate": 4.504718349002447e-05, "loss": 0.4246, "step": 5124 }, { "epoch": 5.248720573183214, "grad_norm": 0.35546875, "learning_rate": 4.4848441442675154e-05, "loss": 0.4044, "step": 5128 }, { "epoch": 5.252814738996929, "grad_norm": 0.34765625, "learning_rate": 4.4650061686688514e-05, "loss": 0.4406, "step": 5132 }, { "epoch": 5.256908904810645, "grad_norm": 0.34765625, "learning_rate": 4.445204490556618e-05, "loss": 0.4196, "step": 5136 }, { "epoch": 5.26100307062436, "grad_norm": 0.345703125, "learning_rate": 4.4254391781559336e-05, "loss": 0.4163, "step": 5140 }, { "epoch": 5.265097236438076, "grad_norm": 0.341796875, "learning_rate": 4.405710299566622e-05, "loss": 0.454, "step": 5144 }, { "epoch": 5.269191402251791, "grad_norm": 0.359375, "learning_rate": 4.386017922762958e-05, "loss": 0.441, "step": 5148 }, { "epoch": 5.273285568065507, "grad_norm": 0.3515625, "learning_rate": 4.3663621155934724e-05, "loss": 0.4363, "step": 5152 }, { "epoch": 5.277379733879222, "grad_norm": 0.361328125, "learning_rate": 4.3467429457806965e-05, "loss": 0.4327, "step": 5156 }, { "epoch": 5.281473899692937, "grad_norm": 0.349609375, "learning_rate": 4.327160480920915e-05, "loss": 0.4212, "step": 5160 }, { "epoch": 5.285568065506653, "grad_norm": 0.36328125, "learning_rate": 4.307614788483963e-05, "loss": 0.4076, "step": 5164 }, { "epoch": 5.289662231320369, "grad_norm": 0.353515625, "learning_rate": 4.2881059358129806e-05, "loss": 0.4527, "step": 5168 }, { "epoch": 5.293756397134084, "grad_norm": 0.330078125, "learning_rate": 4.268633990124163e-05, "loss": 0.4017, "step": 5172 }, { "epoch": 5.297850562947799, "grad_norm": 0.328125, "learning_rate": 4.2491990185065625e-05, "loss": 0.392, "step": 5176 }, { "epoch": 5.301944728761515, "grad_norm": 0.361328125, "learning_rate": 4.22980108792184e-05, "loss": 0.4646, "step": 5180 }, { "epoch": 5.30603889457523, "grad_norm": 0.333984375, "learning_rate": 4.2104402652040144e-05, "loss": 0.4128, "step": 5184 }, { "epoch": 5.310133060388946, "grad_norm": 0.3671875, "learning_rate": 4.191116617059272e-05, "loss": 0.4126, "step": 5188 }, { "epoch": 5.314227226202661, "grad_norm": 0.3359375, "learning_rate": 4.1718302100657176e-05, "loss": 0.4295, "step": 5192 }, { "epoch": 5.318321392016377, "grad_norm": 0.33984375, "learning_rate": 4.15258111067313e-05, "loss": 0.4273, "step": 5196 }, { "epoch": 5.322415557830092, "grad_norm": 0.34375, "learning_rate": 4.133369385202756e-05, "loss": 0.42, "step": 5200 }, { "epoch": 5.326509723643808, "grad_norm": 0.3515625, "learning_rate": 4.114195099847083e-05, "loss": 0.4329, "step": 5204 }, { "epoch": 5.330603889457523, "grad_norm": 0.341796875, "learning_rate": 4.0950583206695786e-05, "loss": 0.4637, "step": 5208 }, { "epoch": 5.334698055271239, "grad_norm": 0.361328125, "learning_rate": 4.075959113604506e-05, "loss": 0.4401, "step": 5212 }, { "epoch": 5.338792221084954, "grad_norm": 0.353515625, "learning_rate": 4.056897544456673e-05, "loss": 0.4225, "step": 5216 }, { "epoch": 5.34288638689867, "grad_norm": 0.361328125, "learning_rate": 4.037873678901198e-05, "loss": 0.4322, "step": 5220 }, { "epoch": 5.346980552712385, "grad_norm": 0.34765625, "learning_rate": 4.018887582483306e-05, "loss": 0.4224, "step": 5224 }, { "epoch": 5.3510747185261005, "grad_norm": 0.375, "learning_rate": 3.9999393206180914e-05, "loss": 0.4694, "step": 5228 }, { "epoch": 5.3551688843398155, "grad_norm": 0.34765625, "learning_rate": 3.98102895859028e-05, "loss": 0.4165, "step": 5232 }, { "epoch": 5.359263050153531, "grad_norm": 0.34375, "learning_rate": 3.9621565615540325e-05, "loss": 0.4271, "step": 5236 }, { "epoch": 5.3633572159672465, "grad_norm": 0.365234375, "learning_rate": 3.9433221945326985e-05, "loss": 0.4569, "step": 5240 }, { "epoch": 5.367451381780962, "grad_norm": 0.361328125, "learning_rate": 3.924525922418591e-05, "loss": 0.4756, "step": 5244 }, { "epoch": 5.371545547594677, "grad_norm": 0.359375, "learning_rate": 3.905767809972779e-05, "loss": 0.4734, "step": 5248 }, { "epoch": 5.375639713408393, "grad_norm": 0.35546875, "learning_rate": 3.887047921824858e-05, "loss": 0.3928, "step": 5252 }, { "epoch": 5.379733879222108, "grad_norm": 0.353515625, "learning_rate": 3.868366322472704e-05, "loss": 0.4826, "step": 5256 }, { "epoch": 5.383828045035824, "grad_norm": 0.37890625, "learning_rate": 3.849723076282308e-05, "loss": 0.4614, "step": 5260 }, { "epoch": 5.387922210849539, "grad_norm": 0.359375, "learning_rate": 3.831118247487481e-05, "loss": 0.4696, "step": 5264 }, { "epoch": 5.392016376663255, "grad_norm": 0.3515625, "learning_rate": 3.812551900189694e-05, "loss": 0.4323, "step": 5268 }, { "epoch": 5.39611054247697, "grad_norm": 0.3359375, "learning_rate": 3.794024098357826e-05, "loss": 0.3972, "step": 5272 }, { "epoch": 5.400204708290686, "grad_norm": 0.345703125, "learning_rate": 3.775534905827943e-05, "loss": 0.4397, "step": 5276 }, { "epoch": 5.404298874104401, "grad_norm": 0.359375, "learning_rate": 3.7570843863030995e-05, "loss": 0.426, "step": 5280 }, { "epoch": 5.408393039918117, "grad_norm": 0.3359375, "learning_rate": 3.7386726033530995e-05, "loss": 0.4399, "step": 5284 }, { "epoch": 5.412487205731832, "grad_norm": 0.35546875, "learning_rate": 3.720299620414274e-05, "loss": 0.4422, "step": 5288 }, { "epoch": 5.416581371545548, "grad_norm": 0.365234375, "learning_rate": 3.701965500789287e-05, "loss": 0.4196, "step": 5292 }, { "epoch": 5.420675537359263, "grad_norm": 0.37890625, "learning_rate": 3.68367030764689e-05, "loss": 0.432, "step": 5296 }, { "epoch": 5.424769703172979, "grad_norm": 0.3515625, "learning_rate": 3.665414104021729e-05, "loss": 0.448, "step": 5300 }, { "epoch": 5.428863868986694, "grad_norm": 0.359375, "learning_rate": 3.647196952814099e-05, "loss": 0.435, "step": 5304 }, { "epoch": 5.432958034800409, "grad_norm": 0.36328125, "learning_rate": 3.6290189167897526e-05, "loss": 0.4299, "step": 5308 }, { "epoch": 5.437052200614125, "grad_norm": 0.36328125, "learning_rate": 3.6108800585796774e-05, "loss": 0.4619, "step": 5312 }, { "epoch": 5.441146366427841, "grad_norm": 0.353515625, "learning_rate": 3.5927804406798655e-05, "loss": 0.4072, "step": 5316 }, { "epoch": 5.445240532241556, "grad_norm": 0.361328125, "learning_rate": 3.574720125451118e-05, "loss": 0.425, "step": 5320 }, { "epoch": 5.449334698055271, "grad_norm": 0.3515625, "learning_rate": 3.556699175118824e-05, "loss": 0.3989, "step": 5324 }, { "epoch": 5.453428863868987, "grad_norm": 0.361328125, "learning_rate": 3.538717651772733e-05, "loss": 0.4356, "step": 5328 }, { "epoch": 5.457523029682702, "grad_norm": 0.373046875, "learning_rate": 3.520775617366763e-05, "loss": 0.4129, "step": 5332 }, { "epoch": 5.461617195496418, "grad_norm": 0.349609375, "learning_rate": 3.502873133718775e-05, "loss": 0.4549, "step": 5336 }, { "epoch": 5.465711361310133, "grad_norm": 0.345703125, "learning_rate": 3.4850102625103504e-05, "loss": 0.436, "step": 5340 }, { "epoch": 5.469805527123849, "grad_norm": 0.365234375, "learning_rate": 3.4671870652866e-05, "loss": 0.4197, "step": 5344 }, { "epoch": 5.473899692937564, "grad_norm": 0.341796875, "learning_rate": 3.449403603455941e-05, "loss": 0.4587, "step": 5348 }, { "epoch": 5.4779938587512795, "grad_norm": 0.361328125, "learning_rate": 3.431659938289875e-05, "loss": 0.409, "step": 5352 }, { "epoch": 5.4820880245649946, "grad_norm": 0.36328125, "learning_rate": 3.413956130922797e-05, "loss": 0.3965, "step": 5356 }, { "epoch": 5.4861821903787105, "grad_norm": 0.369140625, "learning_rate": 3.396292242351779e-05, "loss": 0.4204, "step": 5360 }, { "epoch": 5.4902763561924255, "grad_norm": 0.330078125, "learning_rate": 3.3786683334363376e-05, "loss": 0.4489, "step": 5364 }, { "epoch": 5.494370522006141, "grad_norm": 0.35546875, "learning_rate": 3.36108446489826e-05, "loss": 0.4325, "step": 5368 }, { "epoch": 5.498464687819856, "grad_norm": 0.33984375, "learning_rate": 3.3435406973213777e-05, "loss": 0.4531, "step": 5372 }, { "epoch": 5.502558853633572, "grad_norm": 0.3359375, "learning_rate": 3.326037091151342e-05, "loss": 0.4556, "step": 5376 }, { "epoch": 5.506653019447287, "grad_norm": 0.365234375, "learning_rate": 3.308573706695445e-05, "loss": 0.4505, "step": 5380 }, { "epoch": 5.510747185261003, "grad_norm": 0.337890625, "learning_rate": 3.2911506041223984e-05, "loss": 0.4272, "step": 5384 }, { "epoch": 5.514841351074718, "grad_norm": 0.359375, "learning_rate": 3.273767843462113e-05, "loss": 0.4392, "step": 5388 }, { "epoch": 5.518935516888434, "grad_norm": 0.3515625, "learning_rate": 3.2564254846055177e-05, "loss": 0.4276, "step": 5392 }, { "epoch": 5.523029682702149, "grad_norm": 0.3671875, "learning_rate": 3.2391235873043396e-05, "loss": 0.4145, "step": 5396 }, { "epoch": 5.527123848515865, "grad_norm": 0.34765625, "learning_rate": 3.221862211170883e-05, "loss": 0.3965, "step": 5400 }, { "epoch": 5.53121801432958, "grad_norm": 0.361328125, "learning_rate": 3.2046414156778674e-05, "loss": 0.4354, "step": 5404 }, { "epoch": 5.535312180143296, "grad_norm": 0.345703125, "learning_rate": 3.187461260158166e-05, "loss": 0.4275, "step": 5408 }, { "epoch": 5.539406345957011, "grad_norm": 0.37890625, "learning_rate": 3.170321803804649e-05, "loss": 0.4575, "step": 5412 }, { "epoch": 5.543500511770727, "grad_norm": 0.3515625, "learning_rate": 3.153223105669957e-05, "loss": 0.3868, "step": 5416 }, { "epoch": 5.547594677584442, "grad_norm": 0.361328125, "learning_rate": 3.1361652246662944e-05, "loss": 0.4461, "step": 5420 }, { "epoch": 5.551688843398158, "grad_norm": 0.37109375, "learning_rate": 3.1191482195652405e-05, "loss": 0.4191, "step": 5424 }, { "epoch": 5.555783009211873, "grad_norm": 0.369140625, "learning_rate": 3.102172148997543e-05, "loss": 0.4842, "step": 5428 }, { "epoch": 5.559877175025589, "grad_norm": 0.333984375, "learning_rate": 3.085237071452898e-05, "loss": 0.439, "step": 5432 }, { "epoch": 5.563971340839304, "grad_norm": 0.365234375, "learning_rate": 3.068343045279779e-05, "loss": 0.4481, "step": 5436 }, { "epoch": 5.56806550665302, "grad_norm": 0.326171875, "learning_rate": 3.0514901286852177e-05, "loss": 0.3919, "step": 5440 }, { "epoch": 5.572159672466735, "grad_norm": 0.345703125, "learning_rate": 3.0346783797345936e-05, "loss": 0.4635, "step": 5444 }, { "epoch": 5.576253838280451, "grad_norm": 0.337890625, "learning_rate": 3.0179078563514625e-05, "loss": 0.3848, "step": 5448 }, { "epoch": 5.580348004094166, "grad_norm": 0.353515625, "learning_rate": 3.0011786163173357e-05, "loss": 0.4465, "step": 5452 }, { "epoch": 5.584442169907881, "grad_norm": 0.349609375, "learning_rate": 2.9844907172714767e-05, "loss": 0.4399, "step": 5456 }, { "epoch": 5.588536335721597, "grad_norm": 0.380859375, "learning_rate": 2.9678442167107242e-05, "loss": 0.475, "step": 5460 }, { "epoch": 5.592630501535313, "grad_norm": 0.357421875, "learning_rate": 2.951239171989278e-05, "loss": 0.4141, "step": 5464 }, { "epoch": 5.596724667349028, "grad_norm": 0.373046875, "learning_rate": 2.9346756403184974e-05, "loss": 0.4536, "step": 5468 }, { "epoch": 5.600818833162743, "grad_norm": 0.349609375, "learning_rate": 2.9181536787667237e-05, "loss": 0.4386, "step": 5472 }, { "epoch": 5.604912998976459, "grad_norm": 0.337890625, "learning_rate": 2.9016733442590683e-05, "loss": 0.4425, "step": 5476 }, { "epoch": 5.6090071647901745, "grad_norm": 0.345703125, "learning_rate": 2.8852346935772107e-05, "loss": 0.4275, "step": 5480 }, { "epoch": 5.6131013306038895, "grad_norm": 0.392578125, "learning_rate": 2.868837783359222e-05, "loss": 0.4468, "step": 5484 }, { "epoch": 5.6171954964176045, "grad_norm": 0.34375, "learning_rate": 2.8524826700993625e-05, "loss": 0.4208, "step": 5488 }, { "epoch": 5.62128966223132, "grad_norm": 0.34375, "learning_rate": 2.8361694101478704e-05, "loss": 0.4079, "step": 5492 }, { "epoch": 5.6253838280450355, "grad_norm": 0.345703125, "learning_rate": 2.8198980597107956e-05, "loss": 0.4446, "step": 5496 }, { "epoch": 5.629477993858751, "grad_norm": 0.39453125, "learning_rate": 2.8036686748497883e-05, "loss": 0.4611, "step": 5500 }, { "epoch": 5.633572159672466, "grad_norm": 0.337890625, "learning_rate": 2.7874813114818994e-05, "loss": 0.4355, "step": 5504 }, { "epoch": 5.637666325486182, "grad_norm": 0.365234375, "learning_rate": 2.7713360253794143e-05, "loss": 0.4381, "step": 5508 }, { "epoch": 5.641760491299897, "grad_norm": 0.3359375, "learning_rate": 2.7552328721696388e-05, "loss": 0.4526, "step": 5512 }, { "epoch": 5.645854657113613, "grad_norm": 0.35546875, "learning_rate": 2.7391719073347018e-05, "loss": 0.4705, "step": 5516 }, { "epoch": 5.649948822927328, "grad_norm": 0.36328125, "learning_rate": 2.723153186211392e-05, "loss": 0.4437, "step": 5520 }, { "epoch": 5.654042988741044, "grad_norm": 0.375, "learning_rate": 2.707176763990944e-05, "loss": 0.438, "step": 5524 }, { "epoch": 5.658137154554759, "grad_norm": 0.3515625, "learning_rate": 2.69124269571885e-05, "loss": 0.4156, "step": 5528 }, { "epoch": 5.662231320368475, "grad_norm": 0.33984375, "learning_rate": 2.6753510362946823e-05, "loss": 0.4496, "step": 5532 }, { "epoch": 5.66632548618219, "grad_norm": 0.3515625, "learning_rate": 2.659501840471898e-05, "loss": 0.4761, "step": 5536 }, { "epoch": 5.670419651995906, "grad_norm": 0.37109375, "learning_rate": 2.643695162857638e-05, "loss": 0.4506, "step": 5540 }, { "epoch": 5.674513817809621, "grad_norm": 0.359375, "learning_rate": 2.6279310579125562e-05, "loss": 0.4432, "step": 5544 }, { "epoch": 5.678607983623337, "grad_norm": 0.34765625, "learning_rate": 2.6122095799506394e-05, "loss": 0.4281, "step": 5548 }, { "epoch": 5.682702149437052, "grad_norm": 0.37109375, "learning_rate": 2.596530783138983e-05, "loss": 0.4595, "step": 5552 }, { "epoch": 5.686796315250768, "grad_norm": 0.36328125, "learning_rate": 2.5808947214976428e-05, "loss": 0.4131, "step": 5556 }, { "epoch": 5.690890481064483, "grad_norm": 0.361328125, "learning_rate": 2.5653014488994328e-05, "loss": 0.4694, "step": 5560 }, { "epoch": 5.694984646878199, "grad_norm": 0.369140625, "learning_rate": 2.5497510190697323e-05, "loss": 0.4834, "step": 5564 }, { "epoch": 5.699078812691914, "grad_norm": 0.353515625, "learning_rate": 2.5342434855863187e-05, "loss": 0.4144, "step": 5568 }, { "epoch": 5.70317297850563, "grad_norm": 0.3203125, "learning_rate": 2.5187789018791703e-05, "loss": 0.429, "step": 5572 }, { "epoch": 5.707267144319345, "grad_norm": 0.361328125, "learning_rate": 2.5033573212302814e-05, "loss": 0.4901, "step": 5576 }, { "epoch": 5.711361310133061, "grad_norm": 0.337890625, "learning_rate": 2.4879787967734878e-05, "loss": 0.39, "step": 5580 }, { "epoch": 5.715455475946776, "grad_norm": 0.345703125, "learning_rate": 2.4726433814942814e-05, "loss": 0.3819, "step": 5584 }, { "epoch": 5.719549641760492, "grad_norm": 0.349609375, "learning_rate": 2.4573511282296126e-05, "loss": 0.4443, "step": 5588 }, { "epoch": 5.723643807574207, "grad_norm": 0.34375, "learning_rate": 2.4421020896677318e-05, "loss": 0.4127, "step": 5592 }, { "epoch": 5.727737973387923, "grad_norm": 0.34375, "learning_rate": 2.4268963183479967e-05, "loss": 0.488, "step": 5596 }, { "epoch": 5.731832139201638, "grad_norm": 0.36328125, "learning_rate": 2.4117338666606796e-05, "loss": 0.4206, "step": 5600 }, { "epoch": 5.7359263050153535, "grad_norm": 0.345703125, "learning_rate": 2.39661478684681e-05, "loss": 0.3882, "step": 5604 }, { "epoch": 5.7400204708290685, "grad_norm": 0.353515625, "learning_rate": 2.3815391309979843e-05, "loss": 0.3997, "step": 5608 }, { "epoch": 5.744114636642784, "grad_norm": 0.3359375, "learning_rate": 2.366506951056173e-05, "loss": 0.4571, "step": 5612 }, { "epoch": 5.7482088024564995, "grad_norm": 0.3359375, "learning_rate": 2.3515182988135618e-05, "loss": 0.4476, "step": 5616 }, { "epoch": 5.7523029682702145, "grad_norm": 0.365234375, "learning_rate": 2.336573225912371e-05, "loss": 0.4467, "step": 5620 }, { "epoch": 5.75639713408393, "grad_norm": 0.341796875, "learning_rate": 2.3216717838446565e-05, "loss": 0.3617, "step": 5624 }, { "epoch": 5.760491299897646, "grad_norm": 0.341796875, "learning_rate": 2.3068140239521588e-05, "loss": 0.4339, "step": 5628 }, { "epoch": 5.764585465711361, "grad_norm": 0.357421875, "learning_rate": 2.2919999974261177e-05, "loss": 0.4, "step": 5632 }, { "epoch": 5.768679631525076, "grad_norm": 0.3359375, "learning_rate": 2.2772297553070784e-05, "loss": 0.4217, "step": 5636 }, { "epoch": 5.772773797338792, "grad_norm": 0.33984375, "learning_rate": 2.262503348484745e-05, "loss": 0.4739, "step": 5640 }, { "epoch": 5.776867963152508, "grad_norm": 0.373046875, "learning_rate": 2.247820827697789e-05, "loss": 0.4524, "step": 5644 }, { "epoch": 5.780962128966223, "grad_norm": 0.349609375, "learning_rate": 2.2331822435336644e-05, "loss": 0.418, "step": 5648 }, { "epoch": 5.785056294779938, "grad_norm": 0.345703125, "learning_rate": 2.2185876464284554e-05, "loss": 0.4336, "step": 5652 }, { "epoch": 5.789150460593654, "grad_norm": 0.3671875, "learning_rate": 2.2040370866666945e-05, "loss": 0.4555, "step": 5656 }, { "epoch": 5.793244626407369, "grad_norm": 0.3671875, "learning_rate": 2.1895306143811768e-05, "loss": 0.4541, "step": 5660 }, { "epoch": 5.797338792221085, "grad_norm": 0.369140625, "learning_rate": 2.175068279552805e-05, "loss": 0.4081, "step": 5664 }, { "epoch": 5.8014329580348, "grad_norm": 0.328125, "learning_rate": 2.16065013201041e-05, "loss": 0.4147, "step": 5668 }, { "epoch": 5.805527123848516, "grad_norm": 0.349609375, "learning_rate": 2.146276221430572e-05, "loss": 0.423, "step": 5672 }, { "epoch": 5.809621289662231, "grad_norm": 0.31640625, "learning_rate": 2.131946597337463e-05, "loss": 0.4226, "step": 5676 }, { "epoch": 5.813715455475947, "grad_norm": 0.34765625, "learning_rate": 2.1176613091026716e-05, "loss": 0.4253, "step": 5680 }, { "epoch": 5.817809621289662, "grad_norm": 0.359375, "learning_rate": 2.1034204059450193e-05, "loss": 0.4547, "step": 5684 }, { "epoch": 5.821903787103378, "grad_norm": 0.341796875, "learning_rate": 2.0892239369304122e-05, "loss": 0.4063, "step": 5688 }, { "epoch": 5.825997952917093, "grad_norm": 0.357421875, "learning_rate": 2.0750719509716584e-05, "loss": 0.4275, "step": 5692 }, { "epoch": 5.830092118730809, "grad_norm": 0.376953125, "learning_rate": 2.0609644968283068e-05, "loss": 0.4478, "step": 5696 }, { "epoch": 5.834186284544524, "grad_norm": 0.375, "learning_rate": 2.046901623106476e-05, "loss": 0.3858, "step": 5700 }, { "epoch": 5.83828045035824, "grad_norm": 0.345703125, "learning_rate": 2.0328833782586735e-05, "loss": 0.4287, "step": 5704 }, { "epoch": 5.842374616171955, "grad_norm": 0.419921875, "learning_rate": 2.018909810583657e-05, "loss": 0.4537, "step": 5708 }, { "epoch": 5.846468781985671, "grad_norm": 0.369140625, "learning_rate": 2.0049809682262485e-05, "loss": 0.4189, "step": 5712 }, { "epoch": 5.850562947799386, "grad_norm": 0.3515625, "learning_rate": 1.9910968991771642e-05, "loss": 0.4401, "step": 5716 }, { "epoch": 5.854657113613102, "grad_norm": 0.3515625, "learning_rate": 1.977257651272869e-05, "loss": 0.4498, "step": 5720 }, { "epoch": 5.858751279426817, "grad_norm": 0.357421875, "learning_rate": 1.963463272195394e-05, "loss": 0.4612, "step": 5724 }, { "epoch": 5.8628454452405325, "grad_norm": 0.353515625, "learning_rate": 1.949713809472177e-05, "loss": 0.4135, "step": 5728 }, { "epoch": 5.8669396110542475, "grad_norm": 0.359375, "learning_rate": 1.9360093104759035e-05, "loss": 0.4503, "step": 5732 }, { "epoch": 5.8710337768679635, "grad_norm": 0.37890625, "learning_rate": 1.9223498224243395e-05, "loss": 0.4107, "step": 5736 }, { "epoch": 5.8751279426816785, "grad_norm": 0.3671875, "learning_rate": 1.9087353923801723e-05, "loss": 0.4364, "step": 5740 }, { "epoch": 5.879222108495394, "grad_norm": 0.369140625, "learning_rate": 1.8951660672508335e-05, "loss": 0.4805, "step": 5744 }, { "epoch": 5.883316274309109, "grad_norm": 0.3828125, "learning_rate": 1.8816418937883615e-05, "loss": 0.4737, "step": 5748 }, { "epoch": 5.887410440122825, "grad_norm": 0.3359375, "learning_rate": 1.86816291858923e-05, "loss": 0.4202, "step": 5752 }, { "epoch": 5.89150460593654, "grad_norm": 0.34765625, "learning_rate": 1.8547291880941702e-05, "loss": 0.4629, "step": 5756 }, { "epoch": 5.895598771750256, "grad_norm": 0.353515625, "learning_rate": 1.8413407485880394e-05, "loss": 0.4427, "step": 5760 }, { "epoch": 5.899692937563971, "grad_norm": 0.33203125, "learning_rate": 1.8279976461996505e-05, "loss": 0.4057, "step": 5764 }, { "epoch": 5.903787103377686, "grad_norm": 0.36328125, "learning_rate": 1.814699926901597e-05, "loss": 0.4321, "step": 5768 }, { "epoch": 5.907881269191402, "grad_norm": 0.33984375, "learning_rate": 1.8014476365101222e-05, "loss": 0.4007, "step": 5772 }, { "epoch": 5.911975435005118, "grad_norm": 0.3515625, "learning_rate": 1.7882408206849446e-05, "loss": 0.423, "step": 5776 }, { "epoch": 5.916069600818833, "grad_norm": 0.36328125, "learning_rate": 1.7750795249290944e-05, "loss": 0.4179, "step": 5780 }, { "epoch": 5.920163766632548, "grad_norm": 0.35546875, "learning_rate": 1.7619637945887765e-05, "loss": 0.4201, "step": 5784 }, { "epoch": 5.924257932446264, "grad_norm": 0.333984375, "learning_rate": 1.748893674853205e-05, "loss": 0.4133, "step": 5788 }, { "epoch": 5.92835209825998, "grad_norm": 0.341796875, "learning_rate": 1.7358692107544363e-05, "loss": 0.4251, "step": 5792 }, { "epoch": 5.932446264073695, "grad_norm": 0.376953125, "learning_rate": 1.7228904471672294e-05, "loss": 0.4342, "step": 5796 }, { "epoch": 5.93654042988741, "grad_norm": 0.4609375, "learning_rate": 1.7099574288088906e-05, "loss": 0.4283, "step": 5800 }, { "epoch": 5.940634595701126, "grad_norm": 0.376953125, "learning_rate": 1.697070200239103e-05, "loss": 0.4336, "step": 5804 }, { "epoch": 5.944728761514841, "grad_norm": 0.326171875, "learning_rate": 1.6842288058597946e-05, "loss": 0.3919, "step": 5808 }, { "epoch": 5.948822927328557, "grad_norm": 0.326171875, "learning_rate": 1.6714332899149764e-05, "loss": 0.4539, "step": 5812 }, { "epoch": 5.952917093142272, "grad_norm": 0.373046875, "learning_rate": 1.6586836964905775e-05, "loss": 0.447, "step": 5816 }, { "epoch": 5.957011258955988, "grad_norm": 0.36328125, "learning_rate": 1.6459800695143166e-05, "loss": 0.4306, "step": 5820 }, { "epoch": 5.961105424769703, "grad_norm": 0.33984375, "learning_rate": 1.6333224527555332e-05, "loss": 0.3874, "step": 5824 }, { "epoch": 5.965199590583419, "grad_norm": 0.3359375, "learning_rate": 1.620710889825039e-05, "loss": 0.3592, "step": 5828 }, { "epoch": 5.969293756397134, "grad_norm": 0.359375, "learning_rate": 1.6081454241749782e-05, "loss": 0.4402, "step": 5832 }, { "epoch": 5.97338792221085, "grad_norm": 0.357421875, "learning_rate": 1.595626099098667e-05, "loss": 0.4694, "step": 5836 }, { "epoch": 5.977482088024565, "grad_norm": 0.375, "learning_rate": 1.583152957730447e-05, "loss": 0.4011, "step": 5840 }, { "epoch": 5.981576253838281, "grad_norm": 0.369140625, "learning_rate": 1.5707260430455413e-05, "loss": 0.4348, "step": 5844 }, { "epoch": 5.985670419651996, "grad_norm": 0.333984375, "learning_rate": 1.558345397859893e-05, "loss": 0.3799, "step": 5848 }, { "epoch": 5.9897645854657116, "grad_norm": 0.357421875, "learning_rate": 1.54601106483004e-05, "loss": 0.4204, "step": 5852 }, { "epoch": 5.993858751279427, "grad_norm": 0.369140625, "learning_rate": 1.53372308645295e-05, "loss": 0.4476, "step": 5856 }, { "epoch": 5.9979529170931425, "grad_norm": 0.36328125, "learning_rate": 1.521481505065873e-05, "loss": 0.4125, "step": 5860 }, { "epoch": 6.0020470829068575, "grad_norm": 0.341796875, "learning_rate": 1.5092863628462093e-05, "loss": 0.4496, "step": 5864 }, { "epoch": 6.006141248720573, "grad_norm": 0.357421875, "learning_rate": 1.4971377018113617e-05, "loss": 0.4461, "step": 5868 }, { "epoch": 6.0102354145342884, "grad_norm": 0.341796875, "learning_rate": 1.4850355638185713e-05, "loss": 0.3945, "step": 5872 }, { "epoch": 6.014329580348004, "grad_norm": 0.328125, "learning_rate": 1.472979990564797e-05, "loss": 0.4214, "step": 5876 }, { "epoch": 6.018423746161719, "grad_norm": 0.33203125, "learning_rate": 1.460971023586565e-05, "loss": 0.4268, "step": 5880 }, { "epoch": 6.022517911975435, "grad_norm": 0.337890625, "learning_rate": 1.4490087042598147e-05, "loss": 0.4359, "step": 5884 }, { "epoch": 6.02661207778915, "grad_norm": 0.353515625, "learning_rate": 1.4370930737997722e-05, "loss": 0.4153, "step": 5888 }, { "epoch": 6.030706243602866, "grad_norm": 0.337890625, "learning_rate": 1.4252241732608004e-05, "loss": 0.4195, "step": 5892 }, { "epoch": 6.034800409416581, "grad_norm": 0.337890625, "learning_rate": 1.4134020435362487e-05, "loss": 0.3509, "step": 5896 }, { "epoch": 6.038894575230297, "grad_norm": 0.359375, "learning_rate": 1.4016267253583324e-05, "loss": 0.4191, "step": 5900 }, { "epoch": 6.042988741044012, "grad_norm": 0.35546875, "learning_rate": 1.3898982592979802e-05, "loss": 0.4477, "step": 5904 }, { "epoch": 6.047082906857728, "grad_norm": 0.330078125, "learning_rate": 1.378216685764686e-05, "loss": 0.4107, "step": 5908 }, { "epoch": 6.051177072671443, "grad_norm": 0.3671875, "learning_rate": 1.3665820450063898e-05, "loss": 0.3961, "step": 5912 }, { "epoch": 6.055271238485159, "grad_norm": 0.369140625, "learning_rate": 1.3549943771093258e-05, "loss": 0.4052, "step": 5916 }, { "epoch": 6.059365404298874, "grad_norm": 0.328125, "learning_rate": 1.3434537219978813e-05, "loss": 0.389, "step": 5920 }, { "epoch": 6.06345957011259, "grad_norm": 0.333984375, "learning_rate": 1.3319601194344698e-05, "loss": 0.4063, "step": 5924 }, { "epoch": 6.067553735926305, "grad_norm": 0.341796875, "learning_rate": 1.3205136090193923e-05, "loss": 0.436, "step": 5928 }, { "epoch": 6.071647901740021, "grad_norm": 0.37109375, "learning_rate": 1.3091142301906887e-05, "loss": 0.4485, "step": 5932 }, { "epoch": 6.075742067553736, "grad_norm": 0.353515625, "learning_rate": 1.2977620222240165e-05, "loss": 0.4521, "step": 5936 }, { "epoch": 6.079836233367452, "grad_norm": 0.34375, "learning_rate": 1.2864570242325133e-05, "loss": 0.4712, "step": 5940 }, { "epoch": 6.083930399181167, "grad_norm": 0.3515625, "learning_rate": 1.2751992751666457e-05, "loss": 0.4428, "step": 5944 }, { "epoch": 6.088024564994882, "grad_norm": 0.359375, "learning_rate": 1.2639888138141014e-05, "loss": 0.4535, "step": 5948 }, { "epoch": 6.092118730808598, "grad_norm": 0.337890625, "learning_rate": 1.2528256787996372e-05, "loss": 0.4473, "step": 5952 }, { "epoch": 6.096212896622313, "grad_norm": 0.373046875, "learning_rate": 1.2417099085849468e-05, "loss": 0.3956, "step": 5956 }, { "epoch": 6.100307062436029, "grad_norm": 0.3359375, "learning_rate": 1.2306415414685366e-05, "loss": 0.4278, "step": 5960 }, { "epoch": 6.104401228249744, "grad_norm": 0.392578125, "learning_rate": 1.219620615585593e-05, "loss": 0.445, "step": 5964 }, { "epoch": 6.10849539406346, "grad_norm": 0.353515625, "learning_rate": 1.2086471689078353e-05, "loss": 0.4198, "step": 5968 }, { "epoch": 6.112589559877175, "grad_norm": 0.37890625, "learning_rate": 1.1977212392434082e-05, "loss": 0.4577, "step": 5972 }, { "epoch": 6.116683725690891, "grad_norm": 0.3359375, "learning_rate": 1.1868428642367378e-05, "loss": 0.4793, "step": 5976 }, { "epoch": 6.120777891504606, "grad_norm": 0.3515625, "learning_rate": 1.1760120813684009e-05, "loss": 0.4424, "step": 5980 }, { "epoch": 6.1248720573183215, "grad_norm": 0.349609375, "learning_rate": 1.1652289279550026e-05, "loss": 0.4027, "step": 5984 }, { "epoch": 6.1289662231320365, "grad_norm": 0.34765625, "learning_rate": 1.1544934411490469e-05, "loss": 0.3947, "step": 5988 }, { "epoch": 6.1330603889457525, "grad_norm": 0.353515625, "learning_rate": 1.1438056579387966e-05, "loss": 0.4369, "step": 5992 }, { "epoch": 6.1371545547594675, "grad_norm": 0.349609375, "learning_rate": 1.1331656151481654e-05, "loss": 0.4243, "step": 5996 }, { "epoch": 6.141248720573183, "grad_norm": 0.34765625, "learning_rate": 1.1225733494365791e-05, "loss": 0.4472, "step": 6000 }, { "epoch": 6.145342886386898, "grad_norm": 0.3515625, "learning_rate": 1.1120288972988445e-05, "loss": 0.3859, "step": 6004 }, { "epoch": 6.149437052200614, "grad_norm": 0.330078125, "learning_rate": 1.1015322950650408e-05, "loss": 0.4321, "step": 6008 }, { "epoch": 6.153531218014329, "grad_norm": 0.34375, "learning_rate": 1.0910835789003785e-05, "loss": 0.4198, "step": 6012 }, { "epoch": 6.157625383828045, "grad_norm": 0.35546875, "learning_rate": 1.0806827848050791e-05, "loss": 0.4539, "step": 6016 }, { "epoch": 6.16171954964176, "grad_norm": 0.341796875, "learning_rate": 1.0703299486142541e-05, "loss": 0.4371, "step": 6020 }, { "epoch": 6.165813715455476, "grad_norm": 0.337890625, "learning_rate": 1.0600251059977854e-05, "loss": 0.3967, "step": 6024 }, { "epoch": 6.169907881269191, "grad_norm": 0.341796875, "learning_rate": 1.0497682924601841e-05, "loss": 0.4088, "step": 6028 }, { "epoch": 6.174002047082907, "grad_norm": 0.34765625, "learning_rate": 1.0395595433404935e-05, "loss": 0.4137, "step": 6032 }, { "epoch": 6.178096212896622, "grad_norm": 0.3671875, "learning_rate": 1.029398893812151e-05, "loss": 0.4145, "step": 6036 }, { "epoch": 6.182190378710338, "grad_norm": 0.357421875, "learning_rate": 1.0192863788828654e-05, "loss": 0.4265, "step": 6040 }, { "epoch": 6.186284544524053, "grad_norm": 0.349609375, "learning_rate": 1.0092220333945073e-05, "loss": 0.4127, "step": 6044 }, { "epoch": 6.190378710337769, "grad_norm": 0.359375, "learning_rate": 9.992058920229823e-06, "loss": 0.4384, "step": 6048 }, { "epoch": 6.194472876151484, "grad_norm": 0.369140625, "learning_rate": 9.892379892781088e-06, "loss": 0.4406, "step": 6052 }, { "epoch": 6.1985670419652, "grad_norm": 0.35546875, "learning_rate": 9.793183595035082e-06, "loss": 0.4378, "step": 6056 }, { "epoch": 6.202661207778915, "grad_norm": 0.353515625, "learning_rate": 9.694470368764812e-06, "loss": 0.3789, "step": 6060 }, { "epoch": 6.206755373592631, "grad_norm": 0.345703125, "learning_rate": 9.596240554078838e-06, "loss": 0.3834, "step": 6064 }, { "epoch": 6.210849539406346, "grad_norm": 0.361328125, "learning_rate": 9.4984944894202e-06, "loss": 0.4451, "step": 6068 }, { "epoch": 6.214943705220062, "grad_norm": 0.373046875, "learning_rate": 9.40123251156527e-06, "loss": 0.4477, "step": 6072 }, { "epoch": 6.219037871033777, "grad_norm": 0.345703125, "learning_rate": 9.304454955622425e-06, "loss": 0.401, "step": 6076 }, { "epoch": 6.223132036847493, "grad_norm": 0.375, "learning_rate": 9.208162155031074e-06, "loss": 0.4438, "step": 6080 }, { "epoch": 6.227226202661208, "grad_norm": 0.33984375, "learning_rate": 9.112354441560476e-06, "loss": 0.4212, "step": 6084 }, { "epoch": 6.231320368474924, "grad_norm": 0.3125, "learning_rate": 9.017032145308483e-06, "loss": 0.4082, "step": 6088 }, { "epoch": 6.235414534288639, "grad_norm": 0.34765625, "learning_rate": 8.9221955947005e-06, "loss": 0.4046, "step": 6092 }, { "epoch": 6.239508700102354, "grad_norm": 0.37890625, "learning_rate": 8.82784511648838e-06, "loss": 0.418, "step": 6096 }, { "epoch": 6.24360286591607, "grad_norm": 0.35546875, "learning_rate": 8.733981035749193e-06, "loss": 0.4293, "step": 6100 }, { "epoch": 6.247697031729785, "grad_norm": 0.34375, "learning_rate": 8.640603675884194e-06, "loss": 0.4244, "step": 6104 }, { "epoch": 6.2517911975435005, "grad_norm": 0.3671875, "learning_rate": 8.547713358617714e-06, "loss": 0.4109, "step": 6108 }, { "epoch": 6.255885363357216, "grad_norm": 0.357421875, "learning_rate": 8.455310403995924e-06, "loss": 0.4285, "step": 6112 }, { "epoch": 6.2599795291709315, "grad_norm": 0.36328125, "learning_rate": 8.363395130385908e-06, "loss": 0.4134, "step": 6116 }, { "epoch": 6.2640736949846465, "grad_norm": 0.353515625, "learning_rate": 8.27196785447446e-06, "loss": 0.438, "step": 6120 }, { "epoch": 6.268167860798362, "grad_norm": 0.3515625, "learning_rate": 8.181028891267017e-06, "loss": 0.4141, "step": 6124 }, { "epoch": 6.272262026612077, "grad_norm": 0.36328125, "learning_rate": 8.09057855408658e-06, "loss": 0.4089, "step": 6128 }, { "epoch": 6.276356192425793, "grad_norm": 0.34765625, "learning_rate": 8.000617154572597e-06, "loss": 0.4153, "step": 6132 }, { "epoch": 6.280450358239508, "grad_norm": 0.359375, "learning_rate": 7.91114500267993e-06, "loss": 0.4158, "step": 6136 }, { "epoch": 6.284544524053224, "grad_norm": 0.3515625, "learning_rate": 7.82216240667784e-06, "loss": 0.4268, "step": 6140 }, { "epoch": 6.288638689866939, "grad_norm": 0.353515625, "learning_rate": 7.733669673148768e-06, "loss": 0.4038, "step": 6144 }, { "epoch": 6.292732855680655, "grad_norm": 0.359375, "learning_rate": 7.645667106987407e-06, "loss": 0.4577, "step": 6148 }, { "epoch": 6.29682702149437, "grad_norm": 0.34765625, "learning_rate": 7.558155011399669e-06, "loss": 0.412, "step": 6152 }, { "epoch": 6.300921187308086, "grad_norm": 0.33984375, "learning_rate": 7.471133687901498e-06, "loss": 0.3985, "step": 6156 }, { "epoch": 6.305015353121801, "grad_norm": 0.33984375, "learning_rate": 7.384603436317993e-06, "loss": 0.4508, "step": 6160 }, { "epoch": 6.309109518935517, "grad_norm": 0.3515625, "learning_rate": 7.298564554782288e-06, "loss": 0.4141, "step": 6164 }, { "epoch": 6.313203684749232, "grad_norm": 0.357421875, "learning_rate": 7.213017339734506e-06, "loss": 0.4444, "step": 6168 }, { "epoch": 6.317297850562948, "grad_norm": 0.34375, "learning_rate": 7.127962085920808e-06, "loss": 0.4156, "step": 6172 }, { "epoch": 6.321392016376663, "grad_norm": 0.341796875, "learning_rate": 7.043399086392343e-06, "loss": 0.4466, "step": 6176 }, { "epoch": 6.325486182190379, "grad_norm": 0.345703125, "learning_rate": 6.9593286325042185e-06, "loss": 0.4325, "step": 6180 }, { "epoch": 6.329580348004094, "grad_norm": 0.375, "learning_rate": 6.875751013914516e-06, "loss": 0.431, "step": 6184 }, { "epoch": 6.33367451381781, "grad_norm": 0.341796875, "learning_rate": 6.79266651858329e-06, "loss": 0.4705, "step": 6188 }, { "epoch": 6.337768679631525, "grad_norm": 0.3671875, "learning_rate": 6.710075432771606e-06, "loss": 0.4316, "step": 6192 }, { "epoch": 6.341862845445241, "grad_norm": 0.369140625, "learning_rate": 6.627978041040488e-06, "loss": 0.4479, "step": 6196 }, { "epoch": 6.345957011258956, "grad_norm": 0.345703125, "learning_rate": 6.54637462624999e-06, "loss": 0.3946, "step": 6200 }, { "epoch": 6.350051177072672, "grad_norm": 0.34765625, "learning_rate": 6.465265469558256e-06, "loss": 0.4489, "step": 6204 }, { "epoch": 6.354145342886387, "grad_norm": 0.328125, "learning_rate": 6.384650850420397e-06, "loss": 0.4443, "step": 6208 }, { "epoch": 6.358239508700103, "grad_norm": 0.35546875, "learning_rate": 6.304531046587719e-06, "loss": 0.4218, "step": 6212 }, { "epoch": 6.362333674513818, "grad_norm": 0.34765625, "learning_rate": 6.224906334106689e-06, "loss": 0.4614, "step": 6216 }, { "epoch": 6.366427840327534, "grad_norm": 0.345703125, "learning_rate": 6.145776987317891e-06, "loss": 0.4615, "step": 6220 }, { "epoch": 6.370522006141249, "grad_norm": 0.35546875, "learning_rate": 6.067143278855241e-06, "loss": 0.4737, "step": 6224 }, { "epoch": 6.3746161719549645, "grad_norm": 0.341796875, "learning_rate": 5.9890054796449875e-06, "loss": 0.4364, "step": 6228 }, { "epoch": 6.37871033776868, "grad_norm": 0.349609375, "learning_rate": 5.911363858904661e-06, "loss": 0.4312, "step": 6232 }, { "epoch": 6.3828045035823955, "grad_norm": 0.375, "learning_rate": 5.834218684142344e-06, "loss": 0.4062, "step": 6236 }, { "epoch": 6.3868986693961105, "grad_norm": 0.3359375, "learning_rate": 5.757570221155638e-06, "loss": 0.4327, "step": 6240 }, { "epoch": 6.3909928352098255, "grad_norm": 0.361328125, "learning_rate": 5.6814187340307125e-06, "loss": 0.4012, "step": 6244 }, { "epoch": 6.395087001023541, "grad_norm": 0.349609375, "learning_rate": 5.605764485141507e-06, "loss": 0.3789, "step": 6248 }, { "epoch": 6.399181166837257, "grad_norm": 0.345703125, "learning_rate": 5.530607735148762e-06, "loss": 0.4342, "step": 6252 }, { "epoch": 6.403275332650972, "grad_norm": 0.37890625, "learning_rate": 5.4559487429990615e-06, "loss": 0.4502, "step": 6256 }, { "epoch": 6.407369498464687, "grad_norm": 0.353515625, "learning_rate": 5.381787765924056e-06, "loss": 0.4161, "step": 6260 }, { "epoch": 6.411463664278403, "grad_norm": 0.35546875, "learning_rate": 5.308125059439522e-06, "loss": 0.4442, "step": 6264 }, { "epoch": 6.415557830092118, "grad_norm": 0.3359375, "learning_rate": 5.234960877344491e-06, "loss": 0.4584, "step": 6268 }, { "epoch": 6.419651995905834, "grad_norm": 0.357421875, "learning_rate": 5.1622954717203514e-06, "loss": 0.4413, "step": 6272 }, { "epoch": 6.423746161719549, "grad_norm": 0.3828125, "learning_rate": 5.090129092929984e-06, "loss": 0.396, "step": 6276 }, { "epoch": 6.427840327533265, "grad_norm": 0.326171875, "learning_rate": 5.018461989616928e-06, "loss": 0.4253, "step": 6280 }, { "epoch": 6.43193449334698, "grad_norm": 0.34375, "learning_rate": 4.947294408704533e-06, "loss": 0.4244, "step": 6284 }, { "epoch": 6.436028659160696, "grad_norm": 0.337890625, "learning_rate": 4.876626595395039e-06, "loss": 0.434, "step": 6288 }, { "epoch": 6.440122824974411, "grad_norm": 0.326171875, "learning_rate": 4.806458793168799e-06, "loss": 0.4057, "step": 6292 }, { "epoch": 6.444216990788127, "grad_norm": 0.34375, "learning_rate": 4.736791243783427e-06, "loss": 0.4292, "step": 6296 }, { "epoch": 6.448311156601842, "grad_norm": 0.36328125, "learning_rate": 4.667624187272917e-06, "loss": 0.4297, "step": 6300 }, { "epoch": 6.452405322415558, "grad_norm": 0.349609375, "learning_rate": 4.598957861946906e-06, "loss": 0.4292, "step": 6304 }, { "epoch": 6.456499488229273, "grad_norm": 0.349609375, "learning_rate": 4.53079250438978e-06, "loss": 0.4307, "step": 6308 }, { "epoch": 6.460593654042989, "grad_norm": 0.35546875, "learning_rate": 4.463128349459855e-06, "loss": 0.4114, "step": 6312 }, { "epoch": 6.464687819856704, "grad_norm": 0.390625, "learning_rate": 4.395965630288628e-06, "loss": 0.4695, "step": 6316 }, { "epoch": 6.46878198567042, "grad_norm": 0.353515625, "learning_rate": 4.32930457827993e-06, "loss": 0.4417, "step": 6320 }, { "epoch": 6.472876151484135, "grad_norm": 0.357421875, "learning_rate": 4.263145423109121e-06, "loss": 0.4577, "step": 6324 }, { "epoch": 6.476970317297851, "grad_norm": 0.333984375, "learning_rate": 4.197488392722348e-06, "loss": 0.3964, "step": 6328 }, { "epoch": 6.481064483111566, "grad_norm": 0.322265625, "learning_rate": 4.132333713335689e-06, "loss": 0.4551, "step": 6332 }, { "epoch": 6.485158648925282, "grad_norm": 0.388671875, "learning_rate": 4.067681609434426e-06, "loss": 0.432, "step": 6336 }, { "epoch": 6.489252814738997, "grad_norm": 0.35546875, "learning_rate": 4.003532303772256e-06, "loss": 0.4323, "step": 6340 }, { "epoch": 6.493346980552713, "grad_norm": 0.35546875, "learning_rate": 3.939886017370564e-06, "loss": 0.3916, "step": 6344 }, { "epoch": 6.497441146366428, "grad_norm": 0.396484375, "learning_rate": 3.876742969517538e-06, "loss": 0.4087, "step": 6348 }, { "epoch": 6.501535312180144, "grad_norm": 0.359375, "learning_rate": 3.8141033777675854e-06, "loss": 0.4148, "step": 6352 }, { "epoch": 6.505629477993859, "grad_norm": 0.341796875, "learning_rate": 3.751967457940436e-06, "loss": 0.4203, "step": 6356 }, { "epoch": 6.5097236438075745, "grad_norm": 0.357421875, "learning_rate": 3.6903354241204886e-06, "loss": 0.4405, "step": 6360 }, { "epoch": 6.5138178096212895, "grad_norm": 0.353515625, "learning_rate": 3.6292074886559995e-06, "loss": 0.4195, "step": 6364 }, { "epoch": 6.5179119754350054, "grad_norm": 0.361328125, "learning_rate": 3.5685838621584804e-06, "loss": 0.4599, "step": 6368 }, { "epoch": 6.5220061412487205, "grad_norm": 0.34765625, "learning_rate": 3.5084647535017996e-06, "loss": 0.4015, "step": 6372 }, { "epoch": 6.526100307062436, "grad_norm": 0.349609375, "learning_rate": 3.448850369821565e-06, "loss": 0.3875, "step": 6376 }, { "epoch": 6.530194472876151, "grad_norm": 0.36328125, "learning_rate": 3.389740916514461e-06, "loss": 0.4491, "step": 6380 }, { "epoch": 6.534288638689867, "grad_norm": 0.34765625, "learning_rate": 3.331136597237377e-06, "loss": 0.4337, "step": 6384 }, { "epoch": 6.538382804503582, "grad_norm": 0.357421875, "learning_rate": 3.2730376139068816e-06, "loss": 0.4579, "step": 6388 }, { "epoch": 6.542476970317297, "grad_norm": 0.359375, "learning_rate": 3.2154441666984164e-06, "loss": 0.3983, "step": 6392 }, { "epoch": 6.546571136131013, "grad_norm": 0.34375, "learning_rate": 3.158356454045602e-06, "loss": 0.4155, "step": 6396 }, { "epoch": 6.550665301944729, "grad_norm": 0.349609375, "learning_rate": 3.101774672639684e-06, "loss": 0.4145, "step": 6400 }, { "epoch": 6.554759467758444, "grad_norm": 0.33984375, "learning_rate": 3.045699017428671e-06, "loss": 0.4317, "step": 6404 }, { "epoch": 6.558853633572159, "grad_norm": 0.369140625, "learning_rate": 2.990129681616782e-06, "loss": 0.4389, "step": 6408 }, { "epoch": 6.562947799385875, "grad_norm": 0.34375, "learning_rate": 2.9350668566637982e-06, "loss": 0.4404, "step": 6412 }, { "epoch": 6.567041965199591, "grad_norm": 0.333984375, "learning_rate": 2.880510732284297e-06, "loss": 0.4248, "step": 6416 }, { "epoch": 6.571136131013306, "grad_norm": 0.365234375, "learning_rate": 2.8264614964470856e-06, "loss": 0.4753, "step": 6420 }, { "epoch": 6.575230296827021, "grad_norm": 0.359375, "learning_rate": 2.7729193353745017e-06, "loss": 0.4326, "step": 6424 }, { "epoch": 6.579324462640737, "grad_norm": 0.35546875, "learning_rate": 2.7198844335418637e-06, "loss": 0.4478, "step": 6428 }, { "epoch": 6.583418628454452, "grad_norm": 0.345703125, "learning_rate": 2.6673569736766876e-06, "loss": 0.4379, "step": 6432 }, { "epoch": 6.587512794268168, "grad_norm": 0.36328125, "learning_rate": 2.615337136758172e-06, "loss": 0.4266, "step": 6436 }, { "epoch": 6.591606960081883, "grad_norm": 0.3515625, "learning_rate": 2.5638251020165812e-06, "loss": 0.3951, "step": 6440 }, { "epoch": 6.595701125895599, "grad_norm": 0.34375, "learning_rate": 2.512821046932495e-06, "loss": 0.4134, "step": 6444 }, { "epoch": 6.599795291709314, "grad_norm": 0.33984375, "learning_rate": 2.4623251472363937e-06, "loss": 0.4396, "step": 6448 }, { "epoch": 6.60388945752303, "grad_norm": 0.359375, "learning_rate": 2.412337576907858e-06, "loss": 0.4173, "step": 6452 }, { "epoch": 6.607983623336745, "grad_norm": 0.36328125, "learning_rate": 2.362858508175136e-06, "loss": 0.4338, "step": 6456 }, { "epoch": 6.612077789150461, "grad_norm": 0.361328125, "learning_rate": 2.313888111514395e-06, "loss": 0.4044, "step": 6460 }, { "epoch": 6.616171954964176, "grad_norm": 0.34765625, "learning_rate": 2.2654265556493022e-06, "loss": 0.4584, "step": 6464 }, { "epoch": 6.620266120777892, "grad_norm": 0.333984375, "learning_rate": 2.2174740075502794e-06, "loss": 0.4268, "step": 6468 }, { "epoch": 6.624360286591607, "grad_norm": 0.34375, "learning_rate": 2.1700306324340334e-06, "loss": 0.4085, "step": 6472 }, { "epoch": 6.628454452405323, "grad_norm": 0.361328125, "learning_rate": 2.123096593762974e-06, "loss": 0.4266, "step": 6476 }, { "epoch": 6.632548618219038, "grad_norm": 0.345703125, "learning_rate": 2.076672053244599e-06, "loss": 0.4212, "step": 6480 }, { "epoch": 6.6366427840327535, "grad_norm": 0.337890625, "learning_rate": 2.03075717083096e-06, "loss": 0.4263, "step": 6484 }, { "epoch": 6.640736949846469, "grad_norm": 0.353515625, "learning_rate": 1.9853521047181963e-06, "loss": 0.4296, "step": 6488 }, { "epoch": 6.6448311156601845, "grad_norm": 0.33984375, "learning_rate": 1.9404570113458197e-06, "loss": 0.3664, "step": 6492 }, { "epoch": 6.6489252814738995, "grad_norm": 0.3359375, "learning_rate": 1.8960720453963295e-06, "loss": 0.4485, "step": 6496 }, { "epoch": 6.653019447287615, "grad_norm": 0.3828125, "learning_rate": 1.8521973597946326e-06, "loss": 0.4253, "step": 6500 } ], "logging_steps": 4, "max_steps": 6839, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.253726566678266e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }