diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18076 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10314, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 3.656641960144043, + "learning_rate": 4.998060888113244e-05, + "loss": 9.8019, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 3.3737926483154297, + "learning_rate": 4.9961217762264883e-05, + "loss": 9.12, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 3.5734024047851562, + "learning_rate": 4.9941826643397324e-05, + "loss": 8.8002, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 3.0836071968078613, + "learning_rate": 4.992243552452977e-05, + "loss": 8.5483, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 3.126896619796753, + "learning_rate": 4.9903044405662205e-05, + "loss": 8.376, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 2.795313835144043, + "learning_rate": 4.988365328679465e-05, + "loss": 8.2286, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 2.847642183303833, + "learning_rate": 4.9864262167927086e-05, + "loss": 8.0428, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 2.6489531993865967, + "learning_rate": 4.9844871049059533e-05, + "loss": 7.8429, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 2.569993495941162, + "learning_rate": 4.9825479930191974e-05, + "loss": 7.6876, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 2.3502109050750732, + "learning_rate": 4.9806088811324415e-05, + "loss": 7.5277, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 2.2252399921417236, + "learning_rate": 4.9786697692456855e-05, + "loss": 7.3008, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 2.2069296836853027, + "learning_rate": 4.97673065735893e-05, + "loss": 7.1233, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 2.133172035217285, + "learning_rate": 4.9747915454721736e-05, + "loss": 6.9925, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 1.830748438835144, + "learning_rate": 4.9728524335854183e-05, + "loss": 6.8808, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 1.7571477890014648, + "learning_rate": 4.970913321698662e-05, + "loss": 6.7121, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 1.6238043308258057, + "learning_rate": 4.9689742098119065e-05, + "loss": 6.6879, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 1.460631012916565, + "learning_rate": 4.9670350979251505e-05, + "loss": 6.4892, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 1.399665355682373, + "learning_rate": 4.9650959860383946e-05, + "loss": 6.4278, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 1.2306325435638428, + "learning_rate": 4.9631568741516386e-05, + "loss": 6.43, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 1.1850312948226929, + "learning_rate": 4.9612177622648833e-05, + "loss": 6.3628, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 1.3839452266693115, + "learning_rate": 4.959278650378127e-05, + "loss": 6.1944, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.2195193767547607, + "learning_rate": 4.9573395384913715e-05, + "loss": 6.2424, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 1.1323491334915161, + "learning_rate": 4.9554004266046155e-05, + "loss": 6.112, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 1.2749617099761963, + "learning_rate": 4.9534613147178596e-05, + "loss": 6.13, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 1.2026777267456055, + "learning_rate": 4.9515222028311036e-05, + "loss": 6.0626, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 1.4500185251235962, + "learning_rate": 4.949583090944348e-05, + "loss": 6.0918, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 1.2848747968673706, + "learning_rate": 4.947643979057592e-05, + "loss": 6.1688, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 1.3187015056610107, + "learning_rate": 4.9457048671708365e-05, + "loss": 6.0728, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 1.487337589263916, + "learning_rate": 4.94376575528408e-05, + "loss": 6.0801, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 1.191052794456482, + "learning_rate": 4.9418266433973246e-05, + "loss": 6.1361, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 1.5774741172790527, + "learning_rate": 4.9398875315105686e-05, + "loss": 6.055, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 1.0442208051681519, + "learning_rate": 4.937948419623813e-05, + "loss": 6.0708, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 1.4346356391906738, + "learning_rate": 4.936009307737057e-05, + "loss": 6.0169, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 1.2307249307632446, + "learning_rate": 4.934070195850301e-05, + "loss": 6.0547, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 1.1539560556411743, + "learning_rate": 4.932131083963545e-05, + "loss": 6.0166, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 1.1614229679107666, + "learning_rate": 4.930191972076789e-05, + "loss": 6.1866, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 1.0596864223480225, + "learning_rate": 4.928252860190033e-05, + "loss": 5.9715, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 1.111221194267273, + "learning_rate": 4.926313748303277e-05, + "loss": 6.0272, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 0.9217305183410645, + "learning_rate": 4.924374636416522e-05, + "loss": 6.0135, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 0.968235969543457, + "learning_rate": 4.922435524529765e-05, + "loss": 5.9969, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 1.1550750732421875, + "learning_rate": 4.92049641264301e-05, + "loss": 6.0566, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 1.5655795335769653, + "learning_rate": 4.918557300756254e-05, + "loss": 5.9619, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 1.2823691368103027, + "learning_rate": 4.916618188869498e-05, + "loss": 5.876, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 0.913962721824646, + "learning_rate": 4.914679076982742e-05, + "loss": 6.0023, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 1.1342990398406982, + "learning_rate": 4.912739965095986e-05, + "loss": 5.9118, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 1.1372032165527344, + "learning_rate": 4.91080085320923e-05, + "loss": 5.9543, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 1.1001873016357422, + "learning_rate": 4.908861741322475e-05, + "loss": 6.0478, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 1.163060188293457, + "learning_rate": 4.906922629435718e-05, + "loss": 5.9354, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 1.022900938987732, + "learning_rate": 4.904983517548963e-05, + "loss": 5.8227, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 1.2557566165924072, + "learning_rate": 4.903044405662207e-05, + "loss": 5.898, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 1.260396122932434, + "learning_rate": 4.901105293775451e-05, + "loss": 5.9742, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 1.0883902311325073, + "learning_rate": 4.899166181888695e-05, + "loss": 5.8642, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 1.07965886592865, + "learning_rate": 4.89722707000194e-05, + "loss": 5.9836, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 1.12454092502594, + "learning_rate": 4.895287958115183e-05, + "loss": 5.9383, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 1.1025103330612183, + "learning_rate": 4.893348846228428e-05, + "loss": 5.8433, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 1.1046481132507324, + "learning_rate": 4.891409734341671e-05, + "loss": 5.8649, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 1.356872320175171, + "learning_rate": 4.889470622454916e-05, + "loss": 5.9069, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 1.1604045629501343, + "learning_rate": 4.88753151056816e-05, + "loss": 5.8704, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 1.1360890865325928, + "learning_rate": 4.885592398681404e-05, + "loss": 5.8723, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 0.8874178528785706, + "learning_rate": 4.883653286794648e-05, + "loss": 5.8548, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 1.0104761123657227, + "learning_rate": 4.881714174907893e-05, + "loss": 5.8931, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 1.1234338283538818, + "learning_rate": 4.879775063021136e-05, + "loss": 5.8915, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 1.2352111339569092, + "learning_rate": 4.877835951134381e-05, + "loss": 5.9084, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 0.9969356656074524, + "learning_rate": 4.875896839247625e-05, + "loss": 5.9788, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 1.4352229833602905, + "learning_rate": 4.873957727360869e-05, + "loss": 5.8121, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 1.3081138134002686, + "learning_rate": 4.872018615474113e-05, + "loss": 5.8714, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 1.1553027629852295, + "learning_rate": 4.870079503587357e-05, + "loss": 5.9062, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 1.328942060470581, + "learning_rate": 4.868140391700601e-05, + "loss": 5.9659, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 1.092142939567566, + "learning_rate": 4.8662012798138454e-05, + "loss": 5.9436, + "step": 276 + }, + { + "epoch": 0.03, + "grad_norm": 0.8468914031982422, + "learning_rate": 4.8642621679270894e-05, + "loss": 5.9176, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 1.1835949420928955, + "learning_rate": 4.8623230560403335e-05, + "loss": 5.8968, + "step": 284 + }, + { + "epoch": 0.03, + "grad_norm": 1.401023507118225, + "learning_rate": 4.860383944153578e-05, + "loss": 5.8126, + "step": 288 + }, + { + "epoch": 0.03, + "grad_norm": 1.2995976209640503, + "learning_rate": 4.8584448322668216e-05, + "loss": 5.9769, + "step": 292 + }, + { + "epoch": 0.03, + "grad_norm": 1.0936148166656494, + "learning_rate": 4.856505720380066e-05, + "loss": 5.8341, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 1.121054768562317, + "learning_rate": 4.85456660849331e-05, + "loss": 5.8082, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 1.011224389076233, + "learning_rate": 4.8526274966065544e-05, + "loss": 5.8437, + "step": 304 + }, + { + "epoch": 0.03, + "grad_norm": 0.909989058971405, + "learning_rate": 4.8506883847197985e-05, + "loss": 5.8217, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 0.9056807160377502, + "learning_rate": 4.8487492728330425e-05, + "loss": 5.8791, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 0.990393340587616, + "learning_rate": 4.8468101609462866e-05, + "loss": 5.9895, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 1.252443790435791, + "learning_rate": 4.844871049059531e-05, + "loss": 5.9232, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 0.9842864274978638, + "learning_rate": 4.842931937172775e-05, + "loss": 5.8068, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 1.050066351890564, + "learning_rate": 4.8409928252860194e-05, + "loss": 5.8112, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 0.9735043048858643, + "learning_rate": 4.8390537133992635e-05, + "loss": 5.8329, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 0.948833703994751, + "learning_rate": 4.8371146015125075e-05, + "loss": 5.8564, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 1.0226508378982544, + "learning_rate": 4.8351754896257516e-05, + "loss": 5.7493, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 1.09911048412323, + "learning_rate": 4.8332363777389956e-05, + "loss": 5.7015, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 1.1777935028076172, + "learning_rate": 4.83129726585224e-05, + "loss": 5.7534, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 1.1616647243499756, + "learning_rate": 4.8293581539654844e-05, + "loss": 5.8506, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 1.0601903200149536, + "learning_rate": 4.827419042078728e-05, + "loss": 5.8917, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 0.9961209893226624, + "learning_rate": 4.8254799301919725e-05, + "loss": 5.797, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 1.0286229848861694, + "learning_rate": 4.8235408183052166e-05, + "loss": 5.888, + "step": 364 + }, + { + "epoch": 0.04, + "grad_norm": 1.0672661066055298, + "learning_rate": 4.8216017064184606e-05, + "loss": 5.823, + "step": 368 + }, + { + "epoch": 0.04, + "grad_norm": 0.9099803566932678, + "learning_rate": 4.819662594531705e-05, + "loss": 5.7411, + "step": 372 + }, + { + "epoch": 0.04, + "grad_norm": 1.1643503904342651, + "learning_rate": 4.8177234826449494e-05, + "loss": 5.9417, + "step": 376 + }, + { + "epoch": 0.04, + "grad_norm": 0.9987436532974243, + "learning_rate": 4.815784370758193e-05, + "loss": 5.8176, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 1.1946054697036743, + "learning_rate": 4.8138452588714375e-05, + "loss": 5.9005, + "step": 384 + }, + { + "epoch": 0.04, + "grad_norm": 0.9317137002944946, + "learning_rate": 4.811906146984681e-05, + "loss": 5.8017, + "step": 388 + }, + { + "epoch": 0.04, + "grad_norm": 1.0581262111663818, + "learning_rate": 4.8099670350979256e-05, + "loss": 5.794, + "step": 392 + }, + { + "epoch": 0.04, + "grad_norm": 1.1224080324172974, + "learning_rate": 4.80802792321117e-05, + "loss": 5.8161, + "step": 396 + }, + { + "epoch": 0.04, + "grad_norm": 1.2604461908340454, + "learning_rate": 4.806088811324414e-05, + "loss": 5.8109, + "step": 400 + }, + { + "epoch": 0.04, + "grad_norm": 1.1275701522827148, + "learning_rate": 4.804149699437658e-05, + "loss": 5.8104, + "step": 404 + }, + { + "epoch": 0.04, + "grad_norm": 1.012634038925171, + "learning_rate": 4.802210587550902e-05, + "loss": 5.8702, + "step": 408 + }, + { + "epoch": 0.04, + "grad_norm": 0.8785901665687561, + "learning_rate": 4.800271475664146e-05, + "loss": 5.8498, + "step": 412 + }, + { + "epoch": 0.04, + "grad_norm": 0.8808034062385559, + "learning_rate": 4.79833236377739e-05, + "loss": 5.7474, + "step": 416 + }, + { + "epoch": 0.04, + "grad_norm": 1.1132124662399292, + "learning_rate": 4.796393251890634e-05, + "loss": 5.7462, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 0.9468419551849365, + "learning_rate": 4.794454140003878e-05, + "loss": 5.7767, + "step": 424 + }, + { + "epoch": 0.04, + "grad_norm": 0.9365801215171814, + "learning_rate": 4.792515028117123e-05, + "loss": 5.8072, + "step": 428 + }, + { + "epoch": 0.04, + "grad_norm": 1.0467591285705566, + "learning_rate": 4.790575916230366e-05, + "loss": 5.7773, + "step": 432 + }, + { + "epoch": 0.04, + "grad_norm": 1.1843376159667969, + "learning_rate": 4.788636804343611e-05, + "loss": 5.9357, + "step": 436 + }, + { + "epoch": 0.04, + "grad_norm": 1.139872431755066, + "learning_rate": 4.786697692456855e-05, + "loss": 5.7744, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 0.9096530079841614, + "learning_rate": 4.784758580570099e-05, + "loss": 5.9122, + "step": 444 + }, + { + "epoch": 0.04, + "grad_norm": 1.0682241916656494, + "learning_rate": 4.782819468683343e-05, + "loss": 5.8587, + "step": 448 + }, + { + "epoch": 0.04, + "grad_norm": 0.99188232421875, + "learning_rate": 4.780880356796588e-05, + "loss": 5.8096, + "step": 452 + }, + { + "epoch": 0.04, + "grad_norm": 1.0947283506393433, + "learning_rate": 4.778941244909831e-05, + "loss": 5.6552, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 1.0802547931671143, + "learning_rate": 4.777002133023076e-05, + "loss": 5.9245, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 1.164602518081665, + "learning_rate": 4.775063021136319e-05, + "loss": 5.8362, + "step": 464 + }, + { + "epoch": 0.05, + "grad_norm": 1.1340216398239136, + "learning_rate": 4.773123909249564e-05, + "loss": 5.8477, + "step": 468 + }, + { + "epoch": 0.05, + "grad_norm": 1.0476304292678833, + "learning_rate": 4.771184797362808e-05, + "loss": 5.8336, + "step": 472 + }, + { + "epoch": 0.05, + "grad_norm": 0.9962639808654785, + "learning_rate": 4.769245685476052e-05, + "loss": 5.7876, + "step": 476 + }, + { + "epoch": 0.05, + "grad_norm": 0.9998891949653625, + "learning_rate": 4.767306573589296e-05, + "loss": 5.6897, + "step": 480 + }, + { + "epoch": 0.05, + "grad_norm": 1.054436445236206, + "learning_rate": 4.765367461702541e-05, + "loss": 5.6461, + "step": 484 + }, + { + "epoch": 0.05, + "grad_norm": 0.9963980913162231, + "learning_rate": 4.763428349815784e-05, + "loss": 5.7391, + "step": 488 + }, + { + "epoch": 0.05, + "grad_norm": 1.077802062034607, + "learning_rate": 4.761489237929029e-05, + "loss": 5.772, + "step": 492 + }, + { + "epoch": 0.05, + "grad_norm": 1.3317667245864868, + "learning_rate": 4.759550126042273e-05, + "loss": 5.7657, + "step": 496 + }, + { + "epoch": 0.05, + "grad_norm": 1.028090000152588, + "learning_rate": 4.757611014155517e-05, + "loss": 5.7417, + "step": 500 + }, + { + "epoch": 0.05, + "grad_norm": 0.9719061255455017, + "learning_rate": 4.755671902268761e-05, + "loss": 5.8042, + "step": 504 + }, + { + "epoch": 0.05, + "grad_norm": 1.067749261856079, + "learning_rate": 4.753732790382005e-05, + "loss": 5.7923, + "step": 508 + }, + { + "epoch": 0.05, + "grad_norm": 0.9830943942070007, + "learning_rate": 4.751793678495249e-05, + "loss": 5.7911, + "step": 512 + }, + { + "epoch": 0.05, + "grad_norm": 1.0226807594299316, + "learning_rate": 4.749854566608494e-05, + "loss": 5.7696, + "step": 516 + }, + { + "epoch": 0.05, + "grad_norm": 1.3311954736709595, + "learning_rate": 4.7479154547217374e-05, + "loss": 5.8689, + "step": 520 + }, + { + "epoch": 0.05, + "grad_norm": 0.9809809923171997, + "learning_rate": 4.745976342834982e-05, + "loss": 5.8023, + "step": 524 + }, + { + "epoch": 0.05, + "grad_norm": 1.1488884687423706, + "learning_rate": 4.744037230948226e-05, + "loss": 5.7116, + "step": 528 + }, + { + "epoch": 0.05, + "grad_norm": 1.0571842193603516, + "learning_rate": 4.74209811906147e-05, + "loss": 5.7105, + "step": 532 + }, + { + "epoch": 0.05, + "grad_norm": 0.9185091257095337, + "learning_rate": 4.740159007174714e-05, + "loss": 5.8675, + "step": 536 + }, + { + "epoch": 0.05, + "grad_norm": 0.9164991974830627, + "learning_rate": 4.738219895287958e-05, + "loss": 5.7231, + "step": 540 + }, + { + "epoch": 0.05, + "grad_norm": 1.0493993759155273, + "learning_rate": 4.7362807834012024e-05, + "loss": 5.8437, + "step": 544 + }, + { + "epoch": 0.05, + "grad_norm": 0.8653802275657654, + "learning_rate": 4.7343416715144464e-05, + "loss": 5.652, + "step": 548 + }, + { + "epoch": 0.05, + "grad_norm": 0.975098192691803, + "learning_rate": 4.7324025596276905e-05, + "loss": 5.7353, + "step": 552 + }, + { + "epoch": 0.05, + "grad_norm": 0.9267313480377197, + "learning_rate": 4.7304634477409345e-05, + "loss": 5.7886, + "step": 556 + }, + { + "epoch": 0.05, + "grad_norm": 0.9583150148391724, + "learning_rate": 4.728524335854179e-05, + "loss": 5.6864, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 0.9664693474769592, + "learning_rate": 4.7265852239674227e-05, + "loss": 5.8314, + "step": 564 + }, + { + "epoch": 0.06, + "grad_norm": 0.8994282484054565, + "learning_rate": 4.7246461120806674e-05, + "loss": 5.7865, + "step": 568 + }, + { + "epoch": 0.06, + "grad_norm": 1.0095504522323608, + "learning_rate": 4.7227070001939114e-05, + "loss": 5.7627, + "step": 572 + }, + { + "epoch": 0.06, + "grad_norm": 1.0120139122009277, + "learning_rate": 4.7207678883071555e-05, + "loss": 5.6769, + "step": 576 + }, + { + "epoch": 0.06, + "grad_norm": 0.9483413100242615, + "learning_rate": 4.7188287764203995e-05, + "loss": 5.7615, + "step": 580 + }, + { + "epoch": 0.06, + "grad_norm": 0.9208195209503174, + "learning_rate": 4.7168896645336436e-05, + "loss": 5.7178, + "step": 584 + }, + { + "epoch": 0.06, + "grad_norm": 1.0568493604660034, + "learning_rate": 4.7149505526468876e-05, + "loss": 5.6305, + "step": 588 + }, + { + "epoch": 0.06, + "grad_norm": 0.9036898016929626, + "learning_rate": 4.7130114407601324e-05, + "loss": 5.7088, + "step": 592 + }, + { + "epoch": 0.06, + "grad_norm": 1.0219190120697021, + "learning_rate": 4.711072328873376e-05, + "loss": 5.7359, + "step": 596 + }, + { + "epoch": 0.06, + "grad_norm": 1.1148089170455933, + "learning_rate": 4.7091332169866205e-05, + "loss": 5.7653, + "step": 600 + }, + { + "epoch": 0.06, + "grad_norm": 1.0402499437332153, + "learning_rate": 4.7071941050998645e-05, + "loss": 5.7291, + "step": 604 + }, + { + "epoch": 0.06, + "grad_norm": 1.2846848964691162, + "learning_rate": 4.7052549932131086e-05, + "loss": 5.8432, + "step": 608 + }, + { + "epoch": 0.06, + "grad_norm": 1.0562716722488403, + "learning_rate": 4.7033158813263526e-05, + "loss": 5.7242, + "step": 612 + }, + { + "epoch": 0.06, + "grad_norm": 0.9270595908164978, + "learning_rate": 4.7013767694395974e-05, + "loss": 5.735, + "step": 616 + }, + { + "epoch": 0.06, + "grad_norm": 0.9591754674911499, + "learning_rate": 4.699437657552841e-05, + "loss": 5.6973, + "step": 620 + }, + { + "epoch": 0.06, + "grad_norm": 1.020099401473999, + "learning_rate": 4.6974985456660855e-05, + "loss": 5.7211, + "step": 624 + }, + { + "epoch": 0.06, + "grad_norm": 0.9728394150733948, + "learning_rate": 4.695559433779329e-05, + "loss": 5.7558, + "step": 628 + }, + { + "epoch": 0.06, + "grad_norm": 0.9725939631462097, + "learning_rate": 4.6936203218925736e-05, + "loss": 5.8508, + "step": 632 + }, + { + "epoch": 0.06, + "grad_norm": 1.0429859161376953, + "learning_rate": 4.6916812100058176e-05, + "loss": 5.7557, + "step": 636 + }, + { + "epoch": 0.06, + "grad_norm": 1.1550756692886353, + "learning_rate": 4.689742098119062e-05, + "loss": 5.7864, + "step": 640 + }, + { + "epoch": 0.06, + "grad_norm": 1.277512788772583, + "learning_rate": 4.687802986232306e-05, + "loss": 5.731, + "step": 644 + }, + { + "epoch": 0.06, + "grad_norm": 1.170222282409668, + "learning_rate": 4.6858638743455505e-05, + "loss": 5.6716, + "step": 648 + }, + { + "epoch": 0.06, + "grad_norm": 1.0293292999267578, + "learning_rate": 4.683924762458794e-05, + "loss": 5.7762, + "step": 652 + }, + { + "epoch": 0.06, + "grad_norm": 1.0924861431121826, + "learning_rate": 4.6819856505720386e-05, + "loss": 5.7223, + "step": 656 + }, + { + "epoch": 0.06, + "grad_norm": 0.949212908744812, + "learning_rate": 4.680046538685282e-05, + "loss": 5.7425, + "step": 660 + }, + { + "epoch": 0.06, + "grad_norm": 1.0186933279037476, + "learning_rate": 4.678107426798527e-05, + "loss": 5.8361, + "step": 664 + }, + { + "epoch": 0.06, + "grad_norm": 1.0421589612960815, + "learning_rate": 4.676168314911771e-05, + "loss": 5.7093, + "step": 668 + }, + { + "epoch": 0.07, + "grad_norm": 1.061271071434021, + "learning_rate": 4.674229203025015e-05, + "loss": 5.6382, + "step": 672 + }, + { + "epoch": 0.07, + "grad_norm": 1.0228848457336426, + "learning_rate": 4.672290091138259e-05, + "loss": 5.6845, + "step": 676 + }, + { + "epoch": 0.07, + "grad_norm": 1.0488477945327759, + "learning_rate": 4.670350979251503e-05, + "loss": 5.6403, + "step": 680 + }, + { + "epoch": 0.07, + "grad_norm": 0.9394490718841553, + "learning_rate": 4.668411867364747e-05, + "loss": 5.6733, + "step": 684 + }, + { + "epoch": 0.07, + "grad_norm": 1.140890121459961, + "learning_rate": 4.666472755477991e-05, + "loss": 5.7066, + "step": 688 + }, + { + "epoch": 0.07, + "grad_norm": 0.9432203769683838, + "learning_rate": 4.664533643591236e-05, + "loss": 5.7846, + "step": 692 + }, + { + "epoch": 0.07, + "grad_norm": 1.0256586074829102, + "learning_rate": 4.662594531704479e-05, + "loss": 5.766, + "step": 696 + }, + { + "epoch": 0.07, + "grad_norm": 1.0383509397506714, + "learning_rate": 4.660655419817724e-05, + "loss": 5.8138, + "step": 700 + }, + { + "epoch": 0.07, + "grad_norm": 0.9286032319068909, + "learning_rate": 4.658716307930968e-05, + "loss": 5.7233, + "step": 704 + }, + { + "epoch": 0.07, + "grad_norm": 0.9339187145233154, + "learning_rate": 4.656777196044212e-05, + "loss": 5.803, + "step": 708 + }, + { + "epoch": 0.07, + "grad_norm": 0.9535828828811646, + "learning_rate": 4.654838084157456e-05, + "loss": 5.8002, + "step": 712 + }, + { + "epoch": 0.07, + "grad_norm": 1.2881035804748535, + "learning_rate": 4.6528989722707e-05, + "loss": 5.5501, + "step": 716 + }, + { + "epoch": 0.07, + "grad_norm": 0.9991912841796875, + "learning_rate": 4.650959860383944e-05, + "loss": 5.7732, + "step": 720 + }, + { + "epoch": 0.07, + "grad_norm": 1.071502923965454, + "learning_rate": 4.649020748497189e-05, + "loss": 5.6494, + "step": 724 + }, + { + "epoch": 0.07, + "grad_norm": 0.9422402381896973, + "learning_rate": 4.647081636610432e-05, + "loss": 5.7539, + "step": 728 + }, + { + "epoch": 0.07, + "grad_norm": 1.2175371646881104, + "learning_rate": 4.645142524723677e-05, + "loss": 5.6747, + "step": 732 + }, + { + "epoch": 0.07, + "grad_norm": 1.1459147930145264, + "learning_rate": 4.643203412836921e-05, + "loss": 5.6895, + "step": 736 + }, + { + "epoch": 0.07, + "grad_norm": 0.9659698009490967, + "learning_rate": 4.641264300950165e-05, + "loss": 5.6517, + "step": 740 + }, + { + "epoch": 0.07, + "grad_norm": 1.071323275566101, + "learning_rate": 4.639325189063409e-05, + "loss": 5.653, + "step": 744 + }, + { + "epoch": 0.07, + "grad_norm": 1.1298364400863647, + "learning_rate": 4.637386077176653e-05, + "loss": 5.6975, + "step": 748 + }, + { + "epoch": 0.07, + "grad_norm": 0.9889796376228333, + "learning_rate": 4.635446965289897e-05, + "loss": 5.8007, + "step": 752 + }, + { + "epoch": 0.07, + "grad_norm": 0.9490829110145569, + "learning_rate": 4.633507853403142e-05, + "loss": 5.6969, + "step": 756 + }, + { + "epoch": 0.07, + "grad_norm": 1.0920511484146118, + "learning_rate": 4.6315687415163853e-05, + "loss": 5.74, + "step": 760 + }, + { + "epoch": 0.07, + "grad_norm": 1.039368748664856, + "learning_rate": 4.62962962962963e-05, + "loss": 5.7002, + "step": 764 + }, + { + "epoch": 0.07, + "grad_norm": 0.9749051928520203, + "learning_rate": 4.627690517742874e-05, + "loss": 5.6879, + "step": 768 + }, + { + "epoch": 0.07, + "grad_norm": 0.9410939812660217, + "learning_rate": 4.625751405856118e-05, + "loss": 5.7138, + "step": 772 + }, + { + "epoch": 0.08, + "grad_norm": 0.9284390807151794, + "learning_rate": 4.623812293969362e-05, + "loss": 5.6772, + "step": 776 + }, + { + "epoch": 0.08, + "grad_norm": 1.220140814781189, + "learning_rate": 4.621873182082606e-05, + "loss": 5.6187, + "step": 780 + }, + { + "epoch": 0.08, + "grad_norm": 0.9217903017997742, + "learning_rate": 4.6199340701958503e-05, + "loss": 5.7051, + "step": 784 + }, + { + "epoch": 0.08, + "grad_norm": 0.9127422571182251, + "learning_rate": 4.617994958309095e-05, + "loss": 5.6821, + "step": 788 + }, + { + "epoch": 0.08, + "grad_norm": 0.9815553426742554, + "learning_rate": 4.6160558464223385e-05, + "loss": 5.7365, + "step": 792 + }, + { + "epoch": 0.08, + "grad_norm": 0.8972638249397278, + "learning_rate": 4.614116734535583e-05, + "loss": 5.6835, + "step": 796 + }, + { + "epoch": 0.08, + "grad_norm": 0.9297725558280945, + "learning_rate": 4.612177622648827e-05, + "loss": 5.6304, + "step": 800 + }, + { + "epoch": 0.08, + "grad_norm": 0.9928802847862244, + "learning_rate": 4.610238510762071e-05, + "loss": 5.8121, + "step": 804 + }, + { + "epoch": 0.08, + "grad_norm": 0.9002383351325989, + "learning_rate": 4.6082993988753153e-05, + "loss": 5.6457, + "step": 808 + }, + { + "epoch": 0.08, + "grad_norm": 0.8902143239974976, + "learning_rate": 4.6063602869885594e-05, + "loss": 5.7276, + "step": 812 + }, + { + "epoch": 0.08, + "grad_norm": 1.069921612739563, + "learning_rate": 4.6044211751018035e-05, + "loss": 5.6827, + "step": 816 + }, + { + "epoch": 0.08, + "grad_norm": 0.9564573764801025, + "learning_rate": 4.6024820632150475e-05, + "loss": 5.694, + "step": 820 + }, + { + "epoch": 0.08, + "grad_norm": 0.9404289126396179, + "learning_rate": 4.6005429513282916e-05, + "loss": 5.585, + "step": 824 + }, + { + "epoch": 0.08, + "grad_norm": 0.9606718420982361, + "learning_rate": 4.5986038394415356e-05, + "loss": 5.7258, + "step": 828 + }, + { + "epoch": 0.08, + "grad_norm": 0.893195629119873, + "learning_rate": 4.5966647275547803e-05, + "loss": 5.6816, + "step": 832 + }, + { + "epoch": 0.08, + "grad_norm": 0.9530772566795349, + "learning_rate": 4.5947256156680244e-05, + "loss": 5.7424, + "step": 836 + }, + { + "epoch": 0.08, + "grad_norm": 1.1925373077392578, + "learning_rate": 4.5927865037812685e-05, + "loss": 5.6958, + "step": 840 + }, + { + "epoch": 0.08, + "grad_norm": 1.0818183422088623, + "learning_rate": 4.5908473918945125e-05, + "loss": 5.6273, + "step": 844 + }, + { + "epoch": 0.08, + "grad_norm": 1.1746087074279785, + "learning_rate": 4.5889082800077566e-05, + "loss": 5.6699, + "step": 848 + }, + { + "epoch": 0.08, + "grad_norm": 1.069509744644165, + "learning_rate": 4.5869691681210006e-05, + "loss": 5.677, + "step": 852 + }, + { + "epoch": 0.08, + "grad_norm": 1.0898882150650024, + "learning_rate": 4.5850300562342453e-05, + "loss": 5.723, + "step": 856 + }, + { + "epoch": 0.08, + "grad_norm": 0.9786805510520935, + "learning_rate": 4.583090944347489e-05, + "loss": 5.7109, + "step": 860 + }, + { + "epoch": 0.08, + "grad_norm": 1.0218449831008911, + "learning_rate": 4.5811518324607335e-05, + "loss": 5.7927, + "step": 864 + }, + { + "epoch": 0.08, + "grad_norm": 1.0597094297409058, + "learning_rate": 4.579212720573977e-05, + "loss": 5.6456, + "step": 868 + }, + { + "epoch": 0.08, + "grad_norm": 1.015222430229187, + "learning_rate": 4.5772736086872216e-05, + "loss": 5.7023, + "step": 872 + }, + { + "epoch": 0.08, + "grad_norm": 1.0093826055526733, + "learning_rate": 4.5753344968004656e-05, + "loss": 5.673, + "step": 876 + }, + { + "epoch": 0.09, + "grad_norm": 1.1618988513946533, + "learning_rate": 4.57339538491371e-05, + "loss": 5.5492, + "step": 880 + }, + { + "epoch": 0.09, + "grad_norm": 0.9958293437957764, + "learning_rate": 4.571456273026954e-05, + "loss": 5.5945, + "step": 884 + }, + { + "epoch": 0.09, + "grad_norm": 0.9605300426483154, + "learning_rate": 4.5695171611401985e-05, + "loss": 5.6439, + "step": 888 + }, + { + "epoch": 0.09, + "grad_norm": 1.0881812572479248, + "learning_rate": 4.567578049253442e-05, + "loss": 5.7346, + "step": 892 + }, + { + "epoch": 0.09, + "grad_norm": 1.1047258377075195, + "learning_rate": 4.5656389373666866e-05, + "loss": 5.6754, + "step": 896 + }, + { + "epoch": 0.09, + "grad_norm": 1.0597559213638306, + "learning_rate": 4.56369982547993e-05, + "loss": 5.6655, + "step": 900 + }, + { + "epoch": 0.09, + "grad_norm": 1.0080434083938599, + "learning_rate": 4.561760713593175e-05, + "loss": 5.637, + "step": 904 + }, + { + "epoch": 0.09, + "grad_norm": 1.1175477504730225, + "learning_rate": 4.559821601706419e-05, + "loss": 5.6114, + "step": 908 + }, + { + "epoch": 0.09, + "grad_norm": 1.309441089630127, + "learning_rate": 4.557882489819663e-05, + "loss": 5.64, + "step": 912 + }, + { + "epoch": 0.09, + "grad_norm": 1.0713026523590088, + "learning_rate": 4.555943377932907e-05, + "loss": 5.6433, + "step": 916 + }, + { + "epoch": 0.09, + "grad_norm": 1.1929994821548462, + "learning_rate": 4.5540042660461516e-05, + "loss": 5.6503, + "step": 920 + }, + { + "epoch": 0.09, + "grad_norm": 1.0933693647384644, + "learning_rate": 4.552065154159395e-05, + "loss": 5.68, + "step": 924 + }, + { + "epoch": 0.09, + "grad_norm": 0.9823185801506042, + "learning_rate": 4.55012604227264e-05, + "loss": 5.6941, + "step": 928 + }, + { + "epoch": 0.09, + "grad_norm": 0.9883749485015869, + "learning_rate": 4.548186930385884e-05, + "loss": 5.5973, + "step": 932 + }, + { + "epoch": 0.09, + "grad_norm": 0.9289098381996155, + "learning_rate": 4.546247818499128e-05, + "loss": 5.7444, + "step": 936 + }, + { + "epoch": 0.09, + "grad_norm": 1.0262410640716553, + "learning_rate": 4.544308706612372e-05, + "loss": 5.6235, + "step": 940 + }, + { + "epoch": 0.09, + "grad_norm": 1.0129894018173218, + "learning_rate": 4.542369594725616e-05, + "loss": 5.6469, + "step": 944 + }, + { + "epoch": 0.09, + "grad_norm": 1.0500363111495972, + "learning_rate": 4.54043048283886e-05, + "loss": 5.6281, + "step": 948 + }, + { + "epoch": 0.09, + "grad_norm": 0.9239895343780518, + "learning_rate": 4.538491370952104e-05, + "loss": 5.6151, + "step": 952 + }, + { + "epoch": 0.09, + "grad_norm": 0.9726962447166443, + "learning_rate": 4.536552259065348e-05, + "loss": 5.6146, + "step": 956 + }, + { + "epoch": 0.09, + "grad_norm": 0.9822578430175781, + "learning_rate": 4.534613147178592e-05, + "loss": 5.5852, + "step": 960 + }, + { + "epoch": 0.09, + "grad_norm": 0.9551820755004883, + "learning_rate": 4.532674035291837e-05, + "loss": 5.7069, + "step": 964 + }, + { + "epoch": 0.09, + "grad_norm": 0.9644606709480286, + "learning_rate": 4.530734923405081e-05, + "loss": 5.6178, + "step": 968 + }, + { + "epoch": 0.09, + "grad_norm": 1.1014901399612427, + "learning_rate": 4.528795811518325e-05, + "loss": 5.6131, + "step": 972 + }, + { + "epoch": 0.09, + "grad_norm": 1.0390726327896118, + "learning_rate": 4.526856699631569e-05, + "loss": 5.5601, + "step": 976 + }, + { + "epoch": 0.1, + "grad_norm": 0.9854323863983154, + "learning_rate": 4.524917587744813e-05, + "loss": 5.6175, + "step": 980 + }, + { + "epoch": 0.1, + "grad_norm": 0.9083386659622192, + "learning_rate": 4.522978475858057e-05, + "loss": 5.6013, + "step": 984 + }, + { + "epoch": 0.1, + "grad_norm": 1.246685266494751, + "learning_rate": 4.521039363971301e-05, + "loss": 5.7068, + "step": 988 + }, + { + "epoch": 0.1, + "grad_norm": 1.0527753829956055, + "learning_rate": 4.519100252084545e-05, + "loss": 5.6358, + "step": 992 + }, + { + "epoch": 0.1, + "grad_norm": 1.0731735229492188, + "learning_rate": 4.51716114019779e-05, + "loss": 5.6828, + "step": 996 + }, + { + "epoch": 0.1, + "grad_norm": 1.0552845001220703, + "learning_rate": 4.515222028311033e-05, + "loss": 5.6828, + "step": 1000 + }, + { + "epoch": 0.1, + "grad_norm": 0.990972101688385, + "learning_rate": 4.513282916424278e-05, + "loss": 5.7003, + "step": 1004 + }, + { + "epoch": 0.1, + "grad_norm": 1.1035810708999634, + "learning_rate": 4.511343804537522e-05, + "loss": 5.7133, + "step": 1008 + }, + { + "epoch": 0.1, + "grad_norm": 1.2226297855377197, + "learning_rate": 4.509404692650766e-05, + "loss": 5.6895, + "step": 1012 + }, + { + "epoch": 0.1, + "grad_norm": 1.1234639883041382, + "learning_rate": 4.50746558076401e-05, + "loss": 5.7874, + "step": 1016 + }, + { + "epoch": 0.1, + "grad_norm": 1.1130845546722412, + "learning_rate": 4.505526468877254e-05, + "loss": 5.7322, + "step": 1020 + }, + { + "epoch": 0.1, + "grad_norm": 0.8997195363044739, + "learning_rate": 4.503587356990498e-05, + "loss": 5.5914, + "step": 1024 + }, + { + "epoch": 0.1, + "grad_norm": 0.9451141953468323, + "learning_rate": 4.501648245103743e-05, + "loss": 5.68, + "step": 1028 + }, + { + "epoch": 0.1, + "grad_norm": 1.0043590068817139, + "learning_rate": 4.4997091332169864e-05, + "loss": 5.7082, + "step": 1032 + }, + { + "epoch": 0.1, + "grad_norm": 0.8845294713973999, + "learning_rate": 4.497770021330231e-05, + "loss": 5.6198, + "step": 1036 + }, + { + "epoch": 0.1, + "grad_norm": 1.0413445234298706, + "learning_rate": 4.495830909443475e-05, + "loss": 5.6253, + "step": 1040 + }, + { + "epoch": 0.1, + "grad_norm": 0.9856407642364502, + "learning_rate": 4.493891797556719e-05, + "loss": 5.7266, + "step": 1044 + }, + { + "epoch": 0.1, + "grad_norm": 1.0194454193115234, + "learning_rate": 4.491952685669963e-05, + "loss": 5.6567, + "step": 1048 + }, + { + "epoch": 0.1, + "grad_norm": 0.9935641884803772, + "learning_rate": 4.490013573783208e-05, + "loss": 5.639, + "step": 1052 + }, + { + "epoch": 0.1, + "grad_norm": 1.0133986473083496, + "learning_rate": 4.4880744618964514e-05, + "loss": 5.6575, + "step": 1056 + }, + { + "epoch": 0.1, + "grad_norm": 0.9149471521377563, + "learning_rate": 4.486135350009696e-05, + "loss": 5.7549, + "step": 1060 + }, + { + "epoch": 0.1, + "grad_norm": 1.1448979377746582, + "learning_rate": 4.4841962381229395e-05, + "loss": 5.6149, + "step": 1064 + }, + { + "epoch": 0.1, + "grad_norm": 0.9014776349067688, + "learning_rate": 4.482257126236184e-05, + "loss": 5.6667, + "step": 1068 + }, + { + "epoch": 0.1, + "grad_norm": 0.9877203702926636, + "learning_rate": 4.480318014349428e-05, + "loss": 5.6292, + "step": 1072 + }, + { + "epoch": 0.1, + "grad_norm": 1.15706467628479, + "learning_rate": 4.4783789024626724e-05, + "loss": 5.7073, + "step": 1076 + }, + { + "epoch": 0.1, + "grad_norm": 1.0951610803604126, + "learning_rate": 4.4764397905759164e-05, + "loss": 5.6028, + "step": 1080 + }, + { + "epoch": 0.11, + "grad_norm": 0.9684280157089233, + "learning_rate": 4.4745006786891605e-05, + "loss": 5.6946, + "step": 1084 + }, + { + "epoch": 0.11, + "grad_norm": 0.9964583516120911, + "learning_rate": 4.4725615668024045e-05, + "loss": 5.6245, + "step": 1088 + }, + { + "epoch": 0.11, + "grad_norm": 0.9633233547210693, + "learning_rate": 4.4706224549156486e-05, + "loss": 5.6683, + "step": 1092 + }, + { + "epoch": 0.11, + "grad_norm": 1.0245460271835327, + "learning_rate": 4.468683343028893e-05, + "loss": 5.644, + "step": 1096 + }, + { + "epoch": 0.11, + "grad_norm": 1.0242762565612793, + "learning_rate": 4.4667442311421374e-05, + "loss": 5.6365, + "step": 1100 + }, + { + "epoch": 0.11, + "grad_norm": 0.982917070388794, + "learning_rate": 4.4648051192553814e-05, + "loss": 5.6458, + "step": 1104 + }, + { + "epoch": 0.11, + "grad_norm": 0.949592113494873, + "learning_rate": 4.4628660073686255e-05, + "loss": 5.6253, + "step": 1108 + }, + { + "epoch": 0.11, + "grad_norm": 1.0238159894943237, + "learning_rate": 4.4609268954818695e-05, + "loss": 5.6323, + "step": 1112 + }, + { + "epoch": 0.11, + "grad_norm": 0.9112494587898254, + "learning_rate": 4.4589877835951136e-05, + "loss": 5.4746, + "step": 1116 + }, + { + "epoch": 0.11, + "grad_norm": 0.9587331414222717, + "learning_rate": 4.4570486717083576e-05, + "loss": 5.6729, + "step": 1120 + }, + { + "epoch": 0.11, + "grad_norm": 0.8947247862815857, + "learning_rate": 4.455109559821602e-05, + "loss": 5.7247, + "step": 1124 + }, + { + "epoch": 0.11, + "grad_norm": 0.9927013516426086, + "learning_rate": 4.4531704479348464e-05, + "loss": 5.6287, + "step": 1128 + }, + { + "epoch": 0.11, + "grad_norm": 1.0964155197143555, + "learning_rate": 4.45123133604809e-05, + "loss": 5.611, + "step": 1132 + }, + { + "epoch": 0.11, + "grad_norm": 0.9409870505332947, + "learning_rate": 4.4492922241613345e-05, + "loss": 5.6107, + "step": 1136 + }, + { + "epoch": 0.11, + "grad_norm": 1.1042007207870483, + "learning_rate": 4.447353112274578e-05, + "loss": 5.7283, + "step": 1140 + }, + { + "epoch": 0.11, + "grad_norm": 1.0457091331481934, + "learning_rate": 4.4454140003878226e-05, + "loss": 5.6, + "step": 1144 + }, + { + "epoch": 0.11, + "grad_norm": 1.037097692489624, + "learning_rate": 4.443474888501067e-05, + "loss": 5.7081, + "step": 1148 + }, + { + "epoch": 0.11, + "grad_norm": 0.8815637230873108, + "learning_rate": 4.441535776614311e-05, + "loss": 5.6222, + "step": 1152 + }, + { + "epoch": 0.11, + "grad_norm": 0.9820300936698914, + "learning_rate": 4.439596664727555e-05, + "loss": 5.639, + "step": 1156 + }, + { + "epoch": 0.11, + "grad_norm": 1.1689587831497192, + "learning_rate": 4.4376575528407995e-05, + "loss": 5.6293, + "step": 1160 + }, + { + "epoch": 0.11, + "grad_norm": 0.9346803426742554, + "learning_rate": 4.435718440954043e-05, + "loss": 5.5408, + "step": 1164 + }, + { + "epoch": 0.11, + "grad_norm": 0.8906845450401306, + "learning_rate": 4.4337793290672876e-05, + "loss": 5.5729, + "step": 1168 + }, + { + "epoch": 0.11, + "grad_norm": 1.0249162912368774, + "learning_rate": 4.431840217180532e-05, + "loss": 5.6928, + "step": 1172 + }, + { + "epoch": 0.11, + "grad_norm": 1.0330933332443237, + "learning_rate": 4.429901105293776e-05, + "loss": 5.6583, + "step": 1176 + }, + { + "epoch": 0.11, + "grad_norm": 1.2164433002471924, + "learning_rate": 4.42796199340702e-05, + "loss": 5.6906, + "step": 1180 + }, + { + "epoch": 0.11, + "grad_norm": 1.061458706855774, + "learning_rate": 4.426022881520264e-05, + "loss": 5.6325, + "step": 1184 + }, + { + "epoch": 0.12, + "grad_norm": 1.0236989259719849, + "learning_rate": 4.424083769633508e-05, + "loss": 5.6806, + "step": 1188 + }, + { + "epoch": 0.12, + "grad_norm": 1.125806212425232, + "learning_rate": 4.4221446577467526e-05, + "loss": 5.6773, + "step": 1192 + }, + { + "epoch": 0.12, + "grad_norm": 1.0632579326629639, + "learning_rate": 4.420205545859996e-05, + "loss": 5.7194, + "step": 1196 + }, + { + "epoch": 0.12, + "grad_norm": 1.0620874166488647, + "learning_rate": 4.418266433973241e-05, + "loss": 5.5184, + "step": 1200 + }, + { + "epoch": 0.12, + "grad_norm": 1.0390667915344238, + "learning_rate": 4.416327322086485e-05, + "loss": 5.6573, + "step": 1204 + }, + { + "epoch": 0.12, + "grad_norm": 0.9894878268241882, + "learning_rate": 4.414388210199729e-05, + "loss": 5.765, + "step": 1208 + }, + { + "epoch": 0.12, + "grad_norm": 0.9994860887527466, + "learning_rate": 4.412449098312973e-05, + "loss": 5.661, + "step": 1212 + }, + { + "epoch": 0.12, + "grad_norm": 0.9990851283073425, + "learning_rate": 4.410509986426217e-05, + "loss": 5.6168, + "step": 1216 + }, + { + "epoch": 0.12, + "grad_norm": 1.0815703868865967, + "learning_rate": 4.408570874539461e-05, + "loss": 5.6433, + "step": 1220 + }, + { + "epoch": 0.12, + "grad_norm": 0.9334216117858887, + "learning_rate": 4.406631762652705e-05, + "loss": 5.5035, + "step": 1224 + }, + { + "epoch": 0.12, + "grad_norm": 0.9055896401405334, + "learning_rate": 4.404692650765949e-05, + "loss": 5.632, + "step": 1228 + }, + { + "epoch": 0.12, + "grad_norm": 1.0294831991195679, + "learning_rate": 4.402753538879194e-05, + "loss": 5.5739, + "step": 1232 + }, + { + "epoch": 0.12, + "grad_norm": 1.100758671760559, + "learning_rate": 4.400814426992438e-05, + "loss": 5.6159, + "step": 1236 + }, + { + "epoch": 0.12, + "grad_norm": 0.9790968894958496, + "learning_rate": 4.398875315105682e-05, + "loss": 5.6637, + "step": 1240 + }, + { + "epoch": 0.12, + "grad_norm": 0.9665990471839905, + "learning_rate": 4.396936203218926e-05, + "loss": 5.6361, + "step": 1244 + }, + { + "epoch": 0.12, + "grad_norm": 1.0019501447677612, + "learning_rate": 4.39499709133217e-05, + "loss": 5.5965, + "step": 1248 + }, + { + "epoch": 0.12, + "grad_norm": 0.9785341620445251, + "learning_rate": 4.393057979445414e-05, + "loss": 5.6697, + "step": 1252 + }, + { + "epoch": 0.12, + "grad_norm": 1.0229973793029785, + "learning_rate": 4.391118867558658e-05, + "loss": 5.6391, + "step": 1256 + }, + { + "epoch": 0.12, + "grad_norm": 0.9716441035270691, + "learning_rate": 4.389179755671902e-05, + "loss": 5.6513, + "step": 1260 + }, + { + "epoch": 0.12, + "grad_norm": 0.8981690406799316, + "learning_rate": 4.387240643785146e-05, + "loss": 5.6567, + "step": 1264 + }, + { + "epoch": 0.12, + "grad_norm": 0.9904807806015015, + "learning_rate": 4.385301531898391e-05, + "loss": 5.6761, + "step": 1268 + }, + { + "epoch": 0.12, + "grad_norm": 0.9603332877159119, + "learning_rate": 4.3833624200116344e-05, + "loss": 5.7039, + "step": 1272 + }, + { + "epoch": 0.12, + "grad_norm": 1.044404149055481, + "learning_rate": 4.381423308124879e-05, + "loss": 5.5329, + "step": 1276 + }, + { + "epoch": 0.12, + "grad_norm": 1.1209489107131958, + "learning_rate": 4.379484196238123e-05, + "loss": 5.6461, + "step": 1280 + }, + { + "epoch": 0.12, + "grad_norm": 1.1430855989456177, + "learning_rate": 4.377545084351367e-05, + "loss": 5.5406, + "step": 1284 + }, + { + "epoch": 0.12, + "grad_norm": 1.004539966583252, + "learning_rate": 4.375605972464611e-05, + "loss": 5.5827, + "step": 1288 + }, + { + "epoch": 0.13, + "grad_norm": 1.0721603631973267, + "learning_rate": 4.373666860577856e-05, + "loss": 5.5553, + "step": 1292 + }, + { + "epoch": 0.13, + "grad_norm": 0.9446938037872314, + "learning_rate": 4.3717277486910994e-05, + "loss": 5.6481, + "step": 1296 + }, + { + "epoch": 0.13, + "grad_norm": 0.8731529116630554, + "learning_rate": 4.369788636804344e-05, + "loss": 5.6374, + "step": 1300 + }, + { + "epoch": 0.13, + "grad_norm": 1.0640937089920044, + "learning_rate": 4.3678495249175875e-05, + "loss": 5.5943, + "step": 1304 + }, + { + "epoch": 0.13, + "grad_norm": 1.0243881940841675, + "learning_rate": 4.365910413030832e-05, + "loss": 5.5065, + "step": 1308 + }, + { + "epoch": 0.13, + "grad_norm": 1.1701568365097046, + "learning_rate": 4.363971301144076e-05, + "loss": 5.6329, + "step": 1312 + }, + { + "epoch": 0.13, + "grad_norm": 0.9212052822113037, + "learning_rate": 4.36203218925732e-05, + "loss": 5.7735, + "step": 1316 + }, + { + "epoch": 0.13, + "grad_norm": 0.9392960667610168, + "learning_rate": 4.3600930773705644e-05, + "loss": 5.5992, + "step": 1320 + }, + { + "epoch": 0.13, + "grad_norm": 1.0432089567184448, + "learning_rate": 4.358153965483809e-05, + "loss": 5.5948, + "step": 1324 + }, + { + "epoch": 0.13, + "grad_norm": 0.9817702770233154, + "learning_rate": 4.3562148535970525e-05, + "loss": 5.6496, + "step": 1328 + }, + { + "epoch": 0.13, + "grad_norm": 1.0641746520996094, + "learning_rate": 4.354275741710297e-05, + "loss": 5.6238, + "step": 1332 + }, + { + "epoch": 0.13, + "grad_norm": 1.0068002939224243, + "learning_rate": 4.352336629823541e-05, + "loss": 5.6316, + "step": 1336 + }, + { + "epoch": 0.13, + "grad_norm": 1.031501054763794, + "learning_rate": 4.350397517936785e-05, + "loss": 5.6462, + "step": 1340 + }, + { + "epoch": 0.13, + "grad_norm": 0.980787992477417, + "learning_rate": 4.3484584060500294e-05, + "loss": 5.6535, + "step": 1344 + }, + { + "epoch": 0.13, + "grad_norm": 0.9462695717811584, + "learning_rate": 4.3465192941632734e-05, + "loss": 5.6083, + "step": 1348 + }, + { + "epoch": 0.13, + "grad_norm": 0.8907430768013, + "learning_rate": 4.3445801822765175e-05, + "loss": 5.615, + "step": 1352 + }, + { + "epoch": 0.13, + "grad_norm": 1.2333685159683228, + "learning_rate": 4.3426410703897615e-05, + "loss": 5.6374, + "step": 1356 + }, + { + "epoch": 0.13, + "grad_norm": 1.165974497795105, + "learning_rate": 4.3407019585030056e-05, + "loss": 5.5617, + "step": 1360 + }, + { + "epoch": 0.13, + "grad_norm": 1.1213206052780151, + "learning_rate": 4.33876284661625e-05, + "loss": 5.6201, + "step": 1364 + }, + { + "epoch": 0.13, + "grad_norm": 1.0781583786010742, + "learning_rate": 4.3368237347294944e-05, + "loss": 5.639, + "step": 1368 + }, + { + "epoch": 0.13, + "grad_norm": 1.0337659120559692, + "learning_rate": 4.3348846228427384e-05, + "loss": 5.736, + "step": 1372 + }, + { + "epoch": 0.13, + "grad_norm": 0.9833229184150696, + "learning_rate": 4.3329455109559825e-05, + "loss": 5.6153, + "step": 1376 + }, + { + "epoch": 0.13, + "grad_norm": 1.0340036153793335, + "learning_rate": 4.3310063990692265e-05, + "loss": 5.6146, + "step": 1380 + }, + { + "epoch": 0.13, + "grad_norm": 0.9522514939308167, + "learning_rate": 4.3290672871824706e-05, + "loss": 5.646, + "step": 1384 + }, + { + "epoch": 0.13, + "grad_norm": 1.040779948234558, + "learning_rate": 4.3271281752957146e-05, + "loss": 5.6411, + "step": 1388 + }, + { + "epoch": 0.13, + "grad_norm": 0.9653657674789429, + "learning_rate": 4.325189063408959e-05, + "loss": 5.634, + "step": 1392 + }, + { + "epoch": 0.14, + "grad_norm": 0.9610472321510315, + "learning_rate": 4.323249951522203e-05, + "loss": 5.5112, + "step": 1396 + }, + { + "epoch": 0.14, + "grad_norm": 1.0222139358520508, + "learning_rate": 4.3213108396354475e-05, + "loss": 5.4966, + "step": 1400 + }, + { + "epoch": 0.14, + "grad_norm": 1.0731512308120728, + "learning_rate": 4.319371727748691e-05, + "loss": 5.5193, + "step": 1404 + }, + { + "epoch": 0.14, + "grad_norm": 1.075119972229004, + "learning_rate": 4.3174326158619356e-05, + "loss": 5.573, + "step": 1408 + }, + { + "epoch": 0.14, + "grad_norm": 0.9324813485145569, + "learning_rate": 4.3154935039751796e-05, + "loss": 5.5821, + "step": 1412 + }, + { + "epoch": 0.14, + "grad_norm": 0.8965750336647034, + "learning_rate": 4.313554392088424e-05, + "loss": 5.4723, + "step": 1416 + }, + { + "epoch": 0.14, + "grad_norm": 0.902160108089447, + "learning_rate": 4.311615280201668e-05, + "loss": 5.5979, + "step": 1420 + }, + { + "epoch": 0.14, + "grad_norm": 0.9165207743644714, + "learning_rate": 4.309676168314912e-05, + "loss": 5.5698, + "step": 1424 + }, + { + "epoch": 0.14, + "grad_norm": 1.0775911808013916, + "learning_rate": 4.307737056428156e-05, + "loss": 5.4091, + "step": 1428 + }, + { + "epoch": 0.14, + "grad_norm": 0.9935070276260376, + "learning_rate": 4.3057979445414006e-05, + "loss": 5.6084, + "step": 1432 + }, + { + "epoch": 0.14, + "grad_norm": 1.105722188949585, + "learning_rate": 4.303858832654644e-05, + "loss": 5.6961, + "step": 1436 + }, + { + "epoch": 0.14, + "grad_norm": 1.0309725999832153, + "learning_rate": 4.301919720767889e-05, + "loss": 5.6465, + "step": 1440 + }, + { + "epoch": 0.14, + "grad_norm": 0.9571665525436401, + "learning_rate": 4.299980608881133e-05, + "loss": 5.5844, + "step": 1444 + }, + { + "epoch": 0.14, + "grad_norm": 1.0037786960601807, + "learning_rate": 4.298041496994377e-05, + "loss": 5.7001, + "step": 1448 + }, + { + "epoch": 0.14, + "grad_norm": 0.9573729634284973, + "learning_rate": 4.296102385107621e-05, + "loss": 5.5545, + "step": 1452 + }, + { + "epoch": 0.14, + "grad_norm": 1.0113383531570435, + "learning_rate": 4.2941632732208656e-05, + "loss": 5.6047, + "step": 1456 + }, + { + "epoch": 0.14, + "grad_norm": 0.9878535866737366, + "learning_rate": 4.292224161334109e-05, + "loss": 5.5582, + "step": 1460 + }, + { + "epoch": 0.14, + "grad_norm": 0.9363996386528015, + "learning_rate": 4.290285049447354e-05, + "loss": 5.5544, + "step": 1464 + }, + { + "epoch": 0.14, + "grad_norm": 0.8429636359214783, + "learning_rate": 4.288345937560597e-05, + "loss": 5.5561, + "step": 1468 + }, + { + "epoch": 0.14, + "grad_norm": 1.0051556825637817, + "learning_rate": 4.286406825673842e-05, + "loss": 5.5236, + "step": 1472 + }, + { + "epoch": 0.14, + "grad_norm": 0.9440407156944275, + "learning_rate": 4.284467713787086e-05, + "loss": 5.6561, + "step": 1476 + }, + { + "epoch": 0.14, + "grad_norm": 0.9376720786094666, + "learning_rate": 4.28252860190033e-05, + "loss": 5.6352, + "step": 1480 + }, + { + "epoch": 0.14, + "grad_norm": 1.0096479654312134, + "learning_rate": 4.280589490013574e-05, + "loss": 5.5187, + "step": 1484 + }, + { + "epoch": 0.14, + "grad_norm": 1.003917932510376, + "learning_rate": 4.278650378126818e-05, + "loss": 5.5664, + "step": 1488 + }, + { + "epoch": 0.14, + "grad_norm": 0.9462191462516785, + "learning_rate": 4.276711266240062e-05, + "loss": 5.553, + "step": 1492 + }, + { + "epoch": 0.15, + "grad_norm": 0.9691956043243408, + "learning_rate": 4.274772154353306e-05, + "loss": 5.6481, + "step": 1496 + }, + { + "epoch": 0.15, + "grad_norm": 0.8949794769287109, + "learning_rate": 4.27283304246655e-05, + "loss": 5.5607, + "step": 1500 + }, + { + "epoch": 0.15, + "grad_norm": 0.9408190250396729, + "learning_rate": 4.270893930579795e-05, + "loss": 5.5379, + "step": 1504 + }, + { + "epoch": 0.15, + "grad_norm": 0.9462267756462097, + "learning_rate": 4.268954818693039e-05, + "loss": 5.628, + "step": 1508 + }, + { + "epoch": 0.15, + "grad_norm": 0.9751488566398621, + "learning_rate": 4.267015706806283e-05, + "loss": 5.5709, + "step": 1512 + }, + { + "epoch": 0.15, + "grad_norm": 1.0487922430038452, + "learning_rate": 4.265076594919527e-05, + "loss": 5.5999, + "step": 1516 + }, + { + "epoch": 0.15, + "grad_norm": 1.0256918668746948, + "learning_rate": 4.263137483032771e-05, + "loss": 5.5375, + "step": 1520 + }, + { + "epoch": 0.15, + "grad_norm": 0.9403053522109985, + "learning_rate": 4.261198371146015e-05, + "loss": 5.539, + "step": 1524 + }, + { + "epoch": 0.15, + "grad_norm": 1.2121844291687012, + "learning_rate": 4.259259259259259e-05, + "loss": 5.5828, + "step": 1528 + }, + { + "epoch": 0.15, + "grad_norm": 1.0676014423370361, + "learning_rate": 4.257320147372504e-05, + "loss": 5.6215, + "step": 1532 + }, + { + "epoch": 0.15, + "grad_norm": 0.9083504676818848, + "learning_rate": 4.2553810354857473e-05, + "loss": 5.7206, + "step": 1536 + }, + { + "epoch": 0.15, + "grad_norm": 0.8858234882354736, + "learning_rate": 4.253441923598992e-05, + "loss": 5.5742, + "step": 1540 + }, + { + "epoch": 0.15, + "grad_norm": 0.9309298396110535, + "learning_rate": 4.2515028117122355e-05, + "loss": 5.5078, + "step": 1544 + }, + { + "epoch": 0.15, + "grad_norm": 0.9594176411628723, + "learning_rate": 4.24956369982548e-05, + "loss": 5.5863, + "step": 1548 + }, + { + "epoch": 0.15, + "grad_norm": 0.8728674650192261, + "learning_rate": 4.247624587938724e-05, + "loss": 5.5323, + "step": 1552 + }, + { + "epoch": 0.15, + "grad_norm": 0.9947747588157654, + "learning_rate": 4.245685476051968e-05, + "loss": 5.5641, + "step": 1556 + }, + { + "epoch": 0.15, + "grad_norm": 0.9186658263206482, + "learning_rate": 4.2437463641652123e-05, + "loss": 5.6258, + "step": 1560 + }, + { + "epoch": 0.15, + "grad_norm": 0.9086698889732361, + "learning_rate": 4.241807252278457e-05, + "loss": 5.6391, + "step": 1564 + }, + { + "epoch": 0.15, + "grad_norm": 0.954474925994873, + "learning_rate": 4.2398681403917005e-05, + "loss": 5.6077, + "step": 1568 + }, + { + "epoch": 0.15, + "grad_norm": 0.9422994256019592, + "learning_rate": 4.237929028504945e-05, + "loss": 5.4322, + "step": 1572 + }, + { + "epoch": 0.15, + "grad_norm": 0.9581114649772644, + "learning_rate": 4.235989916618189e-05, + "loss": 5.5757, + "step": 1576 + }, + { + "epoch": 0.15, + "grad_norm": 0.9948291778564453, + "learning_rate": 4.234050804731433e-05, + "loss": 5.5409, + "step": 1580 + }, + { + "epoch": 0.15, + "grad_norm": 1.0310508012771606, + "learning_rate": 4.2321116928446773e-05, + "loss": 5.5628, + "step": 1584 + }, + { + "epoch": 0.15, + "grad_norm": 1.053040862083435, + "learning_rate": 4.2301725809579214e-05, + "loss": 5.6698, + "step": 1588 + }, + { + "epoch": 0.15, + "grad_norm": 1.093757152557373, + "learning_rate": 4.2282334690711655e-05, + "loss": 5.5696, + "step": 1592 + }, + { + "epoch": 0.15, + "grad_norm": 1.0216928720474243, + "learning_rate": 4.22629435718441e-05, + "loss": 5.6469, + "step": 1596 + }, + { + "epoch": 0.16, + "grad_norm": 1.0343660116195679, + "learning_rate": 4.2243552452976536e-05, + "loss": 5.6364, + "step": 1600 + }, + { + "epoch": 0.16, + "grad_norm": 0.9620201587677002, + "learning_rate": 4.222416133410898e-05, + "loss": 5.6311, + "step": 1604 + }, + { + "epoch": 0.16, + "grad_norm": 1.032458782196045, + "learning_rate": 4.2204770215241423e-05, + "loss": 5.523, + "step": 1608 + }, + { + "epoch": 0.16, + "grad_norm": 0.9948311448097229, + "learning_rate": 4.2185379096373864e-05, + "loss": 5.5698, + "step": 1612 + }, + { + "epoch": 0.16, + "grad_norm": 1.0208816528320312, + "learning_rate": 4.2165987977506305e-05, + "loss": 5.6042, + "step": 1616 + }, + { + "epoch": 0.16, + "grad_norm": 1.1074092388153076, + "learning_rate": 4.2146596858638745e-05, + "loss": 5.542, + "step": 1620 + }, + { + "epoch": 0.16, + "grad_norm": 0.9410656690597534, + "learning_rate": 4.2127205739771186e-05, + "loss": 5.5188, + "step": 1624 + }, + { + "epoch": 0.16, + "grad_norm": 1.0261708498001099, + "learning_rate": 4.2107814620903626e-05, + "loss": 5.5377, + "step": 1628 + }, + { + "epoch": 0.16, + "grad_norm": 0.9042761325836182, + "learning_rate": 4.208842350203607e-05, + "loss": 5.5023, + "step": 1632 + }, + { + "epoch": 0.16, + "grad_norm": 1.0243982076644897, + "learning_rate": 4.2069032383168514e-05, + "loss": 5.5674, + "step": 1636 + }, + { + "epoch": 0.16, + "grad_norm": 1.0028672218322754, + "learning_rate": 4.2049641264300955e-05, + "loss": 5.5574, + "step": 1640 + }, + { + "epoch": 0.16, + "grad_norm": 0.9813360571861267, + "learning_rate": 4.2030250145433395e-05, + "loss": 5.5664, + "step": 1644 + }, + { + "epoch": 0.16, + "grad_norm": 0.9174672365188599, + "learning_rate": 4.2010859026565836e-05, + "loss": 5.5837, + "step": 1648 + }, + { + "epoch": 0.16, + "grad_norm": 1.1121405363082886, + "learning_rate": 4.1991467907698276e-05, + "loss": 5.5619, + "step": 1652 + }, + { + "epoch": 0.16, + "grad_norm": 1.0016354322433472, + "learning_rate": 4.197207678883072e-05, + "loss": 5.5928, + "step": 1656 + }, + { + "epoch": 0.16, + "grad_norm": 0.9536789655685425, + "learning_rate": 4.195268566996316e-05, + "loss": 5.5211, + "step": 1660 + }, + { + "epoch": 0.16, + "grad_norm": 1.0609054565429688, + "learning_rate": 4.19332945510956e-05, + "loss": 5.5809, + "step": 1664 + }, + { + "epoch": 0.16, + "grad_norm": 0.9688783288002014, + "learning_rate": 4.191390343222804e-05, + "loss": 5.576, + "step": 1668 + }, + { + "epoch": 0.16, + "grad_norm": 1.0187081098556519, + "learning_rate": 4.1894512313360486e-05, + "loss": 5.569, + "step": 1672 + }, + { + "epoch": 0.16, + "grad_norm": 0.9904341101646423, + "learning_rate": 4.187512119449292e-05, + "loss": 5.5725, + "step": 1676 + }, + { + "epoch": 0.16, + "grad_norm": 1.1697142124176025, + "learning_rate": 4.185573007562537e-05, + "loss": 5.566, + "step": 1680 + }, + { + "epoch": 0.16, + "grad_norm": 0.9842191934585571, + "learning_rate": 4.183633895675781e-05, + "loss": 5.5687, + "step": 1684 + }, + { + "epoch": 0.16, + "grad_norm": 1.0618923902511597, + "learning_rate": 4.181694783789025e-05, + "loss": 5.5165, + "step": 1688 + }, + { + "epoch": 0.16, + "grad_norm": 0.9923197031021118, + "learning_rate": 4.179755671902269e-05, + "loss": 5.5625, + "step": 1692 + }, + { + "epoch": 0.16, + "grad_norm": 1.0109176635742188, + "learning_rate": 4.1778165600155136e-05, + "loss": 5.601, + "step": 1696 + }, + { + "epoch": 0.16, + "grad_norm": 0.8870441913604736, + "learning_rate": 4.175877448128757e-05, + "loss": 5.4932, + "step": 1700 + }, + { + "epoch": 0.17, + "grad_norm": 1.004470705986023, + "learning_rate": 4.173938336242002e-05, + "loss": 5.5546, + "step": 1704 + }, + { + "epoch": 0.17, + "grad_norm": 0.9421555399894714, + "learning_rate": 4.171999224355245e-05, + "loss": 5.4712, + "step": 1708 + }, + { + "epoch": 0.17, + "grad_norm": 0.9490504264831543, + "learning_rate": 4.17006011246849e-05, + "loss": 5.493, + "step": 1712 + }, + { + "epoch": 0.17, + "grad_norm": 0.9385312795639038, + "learning_rate": 4.168121000581734e-05, + "loss": 5.5997, + "step": 1716 + }, + { + "epoch": 0.17, + "grad_norm": 0.9013298153877258, + "learning_rate": 4.166181888694978e-05, + "loss": 5.4883, + "step": 1720 + }, + { + "epoch": 0.17, + "grad_norm": 0.9516580104827881, + "learning_rate": 4.164242776808222e-05, + "loss": 5.5971, + "step": 1724 + }, + { + "epoch": 0.17, + "grad_norm": 1.033234715461731, + "learning_rate": 4.162303664921467e-05, + "loss": 5.5807, + "step": 1728 + }, + { + "epoch": 0.17, + "grad_norm": 1.042651653289795, + "learning_rate": 4.16036455303471e-05, + "loss": 5.5076, + "step": 1732 + }, + { + "epoch": 0.17, + "grad_norm": 0.9568614959716797, + "learning_rate": 4.158425441147955e-05, + "loss": 5.5909, + "step": 1736 + }, + { + "epoch": 0.17, + "grad_norm": 0.9669156074523926, + "learning_rate": 4.156486329261198e-05, + "loss": 5.5952, + "step": 1740 + }, + { + "epoch": 0.17, + "grad_norm": 0.9101834893226624, + "learning_rate": 4.154547217374443e-05, + "loss": 5.5554, + "step": 1744 + }, + { + "epoch": 0.17, + "grad_norm": 0.9412431716918945, + "learning_rate": 4.152608105487687e-05, + "loss": 5.5461, + "step": 1748 + }, + { + "epoch": 0.17, + "grad_norm": 1.0211936235427856, + "learning_rate": 4.150668993600931e-05, + "loss": 5.6353, + "step": 1752 + }, + { + "epoch": 0.17, + "grad_norm": 1.0581713914871216, + "learning_rate": 4.148729881714175e-05, + "loss": 5.5807, + "step": 1756 + }, + { + "epoch": 0.17, + "grad_norm": 0.9403213858604431, + "learning_rate": 4.146790769827419e-05, + "loss": 5.5369, + "step": 1760 + }, + { + "epoch": 0.17, + "grad_norm": 0.9232079386711121, + "learning_rate": 4.144851657940663e-05, + "loss": 5.5559, + "step": 1764 + }, + { + "epoch": 0.17, + "grad_norm": 0.9828022122383118, + "learning_rate": 4.142912546053908e-05, + "loss": 5.4692, + "step": 1768 + }, + { + "epoch": 0.17, + "grad_norm": 0.9320650696754456, + "learning_rate": 4.140973434167152e-05, + "loss": 5.5443, + "step": 1772 + }, + { + "epoch": 0.17, + "grad_norm": 0.9043223261833191, + "learning_rate": 4.139034322280396e-05, + "loss": 5.4807, + "step": 1776 + }, + { + "epoch": 0.17, + "grad_norm": 0.9783521294593811, + "learning_rate": 4.13709521039364e-05, + "loss": 5.575, + "step": 1780 + }, + { + "epoch": 0.17, + "grad_norm": 1.0388356447219849, + "learning_rate": 4.135156098506884e-05, + "loss": 5.5553, + "step": 1784 + }, + { + "epoch": 0.17, + "grad_norm": 0.9247937202453613, + "learning_rate": 4.133216986620128e-05, + "loss": 5.5511, + "step": 1788 + }, + { + "epoch": 0.17, + "grad_norm": 1.0111491680145264, + "learning_rate": 4.131277874733372e-05, + "loss": 5.6378, + "step": 1792 + }, + { + "epoch": 0.17, + "grad_norm": 0.9572594165802002, + "learning_rate": 4.129338762846616e-05, + "loss": 5.4827, + "step": 1796 + }, + { + "epoch": 0.17, + "grad_norm": 0.9328434467315674, + "learning_rate": 4.12739965095986e-05, + "loss": 5.5489, + "step": 1800 + }, + { + "epoch": 0.17, + "grad_norm": 1.000696063041687, + "learning_rate": 4.125460539073105e-05, + "loss": 5.6207, + "step": 1804 + }, + { + "epoch": 0.18, + "grad_norm": 0.891006350517273, + "learning_rate": 4.1235214271863484e-05, + "loss": 5.525, + "step": 1808 + }, + { + "epoch": 0.18, + "grad_norm": 1.0198568105697632, + "learning_rate": 4.121582315299593e-05, + "loss": 5.5186, + "step": 1812 + }, + { + "epoch": 0.18, + "grad_norm": 1.0678110122680664, + "learning_rate": 4.119643203412837e-05, + "loss": 5.565, + "step": 1816 + }, + { + "epoch": 0.18, + "grad_norm": 0.9433650970458984, + "learning_rate": 4.117704091526081e-05, + "loss": 5.5038, + "step": 1820 + }, + { + "epoch": 0.18, + "grad_norm": 0.982768714427948, + "learning_rate": 4.115764979639325e-05, + "loss": 5.4981, + "step": 1824 + }, + { + "epoch": 0.18, + "grad_norm": 0.9159711003303528, + "learning_rate": 4.1138258677525694e-05, + "loss": 5.4945, + "step": 1828 + }, + { + "epoch": 0.18, + "grad_norm": 1.0521996021270752, + "learning_rate": 4.1118867558658134e-05, + "loss": 5.5498, + "step": 1832 + }, + { + "epoch": 0.18, + "grad_norm": 0.9633331894874573, + "learning_rate": 4.109947643979058e-05, + "loss": 5.4915, + "step": 1836 + }, + { + "epoch": 0.18, + "grad_norm": 0.9132066965103149, + "learning_rate": 4.1080085320923015e-05, + "loss": 5.5192, + "step": 1840 + }, + { + "epoch": 0.18, + "grad_norm": 0.947010338306427, + "learning_rate": 4.106069420205546e-05, + "loss": 5.5928, + "step": 1844 + }, + { + "epoch": 0.18, + "grad_norm": 1.1576569080352783, + "learning_rate": 4.10413030831879e-05, + "loss": 5.6224, + "step": 1848 + }, + { + "epoch": 0.18, + "grad_norm": 0.913221538066864, + "learning_rate": 4.1021911964320344e-05, + "loss": 5.6261, + "step": 1852 + }, + { + "epoch": 0.18, + "grad_norm": 0.9803014397621155, + "learning_rate": 4.1002520845452784e-05, + "loss": 5.4529, + "step": 1856 + }, + { + "epoch": 0.18, + "grad_norm": 1.0535297393798828, + "learning_rate": 4.0983129726585225e-05, + "loss": 5.5471, + "step": 1860 + }, + { + "epoch": 0.18, + "grad_norm": 0.9755203723907471, + "learning_rate": 4.0963738607717665e-05, + "loss": 5.6235, + "step": 1864 + }, + { + "epoch": 0.18, + "grad_norm": 0.8834216594696045, + "learning_rate": 4.094434748885011e-05, + "loss": 5.5566, + "step": 1868 + }, + { + "epoch": 0.18, + "grad_norm": 0.9767342209815979, + "learning_rate": 4.0924956369982546e-05, + "loss": 5.6874, + "step": 1872 + }, + { + "epoch": 0.18, + "grad_norm": 1.018384575843811, + "learning_rate": 4.0905565251114994e-05, + "loss": 5.5437, + "step": 1876 + }, + { + "epoch": 0.18, + "grad_norm": 1.0440962314605713, + "learning_rate": 4.0886174132247434e-05, + "loss": 5.594, + "step": 1880 + }, + { + "epoch": 0.18, + "grad_norm": 1.0711756944656372, + "learning_rate": 4.0866783013379875e-05, + "loss": 5.55, + "step": 1884 + }, + { + "epoch": 0.18, + "grad_norm": 0.9786011576652527, + "learning_rate": 4.0847391894512315e-05, + "loss": 5.5315, + "step": 1888 + }, + { + "epoch": 0.18, + "grad_norm": 1.1673699617385864, + "learning_rate": 4.0828000775644756e-05, + "loss": 5.596, + "step": 1892 + }, + { + "epoch": 0.18, + "grad_norm": 1.0179039239883423, + "learning_rate": 4.0808609656777196e-05, + "loss": 5.5635, + "step": 1896 + }, + { + "epoch": 0.18, + "grad_norm": 1.0204113721847534, + "learning_rate": 4.0789218537909644e-05, + "loss": 5.5419, + "step": 1900 + }, + { + "epoch": 0.18, + "grad_norm": 0.919965386390686, + "learning_rate": 4.076982741904208e-05, + "loss": 5.5283, + "step": 1904 + }, + { + "epoch": 0.18, + "grad_norm": 0.911108136177063, + "learning_rate": 4.0750436300174525e-05, + "loss": 5.4592, + "step": 1908 + }, + { + "epoch": 0.19, + "grad_norm": 0.9294359087944031, + "learning_rate": 4.0731045181306965e-05, + "loss": 5.5676, + "step": 1912 + }, + { + "epoch": 0.19, + "grad_norm": 0.928774893283844, + "learning_rate": 4.0711654062439406e-05, + "loss": 5.5844, + "step": 1916 + }, + { + "epoch": 0.19, + "grad_norm": 1.0778926610946655, + "learning_rate": 4.0692262943571846e-05, + "loss": 5.5877, + "step": 1920 + }, + { + "epoch": 0.19, + "grad_norm": 1.0779755115509033, + "learning_rate": 4.067287182470429e-05, + "loss": 5.5362, + "step": 1924 + }, + { + "epoch": 0.19, + "grad_norm": 0.9737743139266968, + "learning_rate": 4.065348070583673e-05, + "loss": 5.584, + "step": 1928 + }, + { + "epoch": 0.19, + "grad_norm": 0.9039328098297119, + "learning_rate": 4.063408958696917e-05, + "loss": 5.441, + "step": 1932 + }, + { + "epoch": 0.19, + "grad_norm": 1.0125256776809692, + "learning_rate": 4.0614698468101615e-05, + "loss": 5.5987, + "step": 1936 + }, + { + "epoch": 0.19, + "grad_norm": 0.9862051010131836, + "learning_rate": 4.059530734923405e-05, + "loss": 5.5512, + "step": 1940 + }, + { + "epoch": 0.19, + "grad_norm": 0.8931455016136169, + "learning_rate": 4.0575916230366496e-05, + "loss": 5.5184, + "step": 1944 + }, + { + "epoch": 0.19, + "grad_norm": 0.9782811403274536, + "learning_rate": 4.055652511149893e-05, + "loss": 5.6231, + "step": 1948 + }, + { + "epoch": 0.19, + "grad_norm": 0.9657939076423645, + "learning_rate": 4.053713399263138e-05, + "loss": 5.5536, + "step": 1952 + }, + { + "epoch": 0.19, + "grad_norm": 0.9090112447738647, + "learning_rate": 4.051774287376382e-05, + "loss": 5.51, + "step": 1956 + }, + { + "epoch": 0.19, + "grad_norm": 0.9079639911651611, + "learning_rate": 4.049835175489626e-05, + "loss": 5.4939, + "step": 1960 + }, + { + "epoch": 0.19, + "grad_norm": 1.0238642692565918, + "learning_rate": 4.04789606360287e-05, + "loss": 5.626, + "step": 1964 + }, + { + "epoch": 0.19, + "grad_norm": 0.9550356268882751, + "learning_rate": 4.0459569517161146e-05, + "loss": 5.5831, + "step": 1968 + }, + { + "epoch": 0.19, + "grad_norm": 1.0468617677688599, + "learning_rate": 4.044017839829358e-05, + "loss": 5.5789, + "step": 1972 + }, + { + "epoch": 0.19, + "grad_norm": 0.9686053395271301, + "learning_rate": 4.042078727942603e-05, + "loss": 5.5384, + "step": 1976 + }, + { + "epoch": 0.19, + "grad_norm": 1.0532784461975098, + "learning_rate": 4.040139616055846e-05, + "loss": 5.617, + "step": 1980 + }, + { + "epoch": 0.19, + "grad_norm": 0.9046121835708618, + "learning_rate": 4.038200504169091e-05, + "loss": 5.6226, + "step": 1984 + }, + { + "epoch": 0.19, + "grad_norm": 0.9807924628257751, + "learning_rate": 4.036261392282335e-05, + "loss": 5.5259, + "step": 1988 + }, + { + "epoch": 0.19, + "grad_norm": 0.957099199295044, + "learning_rate": 4.034322280395579e-05, + "loss": 5.568, + "step": 1992 + }, + { + "epoch": 0.19, + "grad_norm": 0.9218006134033203, + "learning_rate": 4.032383168508823e-05, + "loss": 5.4665, + "step": 1996 + }, + { + "epoch": 0.19, + "grad_norm": 1.190796971321106, + "learning_rate": 4.030444056622068e-05, + "loss": 5.4891, + "step": 2000 + }, + { + "epoch": 0.19, + "grad_norm": 0.9437822699546814, + "learning_rate": 4.028504944735311e-05, + "loss": 5.4209, + "step": 2004 + }, + { + "epoch": 0.19, + "grad_norm": 0.8980192542076111, + "learning_rate": 4.026565832848556e-05, + "loss": 5.5624, + "step": 2008 + }, + { + "epoch": 0.2, + "grad_norm": 1.1388368606567383, + "learning_rate": 4.0246267209618e-05, + "loss": 5.5448, + "step": 2012 + }, + { + "epoch": 0.2, + "grad_norm": 0.9411901235580444, + "learning_rate": 4.022687609075044e-05, + "loss": 5.552, + "step": 2016 + }, + { + "epoch": 0.2, + "grad_norm": 0.9226595163345337, + "learning_rate": 4.020748497188288e-05, + "loss": 5.4518, + "step": 2020 + }, + { + "epoch": 0.2, + "grad_norm": 1.0351731777191162, + "learning_rate": 4.018809385301532e-05, + "loss": 5.5449, + "step": 2024 + }, + { + "epoch": 0.2, + "grad_norm": 0.89235919713974, + "learning_rate": 4.016870273414776e-05, + "loss": 5.563, + "step": 2028 + }, + { + "epoch": 0.2, + "grad_norm": 0.9486913084983826, + "learning_rate": 4.014931161528021e-05, + "loss": 5.4925, + "step": 2032 + }, + { + "epoch": 0.2, + "grad_norm": 0.9577587842941284, + "learning_rate": 4.012992049641264e-05, + "loss": 5.4697, + "step": 2036 + }, + { + "epoch": 0.2, + "grad_norm": 0.9649032950401306, + "learning_rate": 4.011052937754509e-05, + "loss": 5.4694, + "step": 2040 + }, + { + "epoch": 0.2, + "grad_norm": 0.8921785950660706, + "learning_rate": 4.009113825867753e-05, + "loss": 5.6328, + "step": 2044 + }, + { + "epoch": 0.2, + "grad_norm": 0.9974868893623352, + "learning_rate": 4.007174713980997e-05, + "loss": 5.572, + "step": 2048 + }, + { + "epoch": 0.2, + "grad_norm": 0.9336390495300293, + "learning_rate": 4.005235602094241e-05, + "loss": 5.5147, + "step": 2052 + }, + { + "epoch": 0.2, + "grad_norm": 0.942483127117157, + "learning_rate": 4.003296490207485e-05, + "loss": 5.4991, + "step": 2056 + }, + { + "epoch": 0.2, + "grad_norm": 0.9286855459213257, + "learning_rate": 4.001357378320729e-05, + "loss": 5.5299, + "step": 2060 + }, + { + "epoch": 0.2, + "grad_norm": 0.9835842251777649, + "learning_rate": 3.999418266433973e-05, + "loss": 5.6657, + "step": 2064 + }, + { + "epoch": 0.2, + "grad_norm": 0.9683178663253784, + "learning_rate": 3.997479154547217e-05, + "loss": 5.5365, + "step": 2068 + }, + { + "epoch": 0.2, + "grad_norm": 1.0575816631317139, + "learning_rate": 3.9955400426604614e-05, + "loss": 5.6907, + "step": 2072 + }, + { + "epoch": 0.2, + "grad_norm": 0.9691389799118042, + "learning_rate": 3.993600930773706e-05, + "loss": 5.5166, + "step": 2076 + }, + { + "epoch": 0.2, + "grad_norm": 1.0539683103561401, + "learning_rate": 3.9916618188869495e-05, + "loss": 5.6106, + "step": 2080 + }, + { + "epoch": 0.2, + "grad_norm": 1.010002851486206, + "learning_rate": 3.989722707000194e-05, + "loss": 5.4635, + "step": 2084 + }, + { + "epoch": 0.2, + "grad_norm": 0.9025498032569885, + "learning_rate": 3.987783595113438e-05, + "loss": 5.5538, + "step": 2088 + }, + { + "epoch": 0.2, + "grad_norm": 0.9283170700073242, + "learning_rate": 3.985844483226682e-05, + "loss": 5.5295, + "step": 2092 + }, + { + "epoch": 0.2, + "grad_norm": 1.0095292329788208, + "learning_rate": 3.9839053713399264e-05, + "loss": 5.4738, + "step": 2096 + }, + { + "epoch": 0.2, + "grad_norm": 1.009535312652588, + "learning_rate": 3.9819662594531704e-05, + "loss": 5.4439, + "step": 2100 + }, + { + "epoch": 0.2, + "grad_norm": 0.9733148217201233, + "learning_rate": 3.9800271475664145e-05, + "loss": 5.5443, + "step": 2104 + }, + { + "epoch": 0.2, + "grad_norm": 1.0330760478973389, + "learning_rate": 3.978088035679659e-05, + "loss": 5.58, + "step": 2108 + }, + { + "epoch": 0.2, + "grad_norm": 1.0041800737380981, + "learning_rate": 3.9761489237929026e-05, + "loss": 5.5228, + "step": 2112 + }, + { + "epoch": 0.21, + "grad_norm": 0.967785120010376, + "learning_rate": 3.974209811906147e-05, + "loss": 5.6853, + "step": 2116 + }, + { + "epoch": 0.21, + "grad_norm": 1.0202077627182007, + "learning_rate": 3.9722707000193914e-05, + "loss": 5.5428, + "step": 2120 + }, + { + "epoch": 0.21, + "grad_norm": 0.9732391834259033, + "learning_rate": 3.9703315881326354e-05, + "loss": 5.5364, + "step": 2124 + }, + { + "epoch": 0.21, + "grad_norm": 0.9813392162322998, + "learning_rate": 3.9683924762458795e-05, + "loss": 5.5904, + "step": 2128 + }, + { + "epoch": 0.21, + "grad_norm": 0.9656361937522888, + "learning_rate": 3.966453364359124e-05, + "loss": 5.4623, + "step": 2132 + }, + { + "epoch": 0.21, + "grad_norm": 0.956072211265564, + "learning_rate": 3.9645142524723676e-05, + "loss": 5.5391, + "step": 2136 + }, + { + "epoch": 0.21, + "grad_norm": 1.0806626081466675, + "learning_rate": 3.962575140585612e-05, + "loss": 5.5746, + "step": 2140 + }, + { + "epoch": 0.21, + "grad_norm": 1.0280499458312988, + "learning_rate": 3.960636028698856e-05, + "loss": 5.5504, + "step": 2144 + }, + { + "epoch": 0.21, + "grad_norm": 1.0023061037063599, + "learning_rate": 3.9586969168121004e-05, + "loss": 5.5687, + "step": 2148 + }, + { + "epoch": 0.21, + "grad_norm": 0.9010854363441467, + "learning_rate": 3.9567578049253445e-05, + "loss": 5.5546, + "step": 2152 + }, + { + "epoch": 0.21, + "grad_norm": 1.0272430181503296, + "learning_rate": 3.9548186930385885e-05, + "loss": 5.4934, + "step": 2156 + }, + { + "epoch": 0.21, + "grad_norm": 1.0653831958770752, + "learning_rate": 3.9528795811518326e-05, + "loss": 5.496, + "step": 2160 + }, + { + "epoch": 0.21, + "grad_norm": 1.0314921140670776, + "learning_rate": 3.950940469265077e-05, + "loss": 5.5062, + "step": 2164 + }, + { + "epoch": 0.21, + "grad_norm": 0.9908810257911682, + "learning_rate": 3.949001357378321e-05, + "loss": 5.6055, + "step": 2168 + }, + { + "epoch": 0.21, + "grad_norm": 0.9573884606361389, + "learning_rate": 3.9470622454915654e-05, + "loss": 5.5008, + "step": 2172 + }, + { + "epoch": 0.21, + "grad_norm": 0.9359253644943237, + "learning_rate": 3.9451231336048095e-05, + "loss": 5.6236, + "step": 2176 + }, + { + "epoch": 0.21, + "grad_norm": 1.0594229698181152, + "learning_rate": 3.9431840217180535e-05, + "loss": 5.4301, + "step": 2180 + }, + { + "epoch": 0.21, + "grad_norm": 0.994006872177124, + "learning_rate": 3.9412449098312976e-05, + "loss": 5.5638, + "step": 2184 + }, + { + "epoch": 0.21, + "grad_norm": 0.9278011322021484, + "learning_rate": 3.9393057979445416e-05, + "loss": 5.5056, + "step": 2188 + }, + { + "epoch": 0.21, + "grad_norm": 1.0667142868041992, + "learning_rate": 3.937366686057786e-05, + "loss": 5.6599, + "step": 2192 + }, + { + "epoch": 0.21, + "grad_norm": 0.9456555843353271, + "learning_rate": 3.93542757417103e-05, + "loss": 5.5152, + "step": 2196 + }, + { + "epoch": 0.21, + "grad_norm": 0.9990763068199158, + "learning_rate": 3.933488462284274e-05, + "loss": 5.4061, + "step": 2200 + }, + { + "epoch": 0.21, + "grad_norm": 0.9991575479507446, + "learning_rate": 3.931549350397518e-05, + "loss": 5.5047, + "step": 2204 + }, + { + "epoch": 0.21, + "grad_norm": 1.0041279792785645, + "learning_rate": 3.9296102385107626e-05, + "loss": 5.5836, + "step": 2208 + }, + { + "epoch": 0.21, + "grad_norm": 1.0032880306243896, + "learning_rate": 3.927671126624006e-05, + "loss": 5.5682, + "step": 2212 + }, + { + "epoch": 0.21, + "grad_norm": 1.0610923767089844, + "learning_rate": 3.925732014737251e-05, + "loss": 5.597, + "step": 2216 + }, + { + "epoch": 0.22, + "grad_norm": 0.9000992178916931, + "learning_rate": 3.923792902850494e-05, + "loss": 5.54, + "step": 2220 + }, + { + "epoch": 0.22, + "grad_norm": 0.9666118621826172, + "learning_rate": 3.921853790963739e-05, + "loss": 5.568, + "step": 2224 + }, + { + "epoch": 0.22, + "grad_norm": 0.963789701461792, + "learning_rate": 3.919914679076983e-05, + "loss": 5.5229, + "step": 2228 + }, + { + "epoch": 0.22, + "grad_norm": 1.030738353729248, + "learning_rate": 3.917975567190227e-05, + "loss": 5.6063, + "step": 2232 + }, + { + "epoch": 0.22, + "grad_norm": 1.0029246807098389, + "learning_rate": 3.916036455303471e-05, + "loss": 5.5573, + "step": 2236 + }, + { + "epoch": 0.22, + "grad_norm": 1.0004112720489502, + "learning_rate": 3.914097343416716e-05, + "loss": 5.4992, + "step": 2240 + }, + { + "epoch": 0.22, + "grad_norm": 0.9665903449058533, + "learning_rate": 3.912158231529959e-05, + "loss": 5.475, + "step": 2244 + }, + { + "epoch": 0.22, + "grad_norm": 0.927628219127655, + "learning_rate": 3.910219119643204e-05, + "loss": 5.4589, + "step": 2248 + }, + { + "epoch": 0.22, + "grad_norm": 1.0179831981658936, + "learning_rate": 3.908280007756448e-05, + "loss": 5.5458, + "step": 2252 + }, + { + "epoch": 0.22, + "grad_norm": 0.8914494514465332, + "learning_rate": 3.906340895869692e-05, + "loss": 5.4927, + "step": 2256 + }, + { + "epoch": 0.22, + "grad_norm": 1.001958966255188, + "learning_rate": 3.904401783982936e-05, + "loss": 5.4334, + "step": 2260 + }, + { + "epoch": 0.22, + "grad_norm": 0.9571743011474609, + "learning_rate": 3.90246267209618e-05, + "loss": 5.5728, + "step": 2264 + }, + { + "epoch": 0.22, + "grad_norm": 1.0079842805862427, + "learning_rate": 3.900523560209424e-05, + "loss": 5.5844, + "step": 2268 + }, + { + "epoch": 0.22, + "grad_norm": 0.9021591544151306, + "learning_rate": 3.898584448322669e-05, + "loss": 5.5276, + "step": 2272 + }, + { + "epoch": 0.22, + "grad_norm": 0.9413024187088013, + "learning_rate": 3.896645336435912e-05, + "loss": 5.4693, + "step": 2276 + }, + { + "epoch": 0.22, + "grad_norm": 0.9716333150863647, + "learning_rate": 3.894706224549157e-05, + "loss": 5.434, + "step": 2280 + }, + { + "epoch": 0.22, + "grad_norm": 1.020964503288269, + "learning_rate": 3.892767112662401e-05, + "loss": 5.4197, + "step": 2284 + }, + { + "epoch": 0.22, + "grad_norm": 0.9597110748291016, + "learning_rate": 3.890828000775645e-05, + "loss": 5.4174, + "step": 2288 + }, + { + "epoch": 0.22, + "grad_norm": 1.0164399147033691, + "learning_rate": 3.888888888888889e-05, + "loss": 5.4698, + "step": 2292 + }, + { + "epoch": 0.22, + "grad_norm": 1.0590660572052002, + "learning_rate": 3.886949777002134e-05, + "loss": 5.4815, + "step": 2296 + }, + { + "epoch": 0.22, + "grad_norm": 0.9020886421203613, + "learning_rate": 3.885010665115377e-05, + "loss": 5.4197, + "step": 2300 + }, + { + "epoch": 0.22, + "grad_norm": 0.9899044632911682, + "learning_rate": 3.883071553228622e-05, + "loss": 5.5061, + "step": 2304 + }, + { + "epoch": 0.22, + "grad_norm": 0.9730533957481384, + "learning_rate": 3.881132441341865e-05, + "loss": 5.4804, + "step": 2308 + }, + { + "epoch": 0.22, + "grad_norm": 0.9274543523788452, + "learning_rate": 3.87919332945511e-05, + "loss": 5.5486, + "step": 2312 + }, + { + "epoch": 0.22, + "grad_norm": 0.9086050391197205, + "learning_rate": 3.877254217568354e-05, + "loss": 5.3997, + "step": 2316 + }, + { + "epoch": 0.22, + "grad_norm": 0.9170548915863037, + "learning_rate": 3.875315105681598e-05, + "loss": 5.3701, + "step": 2320 + }, + { + "epoch": 0.23, + "grad_norm": 0.9133801460266113, + "learning_rate": 3.873375993794842e-05, + "loss": 5.4241, + "step": 2324 + }, + { + "epoch": 0.23, + "grad_norm": 0.9891861081123352, + "learning_rate": 3.871436881908086e-05, + "loss": 5.5119, + "step": 2328 + }, + { + "epoch": 0.23, + "grad_norm": 1.0900509357452393, + "learning_rate": 3.86949777002133e-05, + "loss": 5.6089, + "step": 2332 + }, + { + "epoch": 0.23, + "grad_norm": 0.9711102843284607, + "learning_rate": 3.8675586581345743e-05, + "loss": 5.5975, + "step": 2336 + }, + { + "epoch": 0.23, + "grad_norm": 0.9177804589271545, + "learning_rate": 3.8656195462478184e-05, + "loss": 5.4848, + "step": 2340 + }, + { + "epoch": 0.23, + "grad_norm": 0.9340533018112183, + "learning_rate": 3.8636804343610625e-05, + "loss": 5.5705, + "step": 2344 + }, + { + "epoch": 0.23, + "grad_norm": 0.9871985912322998, + "learning_rate": 3.861741322474307e-05, + "loss": 5.5552, + "step": 2348 + }, + { + "epoch": 0.23, + "grad_norm": 0.9477519392967224, + "learning_rate": 3.8598022105875506e-05, + "loss": 5.3622, + "step": 2352 + }, + { + "epoch": 0.23, + "grad_norm": 0.9251902103424072, + "learning_rate": 3.857863098700795e-05, + "loss": 5.3964, + "step": 2356 + }, + { + "epoch": 0.23, + "grad_norm": 0.9803330898284912, + "learning_rate": 3.8559239868140393e-05, + "loss": 5.4716, + "step": 2360 + }, + { + "epoch": 0.23, + "grad_norm": 0.9429686069488525, + "learning_rate": 3.8539848749272834e-05, + "loss": 5.4319, + "step": 2364 + }, + { + "epoch": 0.23, + "grad_norm": 1.0370640754699707, + "learning_rate": 3.8520457630405275e-05, + "loss": 5.4297, + "step": 2368 + }, + { + "epoch": 0.23, + "grad_norm": 0.9429317712783813, + "learning_rate": 3.850106651153772e-05, + "loss": 5.4068, + "step": 2372 + }, + { + "epoch": 0.23, + "grad_norm": 1.0374614000320435, + "learning_rate": 3.8481675392670156e-05, + "loss": 5.4975, + "step": 2376 + }, + { + "epoch": 0.23, + "grad_norm": 0.9564975500106812, + "learning_rate": 3.84622842738026e-05, + "loss": 5.5221, + "step": 2380 + }, + { + "epoch": 0.23, + "grad_norm": 0.9388477802276611, + "learning_rate": 3.844289315493504e-05, + "loss": 5.4748, + "step": 2384 + }, + { + "epoch": 0.23, + "grad_norm": 0.9292894601821899, + "learning_rate": 3.8423502036067484e-05, + "loss": 5.6158, + "step": 2388 + }, + { + "epoch": 0.23, + "grad_norm": 0.9442563056945801, + "learning_rate": 3.8404110917199925e-05, + "loss": 5.5371, + "step": 2392 + }, + { + "epoch": 0.23, + "grad_norm": 1.0346298217773438, + "learning_rate": 3.8384719798332365e-05, + "loss": 5.4546, + "step": 2396 + }, + { + "epoch": 0.23, + "grad_norm": 0.9618560671806335, + "learning_rate": 3.8365328679464806e-05, + "loss": 5.5663, + "step": 2400 + }, + { + "epoch": 0.23, + "grad_norm": 0.9630410671234131, + "learning_rate": 3.834593756059725e-05, + "loss": 5.5669, + "step": 2404 + }, + { + "epoch": 0.23, + "grad_norm": 1.0021862983703613, + "learning_rate": 3.832654644172969e-05, + "loss": 5.5155, + "step": 2408 + }, + { + "epoch": 0.23, + "grad_norm": 1.015215277671814, + "learning_rate": 3.8307155322862134e-05, + "loss": 5.4772, + "step": 2412 + }, + { + "epoch": 0.23, + "grad_norm": 0.9358635544776917, + "learning_rate": 3.8287764203994574e-05, + "loss": 5.4375, + "step": 2416 + }, + { + "epoch": 0.23, + "grad_norm": 1.1098734140396118, + "learning_rate": 3.8268373085127015e-05, + "loss": 5.5493, + "step": 2420 + }, + { + "epoch": 0.24, + "grad_norm": 0.9704833030700684, + "learning_rate": 3.8248981966259456e-05, + "loss": 5.5974, + "step": 2424 + }, + { + "epoch": 0.24, + "grad_norm": 1.1319690942764282, + "learning_rate": 3.8229590847391896e-05, + "loss": 5.5381, + "step": 2428 + }, + { + "epoch": 0.24, + "grad_norm": 0.9214887619018555, + "learning_rate": 3.821019972852434e-05, + "loss": 5.4689, + "step": 2432 + }, + { + "epoch": 0.24, + "grad_norm": 0.9483059048652649, + "learning_rate": 3.8190808609656784e-05, + "loss": 5.447, + "step": 2436 + }, + { + "epoch": 0.24, + "grad_norm": 0.936059832572937, + "learning_rate": 3.817141749078922e-05, + "loss": 5.5256, + "step": 2440 + }, + { + "epoch": 0.24, + "grad_norm": 0.8800360560417175, + "learning_rate": 3.8152026371921665e-05, + "loss": 5.4862, + "step": 2444 + }, + { + "epoch": 0.24, + "grad_norm": 0.9923036098480225, + "learning_rate": 3.8132635253054106e-05, + "loss": 5.5005, + "step": 2448 + }, + { + "epoch": 0.24, + "grad_norm": 0.9370976090431213, + "learning_rate": 3.8113244134186546e-05, + "loss": 5.5008, + "step": 2452 + }, + { + "epoch": 0.24, + "grad_norm": 0.9325810670852661, + "learning_rate": 3.809385301531899e-05, + "loss": 5.5375, + "step": 2456 + }, + { + "epoch": 0.24, + "grad_norm": 1.0524775981903076, + "learning_rate": 3.807446189645143e-05, + "loss": 5.5869, + "step": 2460 + }, + { + "epoch": 0.24, + "grad_norm": 0.9688572287559509, + "learning_rate": 3.805507077758387e-05, + "loss": 5.5657, + "step": 2464 + }, + { + "epoch": 0.24, + "grad_norm": 0.9577921628952026, + "learning_rate": 3.803567965871631e-05, + "loss": 5.4082, + "step": 2468 + }, + { + "epoch": 0.24, + "grad_norm": 0.977051317691803, + "learning_rate": 3.801628853984875e-05, + "loss": 5.4773, + "step": 2472 + }, + { + "epoch": 0.24, + "grad_norm": 0.9854933023452759, + "learning_rate": 3.799689742098119e-05, + "loss": 5.487, + "step": 2476 + }, + { + "epoch": 0.24, + "grad_norm": 1.0686819553375244, + "learning_rate": 3.797750630211364e-05, + "loss": 5.3842, + "step": 2480 + }, + { + "epoch": 0.24, + "grad_norm": 0.9736838936805725, + "learning_rate": 3.795811518324607e-05, + "loss": 5.5956, + "step": 2484 + }, + { + "epoch": 0.24, + "grad_norm": 0.9340422749519348, + "learning_rate": 3.793872406437852e-05, + "loss": 5.5148, + "step": 2488 + }, + { + "epoch": 0.24, + "grad_norm": 1.0714948177337646, + "learning_rate": 3.791933294551096e-05, + "loss": 5.495, + "step": 2492 + }, + { + "epoch": 0.24, + "grad_norm": 1.037858486175537, + "learning_rate": 3.78999418266434e-05, + "loss": 5.5312, + "step": 2496 + }, + { + "epoch": 0.24, + "grad_norm": 1.1142346858978271, + "learning_rate": 3.788055070777584e-05, + "loss": 5.6109, + "step": 2500 + }, + { + "epoch": 0.24, + "grad_norm": 1.0405195951461792, + "learning_rate": 3.786115958890828e-05, + "loss": 5.4659, + "step": 2504 + }, + { + "epoch": 0.24, + "grad_norm": 1.106404423713684, + "learning_rate": 3.784176847004072e-05, + "loss": 5.5187, + "step": 2508 + }, + { + "epoch": 0.24, + "grad_norm": 0.9529224634170532, + "learning_rate": 3.782237735117317e-05, + "loss": 5.441, + "step": 2512 + }, + { + "epoch": 0.24, + "grad_norm": 1.0962753295898438, + "learning_rate": 3.78029862323056e-05, + "loss": 5.5503, + "step": 2516 + }, + { + "epoch": 0.24, + "grad_norm": 0.9510455131530762, + "learning_rate": 3.778359511343805e-05, + "loss": 5.4773, + "step": 2520 + }, + { + "epoch": 0.24, + "grad_norm": 1.0108531713485718, + "learning_rate": 3.776420399457049e-05, + "loss": 5.5567, + "step": 2524 + }, + { + "epoch": 0.25, + "grad_norm": 0.9611102938652039, + "learning_rate": 3.774481287570293e-05, + "loss": 5.4825, + "step": 2528 + }, + { + "epoch": 0.25, + "grad_norm": 1.0297412872314453, + "learning_rate": 3.772542175683537e-05, + "loss": 5.4075, + "step": 2532 + }, + { + "epoch": 0.25, + "grad_norm": 1.0067005157470703, + "learning_rate": 3.770603063796782e-05, + "loss": 5.5345, + "step": 2536 + }, + { + "epoch": 0.25, + "grad_norm": 1.0144344568252563, + "learning_rate": 3.768663951910025e-05, + "loss": 5.5108, + "step": 2540 + }, + { + "epoch": 0.25, + "grad_norm": 0.9989475607872009, + "learning_rate": 3.76672484002327e-05, + "loss": 5.4782, + "step": 2544 + }, + { + "epoch": 0.25, + "grad_norm": 1.0596572160720825, + "learning_rate": 3.764785728136513e-05, + "loss": 5.5088, + "step": 2548 + }, + { + "epoch": 0.25, + "grad_norm": 1.022268533706665, + "learning_rate": 3.762846616249758e-05, + "loss": 5.4595, + "step": 2552 + }, + { + "epoch": 0.25, + "grad_norm": 0.9864400029182434, + "learning_rate": 3.760907504363002e-05, + "loss": 5.5029, + "step": 2556 + }, + { + "epoch": 0.25, + "grad_norm": 0.9575673937797546, + "learning_rate": 3.758968392476246e-05, + "loss": 5.4772, + "step": 2560 + }, + { + "epoch": 0.25, + "grad_norm": 0.9066863059997559, + "learning_rate": 3.75702928058949e-05, + "loss": 5.5936, + "step": 2564 + }, + { + "epoch": 0.25, + "grad_norm": 0.9975427985191345, + "learning_rate": 3.755090168702735e-05, + "loss": 5.5532, + "step": 2568 + }, + { + "epoch": 0.25, + "grad_norm": 0.935875415802002, + "learning_rate": 3.753151056815978e-05, + "loss": 5.4235, + "step": 2572 + }, + { + "epoch": 0.25, + "grad_norm": 1.0825345516204834, + "learning_rate": 3.751211944929223e-05, + "loss": 5.3898, + "step": 2576 + }, + { + "epoch": 0.25, + "grad_norm": 1.0362260341644287, + "learning_rate": 3.7492728330424664e-05, + "loss": 5.4427, + "step": 2580 + }, + { + "epoch": 0.25, + "grad_norm": 0.8955732583999634, + "learning_rate": 3.747333721155711e-05, + "loss": 5.5447, + "step": 2584 + }, + { + "epoch": 0.25, + "grad_norm": 1.0196340084075928, + "learning_rate": 3.745394609268955e-05, + "loss": 5.5636, + "step": 2588 + }, + { + "epoch": 0.25, + "grad_norm": 0.9525064826011658, + "learning_rate": 3.743455497382199e-05, + "loss": 5.5, + "step": 2592 + }, + { + "epoch": 0.25, + "grad_norm": 0.9297643899917603, + "learning_rate": 3.741516385495443e-05, + "loss": 5.5382, + "step": 2596 + }, + { + "epoch": 0.25, + "grad_norm": 0.9364489912986755, + "learning_rate": 3.739577273608687e-05, + "loss": 5.5248, + "step": 2600 + }, + { + "epoch": 0.25, + "grad_norm": 0.9031673073768616, + "learning_rate": 3.7376381617219314e-05, + "loss": 5.4689, + "step": 2604 + }, + { + "epoch": 0.25, + "grad_norm": 1.020928978919983, + "learning_rate": 3.7356990498351754e-05, + "loss": 5.4896, + "step": 2608 + }, + { + "epoch": 0.25, + "grad_norm": 0.9407410621643066, + "learning_rate": 3.73375993794842e-05, + "loss": 5.4259, + "step": 2612 + }, + { + "epoch": 0.25, + "grad_norm": 1.0018398761749268, + "learning_rate": 3.7318208260616635e-05, + "loss": 5.5738, + "step": 2616 + }, + { + "epoch": 0.25, + "grad_norm": 1.0489344596862793, + "learning_rate": 3.729881714174908e-05, + "loss": 5.404, + "step": 2620 + }, + { + "epoch": 0.25, + "grad_norm": 0.9759474992752075, + "learning_rate": 3.7279426022881516e-05, + "loss": 5.4977, + "step": 2624 + }, + { + "epoch": 0.25, + "grad_norm": 1.0580724477767944, + "learning_rate": 3.7260034904013964e-05, + "loss": 5.4477, + "step": 2628 + }, + { + "epoch": 0.26, + "grad_norm": 1.0281325578689575, + "learning_rate": 3.7240643785146404e-05, + "loss": 5.4882, + "step": 2632 + }, + { + "epoch": 0.26, + "grad_norm": 0.9635825157165527, + "learning_rate": 3.7221252666278845e-05, + "loss": 5.4303, + "step": 2636 + }, + { + "epoch": 0.26, + "grad_norm": 0.9374428391456604, + "learning_rate": 3.7201861547411285e-05, + "loss": 5.4308, + "step": 2640 + }, + { + "epoch": 0.26, + "grad_norm": 1.0188990831375122, + "learning_rate": 3.718247042854373e-05, + "loss": 5.4521, + "step": 2644 + }, + { + "epoch": 0.26, + "grad_norm": 1.0239014625549316, + "learning_rate": 3.7163079309676166e-05, + "loss": 5.5686, + "step": 2648 + }, + { + "epoch": 0.26, + "grad_norm": 0.946735680103302, + "learning_rate": 3.7143688190808614e-05, + "loss": 5.4329, + "step": 2652 + }, + { + "epoch": 0.26, + "grad_norm": 0.9370056986808777, + "learning_rate": 3.7124297071941054e-05, + "loss": 5.5174, + "step": 2656 + }, + { + "epoch": 0.26, + "grad_norm": 1.003072738647461, + "learning_rate": 3.7104905953073495e-05, + "loss": 5.3857, + "step": 2660 + }, + { + "epoch": 0.26, + "grad_norm": 0.9541458487510681, + "learning_rate": 3.7085514834205935e-05, + "loss": 5.448, + "step": 2664 + }, + { + "epoch": 0.26, + "grad_norm": 0.9533443450927734, + "learning_rate": 3.7066123715338376e-05, + "loss": 5.559, + "step": 2668 + }, + { + "epoch": 0.26, + "grad_norm": 1.0223768949508667, + "learning_rate": 3.7046732596470816e-05, + "loss": 5.503, + "step": 2672 + }, + { + "epoch": 0.26, + "grad_norm": 1.2174021005630493, + "learning_rate": 3.7027341477603264e-05, + "loss": 5.5322, + "step": 2676 + }, + { + "epoch": 0.26, + "grad_norm": 1.117325782775879, + "learning_rate": 3.70079503587357e-05, + "loss": 5.4969, + "step": 2680 + }, + { + "epoch": 0.26, + "grad_norm": 0.9299269318580627, + "learning_rate": 3.6988559239868145e-05, + "loss": 5.4264, + "step": 2684 + }, + { + "epoch": 0.26, + "grad_norm": 0.9750757813453674, + "learning_rate": 3.6969168121000585e-05, + "loss": 5.4953, + "step": 2688 + }, + { + "epoch": 0.26, + "grad_norm": 0.9810564517974854, + "learning_rate": 3.6949777002133026e-05, + "loss": 5.4762, + "step": 2692 + }, + { + "epoch": 0.26, + "grad_norm": 1.0046603679656982, + "learning_rate": 3.6930385883265466e-05, + "loss": 5.4572, + "step": 2696 + }, + { + "epoch": 0.26, + "grad_norm": 0.9024963974952698, + "learning_rate": 3.691099476439791e-05, + "loss": 5.5038, + "step": 2700 + }, + { + "epoch": 0.26, + "grad_norm": 0.9631572961807251, + "learning_rate": 3.689160364553035e-05, + "loss": 5.4895, + "step": 2704 + }, + { + "epoch": 0.26, + "grad_norm": 0.8802670240402222, + "learning_rate": 3.6872212526662795e-05, + "loss": 5.6245, + "step": 2708 + }, + { + "epoch": 0.26, + "grad_norm": 0.9694925546646118, + "learning_rate": 3.685282140779523e-05, + "loss": 5.4813, + "step": 2712 + }, + { + "epoch": 0.26, + "grad_norm": 1.0332534313201904, + "learning_rate": 3.6833430288927676e-05, + "loss": 5.5064, + "step": 2716 + }, + { + "epoch": 0.26, + "grad_norm": 0.9285298585891724, + "learning_rate": 3.6814039170060116e-05, + "loss": 5.4926, + "step": 2720 + }, + { + "epoch": 0.26, + "grad_norm": 0.9079506993293762, + "learning_rate": 3.679464805119256e-05, + "loss": 5.4837, + "step": 2724 + }, + { + "epoch": 0.26, + "grad_norm": 1.0010629892349243, + "learning_rate": 3.6775256932325e-05, + "loss": 5.4814, + "step": 2728 + }, + { + "epoch": 0.26, + "grad_norm": 0.9733301997184753, + "learning_rate": 3.675586581345744e-05, + "loss": 5.526, + "step": 2732 + }, + { + "epoch": 0.27, + "grad_norm": 0.9595903158187866, + "learning_rate": 3.673647469458988e-05, + "loss": 5.3726, + "step": 2736 + }, + { + "epoch": 0.27, + "grad_norm": 1.0144261121749878, + "learning_rate": 3.671708357572232e-05, + "loss": 5.4286, + "step": 2740 + }, + { + "epoch": 0.27, + "grad_norm": 1.0285661220550537, + "learning_rate": 3.669769245685476e-05, + "loss": 5.5113, + "step": 2744 + }, + { + "epoch": 0.27, + "grad_norm": 0.9807763695716858, + "learning_rate": 3.66783013379872e-05, + "loss": 5.5956, + "step": 2748 + }, + { + "epoch": 0.27, + "grad_norm": 1.0480782985687256, + "learning_rate": 3.665891021911965e-05, + "loss": 5.4205, + "step": 2752 + }, + { + "epoch": 0.27, + "grad_norm": 0.8527302145957947, + "learning_rate": 3.663951910025208e-05, + "loss": 5.5106, + "step": 2756 + }, + { + "epoch": 0.27, + "grad_norm": 0.9895337224006653, + "learning_rate": 3.662012798138453e-05, + "loss": 5.517, + "step": 2760 + }, + { + "epoch": 0.27, + "grad_norm": 1.005570888519287, + "learning_rate": 3.660073686251697e-05, + "loss": 5.4432, + "step": 2764 + }, + { + "epoch": 0.27, + "grad_norm": 1.0083740949630737, + "learning_rate": 3.658134574364941e-05, + "loss": 5.3863, + "step": 2768 + }, + { + "epoch": 0.27, + "grad_norm": 0.9431845545768738, + "learning_rate": 3.656195462478185e-05, + "loss": 5.3734, + "step": 2772 + }, + { + "epoch": 0.27, + "grad_norm": 0.9629083871841431, + "learning_rate": 3.65425635059143e-05, + "loss": 5.4737, + "step": 2776 + }, + { + "epoch": 0.27, + "grad_norm": 0.9649605751037598, + "learning_rate": 3.652317238704673e-05, + "loss": 5.4022, + "step": 2780 + }, + { + "epoch": 0.27, + "grad_norm": 0.9746363162994385, + "learning_rate": 3.650378126817918e-05, + "loss": 5.4826, + "step": 2784 + }, + { + "epoch": 0.27, + "grad_norm": 0.9153027534484863, + "learning_rate": 3.648439014931161e-05, + "loss": 5.5833, + "step": 2788 + }, + { + "epoch": 0.27, + "grad_norm": 0.940949559211731, + "learning_rate": 3.646499903044406e-05, + "loss": 5.4852, + "step": 2792 + }, + { + "epoch": 0.27, + "grad_norm": 0.9482103586196899, + "learning_rate": 3.64456079115765e-05, + "loss": 5.4711, + "step": 2796 + }, + { + "epoch": 0.27, + "grad_norm": 1.0848538875579834, + "learning_rate": 3.642621679270894e-05, + "loss": 5.464, + "step": 2800 + }, + { + "epoch": 0.27, + "grad_norm": 0.9354459643363953, + "learning_rate": 3.640682567384138e-05, + "loss": 5.4495, + "step": 2804 + }, + { + "epoch": 0.27, + "grad_norm": 0.9546772837638855, + "learning_rate": 3.638743455497383e-05, + "loss": 5.4562, + "step": 2808 + }, + { + "epoch": 0.27, + "grad_norm": 0.9387646317481995, + "learning_rate": 3.636804343610626e-05, + "loss": 5.4439, + "step": 2812 + }, + { + "epoch": 0.27, + "grad_norm": 0.9842014312744141, + "learning_rate": 3.634865231723871e-05, + "loss": 5.3971, + "step": 2816 + }, + { + "epoch": 0.27, + "grad_norm": 0.9438384175300598, + "learning_rate": 3.632926119837114e-05, + "loss": 5.4875, + "step": 2820 + }, + { + "epoch": 0.27, + "grad_norm": 0.998210072517395, + "learning_rate": 3.630987007950359e-05, + "loss": 5.4301, + "step": 2824 + }, + { + "epoch": 0.27, + "grad_norm": 0.948137640953064, + "learning_rate": 3.629047896063603e-05, + "loss": 5.5703, + "step": 2828 + }, + { + "epoch": 0.27, + "grad_norm": 1.0335065126419067, + "learning_rate": 3.627108784176847e-05, + "loss": 5.386, + "step": 2832 + }, + { + "epoch": 0.27, + "grad_norm": 0.9774126410484314, + "learning_rate": 3.625169672290091e-05, + "loss": 5.4288, + "step": 2836 + }, + { + "epoch": 0.28, + "grad_norm": 1.168003797531128, + "learning_rate": 3.623230560403336e-05, + "loss": 5.4524, + "step": 2840 + }, + { + "epoch": 0.28, + "grad_norm": 1.05010187625885, + "learning_rate": 3.621291448516579e-05, + "loss": 5.3784, + "step": 2844 + }, + { + "epoch": 0.28, + "grad_norm": 1.0008686780929565, + "learning_rate": 3.619352336629824e-05, + "loss": 5.351, + "step": 2848 + }, + { + "epoch": 0.28, + "grad_norm": 1.0319279432296753, + "learning_rate": 3.617413224743068e-05, + "loss": 5.5112, + "step": 2852 + }, + { + "epoch": 0.28, + "grad_norm": 0.9444233775138855, + "learning_rate": 3.615474112856312e-05, + "loss": 5.4225, + "step": 2856 + }, + { + "epoch": 0.28, + "grad_norm": 1.0696698427200317, + "learning_rate": 3.613535000969556e-05, + "loss": 5.5542, + "step": 2860 + }, + { + "epoch": 0.28, + "grad_norm": 0.9493553042411804, + "learning_rate": 3.6115958890828e-05, + "loss": 5.42, + "step": 2864 + }, + { + "epoch": 0.28, + "grad_norm": 1.0606472492218018, + "learning_rate": 3.609656777196044e-05, + "loss": 5.421, + "step": 2868 + }, + { + "epoch": 0.28, + "grad_norm": 1.045782208442688, + "learning_rate": 3.6077176653092884e-05, + "loss": 5.5086, + "step": 2872 + }, + { + "epoch": 0.28, + "grad_norm": 1.034601092338562, + "learning_rate": 3.6057785534225324e-05, + "loss": 5.4582, + "step": 2876 + }, + { + "epoch": 0.28, + "grad_norm": 1.0553306341171265, + "learning_rate": 3.6038394415357765e-05, + "loss": 5.5501, + "step": 2880 + }, + { + "epoch": 0.28, + "grad_norm": 1.0070221424102783, + "learning_rate": 3.601900329649021e-05, + "loss": 5.5016, + "step": 2884 + }, + { + "epoch": 0.28, + "grad_norm": 1.0406205654144287, + "learning_rate": 3.5999612177622646e-05, + "loss": 5.4743, + "step": 2888 + }, + { + "epoch": 0.28, + "grad_norm": 1.0793685913085938, + "learning_rate": 3.598022105875509e-05, + "loss": 5.4889, + "step": 2892 + }, + { + "epoch": 0.28, + "grad_norm": 1.1032297611236572, + "learning_rate": 3.5960829939887534e-05, + "loss": 5.4928, + "step": 2896 + }, + { + "epoch": 0.28, + "grad_norm": 0.9864259362220764, + "learning_rate": 3.5941438821019974e-05, + "loss": 5.4582, + "step": 2900 + }, + { + "epoch": 0.28, + "grad_norm": 0.9251708388328552, + "learning_rate": 3.5922047702152415e-05, + "loss": 5.4329, + "step": 2904 + }, + { + "epoch": 0.28, + "grad_norm": 0.9993565082550049, + "learning_rate": 3.5902656583284855e-05, + "loss": 5.5837, + "step": 2908 + }, + { + "epoch": 0.28, + "grad_norm": 0.9946919083595276, + "learning_rate": 3.5883265464417296e-05, + "loss": 5.4545, + "step": 2912 + }, + { + "epoch": 0.28, + "grad_norm": 0.9719089865684509, + "learning_rate": 3.586387434554974e-05, + "loss": 5.448, + "step": 2916 + }, + { + "epoch": 0.28, + "grad_norm": 0.900641143321991, + "learning_rate": 3.584448322668218e-05, + "loss": 5.5845, + "step": 2920 + }, + { + "epoch": 0.28, + "grad_norm": 0.9279571771621704, + "learning_rate": 3.5825092107814624e-05, + "loss": 5.4733, + "step": 2924 + }, + { + "epoch": 0.28, + "grad_norm": 1.0747668743133545, + "learning_rate": 3.5805700988947065e-05, + "loss": 5.3805, + "step": 2928 + }, + { + "epoch": 0.28, + "grad_norm": 0.8932091593742371, + "learning_rate": 3.5786309870079505e-05, + "loss": 5.4015, + "step": 2932 + }, + { + "epoch": 0.28, + "grad_norm": 0.9807014465332031, + "learning_rate": 3.5766918751211946e-05, + "loss": 5.5228, + "step": 2936 + }, + { + "epoch": 0.29, + "grad_norm": 0.9529114961624146, + "learning_rate": 3.5747527632344386e-05, + "loss": 5.5328, + "step": 2940 + }, + { + "epoch": 0.29, + "grad_norm": 0.9890924692153931, + "learning_rate": 3.572813651347683e-05, + "loss": 5.4686, + "step": 2944 + }, + { + "epoch": 0.29, + "grad_norm": 0.9855780005455017, + "learning_rate": 3.5708745394609274e-05, + "loss": 5.4921, + "step": 2948 + }, + { + "epoch": 0.29, + "grad_norm": 0.9508200287818909, + "learning_rate": 3.568935427574171e-05, + "loss": 5.5508, + "step": 2952 + }, + { + "epoch": 0.29, + "grad_norm": 0.9192949533462524, + "learning_rate": 3.5669963156874155e-05, + "loss": 5.5484, + "step": 2956 + }, + { + "epoch": 0.29, + "grad_norm": 0.9657400846481323, + "learning_rate": 3.5650572038006596e-05, + "loss": 5.4305, + "step": 2960 + }, + { + "epoch": 0.29, + "grad_norm": 1.1960434913635254, + "learning_rate": 3.5631180919139036e-05, + "loss": 5.4461, + "step": 2964 + }, + { + "epoch": 0.29, + "grad_norm": 0.9226086735725403, + "learning_rate": 3.561178980027148e-05, + "loss": 5.4288, + "step": 2968 + }, + { + "epoch": 0.29, + "grad_norm": 1.0317691564559937, + "learning_rate": 3.5592398681403924e-05, + "loss": 5.5107, + "step": 2972 + }, + { + "epoch": 0.29, + "grad_norm": 0.9748375415802002, + "learning_rate": 3.557300756253636e-05, + "loss": 5.5342, + "step": 2976 + }, + { + "epoch": 0.29, + "grad_norm": 0.9513313174247742, + "learning_rate": 3.5553616443668805e-05, + "loss": 5.4337, + "step": 2980 + }, + { + "epoch": 0.29, + "grad_norm": 0.9269315600395203, + "learning_rate": 3.553422532480124e-05, + "loss": 5.4853, + "step": 2984 + }, + { + "epoch": 0.29, + "grad_norm": 1.0136945247650146, + "learning_rate": 3.5514834205933686e-05, + "loss": 5.4767, + "step": 2988 + }, + { + "epoch": 0.29, + "grad_norm": 1.049842119216919, + "learning_rate": 3.549544308706613e-05, + "loss": 5.5028, + "step": 2992 + }, + { + "epoch": 0.29, + "grad_norm": 0.9470251798629761, + "learning_rate": 3.547605196819857e-05, + "loss": 5.4391, + "step": 2996 + }, + { + "epoch": 0.29, + "grad_norm": 1.1014668941497803, + "learning_rate": 3.545666084933101e-05, + "loss": 5.4564, + "step": 3000 + }, + { + "epoch": 0.29, + "grad_norm": 1.0358186960220337, + "learning_rate": 3.543726973046345e-05, + "loss": 5.5398, + "step": 3004 + }, + { + "epoch": 0.29, + "grad_norm": 1.0642112493515015, + "learning_rate": 3.541787861159589e-05, + "loss": 5.4548, + "step": 3008 + }, + { + "epoch": 0.29, + "grad_norm": 0.9606940150260925, + "learning_rate": 3.539848749272833e-05, + "loss": 5.5331, + "step": 3012 + }, + { + "epoch": 0.29, + "grad_norm": 1.0315513610839844, + "learning_rate": 3.537909637386078e-05, + "loss": 5.476, + "step": 3016 + }, + { + "epoch": 0.29, + "grad_norm": 1.0335862636566162, + "learning_rate": 3.535970525499321e-05, + "loss": 5.3247, + "step": 3020 + }, + { + "epoch": 0.29, + "grad_norm": 1.0279650688171387, + "learning_rate": 3.534031413612566e-05, + "loss": 5.425, + "step": 3024 + }, + { + "epoch": 0.29, + "grad_norm": 0.9862622022628784, + "learning_rate": 3.532092301725809e-05, + "loss": 5.5486, + "step": 3028 + }, + { + "epoch": 0.29, + "grad_norm": 1.0324090719223022, + "learning_rate": 3.530153189839054e-05, + "loss": 5.546, + "step": 3032 + }, + { + "epoch": 0.29, + "grad_norm": 1.0767823457717896, + "learning_rate": 3.528214077952298e-05, + "loss": 5.4906, + "step": 3036 + }, + { + "epoch": 0.29, + "grad_norm": 0.9795736074447632, + "learning_rate": 3.526274966065542e-05, + "loss": 5.4132, + "step": 3040 + }, + { + "epoch": 0.3, + "grad_norm": 0.9786263704299927, + "learning_rate": 3.524335854178786e-05, + "loss": 5.4225, + "step": 3044 + }, + { + "epoch": 0.3, + "grad_norm": 0.9306228756904602, + "learning_rate": 3.522396742292031e-05, + "loss": 5.5407, + "step": 3048 + }, + { + "epoch": 0.3, + "grad_norm": 0.9735816121101379, + "learning_rate": 3.520457630405274e-05, + "loss": 5.5262, + "step": 3052 + }, + { + "epoch": 0.3, + "grad_norm": 0.9719963073730469, + "learning_rate": 3.518518518518519e-05, + "loss": 5.4489, + "step": 3056 + }, + { + "epoch": 0.3, + "grad_norm": 0.9754818081855774, + "learning_rate": 3.516579406631762e-05, + "loss": 5.4656, + "step": 3060 + }, + { + "epoch": 0.3, + "grad_norm": 1.0036709308624268, + "learning_rate": 3.514640294745007e-05, + "loss": 5.4314, + "step": 3064 + }, + { + "epoch": 0.3, + "grad_norm": 0.956697940826416, + "learning_rate": 3.512701182858251e-05, + "loss": 5.4755, + "step": 3068 + }, + { + "epoch": 0.3, + "grad_norm": 1.1752293109893799, + "learning_rate": 3.510762070971495e-05, + "loss": 5.4411, + "step": 3072 + }, + { + "epoch": 0.3, + "grad_norm": 0.9563004374504089, + "learning_rate": 3.508822959084739e-05, + "loss": 5.3531, + "step": 3076 + }, + { + "epoch": 0.3, + "grad_norm": 1.0313175916671753, + "learning_rate": 3.506883847197984e-05, + "loss": 5.477, + "step": 3080 + }, + { + "epoch": 0.3, + "grad_norm": 0.997872531414032, + "learning_rate": 3.504944735311227e-05, + "loss": 5.4479, + "step": 3084 + }, + { + "epoch": 0.3, + "grad_norm": 0.9374060034751892, + "learning_rate": 3.503005623424472e-05, + "loss": 5.4492, + "step": 3088 + }, + { + "epoch": 0.3, + "grad_norm": 0.9899947643280029, + "learning_rate": 3.501066511537716e-05, + "loss": 5.4255, + "step": 3092 + }, + { + "epoch": 0.3, + "grad_norm": 1.0138983726501465, + "learning_rate": 3.49912739965096e-05, + "loss": 5.4234, + "step": 3096 + }, + { + "epoch": 0.3, + "grad_norm": 0.9544614553451538, + "learning_rate": 3.497188287764204e-05, + "loss": 5.4781, + "step": 3100 + }, + { + "epoch": 0.3, + "grad_norm": 1.0117629766464233, + "learning_rate": 3.495249175877448e-05, + "loss": 5.4445, + "step": 3104 + }, + { + "epoch": 0.3, + "grad_norm": 1.0312600135803223, + "learning_rate": 3.493310063990692e-05, + "loss": 5.439, + "step": 3108 + }, + { + "epoch": 0.3, + "grad_norm": 0.9983291029930115, + "learning_rate": 3.491370952103937e-05, + "loss": 5.5003, + "step": 3112 + }, + { + "epoch": 0.3, + "grad_norm": 1.0281239748001099, + "learning_rate": 3.4894318402171804e-05, + "loss": 5.4276, + "step": 3116 + }, + { + "epoch": 0.3, + "grad_norm": 0.9426625370979309, + "learning_rate": 3.487492728330425e-05, + "loss": 5.5267, + "step": 3120 + }, + { + "epoch": 0.3, + "grad_norm": 1.0768100023269653, + "learning_rate": 3.485553616443669e-05, + "loss": 5.5023, + "step": 3124 + }, + { + "epoch": 0.3, + "grad_norm": 1.0463875532150269, + "learning_rate": 3.483614504556913e-05, + "loss": 5.4019, + "step": 3128 + }, + { + "epoch": 0.3, + "grad_norm": 0.9380079507827759, + "learning_rate": 3.481675392670157e-05, + "loss": 5.5364, + "step": 3132 + }, + { + "epoch": 0.3, + "grad_norm": 0.9299972653388977, + "learning_rate": 3.4797362807834013e-05, + "loss": 5.4026, + "step": 3136 + }, + { + "epoch": 0.3, + "grad_norm": 0.9217830300331116, + "learning_rate": 3.4777971688966454e-05, + "loss": 5.6161, + "step": 3140 + }, + { + "epoch": 0.3, + "grad_norm": 0.983069121837616, + "learning_rate": 3.4758580570098894e-05, + "loss": 5.4112, + "step": 3144 + }, + { + "epoch": 0.31, + "grad_norm": 1.0096659660339355, + "learning_rate": 3.4739189451231335e-05, + "loss": 5.4705, + "step": 3148 + }, + { + "epoch": 0.31, + "grad_norm": 0.9388656616210938, + "learning_rate": 3.4719798332363776e-05, + "loss": 5.4713, + "step": 3152 + }, + { + "epoch": 0.31, + "grad_norm": 1.0393484830856323, + "learning_rate": 3.470040721349622e-05, + "loss": 5.4147, + "step": 3156 + }, + { + "epoch": 0.31, + "grad_norm": 1.001868724822998, + "learning_rate": 3.468101609462866e-05, + "loss": 5.3701, + "step": 3160 + }, + { + "epoch": 0.31, + "grad_norm": 0.9857000708580017, + "learning_rate": 3.4661624975761104e-05, + "loss": 5.4891, + "step": 3164 + }, + { + "epoch": 0.31, + "grad_norm": 1.0586354732513428, + "learning_rate": 3.4642233856893544e-05, + "loss": 5.4801, + "step": 3168 + }, + { + "epoch": 0.31, + "grad_norm": 1.0346976518630981, + "learning_rate": 3.4622842738025985e-05, + "loss": 5.4555, + "step": 3172 + }, + { + "epoch": 0.31, + "grad_norm": 0.9425565004348755, + "learning_rate": 3.4603451619158426e-05, + "loss": 5.4126, + "step": 3176 + }, + { + "epoch": 0.31, + "grad_norm": 0.984109103679657, + "learning_rate": 3.4584060500290866e-05, + "loss": 5.3515, + "step": 3180 + }, + { + "epoch": 0.31, + "grad_norm": 1.0639657974243164, + "learning_rate": 3.456466938142331e-05, + "loss": 5.3965, + "step": 3184 + }, + { + "epoch": 0.31, + "grad_norm": 0.9687911868095398, + "learning_rate": 3.4545278262555754e-05, + "loss": 5.4377, + "step": 3188 + }, + { + "epoch": 0.31, + "grad_norm": 1.1144814491271973, + "learning_rate": 3.452588714368819e-05, + "loss": 5.5135, + "step": 3192 + }, + { + "epoch": 0.31, + "grad_norm": 0.94063401222229, + "learning_rate": 3.4506496024820635e-05, + "loss": 5.4472, + "step": 3196 + }, + { + "epoch": 0.31, + "grad_norm": 0.9204466938972473, + "learning_rate": 3.4487104905953076e-05, + "loss": 5.3214, + "step": 3200 + }, + { + "epoch": 0.31, + "grad_norm": 1.0223456621170044, + "learning_rate": 3.4467713787085516e-05, + "loss": 5.5622, + "step": 3204 + }, + { + "epoch": 0.31, + "grad_norm": 0.9197329878807068, + "learning_rate": 3.444832266821796e-05, + "loss": 5.4799, + "step": 3208 + }, + { + "epoch": 0.31, + "grad_norm": 0.9634442329406738, + "learning_rate": 3.4428931549350404e-05, + "loss": 5.3426, + "step": 3212 + }, + { + "epoch": 0.31, + "grad_norm": 1.0423585176467896, + "learning_rate": 3.440954043048284e-05, + "loss": 5.4419, + "step": 3216 + }, + { + "epoch": 0.31, + "grad_norm": 1.0101354122161865, + "learning_rate": 3.4390149311615285e-05, + "loss": 5.4811, + "step": 3220 + }, + { + "epoch": 0.31, + "grad_norm": 1.0531187057495117, + "learning_rate": 3.437075819274772e-05, + "loss": 5.4645, + "step": 3224 + }, + { + "epoch": 0.31, + "grad_norm": 1.0495266914367676, + "learning_rate": 3.4351367073880166e-05, + "loss": 5.4345, + "step": 3228 + }, + { + "epoch": 0.31, + "grad_norm": 1.029811978340149, + "learning_rate": 3.433197595501261e-05, + "loss": 5.4607, + "step": 3232 + }, + { + "epoch": 0.31, + "grad_norm": 0.9279022812843323, + "learning_rate": 3.431258483614505e-05, + "loss": 5.4771, + "step": 3236 + }, + { + "epoch": 0.31, + "grad_norm": 1.026243805885315, + "learning_rate": 3.429319371727749e-05, + "loss": 5.4841, + "step": 3240 + }, + { + "epoch": 0.31, + "grad_norm": 0.9428079724311829, + "learning_rate": 3.4273802598409935e-05, + "loss": 5.5417, + "step": 3244 + }, + { + "epoch": 0.31, + "grad_norm": 0.9030874967575073, + "learning_rate": 3.425441147954237e-05, + "loss": 5.443, + "step": 3248 + }, + { + "epoch": 0.32, + "grad_norm": 0.981732189655304, + "learning_rate": 3.4235020360674816e-05, + "loss": 5.5341, + "step": 3252 + }, + { + "epoch": 0.32, + "grad_norm": 0.9747270941734314, + "learning_rate": 3.421562924180726e-05, + "loss": 5.4236, + "step": 3256 + }, + { + "epoch": 0.32, + "grad_norm": 0.9781522154808044, + "learning_rate": 3.41962381229397e-05, + "loss": 5.4404, + "step": 3260 + }, + { + "epoch": 0.32, + "grad_norm": 0.9788567423820496, + "learning_rate": 3.417684700407214e-05, + "loss": 5.3855, + "step": 3264 + }, + { + "epoch": 0.32, + "grad_norm": 0.9978493452072144, + "learning_rate": 3.415745588520458e-05, + "loss": 5.3939, + "step": 3268 + }, + { + "epoch": 0.32, + "grad_norm": 1.0338048934936523, + "learning_rate": 3.413806476633702e-05, + "loss": 5.464, + "step": 3272 + }, + { + "epoch": 0.32, + "grad_norm": 0.9052521586418152, + "learning_rate": 3.411867364746946e-05, + "loss": 5.3242, + "step": 3276 + }, + { + "epoch": 0.32, + "grad_norm": 0.9726389050483704, + "learning_rate": 3.40992825286019e-05, + "loss": 5.4116, + "step": 3280 + }, + { + "epoch": 0.32, + "grad_norm": 0.987234354019165, + "learning_rate": 3.407989140973434e-05, + "loss": 5.506, + "step": 3284 + }, + { + "epoch": 0.32, + "grad_norm": 1.0017744302749634, + "learning_rate": 3.406050029086679e-05, + "loss": 5.5025, + "step": 3288 + }, + { + "epoch": 0.32, + "grad_norm": 0.9768481850624084, + "learning_rate": 3.404110917199922e-05, + "loss": 5.484, + "step": 3292 + }, + { + "epoch": 0.32, + "grad_norm": 0.9455767273902893, + "learning_rate": 3.402171805313167e-05, + "loss": 5.4146, + "step": 3296 + }, + { + "epoch": 0.32, + "grad_norm": 0.9832062721252441, + "learning_rate": 3.400232693426411e-05, + "loss": 5.4591, + "step": 3300 + }, + { + "epoch": 0.32, + "grad_norm": 0.8915082216262817, + "learning_rate": 3.398293581539655e-05, + "loss": 5.3985, + "step": 3304 + }, + { + "epoch": 0.32, + "grad_norm": 1.0324420928955078, + "learning_rate": 3.396354469652899e-05, + "loss": 5.5157, + "step": 3308 + }, + { + "epoch": 0.32, + "grad_norm": 1.0704431533813477, + "learning_rate": 3.394415357766143e-05, + "loss": 5.4155, + "step": 3312 + }, + { + "epoch": 0.32, + "grad_norm": 0.9540812373161316, + "learning_rate": 3.392476245879387e-05, + "loss": 5.418, + "step": 3316 + }, + { + "epoch": 0.32, + "grad_norm": 1.0278005599975586, + "learning_rate": 3.390537133992632e-05, + "loss": 5.4143, + "step": 3320 + }, + { + "epoch": 0.32, + "grad_norm": 1.0574851036071777, + "learning_rate": 3.388598022105875e-05, + "loss": 5.3889, + "step": 3324 + }, + { + "epoch": 0.32, + "grad_norm": 1.1078550815582275, + "learning_rate": 3.38665891021912e-05, + "loss": 5.4035, + "step": 3328 + }, + { + "epoch": 0.32, + "grad_norm": 0.9876176714897156, + "learning_rate": 3.384719798332364e-05, + "loss": 5.4488, + "step": 3332 + }, + { + "epoch": 0.32, + "grad_norm": 0.9665130376815796, + "learning_rate": 3.382780686445608e-05, + "loss": 5.3989, + "step": 3336 + }, + { + "epoch": 0.32, + "grad_norm": 1.0209985971450806, + "learning_rate": 3.380841574558852e-05, + "loss": 5.4641, + "step": 3340 + }, + { + "epoch": 0.32, + "grad_norm": 0.920693039894104, + "learning_rate": 3.378902462672096e-05, + "loss": 5.3932, + "step": 3344 + }, + { + "epoch": 0.32, + "grad_norm": 1.0622704029083252, + "learning_rate": 3.37696335078534e-05, + "loss": 5.3708, + "step": 3348 + }, + { + "epoch": 0.32, + "grad_norm": 1.018336296081543, + "learning_rate": 3.375024238898585e-05, + "loss": 5.4366, + "step": 3352 + }, + { + "epoch": 0.33, + "grad_norm": 0.9588587284088135, + "learning_rate": 3.3730851270118284e-05, + "loss": 5.4806, + "step": 3356 + }, + { + "epoch": 0.33, + "grad_norm": 0.9901473522186279, + "learning_rate": 3.371146015125073e-05, + "loss": 5.4993, + "step": 3360 + }, + { + "epoch": 0.33, + "grad_norm": 1.0226725339889526, + "learning_rate": 3.369206903238317e-05, + "loss": 5.4429, + "step": 3364 + }, + { + "epoch": 0.33, + "grad_norm": 1.0187616348266602, + "learning_rate": 3.367267791351561e-05, + "loss": 5.4496, + "step": 3368 + }, + { + "epoch": 0.33, + "grad_norm": 0.9235848188400269, + "learning_rate": 3.365328679464805e-05, + "loss": 5.4033, + "step": 3372 + }, + { + "epoch": 0.33, + "grad_norm": 0.9983669519424438, + "learning_rate": 3.36338956757805e-05, + "loss": 5.3515, + "step": 3376 + }, + { + "epoch": 0.33, + "grad_norm": 0.9427633285522461, + "learning_rate": 3.3614504556912934e-05, + "loss": 5.5194, + "step": 3380 + }, + { + "epoch": 0.33, + "grad_norm": 0.9658553004264832, + "learning_rate": 3.359511343804538e-05, + "loss": 5.489, + "step": 3384 + }, + { + "epoch": 0.33, + "grad_norm": 0.9263963103294373, + "learning_rate": 3.3575722319177815e-05, + "loss": 5.4182, + "step": 3388 + }, + { + "epoch": 0.33, + "grad_norm": 1.0251826047897339, + "learning_rate": 3.355633120031026e-05, + "loss": 5.5208, + "step": 3392 + }, + { + "epoch": 0.33, + "grad_norm": 0.9037973880767822, + "learning_rate": 3.35369400814427e-05, + "loss": 5.3744, + "step": 3396 + }, + { + "epoch": 0.33, + "grad_norm": 1.096735954284668, + "learning_rate": 3.351754896257514e-05, + "loss": 5.4218, + "step": 3400 + }, + { + "epoch": 0.33, + "grad_norm": 0.9776617884635925, + "learning_rate": 3.3498157843707584e-05, + "loss": 5.4539, + "step": 3404 + }, + { + "epoch": 0.33, + "grad_norm": 1.0098875761032104, + "learning_rate": 3.3478766724840024e-05, + "loss": 5.4252, + "step": 3408 + }, + { + "epoch": 0.33, + "grad_norm": 1.0725504159927368, + "learning_rate": 3.3459375605972465e-05, + "loss": 5.4702, + "step": 3412 + }, + { + "epoch": 0.33, + "grad_norm": 0.8836826086044312, + "learning_rate": 3.3439984487104905e-05, + "loss": 5.4135, + "step": 3416 + }, + { + "epoch": 0.33, + "grad_norm": 0.9869784116744995, + "learning_rate": 3.3420593368237346e-05, + "loss": 5.4617, + "step": 3420 + }, + { + "epoch": 0.33, + "grad_norm": 1.0457979440689087, + "learning_rate": 3.3401202249369786e-05, + "loss": 5.4872, + "step": 3424 + }, + { + "epoch": 0.33, + "grad_norm": 1.0574203729629517, + "learning_rate": 3.3381811130502234e-05, + "loss": 5.4167, + "step": 3428 + }, + { + "epoch": 0.33, + "grad_norm": 1.001185655593872, + "learning_rate": 3.3362420011634674e-05, + "loss": 5.4038, + "step": 3432 + }, + { + "epoch": 0.33, + "grad_norm": 1.0334811210632324, + "learning_rate": 3.3343028892767115e-05, + "loss": 5.3715, + "step": 3436 + }, + { + "epoch": 0.33, + "grad_norm": 1.0406103134155273, + "learning_rate": 3.3323637773899555e-05, + "loss": 5.4744, + "step": 3440 + }, + { + "epoch": 0.33, + "grad_norm": 0.9958846569061279, + "learning_rate": 3.3304246655031996e-05, + "loss": 5.4423, + "step": 3444 + }, + { + "epoch": 0.33, + "grad_norm": 0.9611808657646179, + "learning_rate": 3.3284855536164436e-05, + "loss": 5.4374, + "step": 3448 + }, + { + "epoch": 0.33, + "grad_norm": 1.0334917306900024, + "learning_rate": 3.3265464417296884e-05, + "loss": 5.3888, + "step": 3452 + }, + { + "epoch": 0.34, + "grad_norm": 0.9894860982894897, + "learning_rate": 3.324607329842932e-05, + "loss": 5.5356, + "step": 3456 + }, + { + "epoch": 0.34, + "grad_norm": 0.9536516070365906, + "learning_rate": 3.3226682179561765e-05, + "loss": 5.451, + "step": 3460 + }, + { + "epoch": 0.34, + "grad_norm": 1.0449178218841553, + "learning_rate": 3.32072910606942e-05, + "loss": 5.4379, + "step": 3464 + }, + { + "epoch": 0.34, + "grad_norm": 0.958135724067688, + "learning_rate": 3.3187899941826646e-05, + "loss": 5.394, + "step": 3468 + }, + { + "epoch": 0.34, + "grad_norm": 1.049261212348938, + "learning_rate": 3.3168508822959086e-05, + "loss": 5.4298, + "step": 3472 + }, + { + "epoch": 0.34, + "grad_norm": 1.018259048461914, + "learning_rate": 3.314911770409153e-05, + "loss": 5.3572, + "step": 3476 + }, + { + "epoch": 0.34, + "grad_norm": 0.969048798084259, + "learning_rate": 3.312972658522397e-05, + "loss": 5.4854, + "step": 3480 + }, + { + "epoch": 0.34, + "grad_norm": 1.0267409086227417, + "learning_rate": 3.3110335466356415e-05, + "loss": 5.4474, + "step": 3484 + }, + { + "epoch": 0.34, + "grad_norm": 0.9933199286460876, + "learning_rate": 3.309094434748885e-05, + "loss": 5.4446, + "step": 3488 + }, + { + "epoch": 0.34, + "grad_norm": 1.0801371335983276, + "learning_rate": 3.3071553228621296e-05, + "loss": 5.4434, + "step": 3492 + }, + { + "epoch": 0.34, + "grad_norm": 0.953080415725708, + "learning_rate": 3.3052162109753736e-05, + "loss": 5.4595, + "step": 3496 + }, + { + "epoch": 0.34, + "grad_norm": 0.9589456915855408, + "learning_rate": 3.303277099088618e-05, + "loss": 5.433, + "step": 3500 + }, + { + "epoch": 0.34, + "grad_norm": 0.9284895658493042, + "learning_rate": 3.301337987201862e-05, + "loss": 5.4761, + "step": 3504 + }, + { + "epoch": 0.34, + "grad_norm": 0.9796357154846191, + "learning_rate": 3.299398875315106e-05, + "loss": 5.3998, + "step": 3508 + }, + { + "epoch": 0.34, + "grad_norm": 0.9407968521118164, + "learning_rate": 3.29745976342835e-05, + "loss": 5.3964, + "step": 3512 + }, + { + "epoch": 0.34, + "grad_norm": 1.0362569093704224, + "learning_rate": 3.2955206515415946e-05, + "loss": 5.5405, + "step": 3516 + }, + { + "epoch": 0.34, + "grad_norm": 1.024808645248413, + "learning_rate": 3.293581539654838e-05, + "loss": 5.3888, + "step": 3520 + }, + { + "epoch": 0.34, + "grad_norm": 1.185713291168213, + "learning_rate": 3.291642427768083e-05, + "loss": 5.3961, + "step": 3524 + }, + { + "epoch": 0.34, + "grad_norm": 0.9882110357284546, + "learning_rate": 3.289703315881327e-05, + "loss": 5.4918, + "step": 3528 + }, + { + "epoch": 0.34, + "grad_norm": 0.9622325897216797, + "learning_rate": 3.287764203994571e-05, + "loss": 5.409, + "step": 3532 + }, + { + "epoch": 0.34, + "grad_norm": 0.9925034642219543, + "learning_rate": 3.285825092107815e-05, + "loss": 5.4252, + "step": 3536 + }, + { + "epoch": 0.34, + "grad_norm": 1.0400789976119995, + "learning_rate": 3.283885980221059e-05, + "loss": 5.4415, + "step": 3540 + }, + { + "epoch": 0.34, + "grad_norm": 1.0452476739883423, + "learning_rate": 3.281946868334303e-05, + "loss": 5.4926, + "step": 3544 + }, + { + "epoch": 0.34, + "grad_norm": 1.106879472732544, + "learning_rate": 3.280007756447547e-05, + "loss": 5.3975, + "step": 3548 + }, + { + "epoch": 0.34, + "grad_norm": 0.9826605319976807, + "learning_rate": 3.278068644560791e-05, + "loss": 5.5062, + "step": 3552 + }, + { + "epoch": 0.34, + "grad_norm": 1.0423948764801025, + "learning_rate": 3.276129532674035e-05, + "loss": 5.4536, + "step": 3556 + }, + { + "epoch": 0.35, + "grad_norm": 1.0544408559799194, + "learning_rate": 3.27419042078728e-05, + "loss": 5.4063, + "step": 3560 + }, + { + "epoch": 0.35, + "grad_norm": 0.9723476767539978, + "learning_rate": 3.272251308900524e-05, + "loss": 5.5508, + "step": 3564 + }, + { + "epoch": 0.35, + "grad_norm": 1.0550904273986816, + "learning_rate": 3.270312197013768e-05, + "loss": 5.3544, + "step": 3568 + }, + { + "epoch": 0.35, + "grad_norm": 1.0153075456619263, + "learning_rate": 3.268373085127012e-05, + "loss": 5.4197, + "step": 3572 + }, + { + "epoch": 0.35, + "grad_norm": 0.9704105854034424, + "learning_rate": 3.266433973240256e-05, + "loss": 5.4494, + "step": 3576 + }, + { + "epoch": 0.35, + "grad_norm": 1.03487229347229, + "learning_rate": 3.2644948613535e-05, + "loss": 5.4456, + "step": 3580 + }, + { + "epoch": 0.35, + "grad_norm": 0.9594029784202576, + "learning_rate": 3.262555749466744e-05, + "loss": 5.4308, + "step": 3584 + }, + { + "epoch": 0.35, + "grad_norm": 0.9959999918937683, + "learning_rate": 3.260616637579988e-05, + "loss": 5.4625, + "step": 3588 + }, + { + "epoch": 0.35, + "grad_norm": 1.0654551982879639, + "learning_rate": 3.258677525693233e-05, + "loss": 5.4254, + "step": 3592 + }, + { + "epoch": 0.35, + "grad_norm": 0.9292247295379639, + "learning_rate": 3.256738413806476e-05, + "loss": 5.4191, + "step": 3596 + }, + { + "epoch": 0.35, + "grad_norm": 0.9685704708099365, + "learning_rate": 3.254799301919721e-05, + "loss": 5.4356, + "step": 3600 + }, + { + "epoch": 0.35, + "grad_norm": 1.0221501588821411, + "learning_rate": 3.252860190032965e-05, + "loss": 5.3416, + "step": 3604 + }, + { + "epoch": 0.35, + "grad_norm": 0.9916023015975952, + "learning_rate": 3.250921078146209e-05, + "loss": 5.5014, + "step": 3608 + }, + { + "epoch": 0.35, + "grad_norm": 0.9550511240959167, + "learning_rate": 3.248981966259453e-05, + "loss": 5.4269, + "step": 3612 + }, + { + "epoch": 0.35, + "grad_norm": 1.0043843984603882, + "learning_rate": 3.247042854372698e-05, + "loss": 5.4404, + "step": 3616 + }, + { + "epoch": 0.35, + "grad_norm": 0.9808074235916138, + "learning_rate": 3.245103742485941e-05, + "loss": 5.4492, + "step": 3620 + }, + { + "epoch": 0.35, + "grad_norm": 0.9921736121177673, + "learning_rate": 3.243164630599186e-05, + "loss": 5.358, + "step": 3624 + }, + { + "epoch": 0.35, + "grad_norm": 1.0501856803894043, + "learning_rate": 3.2412255187124294e-05, + "loss": 5.3826, + "step": 3628 + }, + { + "epoch": 0.35, + "grad_norm": 1.0625114440917969, + "learning_rate": 3.239286406825674e-05, + "loss": 5.3954, + "step": 3632 + }, + { + "epoch": 0.35, + "grad_norm": 0.9541780948638916, + "learning_rate": 3.237347294938918e-05, + "loss": 5.4487, + "step": 3636 + }, + { + "epoch": 0.35, + "grad_norm": 1.0044699907302856, + "learning_rate": 3.235408183052162e-05, + "loss": 5.4098, + "step": 3640 + }, + { + "epoch": 0.35, + "grad_norm": 0.9578327536582947, + "learning_rate": 3.233469071165406e-05, + "loss": 5.4887, + "step": 3644 + }, + { + "epoch": 0.35, + "grad_norm": 0.9381611347198486, + "learning_rate": 3.231529959278651e-05, + "loss": 5.3075, + "step": 3648 + }, + { + "epoch": 0.35, + "grad_norm": 0.9474562406539917, + "learning_rate": 3.2295908473918944e-05, + "loss": 5.6028, + "step": 3652 + }, + { + "epoch": 0.35, + "grad_norm": 0.9133497476577759, + "learning_rate": 3.227651735505139e-05, + "loss": 5.2818, + "step": 3656 + }, + { + "epoch": 0.35, + "grad_norm": 1.0017893314361572, + "learning_rate": 3.2257126236183825e-05, + "loss": 5.4843, + "step": 3660 + }, + { + "epoch": 0.36, + "grad_norm": 1.0744807720184326, + "learning_rate": 3.223773511731627e-05, + "loss": 5.3414, + "step": 3664 + }, + { + "epoch": 0.36, + "grad_norm": 0.9486330151557922, + "learning_rate": 3.221834399844871e-05, + "loss": 5.4186, + "step": 3668 + }, + { + "epoch": 0.36, + "grad_norm": 0.9700675010681152, + "learning_rate": 3.2198952879581154e-05, + "loss": 5.3885, + "step": 3672 + }, + { + "epoch": 0.36, + "grad_norm": 1.0845935344696045, + "learning_rate": 3.2179561760713594e-05, + "loss": 5.4627, + "step": 3676 + }, + { + "epoch": 0.36, + "grad_norm": 1.0409663915634155, + "learning_rate": 3.2160170641846035e-05, + "loss": 5.4881, + "step": 3680 + }, + { + "epoch": 0.36, + "grad_norm": 0.9628760814666748, + "learning_rate": 3.2140779522978475e-05, + "loss": 5.4565, + "step": 3684 + }, + { + "epoch": 0.36, + "grad_norm": 0.948780357837677, + "learning_rate": 3.2121388404110916e-05, + "loss": 5.4444, + "step": 3688 + }, + { + "epoch": 0.36, + "grad_norm": 0.9025591015815735, + "learning_rate": 3.210199728524336e-05, + "loss": 5.329, + "step": 3692 + }, + { + "epoch": 0.36, + "grad_norm": 1.059144377708435, + "learning_rate": 3.2082606166375804e-05, + "loss": 5.4179, + "step": 3696 + }, + { + "epoch": 0.36, + "grad_norm": 0.9723076820373535, + "learning_rate": 3.2063215047508244e-05, + "loss": 5.3901, + "step": 3700 + }, + { + "epoch": 0.36, + "grad_norm": 1.0358588695526123, + "learning_rate": 3.2043823928640685e-05, + "loss": 5.4644, + "step": 3704 + }, + { + "epoch": 0.36, + "grad_norm": 1.0152002573013306, + "learning_rate": 3.2024432809773125e-05, + "loss": 5.5068, + "step": 3708 + }, + { + "epoch": 0.36, + "grad_norm": 1.0357673168182373, + "learning_rate": 3.2005041690905566e-05, + "loss": 5.5428, + "step": 3712 + }, + { + "epoch": 0.36, + "grad_norm": 1.0267854928970337, + "learning_rate": 3.1985650572038006e-05, + "loss": 5.4718, + "step": 3716 + }, + { + "epoch": 0.36, + "grad_norm": 0.9849284887313843, + "learning_rate": 3.196625945317045e-05, + "loss": 5.3864, + "step": 3720 + }, + { + "epoch": 0.36, + "grad_norm": 1.0108904838562012, + "learning_rate": 3.1946868334302894e-05, + "loss": 5.4425, + "step": 3724 + }, + { + "epoch": 0.36, + "grad_norm": 1.0127886533737183, + "learning_rate": 3.192747721543533e-05, + "loss": 5.3571, + "step": 3728 + }, + { + "epoch": 0.36, + "grad_norm": 0.9837380051612854, + "learning_rate": 3.1908086096567775e-05, + "loss": 5.4701, + "step": 3732 + }, + { + "epoch": 0.36, + "grad_norm": 1.0696730613708496, + "learning_rate": 3.1888694977700216e-05, + "loss": 5.5145, + "step": 3736 + }, + { + "epoch": 0.36, + "grad_norm": 0.9534905552864075, + "learning_rate": 3.1869303858832656e-05, + "loss": 5.3795, + "step": 3740 + }, + { + "epoch": 0.36, + "grad_norm": 1.048189401626587, + "learning_rate": 3.18499127399651e-05, + "loss": 5.3971, + "step": 3744 + }, + { + "epoch": 0.36, + "grad_norm": 1.0402449369430542, + "learning_rate": 3.183052162109754e-05, + "loss": 5.4111, + "step": 3748 + }, + { + "epoch": 0.36, + "grad_norm": 1.022661805152893, + "learning_rate": 3.181113050222998e-05, + "loss": 5.4513, + "step": 3752 + }, + { + "epoch": 0.36, + "grad_norm": 0.9560799598693848, + "learning_rate": 3.1791739383362425e-05, + "loss": 5.3978, + "step": 3756 + }, + { + "epoch": 0.36, + "grad_norm": 0.9077816605567932, + "learning_rate": 3.177234826449486e-05, + "loss": 5.3508, + "step": 3760 + }, + { + "epoch": 0.36, + "grad_norm": 1.029691219329834, + "learning_rate": 3.1752957145627306e-05, + "loss": 5.3808, + "step": 3764 + }, + { + "epoch": 0.37, + "grad_norm": 0.9867807626724243, + "learning_rate": 3.173356602675975e-05, + "loss": 5.4433, + "step": 3768 + }, + { + "epoch": 0.37, + "grad_norm": 1.0569761991500854, + "learning_rate": 3.171417490789219e-05, + "loss": 5.4479, + "step": 3772 + }, + { + "epoch": 0.37, + "grad_norm": 1.0014142990112305, + "learning_rate": 3.169478378902463e-05, + "loss": 5.3415, + "step": 3776 + }, + { + "epoch": 0.37, + "grad_norm": 0.9502798318862915, + "learning_rate": 3.167539267015707e-05, + "loss": 5.4549, + "step": 3780 + }, + { + "epoch": 0.37, + "grad_norm": 0.9451482892036438, + "learning_rate": 3.165600155128951e-05, + "loss": 5.43, + "step": 3784 + }, + { + "epoch": 0.37, + "grad_norm": 1.002482295036316, + "learning_rate": 3.1636610432421956e-05, + "loss": 5.4482, + "step": 3788 + }, + { + "epoch": 0.37, + "grad_norm": 1.0348658561706543, + "learning_rate": 3.161721931355439e-05, + "loss": 5.471, + "step": 3792 + }, + { + "epoch": 0.37, + "grad_norm": 0.9318069815635681, + "learning_rate": 3.159782819468684e-05, + "loss": 5.3646, + "step": 3796 + }, + { + "epoch": 0.37, + "grad_norm": 1.1141440868377686, + "learning_rate": 3.157843707581928e-05, + "loss": 5.4423, + "step": 3800 + }, + { + "epoch": 0.37, + "grad_norm": 1.066623568534851, + "learning_rate": 3.155904595695172e-05, + "loss": 5.3612, + "step": 3804 + }, + { + "epoch": 0.37, + "grad_norm": 1.0264089107513428, + "learning_rate": 3.153965483808416e-05, + "loss": 5.383, + "step": 3808 + }, + { + "epoch": 0.37, + "grad_norm": 1.0358729362487793, + "learning_rate": 3.15202637192166e-05, + "loss": 5.5152, + "step": 3812 + }, + { + "epoch": 0.37, + "grad_norm": 0.9304607510566711, + "learning_rate": 3.150087260034904e-05, + "loss": 5.3978, + "step": 3816 + }, + { + "epoch": 0.37, + "grad_norm": 1.0691871643066406, + "learning_rate": 3.148148148148148e-05, + "loss": 5.5089, + "step": 3820 + }, + { + "epoch": 0.37, + "grad_norm": 1.0088552236557007, + "learning_rate": 3.146209036261392e-05, + "loss": 5.3989, + "step": 3824 + }, + { + "epoch": 0.37, + "grad_norm": 0.9631534218788147, + "learning_rate": 3.144269924374637e-05, + "loss": 5.3578, + "step": 3828 + }, + { + "epoch": 0.37, + "grad_norm": 0.964043378829956, + "learning_rate": 3.142330812487881e-05, + "loss": 5.4551, + "step": 3832 + }, + { + "epoch": 0.37, + "grad_norm": 1.0570262670516968, + "learning_rate": 3.140391700601125e-05, + "loss": 5.4109, + "step": 3836 + }, + { + "epoch": 0.37, + "grad_norm": 0.975766658782959, + "learning_rate": 3.138452588714369e-05, + "loss": 5.4265, + "step": 3840 + }, + { + "epoch": 0.37, + "grad_norm": 0.9857097268104553, + "learning_rate": 3.136513476827613e-05, + "loss": 5.4179, + "step": 3844 + }, + { + "epoch": 0.37, + "grad_norm": 0.9902443885803223, + "learning_rate": 3.134574364940857e-05, + "loss": 5.3386, + "step": 3848 + }, + { + "epoch": 0.37, + "grad_norm": 0.974892795085907, + "learning_rate": 3.132635253054101e-05, + "loss": 5.3901, + "step": 3852 + }, + { + "epoch": 0.37, + "grad_norm": 0.9388407468795776, + "learning_rate": 3.130696141167346e-05, + "loss": 5.5098, + "step": 3856 + }, + { + "epoch": 0.37, + "grad_norm": 1.0491032600402832, + "learning_rate": 3.128757029280589e-05, + "loss": 5.2734, + "step": 3860 + }, + { + "epoch": 0.37, + "grad_norm": 0.9660788178443909, + "learning_rate": 3.126817917393834e-05, + "loss": 5.347, + "step": 3864 + }, + { + "epoch": 0.38, + "grad_norm": 0.9738529324531555, + "learning_rate": 3.1248788055070774e-05, + "loss": 5.4238, + "step": 3868 + }, + { + "epoch": 0.38, + "grad_norm": 0.9557338356971741, + "learning_rate": 3.122939693620322e-05, + "loss": 5.3881, + "step": 3872 + }, + { + "epoch": 0.38, + "grad_norm": 1.1574413776397705, + "learning_rate": 3.121000581733566e-05, + "loss": 5.372, + "step": 3876 + }, + { + "epoch": 0.38, + "grad_norm": 1.046978235244751, + "learning_rate": 3.11906146984681e-05, + "loss": 5.3993, + "step": 3880 + }, + { + "epoch": 0.38, + "grad_norm": 1.0011602640151978, + "learning_rate": 3.117122357960054e-05, + "loss": 5.3449, + "step": 3884 + }, + { + "epoch": 0.38, + "grad_norm": 1.0066341161727905, + "learning_rate": 3.115183246073299e-05, + "loss": 5.5072, + "step": 3888 + }, + { + "epoch": 0.38, + "grad_norm": 0.9333812594413757, + "learning_rate": 3.1132441341865424e-05, + "loss": 5.4166, + "step": 3892 + }, + { + "epoch": 0.38, + "grad_norm": 0.9771477580070496, + "learning_rate": 3.111305022299787e-05, + "loss": 5.4957, + "step": 3896 + }, + { + "epoch": 0.38, + "grad_norm": 0.9672766923904419, + "learning_rate": 3.1093659104130305e-05, + "loss": 5.5576, + "step": 3900 + }, + { + "epoch": 0.38, + "grad_norm": 1.0771820545196533, + "learning_rate": 3.107426798526275e-05, + "loss": 5.432, + "step": 3904 + }, + { + "epoch": 0.38, + "grad_norm": 0.9599045515060425, + "learning_rate": 3.105487686639519e-05, + "loss": 5.3826, + "step": 3908 + }, + { + "epoch": 0.38, + "grad_norm": 1.0193896293640137, + "learning_rate": 3.1035485747527633e-05, + "loss": 5.3933, + "step": 3912 + }, + { + "epoch": 0.38, + "grad_norm": 0.9940204620361328, + "learning_rate": 3.1016094628660074e-05, + "loss": 5.4366, + "step": 3916 + }, + { + "epoch": 0.38, + "grad_norm": 0.9993325471878052, + "learning_rate": 3.099670350979252e-05, + "loss": 5.3764, + "step": 3920 + }, + { + "epoch": 0.38, + "grad_norm": 1.0441081523895264, + "learning_rate": 3.0977312390924955e-05, + "loss": 5.4612, + "step": 3924 + }, + { + "epoch": 0.38, + "grad_norm": 0.971049964427948, + "learning_rate": 3.09579212720574e-05, + "loss": 5.365, + "step": 3928 + }, + { + "epoch": 0.38, + "grad_norm": 0.9899376034736633, + "learning_rate": 3.093853015318984e-05, + "loss": 5.4119, + "step": 3932 + }, + { + "epoch": 0.38, + "grad_norm": 0.965894341468811, + "learning_rate": 3.0919139034322283e-05, + "loss": 5.5259, + "step": 3936 + }, + { + "epoch": 0.38, + "grad_norm": 1.007863998413086, + "learning_rate": 3.0899747915454724e-05, + "loss": 5.4325, + "step": 3940 + }, + { + "epoch": 0.38, + "grad_norm": 1.032435417175293, + "learning_rate": 3.0880356796587164e-05, + "loss": 5.5254, + "step": 3944 + }, + { + "epoch": 0.38, + "grad_norm": 0.9465643167495728, + "learning_rate": 3.0860965677719605e-05, + "loss": 5.443, + "step": 3948 + }, + { + "epoch": 0.38, + "grad_norm": 1.112241268157959, + "learning_rate": 3.0841574558852046e-05, + "loss": 5.4087, + "step": 3952 + }, + { + "epoch": 0.38, + "grad_norm": 0.9741985201835632, + "learning_rate": 3.0822183439984486e-05, + "loss": 5.4688, + "step": 3956 + }, + { + "epoch": 0.38, + "grad_norm": 1.0749200582504272, + "learning_rate": 3.0802792321116933e-05, + "loss": 5.3697, + "step": 3960 + }, + { + "epoch": 0.38, + "grad_norm": 1.0521398782730103, + "learning_rate": 3.0783401202249374e-05, + "loss": 5.4226, + "step": 3964 + }, + { + "epoch": 0.38, + "grad_norm": 0.995364248752594, + "learning_rate": 3.0764010083381814e-05, + "loss": 5.4333, + "step": 3968 + }, + { + "epoch": 0.39, + "grad_norm": 1.023390769958496, + "learning_rate": 3.0744618964514255e-05, + "loss": 5.4662, + "step": 3972 + }, + { + "epoch": 0.39, + "grad_norm": 0.9972986578941345, + "learning_rate": 3.0725227845646696e-05, + "loss": 5.4432, + "step": 3976 + }, + { + "epoch": 0.39, + "grad_norm": 1.0111867189407349, + "learning_rate": 3.0705836726779136e-05, + "loss": 5.4875, + "step": 3980 + }, + { + "epoch": 0.39, + "grad_norm": 0.930380642414093, + "learning_rate": 3.068644560791158e-05, + "loss": 5.335, + "step": 3984 + }, + { + "epoch": 0.39, + "grad_norm": 1.0326186418533325, + "learning_rate": 3.066705448904402e-05, + "loss": 5.3984, + "step": 3988 + }, + { + "epoch": 0.39, + "grad_norm": 0.9522334933280945, + "learning_rate": 3.064766337017646e-05, + "loss": 5.4895, + "step": 3992 + }, + { + "epoch": 0.39, + "grad_norm": 1.048785924911499, + "learning_rate": 3.0628272251308905e-05, + "loss": 5.3766, + "step": 3996 + }, + { + "epoch": 0.39, + "grad_norm": 1.0069239139556885, + "learning_rate": 3.060888113244134e-05, + "loss": 5.3243, + "step": 4000 + }, + { + "epoch": 0.39, + "grad_norm": 0.8954800963401794, + "learning_rate": 3.0589490013573786e-05, + "loss": 5.3281, + "step": 4004 + }, + { + "epoch": 0.39, + "grad_norm": 1.0596482753753662, + "learning_rate": 3.057009889470623e-05, + "loss": 5.5029, + "step": 4008 + }, + { + "epoch": 0.39, + "grad_norm": 0.9929758310317993, + "learning_rate": 3.055070777583867e-05, + "loss": 5.4649, + "step": 4012 + }, + { + "epoch": 0.39, + "grad_norm": 0.9867807626724243, + "learning_rate": 3.053131665697111e-05, + "loss": 5.3755, + "step": 4016 + }, + { + "epoch": 0.39, + "grad_norm": 0.9822360873222351, + "learning_rate": 3.0511925538103548e-05, + "loss": 5.4723, + "step": 4020 + }, + { + "epoch": 0.39, + "grad_norm": 0.9592394232749939, + "learning_rate": 3.0492534419235992e-05, + "loss": 5.4694, + "step": 4024 + }, + { + "epoch": 0.39, + "grad_norm": 0.993042528629303, + "learning_rate": 3.0473143300368433e-05, + "loss": 5.4091, + "step": 4028 + }, + { + "epoch": 0.39, + "grad_norm": 0.9652045965194702, + "learning_rate": 3.0453752181500873e-05, + "loss": 5.4072, + "step": 4032 + }, + { + "epoch": 0.39, + "grad_norm": 0.9788159728050232, + "learning_rate": 3.0434361062633314e-05, + "loss": 5.4358, + "step": 4036 + }, + { + "epoch": 0.39, + "grad_norm": 0.9906930923461914, + "learning_rate": 3.0414969943765758e-05, + "loss": 5.4567, + "step": 4040 + }, + { + "epoch": 0.39, + "grad_norm": 1.0816450119018555, + "learning_rate": 3.0395578824898195e-05, + "loss": 5.3435, + "step": 4044 + }, + { + "epoch": 0.39, + "grad_norm": 1.0458557605743408, + "learning_rate": 3.037618770603064e-05, + "loss": 5.5151, + "step": 4048 + }, + { + "epoch": 0.39, + "grad_norm": 0.9744831919670105, + "learning_rate": 3.0356796587163083e-05, + "loss": 5.486, + "step": 4052 + }, + { + "epoch": 0.39, + "grad_norm": 0.9657852649688721, + "learning_rate": 3.033740546829552e-05, + "loss": 5.441, + "step": 4056 + }, + { + "epoch": 0.39, + "grad_norm": 0.9370314478874207, + "learning_rate": 3.0318014349427964e-05, + "loss": 5.4117, + "step": 4060 + }, + { + "epoch": 0.39, + "grad_norm": 0.9673814177513123, + "learning_rate": 3.02986232305604e-05, + "loss": 5.5008, + "step": 4064 + }, + { + "epoch": 0.39, + "grad_norm": 1.004071593284607, + "learning_rate": 3.0279232111692845e-05, + "loss": 5.4525, + "step": 4068 + }, + { + "epoch": 0.39, + "grad_norm": 1.0029621124267578, + "learning_rate": 3.025984099282529e-05, + "loss": 5.4788, + "step": 4072 + }, + { + "epoch": 0.4, + "grad_norm": 1.0012478828430176, + "learning_rate": 3.0240449873957726e-05, + "loss": 5.443, + "step": 4076 + }, + { + "epoch": 0.4, + "grad_norm": 0.9706107974052429, + "learning_rate": 3.022105875509017e-05, + "loss": 5.35, + "step": 4080 + }, + { + "epoch": 0.4, + "grad_norm": 0.9983603358268738, + "learning_rate": 3.0201667636222614e-05, + "loss": 5.4215, + "step": 4084 + }, + { + "epoch": 0.4, + "grad_norm": 1.0420852899551392, + "learning_rate": 3.018227651735505e-05, + "loss": 5.486, + "step": 4088 + }, + { + "epoch": 0.4, + "grad_norm": 0.9456035494804382, + "learning_rate": 3.0162885398487495e-05, + "loss": 5.4257, + "step": 4092 + }, + { + "epoch": 0.4, + "grad_norm": 0.9700554609298706, + "learning_rate": 3.014349427961994e-05, + "loss": 5.4415, + "step": 4096 + }, + { + "epoch": 0.4, + "grad_norm": 1.0411101579666138, + "learning_rate": 3.0124103160752376e-05, + "loss": 5.4313, + "step": 4100 + }, + { + "epoch": 0.4, + "grad_norm": 1.0131868124008179, + "learning_rate": 3.010471204188482e-05, + "loss": 5.371, + "step": 4104 + }, + { + "epoch": 0.4, + "grad_norm": 0.9504354000091553, + "learning_rate": 3.0085320923017257e-05, + "loss": 5.4643, + "step": 4108 + }, + { + "epoch": 0.4, + "grad_norm": 0.9660660624504089, + "learning_rate": 3.00659298041497e-05, + "loss": 5.4337, + "step": 4112 + }, + { + "epoch": 0.4, + "grad_norm": 1.0248061418533325, + "learning_rate": 3.0046538685282145e-05, + "loss": 5.4113, + "step": 4116 + }, + { + "epoch": 0.4, + "grad_norm": 1.0795485973358154, + "learning_rate": 3.0027147566414582e-05, + "loss": 5.3737, + "step": 4120 + }, + { + "epoch": 0.4, + "grad_norm": 1.0381568670272827, + "learning_rate": 3.0007756447547026e-05, + "loss": 5.4409, + "step": 4124 + }, + { + "epoch": 0.4, + "grad_norm": 0.9909124970436096, + "learning_rate": 2.998836532867947e-05, + "loss": 5.358, + "step": 4128 + }, + { + "epoch": 0.4, + "grad_norm": 1.0048110485076904, + "learning_rate": 2.9968974209811907e-05, + "loss": 5.4109, + "step": 4132 + }, + { + "epoch": 0.4, + "grad_norm": 0.9982589483261108, + "learning_rate": 2.994958309094435e-05, + "loss": 5.4673, + "step": 4136 + }, + { + "epoch": 0.4, + "grad_norm": 1.010875940322876, + "learning_rate": 2.9930191972076788e-05, + "loss": 5.3333, + "step": 4140 + }, + { + "epoch": 0.4, + "grad_norm": 1.0325456857681274, + "learning_rate": 2.9910800853209232e-05, + "loss": 5.3687, + "step": 4144 + }, + { + "epoch": 0.4, + "grad_norm": 1.057373046875, + "learning_rate": 2.9891409734341676e-05, + "loss": 5.4218, + "step": 4148 + }, + { + "epoch": 0.4, + "grad_norm": 0.975649356842041, + "learning_rate": 2.9872018615474113e-05, + "loss": 5.3694, + "step": 4152 + }, + { + "epoch": 0.4, + "grad_norm": 0.9869158864021301, + "learning_rate": 2.9852627496606557e-05, + "loss": 5.3961, + "step": 4156 + }, + { + "epoch": 0.4, + "grad_norm": 0.9405871033668518, + "learning_rate": 2.9833236377738998e-05, + "loss": 5.3882, + "step": 4160 + }, + { + "epoch": 0.4, + "grad_norm": 0.9522705078125, + "learning_rate": 2.9813845258871438e-05, + "loss": 5.4143, + "step": 4164 + }, + { + "epoch": 0.4, + "grad_norm": 0.9004665613174438, + "learning_rate": 2.979445414000388e-05, + "loss": 5.4774, + "step": 4168 + }, + { + "epoch": 0.4, + "grad_norm": 1.01994788646698, + "learning_rate": 2.9775063021136323e-05, + "loss": 5.4938, + "step": 4172 + }, + { + "epoch": 0.4, + "grad_norm": 1.0324288606643677, + "learning_rate": 2.975567190226876e-05, + "loss": 5.351, + "step": 4176 + }, + { + "epoch": 0.41, + "grad_norm": 1.0081384181976318, + "learning_rate": 2.9736280783401204e-05, + "loss": 5.4233, + "step": 4180 + }, + { + "epoch": 0.41, + "grad_norm": 0.9753928780555725, + "learning_rate": 2.9716889664533644e-05, + "loss": 5.3791, + "step": 4184 + }, + { + "epoch": 0.41, + "grad_norm": 1.0055930614471436, + "learning_rate": 2.9697498545666085e-05, + "loss": 5.4083, + "step": 4188 + }, + { + "epoch": 0.41, + "grad_norm": 1.0010572671890259, + "learning_rate": 2.967810742679853e-05, + "loss": 5.3065, + "step": 4192 + }, + { + "epoch": 0.41, + "grad_norm": 1.0279070138931274, + "learning_rate": 2.9658716307930966e-05, + "loss": 5.359, + "step": 4196 + }, + { + "epoch": 0.41, + "grad_norm": 0.9655927419662476, + "learning_rate": 2.963932518906341e-05, + "loss": 5.4802, + "step": 4200 + }, + { + "epoch": 0.41, + "grad_norm": 0.9861381649971008, + "learning_rate": 2.9619934070195854e-05, + "loss": 5.3633, + "step": 4204 + }, + { + "epoch": 0.41, + "grad_norm": 1.0157438516616821, + "learning_rate": 2.960054295132829e-05, + "loss": 5.3607, + "step": 4208 + }, + { + "epoch": 0.41, + "grad_norm": 1.0280277729034424, + "learning_rate": 2.9581151832460735e-05, + "loss": 5.3693, + "step": 4212 + }, + { + "epoch": 0.41, + "grad_norm": 1.045616865158081, + "learning_rate": 2.956176071359318e-05, + "loss": 5.4264, + "step": 4216 + }, + { + "epoch": 0.41, + "grad_norm": 0.950824499130249, + "learning_rate": 2.9542369594725616e-05, + "loss": 5.3266, + "step": 4220 + }, + { + "epoch": 0.41, + "grad_norm": 0.9672490358352661, + "learning_rate": 2.952297847585806e-05, + "loss": 5.4075, + "step": 4224 + }, + { + "epoch": 0.41, + "grad_norm": 0.964094877243042, + "learning_rate": 2.9503587356990497e-05, + "loss": 5.4298, + "step": 4228 + }, + { + "epoch": 0.41, + "grad_norm": 0.9985701441764832, + "learning_rate": 2.948419623812294e-05, + "loss": 5.4677, + "step": 4232 + }, + { + "epoch": 0.41, + "grad_norm": 1.0406873226165771, + "learning_rate": 2.9464805119255385e-05, + "loss": 5.3258, + "step": 4236 + }, + { + "epoch": 0.41, + "grad_norm": 1.0154664516448975, + "learning_rate": 2.9445414000387822e-05, + "loss": 5.4148, + "step": 4240 + }, + { + "epoch": 0.41, + "grad_norm": 0.9826991558074951, + "learning_rate": 2.9426022881520266e-05, + "loss": 5.3293, + "step": 4244 + }, + { + "epoch": 0.41, + "grad_norm": 1.0213288068771362, + "learning_rate": 2.940663176265271e-05, + "loss": 5.3245, + "step": 4248 + }, + { + "epoch": 0.41, + "grad_norm": 1.0005340576171875, + "learning_rate": 2.9387240643785147e-05, + "loss": 5.4466, + "step": 4252 + }, + { + "epoch": 0.41, + "grad_norm": 1.0072083473205566, + "learning_rate": 2.936784952491759e-05, + "loss": 5.3103, + "step": 4256 + }, + { + "epoch": 0.41, + "grad_norm": 0.9992334246635437, + "learning_rate": 2.9348458406050028e-05, + "loss": 5.4595, + "step": 4260 + }, + { + "epoch": 0.41, + "grad_norm": 1.026963233947754, + "learning_rate": 2.9329067287182472e-05, + "loss": 5.4082, + "step": 4264 + }, + { + "epoch": 0.41, + "grad_norm": 1.0018388032913208, + "learning_rate": 2.9309676168314916e-05, + "loss": 5.3292, + "step": 4268 + }, + { + "epoch": 0.41, + "grad_norm": 1.106248140335083, + "learning_rate": 2.9290285049447353e-05, + "loss": 5.3176, + "step": 4272 + }, + { + "epoch": 0.41, + "grad_norm": 1.0284066200256348, + "learning_rate": 2.9270893930579797e-05, + "loss": 5.3839, + "step": 4276 + }, + { + "epoch": 0.41, + "grad_norm": 1.0233203172683716, + "learning_rate": 2.925150281171224e-05, + "loss": 5.4327, + "step": 4280 + }, + { + "epoch": 0.42, + "grad_norm": 1.0423204898834229, + "learning_rate": 2.9232111692844678e-05, + "loss": 5.4134, + "step": 4284 + }, + { + "epoch": 0.42, + "grad_norm": 1.0750269889831543, + "learning_rate": 2.9212720573977122e-05, + "loss": 5.3983, + "step": 4288 + }, + { + "epoch": 0.42, + "grad_norm": 1.035136342048645, + "learning_rate": 2.9193329455109562e-05, + "loss": 5.3828, + "step": 4292 + }, + { + "epoch": 0.42, + "grad_norm": 0.9154065847396851, + "learning_rate": 2.9173938336242003e-05, + "loss": 5.3261, + "step": 4296 + }, + { + "epoch": 0.42, + "grad_norm": 1.0032275915145874, + "learning_rate": 2.9154547217374443e-05, + "loss": 5.4712, + "step": 4300 + }, + { + "epoch": 0.42, + "grad_norm": 1.0405406951904297, + "learning_rate": 2.9135156098506884e-05, + "loss": 5.3499, + "step": 4304 + }, + { + "epoch": 0.42, + "grad_norm": 1.1554255485534668, + "learning_rate": 2.9115764979639324e-05, + "loss": 5.4097, + "step": 4308 + }, + { + "epoch": 0.42, + "grad_norm": 1.054694414138794, + "learning_rate": 2.909637386077177e-05, + "loss": 5.3183, + "step": 4312 + }, + { + "epoch": 0.42, + "grad_norm": 0.9714403748512268, + "learning_rate": 2.907698274190421e-05, + "loss": 5.5304, + "step": 4316 + }, + { + "epoch": 0.42, + "grad_norm": 1.0622743368148804, + "learning_rate": 2.905759162303665e-05, + "loss": 5.321, + "step": 4320 + }, + { + "epoch": 0.42, + "grad_norm": 1.0464075803756714, + "learning_rate": 2.9038200504169093e-05, + "loss": 5.3688, + "step": 4324 + }, + { + "epoch": 0.42, + "grad_norm": 0.9132739901542664, + "learning_rate": 2.901880938530153e-05, + "loss": 5.3441, + "step": 4328 + }, + { + "epoch": 0.42, + "grad_norm": 1.013675332069397, + "learning_rate": 2.8999418266433974e-05, + "loss": 5.4333, + "step": 4332 + }, + { + "epoch": 0.42, + "grad_norm": 1.0379986763000488, + "learning_rate": 2.898002714756642e-05, + "loss": 5.5111, + "step": 4336 + }, + { + "epoch": 0.42, + "grad_norm": 1.0619996786117554, + "learning_rate": 2.8960636028698856e-05, + "loss": 5.3242, + "step": 4340 + }, + { + "epoch": 0.42, + "grad_norm": 1.0503803491592407, + "learning_rate": 2.89412449098313e-05, + "loss": 5.4761, + "step": 4344 + }, + { + "epoch": 0.42, + "grad_norm": 1.006859540939331, + "learning_rate": 2.8921853790963737e-05, + "loss": 5.4661, + "step": 4348 + }, + { + "epoch": 0.42, + "grad_norm": 0.9837526082992554, + "learning_rate": 2.890246267209618e-05, + "loss": 5.4439, + "step": 4352 + }, + { + "epoch": 0.42, + "grad_norm": 1.0037223100662231, + "learning_rate": 2.8883071553228624e-05, + "loss": 5.3982, + "step": 4356 + }, + { + "epoch": 0.42, + "grad_norm": 0.9351251125335693, + "learning_rate": 2.886368043436106e-05, + "loss": 5.4419, + "step": 4360 + }, + { + "epoch": 0.42, + "grad_norm": 0.9317170977592468, + "learning_rate": 2.8844289315493506e-05, + "loss": 5.4067, + "step": 4364 + }, + { + "epoch": 0.42, + "grad_norm": 0.9478697776794434, + "learning_rate": 2.882489819662595e-05, + "loss": 5.4012, + "step": 4368 + }, + { + "epoch": 0.42, + "grad_norm": 0.9834426641464233, + "learning_rate": 2.8805507077758387e-05, + "loss": 5.4496, + "step": 4372 + }, + { + "epoch": 0.42, + "grad_norm": 1.0380594730377197, + "learning_rate": 2.878611595889083e-05, + "loss": 5.4042, + "step": 4376 + }, + { + "epoch": 0.42, + "grad_norm": 1.1123548746109009, + "learning_rate": 2.8766724840023268e-05, + "loss": 5.4814, + "step": 4380 + }, + { + "epoch": 0.43, + "grad_norm": 0.9241830706596375, + "learning_rate": 2.874733372115571e-05, + "loss": 5.4302, + "step": 4384 + }, + { + "epoch": 0.43, + "grad_norm": 0.9564453363418579, + "learning_rate": 2.8727942602288156e-05, + "loss": 5.3426, + "step": 4388 + }, + { + "epoch": 0.43, + "grad_norm": 1.0000247955322266, + "learning_rate": 2.8708551483420593e-05, + "loss": 5.3799, + "step": 4392 + }, + { + "epoch": 0.43, + "grad_norm": 0.9557441473007202, + "learning_rate": 2.8689160364553037e-05, + "loss": 5.4233, + "step": 4396 + }, + { + "epoch": 0.43, + "grad_norm": 0.920683741569519, + "learning_rate": 2.866976924568548e-05, + "loss": 5.351, + "step": 4400 + }, + { + "epoch": 0.43, + "grad_norm": 0.9885355234146118, + "learning_rate": 2.8650378126817918e-05, + "loss": 5.4735, + "step": 4404 + }, + { + "epoch": 0.43, + "grad_norm": 1.02256441116333, + "learning_rate": 2.863098700795036e-05, + "loss": 5.4872, + "step": 4408 + }, + { + "epoch": 0.43, + "grad_norm": 1.0346393585205078, + "learning_rate": 2.8611595889082806e-05, + "loss": 5.4895, + "step": 4412 + }, + { + "epoch": 0.43, + "grad_norm": 1.0645418167114258, + "learning_rate": 2.8592204770215243e-05, + "loss": 5.3678, + "step": 4416 + }, + { + "epoch": 0.43, + "grad_norm": 0.9786351323127747, + "learning_rate": 2.8572813651347687e-05, + "loss": 5.394, + "step": 4420 + }, + { + "epoch": 0.43, + "grad_norm": 1.0003682374954224, + "learning_rate": 2.8553422532480124e-05, + "loss": 5.3394, + "step": 4424 + }, + { + "epoch": 0.43, + "grad_norm": 1.0191590785980225, + "learning_rate": 2.8534031413612568e-05, + "loss": 5.3822, + "step": 4428 + }, + { + "epoch": 0.43, + "grad_norm": 0.9041575193405151, + "learning_rate": 2.8514640294745008e-05, + "loss": 5.3762, + "step": 4432 + }, + { + "epoch": 0.43, + "grad_norm": 1.1107348203659058, + "learning_rate": 2.849524917587745e-05, + "loss": 5.355, + "step": 4436 + }, + { + "epoch": 0.43, + "grad_norm": 0.9530418515205383, + "learning_rate": 2.847585805700989e-05, + "loss": 5.3681, + "step": 4440 + }, + { + "epoch": 0.43, + "grad_norm": 0.988714337348938, + "learning_rate": 2.8456466938142333e-05, + "loss": 5.3924, + "step": 4444 + }, + { + "epoch": 0.43, + "grad_norm": 1.0610371828079224, + "learning_rate": 2.8437075819274774e-05, + "loss": 5.3354, + "step": 4448 + }, + { + "epoch": 0.43, + "grad_norm": 1.005706548690796, + "learning_rate": 2.8417684700407214e-05, + "loss": 5.3619, + "step": 4452 + }, + { + "epoch": 0.43, + "grad_norm": 1.053621530532837, + "learning_rate": 2.8398293581539658e-05, + "loss": 5.344, + "step": 4456 + }, + { + "epoch": 0.43, + "grad_norm": 1.0087207555770874, + "learning_rate": 2.8378902462672095e-05, + "loss": 5.3222, + "step": 4460 + }, + { + "epoch": 0.43, + "grad_norm": 1.0023506879806519, + "learning_rate": 2.835951134380454e-05, + "loss": 5.4015, + "step": 4464 + }, + { + "epoch": 0.43, + "grad_norm": 0.9826129078865051, + "learning_rate": 2.8340120224936976e-05, + "loss": 5.4818, + "step": 4468 + }, + { + "epoch": 0.43, + "grad_norm": 0.9970731139183044, + "learning_rate": 2.832072910606942e-05, + "loss": 5.4439, + "step": 4472 + }, + { + "epoch": 0.43, + "grad_norm": 1.0162273645401, + "learning_rate": 2.8301337987201864e-05, + "loss": 5.4481, + "step": 4476 + }, + { + "epoch": 0.43, + "grad_norm": 1.065722942352295, + "learning_rate": 2.82819468683343e-05, + "loss": 5.4393, + "step": 4480 + }, + { + "epoch": 0.43, + "grad_norm": 1.0227274894714355, + "learning_rate": 2.8262555749466745e-05, + "loss": 5.4049, + "step": 4484 + }, + { + "epoch": 0.44, + "grad_norm": 1.058347463607788, + "learning_rate": 2.824316463059919e-05, + "loss": 5.4221, + "step": 4488 + }, + { + "epoch": 0.44, + "grad_norm": 0.981675922870636, + "learning_rate": 2.8223773511731626e-05, + "loss": 5.4298, + "step": 4492 + }, + { + "epoch": 0.44, + "grad_norm": 1.0078340768814087, + "learning_rate": 2.820438239286407e-05, + "loss": 5.4223, + "step": 4496 + }, + { + "epoch": 0.44, + "grad_norm": 1.0829211473464966, + "learning_rate": 2.8184991273996508e-05, + "loss": 5.4577, + "step": 4500 + }, + { + "epoch": 0.44, + "grad_norm": 0.9840219020843506, + "learning_rate": 2.816560015512895e-05, + "loss": 5.3451, + "step": 4504 + }, + { + "epoch": 0.44, + "grad_norm": 1.059767484664917, + "learning_rate": 2.8146209036261395e-05, + "loss": 5.4052, + "step": 4508 + }, + { + "epoch": 0.44, + "grad_norm": 1.1250829696655273, + "learning_rate": 2.8126817917393833e-05, + "loss": 5.3476, + "step": 4512 + }, + { + "epoch": 0.44, + "grad_norm": 0.9797293543815613, + "learning_rate": 2.8107426798526276e-05, + "loss": 5.3143, + "step": 4516 + }, + { + "epoch": 0.44, + "grad_norm": 0.9784405827522278, + "learning_rate": 2.808803567965872e-05, + "loss": 5.4072, + "step": 4520 + }, + { + "epoch": 0.44, + "grad_norm": 1.0686568021774292, + "learning_rate": 2.8068644560791158e-05, + "loss": 5.4029, + "step": 4524 + }, + { + "epoch": 0.44, + "grad_norm": 0.958621084690094, + "learning_rate": 2.80492534419236e-05, + "loss": 5.3838, + "step": 4528 + }, + { + "epoch": 0.44, + "grad_norm": 0.969975471496582, + "learning_rate": 2.8029862323056045e-05, + "loss": 5.4323, + "step": 4532 + }, + { + "epoch": 0.44, + "grad_norm": 1.0803236961364746, + "learning_rate": 2.8010471204188483e-05, + "loss": 5.4438, + "step": 4536 + }, + { + "epoch": 0.44, + "grad_norm": 1.0710035562515259, + "learning_rate": 2.7991080085320926e-05, + "loss": 5.4503, + "step": 4540 + }, + { + "epoch": 0.44, + "grad_norm": 0.9775360226631165, + "learning_rate": 2.7971688966453364e-05, + "loss": 5.3578, + "step": 4544 + }, + { + "epoch": 0.44, + "grad_norm": 0.9491469264030457, + "learning_rate": 2.7952297847585808e-05, + "loss": 5.3979, + "step": 4548 + }, + { + "epoch": 0.44, + "grad_norm": 1.0196774005889893, + "learning_rate": 2.793290672871825e-05, + "loss": 5.4313, + "step": 4552 + }, + { + "epoch": 0.44, + "grad_norm": 1.0691953897476196, + "learning_rate": 2.791351560985069e-05, + "loss": 5.5029, + "step": 4556 + }, + { + "epoch": 0.44, + "grad_norm": 1.0333261489868164, + "learning_rate": 2.7894124490983133e-05, + "loss": 5.4323, + "step": 4560 + }, + { + "epoch": 0.44, + "grad_norm": 1.0331099033355713, + "learning_rate": 2.7874733372115573e-05, + "loss": 5.4105, + "step": 4564 + }, + { + "epoch": 0.44, + "grad_norm": 1.0562598705291748, + "learning_rate": 2.7855342253248014e-05, + "loss": 5.4442, + "step": 4568 + }, + { + "epoch": 0.44, + "grad_norm": 1.0251268148422241, + "learning_rate": 2.7835951134380454e-05, + "loss": 5.4121, + "step": 4572 + }, + { + "epoch": 0.44, + "grad_norm": 0.9381951689720154, + "learning_rate": 2.7816560015512898e-05, + "loss": 5.303, + "step": 4576 + }, + { + "epoch": 0.44, + "grad_norm": 1.0383044481277466, + "learning_rate": 2.779716889664534e-05, + "loss": 5.3638, + "step": 4580 + }, + { + "epoch": 0.44, + "grad_norm": 1.0460588932037354, + "learning_rate": 2.777777777777778e-05, + "loss": 5.3688, + "step": 4584 + }, + { + "epoch": 0.44, + "grad_norm": 0.9689496159553528, + "learning_rate": 2.775838665891022e-05, + "loss": 5.499, + "step": 4588 + }, + { + "epoch": 0.45, + "grad_norm": 0.9622393846511841, + "learning_rate": 2.773899554004266e-05, + "loss": 5.3238, + "step": 4592 + }, + { + "epoch": 0.45, + "grad_norm": 0.9752629399299622, + "learning_rate": 2.7719604421175104e-05, + "loss": 5.3294, + "step": 4596 + }, + { + "epoch": 0.45, + "grad_norm": 0.9437540173530579, + "learning_rate": 2.770021330230754e-05, + "loss": 5.3397, + "step": 4600 + }, + { + "epoch": 0.45, + "grad_norm": 1.041926383972168, + "learning_rate": 2.7680822183439985e-05, + "loss": 5.4434, + "step": 4604 + }, + { + "epoch": 0.45, + "grad_norm": 1.045983076095581, + "learning_rate": 2.766143106457243e-05, + "loss": 5.3467, + "step": 4608 + }, + { + "epoch": 0.45, + "grad_norm": 0.9513012766838074, + "learning_rate": 2.7642039945704866e-05, + "loss": 5.4401, + "step": 4612 + }, + { + "epoch": 0.45, + "grad_norm": 1.0047541856765747, + "learning_rate": 2.762264882683731e-05, + "loss": 5.3485, + "step": 4616 + }, + { + "epoch": 0.45, + "grad_norm": 1.0038918256759644, + "learning_rate": 2.7603257707969747e-05, + "loss": 5.4189, + "step": 4620 + }, + { + "epoch": 0.45, + "grad_norm": 1.0029683113098145, + "learning_rate": 2.758386658910219e-05, + "loss": 5.3773, + "step": 4624 + }, + { + "epoch": 0.45, + "grad_norm": 0.9756178259849548, + "learning_rate": 2.7564475470234635e-05, + "loss": 5.4745, + "step": 4628 + }, + { + "epoch": 0.45, + "grad_norm": 1.028535008430481, + "learning_rate": 2.7545084351367072e-05, + "loss": 5.3767, + "step": 4632 + }, + { + "epoch": 0.45, + "grad_norm": 1.0276795625686646, + "learning_rate": 2.7525693232499516e-05, + "loss": 5.4127, + "step": 4636 + }, + { + "epoch": 0.45, + "grad_norm": 0.9759376645088196, + "learning_rate": 2.750630211363196e-05, + "loss": 5.4136, + "step": 4640 + }, + { + "epoch": 0.45, + "grad_norm": 0.9665167331695557, + "learning_rate": 2.7486910994764397e-05, + "loss": 5.4307, + "step": 4644 + }, + { + "epoch": 0.45, + "grad_norm": 1.0241793394088745, + "learning_rate": 2.746751987589684e-05, + "loss": 5.3753, + "step": 4648 + }, + { + "epoch": 0.45, + "grad_norm": 0.97547847032547, + "learning_rate": 2.7448128757029285e-05, + "loss": 5.4257, + "step": 4652 + }, + { + "epoch": 0.45, + "grad_norm": 0.9529567360877991, + "learning_rate": 2.7428737638161722e-05, + "loss": 5.4561, + "step": 4656 + }, + { + "epoch": 0.45, + "grad_norm": 0.9759612679481506, + "learning_rate": 2.7409346519294166e-05, + "loss": 5.3565, + "step": 4660 + }, + { + "epoch": 0.45, + "grad_norm": 1.0170910358428955, + "learning_rate": 2.7389955400426603e-05, + "loss": 5.3003, + "step": 4664 + }, + { + "epoch": 0.45, + "grad_norm": 1.0059340000152588, + "learning_rate": 2.7370564281559047e-05, + "loss": 5.4549, + "step": 4668 + }, + { + "epoch": 0.45, + "grad_norm": 0.9615738987922668, + "learning_rate": 2.735117316269149e-05, + "loss": 5.3853, + "step": 4672 + }, + { + "epoch": 0.45, + "grad_norm": 1.07008957862854, + "learning_rate": 2.733178204382393e-05, + "loss": 5.4426, + "step": 4676 + }, + { + "epoch": 0.45, + "grad_norm": 1.0182669162750244, + "learning_rate": 2.7312390924956372e-05, + "loss": 5.3921, + "step": 4680 + }, + { + "epoch": 0.45, + "grad_norm": 1.0533527135849, + "learning_rate": 2.7292999806088816e-05, + "loss": 5.4001, + "step": 4684 + }, + { + "epoch": 0.45, + "grad_norm": 1.0803202390670776, + "learning_rate": 2.7273608687221253e-05, + "loss": 5.4789, + "step": 4688 + }, + { + "epoch": 0.45, + "grad_norm": 1.0540350675582886, + "learning_rate": 2.7254217568353697e-05, + "loss": 5.4387, + "step": 4692 + }, + { + "epoch": 0.46, + "grad_norm": 0.9857541918754578, + "learning_rate": 2.7234826449486138e-05, + "loss": 5.4231, + "step": 4696 + }, + { + "epoch": 0.46, + "grad_norm": 0.9859492778778076, + "learning_rate": 2.721543533061858e-05, + "loss": 5.4614, + "step": 4700 + }, + { + "epoch": 0.46, + "grad_norm": 1.0070449113845825, + "learning_rate": 2.719604421175102e-05, + "loss": 5.4235, + "step": 4704 + }, + { + "epoch": 0.46, + "grad_norm": 1.0052629709243774, + "learning_rate": 2.717665309288346e-05, + "loss": 5.4255, + "step": 4708 + }, + { + "epoch": 0.46, + "grad_norm": 1.0358524322509766, + "learning_rate": 2.7157261974015903e-05, + "loss": 5.3811, + "step": 4712 + }, + { + "epoch": 0.46, + "grad_norm": 0.9567641615867615, + "learning_rate": 2.7137870855148344e-05, + "loss": 5.3977, + "step": 4716 + }, + { + "epoch": 0.46, + "grad_norm": 1.0303080081939697, + "learning_rate": 2.7118479736280784e-05, + "loss": 5.5686, + "step": 4720 + }, + { + "epoch": 0.46, + "grad_norm": 0.9873762726783752, + "learning_rate": 2.7099088617413225e-05, + "loss": 5.3691, + "step": 4724 + }, + { + "epoch": 0.46, + "grad_norm": 0.9921165704727173, + "learning_rate": 2.707969749854567e-05, + "loss": 5.4681, + "step": 4728 + }, + { + "epoch": 0.46, + "grad_norm": 0.9569845795631409, + "learning_rate": 2.7060306379678106e-05, + "loss": 5.2843, + "step": 4732 + }, + { + "epoch": 0.46, + "grad_norm": 1.0275355577468872, + "learning_rate": 2.704091526081055e-05, + "loss": 5.3745, + "step": 4736 + }, + { + "epoch": 0.46, + "grad_norm": 1.034792184829712, + "learning_rate": 2.7021524141942987e-05, + "loss": 5.3919, + "step": 4740 + }, + { + "epoch": 0.46, + "grad_norm": 1.05870521068573, + "learning_rate": 2.700213302307543e-05, + "loss": 5.4458, + "step": 4744 + }, + { + "epoch": 0.46, + "grad_norm": 1.1329647302627563, + "learning_rate": 2.6982741904207875e-05, + "loss": 5.4046, + "step": 4748 + }, + { + "epoch": 0.46, + "grad_norm": 0.9993448853492737, + "learning_rate": 2.6963350785340312e-05, + "loss": 5.3655, + "step": 4752 + }, + { + "epoch": 0.46, + "grad_norm": 1.0315309762954712, + "learning_rate": 2.6943959666472756e-05, + "loss": 5.3993, + "step": 4756 + }, + { + "epoch": 0.46, + "grad_norm": 1.0383795499801636, + "learning_rate": 2.69245685476052e-05, + "loss": 5.3556, + "step": 4760 + }, + { + "epoch": 0.46, + "grad_norm": 0.9584831595420837, + "learning_rate": 2.6905177428737637e-05, + "loss": 5.4116, + "step": 4764 + }, + { + "epoch": 0.46, + "grad_norm": 1.0869189500808716, + "learning_rate": 2.688578630987008e-05, + "loss": 5.3348, + "step": 4768 + }, + { + "epoch": 0.46, + "grad_norm": 1.1664848327636719, + "learning_rate": 2.6866395191002525e-05, + "loss": 5.3155, + "step": 4772 + }, + { + "epoch": 0.46, + "grad_norm": 1.0994501113891602, + "learning_rate": 2.6847004072134962e-05, + "loss": 5.3711, + "step": 4776 + }, + { + "epoch": 0.46, + "grad_norm": 0.9989491105079651, + "learning_rate": 2.6827612953267406e-05, + "loss": 5.3603, + "step": 4780 + }, + { + "epoch": 0.46, + "grad_norm": 1.1618521213531494, + "learning_rate": 2.6808221834399843e-05, + "loss": 5.4217, + "step": 4784 + }, + { + "epoch": 0.46, + "grad_norm": 0.9329230785369873, + "learning_rate": 2.6788830715532287e-05, + "loss": 5.4326, + "step": 4788 + }, + { + "epoch": 0.46, + "grad_norm": 0.9027351140975952, + "learning_rate": 2.676943959666473e-05, + "loss": 5.2881, + "step": 4792 + }, + { + "epoch": 0.46, + "grad_norm": 0.9692825078964233, + "learning_rate": 2.6750048477797168e-05, + "loss": 5.3769, + "step": 4796 + }, + { + "epoch": 0.47, + "grad_norm": 0.9520678520202637, + "learning_rate": 2.6730657358929612e-05, + "loss": 5.377, + "step": 4800 + }, + { + "epoch": 0.47, + "grad_norm": 0.9845559000968933, + "learning_rate": 2.6711266240062056e-05, + "loss": 5.3848, + "step": 4804 + }, + { + "epoch": 0.47, + "grad_norm": 1.0046534538269043, + "learning_rate": 2.6691875121194493e-05, + "loss": 5.3973, + "step": 4808 + }, + { + "epoch": 0.47, + "grad_norm": 1.0389518737792969, + "learning_rate": 2.6672484002326937e-05, + "loss": 5.4557, + "step": 4812 + }, + { + "epoch": 0.47, + "grad_norm": 1.0322433710098267, + "learning_rate": 2.665309288345938e-05, + "loss": 5.3843, + "step": 4816 + }, + { + "epoch": 0.47, + "grad_norm": 1.075420618057251, + "learning_rate": 2.6633701764591818e-05, + "loss": 5.3968, + "step": 4820 + }, + { + "epoch": 0.47, + "grad_norm": 1.004739761352539, + "learning_rate": 2.6614310645724262e-05, + "loss": 5.4449, + "step": 4824 + }, + { + "epoch": 0.47, + "grad_norm": 0.9824436902999878, + "learning_rate": 2.65949195268567e-05, + "loss": 5.348, + "step": 4828 + }, + { + "epoch": 0.47, + "grad_norm": 1.0312827825546265, + "learning_rate": 2.6575528407989143e-05, + "loss": 5.438, + "step": 4832 + }, + { + "epoch": 0.47, + "grad_norm": 0.9586296677589417, + "learning_rate": 2.6556137289121584e-05, + "loss": 5.4007, + "step": 4836 + }, + { + "epoch": 0.47, + "grad_norm": 1.0318596363067627, + "learning_rate": 2.6536746170254024e-05, + "loss": 5.3997, + "step": 4840 + }, + { + "epoch": 0.47, + "grad_norm": 0.9269830584526062, + "learning_rate": 2.6517355051386468e-05, + "loss": 5.3597, + "step": 4844 + }, + { + "epoch": 0.47, + "grad_norm": 1.0358200073242188, + "learning_rate": 2.649796393251891e-05, + "loss": 5.3729, + "step": 4848 + }, + { + "epoch": 0.47, + "grad_norm": 0.9825155735015869, + "learning_rate": 2.647857281365135e-05, + "loss": 5.3678, + "step": 4852 + }, + { + "epoch": 0.47, + "grad_norm": 1.0111230611801147, + "learning_rate": 2.645918169478379e-05, + "loss": 5.4858, + "step": 4856 + }, + { + "epoch": 0.47, + "grad_norm": 0.947890043258667, + "learning_rate": 2.643979057591623e-05, + "loss": 5.3939, + "step": 4860 + }, + { + "epoch": 0.47, + "grad_norm": 0.9673914909362793, + "learning_rate": 2.642039945704867e-05, + "loss": 5.3704, + "step": 4864 + }, + { + "epoch": 0.47, + "grad_norm": 1.0731589794158936, + "learning_rate": 2.6401008338181115e-05, + "loss": 5.4116, + "step": 4868 + }, + { + "epoch": 0.47, + "grad_norm": 1.0044986009597778, + "learning_rate": 2.6381617219313552e-05, + "loss": 5.4318, + "step": 4872 + }, + { + "epoch": 0.47, + "grad_norm": 0.9914534687995911, + "learning_rate": 2.6362226100445996e-05, + "loss": 5.3216, + "step": 4876 + }, + { + "epoch": 0.47, + "grad_norm": 0.9887456297874451, + "learning_rate": 2.634283498157844e-05, + "loss": 5.4072, + "step": 4880 + }, + { + "epoch": 0.47, + "grad_norm": 1.0049148797988892, + "learning_rate": 2.6323443862710877e-05, + "loss": 5.4255, + "step": 4884 + }, + { + "epoch": 0.47, + "grad_norm": 1.0179352760314941, + "learning_rate": 2.630405274384332e-05, + "loss": 5.3108, + "step": 4888 + }, + { + "epoch": 0.47, + "grad_norm": 0.9834827184677124, + "learning_rate": 2.6284661624975765e-05, + "loss": 5.3262, + "step": 4892 + }, + { + "epoch": 0.47, + "grad_norm": 1.032747507095337, + "learning_rate": 2.6265270506108202e-05, + "loss": 5.3302, + "step": 4896 + }, + { + "epoch": 0.48, + "grad_norm": 1.0606615543365479, + "learning_rate": 2.6245879387240646e-05, + "loss": 5.3351, + "step": 4900 + }, + { + "epoch": 0.48, + "grad_norm": 1.0665369033813477, + "learning_rate": 2.6226488268373083e-05, + "loss": 5.3851, + "step": 4904 + }, + { + "epoch": 0.48, + "grad_norm": 1.0293782949447632, + "learning_rate": 2.6207097149505527e-05, + "loss": 5.3772, + "step": 4908 + }, + { + "epoch": 0.48, + "grad_norm": 1.0471354722976685, + "learning_rate": 2.618770603063797e-05, + "loss": 5.4218, + "step": 4912 + }, + { + "epoch": 0.48, + "grad_norm": 0.9955822229385376, + "learning_rate": 2.6168314911770408e-05, + "loss": 5.3427, + "step": 4916 + }, + { + "epoch": 0.48, + "grad_norm": 0.9505192041397095, + "learning_rate": 2.6148923792902852e-05, + "loss": 5.4442, + "step": 4920 + }, + { + "epoch": 0.48, + "grad_norm": 1.0545083284378052, + "learning_rate": 2.6129532674035296e-05, + "loss": 5.3821, + "step": 4924 + }, + { + "epoch": 0.48, + "grad_norm": 1.077630639076233, + "learning_rate": 2.6110141555167733e-05, + "loss": 5.4215, + "step": 4928 + }, + { + "epoch": 0.48, + "grad_norm": 1.1127928495407104, + "learning_rate": 2.6090750436300177e-05, + "loss": 5.4017, + "step": 4932 + }, + { + "epoch": 0.48, + "grad_norm": 1.0543692111968994, + "learning_rate": 2.607135931743262e-05, + "loss": 5.3918, + "step": 4936 + }, + { + "epoch": 0.48, + "grad_norm": 1.0516538619995117, + "learning_rate": 2.6051968198565058e-05, + "loss": 5.5141, + "step": 4940 + }, + { + "epoch": 0.48, + "grad_norm": 1.077325463294983, + "learning_rate": 2.6032577079697502e-05, + "loss": 5.4855, + "step": 4944 + }, + { + "epoch": 0.48, + "grad_norm": 1.034180998802185, + "learning_rate": 2.601318596082994e-05, + "loss": 5.4097, + "step": 4948 + }, + { + "epoch": 0.48, + "grad_norm": 1.0808318853378296, + "learning_rate": 2.5993794841962383e-05, + "loss": 5.3711, + "step": 4952 + }, + { + "epoch": 0.48, + "grad_norm": 1.0749197006225586, + "learning_rate": 2.5974403723094827e-05, + "loss": 5.3917, + "step": 4956 + }, + { + "epoch": 0.48, + "grad_norm": 1.1579383611679077, + "learning_rate": 2.5955012604227264e-05, + "loss": 5.4251, + "step": 4960 + }, + { + "epoch": 0.48, + "grad_norm": 1.0081652402877808, + "learning_rate": 2.5935621485359708e-05, + "loss": 5.3684, + "step": 4964 + }, + { + "epoch": 0.48, + "grad_norm": 0.9971766471862793, + "learning_rate": 2.591623036649215e-05, + "loss": 5.5078, + "step": 4968 + }, + { + "epoch": 0.48, + "grad_norm": 0.9572992920875549, + "learning_rate": 2.589683924762459e-05, + "loss": 5.4004, + "step": 4972 + }, + { + "epoch": 0.48, + "grad_norm": 1.0008381605148315, + "learning_rate": 2.5877448128757033e-05, + "loss": 5.4194, + "step": 4976 + }, + { + "epoch": 0.48, + "grad_norm": 1.0567643642425537, + "learning_rate": 2.585805700988947e-05, + "loss": 5.4014, + "step": 4980 + }, + { + "epoch": 0.48, + "grad_norm": 1.084086298942566, + "learning_rate": 2.5838665891021914e-05, + "loss": 5.4578, + "step": 4984 + }, + { + "epoch": 0.48, + "grad_norm": 1.062583088874817, + "learning_rate": 2.5819274772154355e-05, + "loss": 5.4051, + "step": 4988 + }, + { + "epoch": 0.48, + "grad_norm": 1.018263578414917, + "learning_rate": 2.5799883653286795e-05, + "loss": 5.4411, + "step": 4992 + }, + { + "epoch": 0.48, + "grad_norm": 1.0082392692565918, + "learning_rate": 2.5780492534419236e-05, + "loss": 5.329, + "step": 4996 + }, + { + "epoch": 0.48, + "grad_norm": 1.0163345336914062, + "learning_rate": 2.576110141555168e-05, + "loss": 5.3487, + "step": 5000 + }, + { + "epoch": 0.49, + "grad_norm": 0.9743790030479431, + "learning_rate": 2.5741710296684117e-05, + "loss": 5.4595, + "step": 5004 + }, + { + "epoch": 0.49, + "grad_norm": 1.0049850940704346, + "learning_rate": 2.572231917781656e-05, + "loss": 5.3147, + "step": 5008 + }, + { + "epoch": 0.49, + "grad_norm": 0.9798968434333801, + "learning_rate": 2.5702928058949005e-05, + "loss": 5.4953, + "step": 5012 + }, + { + "epoch": 0.49, + "grad_norm": 1.1075799465179443, + "learning_rate": 2.5683536940081442e-05, + "loss": 5.4284, + "step": 5016 + }, + { + "epoch": 0.49, + "grad_norm": 0.9216572046279907, + "learning_rate": 2.5664145821213886e-05, + "loss": 5.4338, + "step": 5020 + }, + { + "epoch": 0.49, + "grad_norm": 1.0511445999145508, + "learning_rate": 2.5644754702346323e-05, + "loss": 5.35, + "step": 5024 + }, + { + "epoch": 0.49, + "grad_norm": 1.0618550777435303, + "learning_rate": 2.5625363583478767e-05, + "loss": 5.3539, + "step": 5028 + }, + { + "epoch": 0.49, + "grad_norm": 1.0445276498794556, + "learning_rate": 2.560597246461121e-05, + "loss": 5.3797, + "step": 5032 + }, + { + "epoch": 0.49, + "grad_norm": 1.0461180210113525, + "learning_rate": 2.5586581345743648e-05, + "loss": 5.4262, + "step": 5036 + }, + { + "epoch": 0.49, + "grad_norm": 1.1503204107284546, + "learning_rate": 2.5567190226876092e-05, + "loss": 5.3629, + "step": 5040 + }, + { + "epoch": 0.49, + "grad_norm": 0.9792975187301636, + "learning_rate": 2.5547799108008536e-05, + "loss": 5.3245, + "step": 5044 + }, + { + "epoch": 0.49, + "grad_norm": 1.0088895559310913, + "learning_rate": 2.5528407989140973e-05, + "loss": 5.3447, + "step": 5048 + }, + { + "epoch": 0.49, + "grad_norm": 0.9955673217773438, + "learning_rate": 2.5509016870273417e-05, + "loss": 5.2618, + "step": 5052 + }, + { + "epoch": 0.49, + "grad_norm": 0.9601762294769287, + "learning_rate": 2.548962575140586e-05, + "loss": 5.3711, + "step": 5056 + }, + { + "epoch": 0.49, + "grad_norm": 0.9723737835884094, + "learning_rate": 2.5470234632538298e-05, + "loss": 5.4062, + "step": 5060 + }, + { + "epoch": 0.49, + "grad_norm": 0.961095929145813, + "learning_rate": 2.5450843513670742e-05, + "loss": 5.4148, + "step": 5064 + }, + { + "epoch": 0.49, + "grad_norm": 1.071572184562683, + "learning_rate": 2.543145239480318e-05, + "loss": 5.3569, + "step": 5068 + }, + { + "epoch": 0.49, + "grad_norm": 1.0116100311279297, + "learning_rate": 2.5412061275935623e-05, + "loss": 5.4034, + "step": 5072 + }, + { + "epoch": 0.49, + "grad_norm": 1.0250638723373413, + "learning_rate": 2.5392670157068067e-05, + "loss": 5.3628, + "step": 5076 + }, + { + "epoch": 0.49, + "grad_norm": 0.9859606027603149, + "learning_rate": 2.5373279038200504e-05, + "loss": 5.4041, + "step": 5080 + }, + { + "epoch": 0.49, + "grad_norm": 1.01252019405365, + "learning_rate": 2.5353887919332948e-05, + "loss": 5.4278, + "step": 5084 + }, + { + "epoch": 0.49, + "grad_norm": 1.083108901977539, + "learning_rate": 2.5334496800465392e-05, + "loss": 5.3863, + "step": 5088 + }, + { + "epoch": 0.49, + "grad_norm": 0.969508945941925, + "learning_rate": 2.531510568159783e-05, + "loss": 5.3222, + "step": 5092 + }, + { + "epoch": 0.49, + "grad_norm": 0.9864810109138489, + "learning_rate": 2.5295714562730273e-05, + "loss": 5.4214, + "step": 5096 + }, + { + "epoch": 0.49, + "grad_norm": 0.9918404221534729, + "learning_rate": 2.527632344386271e-05, + "loss": 5.3679, + "step": 5100 + }, + { + "epoch": 0.49, + "grad_norm": 1.036550760269165, + "learning_rate": 2.5256932324995154e-05, + "loss": 5.2626, + "step": 5104 + }, + { + "epoch": 0.5, + "grad_norm": 0.9950888752937317, + "learning_rate": 2.5237541206127598e-05, + "loss": 5.3105, + "step": 5108 + }, + { + "epoch": 0.5, + "grad_norm": 1.0199189186096191, + "learning_rate": 2.5218150087260035e-05, + "loss": 5.3776, + "step": 5112 + }, + { + "epoch": 0.5, + "grad_norm": 1.0516811609268188, + "learning_rate": 2.519875896839248e-05, + "loss": 5.3124, + "step": 5116 + }, + { + "epoch": 0.5, + "grad_norm": 1.078303575515747, + "learning_rate": 2.517936784952492e-05, + "loss": 5.4144, + "step": 5120 + }, + { + "epoch": 0.5, + "grad_norm": 0.987095296382904, + "learning_rate": 2.515997673065736e-05, + "loss": 5.349, + "step": 5124 + }, + { + "epoch": 0.5, + "grad_norm": 0.9634592533111572, + "learning_rate": 2.51405856117898e-05, + "loss": 5.2509, + "step": 5128 + }, + { + "epoch": 0.5, + "grad_norm": 1.0986416339874268, + "learning_rate": 2.5121194492922244e-05, + "loss": 5.3225, + "step": 5132 + }, + { + "epoch": 0.5, + "grad_norm": 1.0385926961898804, + "learning_rate": 2.510180337405468e-05, + "loss": 5.3529, + "step": 5136 + }, + { + "epoch": 0.5, + "grad_norm": 0.9627379179000854, + "learning_rate": 2.5082412255187126e-05, + "loss": 5.3675, + "step": 5140 + }, + { + "epoch": 0.5, + "grad_norm": 1.003461480140686, + "learning_rate": 2.5063021136319566e-05, + "loss": 5.4202, + "step": 5144 + }, + { + "epoch": 0.5, + "grad_norm": 0.9771742224693298, + "learning_rate": 2.5043630017452007e-05, + "loss": 5.3042, + "step": 5148 + }, + { + "epoch": 0.5, + "grad_norm": 0.9279887676239014, + "learning_rate": 2.502423889858445e-05, + "loss": 5.3561, + "step": 5152 + }, + { + "epoch": 0.5, + "grad_norm": 0.9281094670295715, + "learning_rate": 2.5004847779716888e-05, + "loss": 5.3231, + "step": 5156 + }, + { + "epoch": 0.5, + "grad_norm": 1.0261962413787842, + "learning_rate": 2.498545666084933e-05, + "loss": 5.2869, + "step": 5160 + }, + { + "epoch": 0.5, + "grad_norm": 0.956251859664917, + "learning_rate": 2.4966065541981772e-05, + "loss": 5.3058, + "step": 5164 + }, + { + "epoch": 0.5, + "grad_norm": 1.0073554515838623, + "learning_rate": 2.4946674423114213e-05, + "loss": 5.3455, + "step": 5168 + }, + { + "epoch": 0.5, + "grad_norm": 0.9829320907592773, + "learning_rate": 2.4927283304246657e-05, + "loss": 5.2866, + "step": 5172 + }, + { + "epoch": 0.5, + "grad_norm": 1.067661166191101, + "learning_rate": 2.4907892185379097e-05, + "loss": 5.3966, + "step": 5176 + }, + { + "epoch": 0.5, + "grad_norm": 1.049310326576233, + "learning_rate": 2.4888501066511538e-05, + "loss": 5.471, + "step": 5180 + }, + { + "epoch": 0.5, + "grad_norm": 1.052420973777771, + "learning_rate": 2.486910994764398e-05, + "loss": 5.3907, + "step": 5184 + }, + { + "epoch": 0.5, + "grad_norm": 1.0315730571746826, + "learning_rate": 2.4849718828776422e-05, + "loss": 5.3528, + "step": 5188 + }, + { + "epoch": 0.5, + "grad_norm": 1.165887475013733, + "learning_rate": 2.4830327709908863e-05, + "loss": 5.399, + "step": 5192 + }, + { + "epoch": 0.5, + "grad_norm": 0.9832936525344849, + "learning_rate": 2.4810936591041303e-05, + "loss": 5.4077, + "step": 5196 + }, + { + "epoch": 0.5, + "grad_norm": 0.9647985100746155, + "learning_rate": 2.4791545472173747e-05, + "loss": 5.3483, + "step": 5200 + }, + { + "epoch": 0.5, + "grad_norm": 1.0132298469543457, + "learning_rate": 2.4772154353306188e-05, + "loss": 5.4671, + "step": 5204 + }, + { + "epoch": 0.5, + "grad_norm": 0.9725786447525024, + "learning_rate": 2.4752763234438628e-05, + "loss": 5.3745, + "step": 5208 + }, + { + "epoch": 0.51, + "grad_norm": 1.0451775789260864, + "learning_rate": 2.473337211557107e-05, + "loss": 5.4456, + "step": 5212 + }, + { + "epoch": 0.51, + "grad_norm": 0.9887628555297852, + "learning_rate": 2.4713980996703513e-05, + "loss": 5.333, + "step": 5216 + }, + { + "epoch": 0.51, + "grad_norm": 1.044665813446045, + "learning_rate": 2.4694589877835953e-05, + "loss": 5.3592, + "step": 5220 + }, + { + "epoch": 0.51, + "grad_norm": 0.9709651470184326, + "learning_rate": 2.4675198758968394e-05, + "loss": 5.3393, + "step": 5224 + }, + { + "epoch": 0.51, + "grad_norm": 0.9530077576637268, + "learning_rate": 2.4655807640100834e-05, + "loss": 5.3602, + "step": 5228 + }, + { + "epoch": 0.51, + "grad_norm": 0.9592558145523071, + "learning_rate": 2.4636416521233278e-05, + "loss": 5.3931, + "step": 5232 + }, + { + "epoch": 0.51, + "grad_norm": 1.0525456666946411, + "learning_rate": 2.461702540236572e-05, + "loss": 5.3367, + "step": 5236 + }, + { + "epoch": 0.51, + "grad_norm": 1.0699265003204346, + "learning_rate": 2.459763428349816e-05, + "loss": 5.3894, + "step": 5240 + }, + { + "epoch": 0.51, + "grad_norm": 1.0436670780181885, + "learning_rate": 2.4578243164630603e-05, + "loss": 5.3643, + "step": 5244 + }, + { + "epoch": 0.51, + "grad_norm": 1.0085383653640747, + "learning_rate": 2.4558852045763044e-05, + "loss": 5.2872, + "step": 5248 + }, + { + "epoch": 0.51, + "grad_norm": 1.0748775005340576, + "learning_rate": 2.4539460926895484e-05, + "loss": 5.4714, + "step": 5252 + }, + { + "epoch": 0.51, + "grad_norm": 1.0273828506469727, + "learning_rate": 2.4520069808027925e-05, + "loss": 5.3271, + "step": 5256 + }, + { + "epoch": 0.51, + "grad_norm": 0.9897158145904541, + "learning_rate": 2.4500678689160365e-05, + "loss": 5.3268, + "step": 5260 + }, + { + "epoch": 0.51, + "grad_norm": 0.9869460463523865, + "learning_rate": 2.4481287570292806e-05, + "loss": 5.3173, + "step": 5264 + }, + { + "epoch": 0.51, + "grad_norm": 1.0365619659423828, + "learning_rate": 2.4461896451425246e-05, + "loss": 5.4168, + "step": 5268 + }, + { + "epoch": 0.51, + "grad_norm": 1.106515884399414, + "learning_rate": 2.4442505332557687e-05, + "loss": 5.3515, + "step": 5272 + }, + { + "epoch": 0.51, + "grad_norm": 0.9866341352462769, + "learning_rate": 2.442311421369013e-05, + "loss": 5.4644, + "step": 5276 + }, + { + "epoch": 0.51, + "grad_norm": 1.1131459474563599, + "learning_rate": 2.440372309482257e-05, + "loss": 5.2931, + "step": 5280 + }, + { + "epoch": 0.51, + "grad_norm": 0.9824268221855164, + "learning_rate": 2.4384331975955012e-05, + "loss": 5.3094, + "step": 5284 + }, + { + "epoch": 0.51, + "grad_norm": 1.0347408056259155, + "learning_rate": 2.4364940857087452e-05, + "loss": 5.3462, + "step": 5288 + }, + { + "epoch": 0.51, + "grad_norm": 1.0148606300354004, + "learning_rate": 2.4345549738219896e-05, + "loss": 5.3224, + "step": 5292 + }, + { + "epoch": 0.51, + "grad_norm": 1.1260877847671509, + "learning_rate": 2.4326158619352337e-05, + "loss": 5.3337, + "step": 5296 + }, + { + "epoch": 0.51, + "grad_norm": 1.034441590309143, + "learning_rate": 2.4306767500484777e-05, + "loss": 5.3728, + "step": 5300 + }, + { + "epoch": 0.51, + "grad_norm": 1.0142707824707031, + "learning_rate": 2.428737638161722e-05, + "loss": 5.3848, + "step": 5304 + }, + { + "epoch": 0.51, + "grad_norm": 0.9446210861206055, + "learning_rate": 2.4267985262749662e-05, + "loss": 5.3927, + "step": 5308 + }, + { + "epoch": 0.52, + "grad_norm": 1.0099536180496216, + "learning_rate": 2.4248594143882102e-05, + "loss": 5.3282, + "step": 5312 + }, + { + "epoch": 0.52, + "grad_norm": 1.0873996019363403, + "learning_rate": 2.4229203025014543e-05, + "loss": 5.4533, + "step": 5316 + }, + { + "epoch": 0.52, + "grad_norm": 0.9772780537605286, + "learning_rate": 2.4209811906146987e-05, + "loss": 5.4258, + "step": 5320 + }, + { + "epoch": 0.52, + "grad_norm": 0.9593787789344788, + "learning_rate": 2.4190420787279427e-05, + "loss": 5.369, + "step": 5324 + }, + { + "epoch": 0.52, + "grad_norm": 1.0021839141845703, + "learning_rate": 2.4171029668411868e-05, + "loss": 5.3721, + "step": 5328 + }, + { + "epoch": 0.52, + "grad_norm": 0.9823392629623413, + "learning_rate": 2.415163854954431e-05, + "loss": 5.3277, + "step": 5332 + }, + { + "epoch": 0.52, + "grad_norm": 1.1319977045059204, + "learning_rate": 2.4132247430676752e-05, + "loss": 5.4176, + "step": 5336 + }, + { + "epoch": 0.52, + "grad_norm": 0.9715146422386169, + "learning_rate": 2.4112856311809193e-05, + "loss": 5.3833, + "step": 5340 + }, + { + "epoch": 0.52, + "grad_norm": 1.0981075763702393, + "learning_rate": 2.4093465192941634e-05, + "loss": 5.4515, + "step": 5344 + }, + { + "epoch": 0.52, + "grad_norm": 0.9924083948135376, + "learning_rate": 2.4074074074074074e-05, + "loss": 5.2979, + "step": 5348 + }, + { + "epoch": 0.52, + "grad_norm": 0.9601098895072937, + "learning_rate": 2.4054682955206518e-05, + "loss": 5.3825, + "step": 5352 + }, + { + "epoch": 0.52, + "grad_norm": 1.134017825126648, + "learning_rate": 2.403529183633896e-05, + "loss": 5.4259, + "step": 5356 + }, + { + "epoch": 0.52, + "grad_norm": 1.0065308809280396, + "learning_rate": 2.40159007174714e-05, + "loss": 5.2852, + "step": 5360 + }, + { + "epoch": 0.52, + "grad_norm": 1.120582103729248, + "learning_rate": 2.3996509598603843e-05, + "loss": 5.3872, + "step": 5364 + }, + { + "epoch": 0.52, + "grad_norm": 1.0139710903167725, + "learning_rate": 2.3977118479736284e-05, + "loss": 5.4521, + "step": 5368 + }, + { + "epoch": 0.52, + "grad_norm": 0.9832119941711426, + "learning_rate": 2.3957727360868724e-05, + "loss": 5.4052, + "step": 5372 + }, + { + "epoch": 0.52, + "grad_norm": 1.0349886417388916, + "learning_rate": 2.3938336242001165e-05, + "loss": 5.2876, + "step": 5376 + }, + { + "epoch": 0.52, + "grad_norm": 1.0244460105895996, + "learning_rate": 2.391894512313361e-05, + "loss": 5.4077, + "step": 5380 + }, + { + "epoch": 0.52, + "grad_norm": 1.0130488872528076, + "learning_rate": 2.389955400426605e-05, + "loss": 5.3853, + "step": 5384 + }, + { + "epoch": 0.52, + "grad_norm": 1.0110422372817993, + "learning_rate": 2.388016288539849e-05, + "loss": 5.4025, + "step": 5388 + }, + { + "epoch": 0.52, + "grad_norm": 1.1506974697113037, + "learning_rate": 2.386077176653093e-05, + "loss": 5.4613, + "step": 5392 + }, + { + "epoch": 0.52, + "grad_norm": 0.9795562624931335, + "learning_rate": 2.384138064766337e-05, + "loss": 5.3075, + "step": 5396 + }, + { + "epoch": 0.52, + "grad_norm": 1.1148579120635986, + "learning_rate": 2.382198952879581e-05, + "loss": 5.4263, + "step": 5400 + }, + { + "epoch": 0.52, + "grad_norm": 0.993959367275238, + "learning_rate": 2.3802598409928252e-05, + "loss": 5.3765, + "step": 5404 + }, + { + "epoch": 0.52, + "grad_norm": 0.9957119822502136, + "learning_rate": 2.3783207291060692e-05, + "loss": 5.3645, + "step": 5408 + }, + { + "epoch": 0.52, + "grad_norm": 0.975545346736908, + "learning_rate": 2.3763816172193136e-05, + "loss": 5.3522, + "step": 5412 + }, + { + "epoch": 0.53, + "grad_norm": 0.9839780330657959, + "learning_rate": 2.3744425053325577e-05, + "loss": 5.333, + "step": 5416 + }, + { + "epoch": 0.53, + "grad_norm": 1.0729457139968872, + "learning_rate": 2.3725033934458017e-05, + "loss": 5.4589, + "step": 5420 + }, + { + "epoch": 0.53, + "grad_norm": 1.0412721633911133, + "learning_rate": 2.370564281559046e-05, + "loss": 5.3807, + "step": 5424 + }, + { + "epoch": 0.53, + "grad_norm": 1.0309202671051025, + "learning_rate": 2.3686251696722902e-05, + "loss": 5.3716, + "step": 5428 + }, + { + "epoch": 0.53, + "grad_norm": 1.076830506324768, + "learning_rate": 2.3666860577855342e-05, + "loss": 5.4141, + "step": 5432 + }, + { + "epoch": 0.53, + "grad_norm": 1.1014715433120728, + "learning_rate": 2.3647469458987783e-05, + "loss": 5.3345, + "step": 5436 + }, + { + "epoch": 0.53, + "grad_norm": 1.107503056526184, + "learning_rate": 2.3628078340120227e-05, + "loss": 5.2971, + "step": 5440 + }, + { + "epoch": 0.53, + "grad_norm": 1.0111589431762695, + "learning_rate": 2.3608687221252667e-05, + "loss": 5.3824, + "step": 5444 + }, + { + "epoch": 0.53, + "grad_norm": 1.014420509338379, + "learning_rate": 2.3589296102385108e-05, + "loss": 5.4623, + "step": 5448 + }, + { + "epoch": 0.53, + "grad_norm": 0.9904616475105286, + "learning_rate": 2.356990498351755e-05, + "loss": 5.3542, + "step": 5452 + }, + { + "epoch": 0.53, + "grad_norm": 1.061886191368103, + "learning_rate": 2.3550513864649992e-05, + "loss": 5.4478, + "step": 5456 + }, + { + "epoch": 0.53, + "grad_norm": 1.016271948814392, + "learning_rate": 2.3531122745782433e-05, + "loss": 5.3555, + "step": 5460 + }, + { + "epoch": 0.53, + "grad_norm": 0.9344733953475952, + "learning_rate": 2.3511731626914873e-05, + "loss": 5.2854, + "step": 5464 + }, + { + "epoch": 0.53, + "grad_norm": 1.050699234008789, + "learning_rate": 2.3492340508047314e-05, + "loss": 5.3064, + "step": 5468 + }, + { + "epoch": 0.53, + "grad_norm": 1.024604320526123, + "learning_rate": 2.3472949389179758e-05, + "loss": 5.3682, + "step": 5472 + }, + { + "epoch": 0.53, + "grad_norm": 1.0533298254013062, + "learning_rate": 2.34535582703122e-05, + "loss": 5.3336, + "step": 5476 + }, + { + "epoch": 0.53, + "grad_norm": 1.0331076383590698, + "learning_rate": 2.343416715144464e-05, + "loss": 5.518, + "step": 5480 + }, + { + "epoch": 0.53, + "grad_norm": 1.101364254951477, + "learning_rate": 2.3414776032577083e-05, + "loss": 5.3366, + "step": 5484 + }, + { + "epoch": 0.53, + "grad_norm": 0.9391170740127563, + "learning_rate": 2.3395384913709523e-05, + "loss": 5.3567, + "step": 5488 + }, + { + "epoch": 0.53, + "grad_norm": 0.9690737724304199, + "learning_rate": 2.3375993794841964e-05, + "loss": 5.3323, + "step": 5492 + }, + { + "epoch": 0.53, + "grad_norm": 1.0040618181228638, + "learning_rate": 2.3356602675974404e-05, + "loss": 5.3209, + "step": 5496 + }, + { + "epoch": 0.53, + "grad_norm": 0.9501678347587585, + "learning_rate": 2.333721155710685e-05, + "loss": 5.3043, + "step": 5500 + }, + { + "epoch": 0.53, + "grad_norm": 0.9725291728973389, + "learning_rate": 2.331782043823929e-05, + "loss": 5.4104, + "step": 5504 + }, + { + "epoch": 0.53, + "grad_norm": 0.9561870098114014, + "learning_rate": 2.329842931937173e-05, + "loss": 5.4013, + "step": 5508 + }, + { + "epoch": 0.53, + "grad_norm": 1.0054466724395752, + "learning_rate": 2.327903820050417e-05, + "loss": 5.3692, + "step": 5512 + }, + { + "epoch": 0.53, + "grad_norm": 1.0419256687164307, + "learning_rate": 2.3259647081636614e-05, + "loss": 5.4082, + "step": 5516 + }, + { + "epoch": 0.54, + "grad_norm": 1.0327070951461792, + "learning_rate": 2.3240255962769054e-05, + "loss": 5.4721, + "step": 5520 + }, + { + "epoch": 0.54, + "grad_norm": 0.9534458518028259, + "learning_rate": 2.3220864843901495e-05, + "loss": 5.3484, + "step": 5524 + }, + { + "epoch": 0.54, + "grad_norm": 1.0182461738586426, + "learning_rate": 2.3201473725033936e-05, + "loss": 5.3595, + "step": 5528 + }, + { + "epoch": 0.54, + "grad_norm": 1.0508521795272827, + "learning_rate": 2.3182082606166376e-05, + "loss": 5.3651, + "step": 5532 + }, + { + "epoch": 0.54, + "grad_norm": 0.9299846887588501, + "learning_rate": 2.3162691487298817e-05, + "loss": 5.3989, + "step": 5536 + }, + { + "epoch": 0.54, + "grad_norm": 1.0000419616699219, + "learning_rate": 2.3143300368431257e-05, + "loss": 5.2571, + "step": 5540 + }, + { + "epoch": 0.54, + "grad_norm": 1.084428071975708, + "learning_rate": 2.31239092495637e-05, + "loss": 5.3619, + "step": 5544 + }, + { + "epoch": 0.54, + "grad_norm": 1.0404269695281982, + "learning_rate": 2.310451813069614e-05, + "loss": 5.3425, + "step": 5548 + }, + { + "epoch": 0.54, + "grad_norm": 1.00773286819458, + "learning_rate": 2.3085127011828582e-05, + "loss": 5.329, + "step": 5552 + }, + { + "epoch": 0.54, + "grad_norm": 1.000942349433899, + "learning_rate": 2.3065735892961023e-05, + "loss": 5.3361, + "step": 5556 + }, + { + "epoch": 0.54, + "grad_norm": 1.021449089050293, + "learning_rate": 2.3046344774093467e-05, + "loss": 5.3859, + "step": 5560 + }, + { + "epoch": 0.54, + "grad_norm": 1.0686131715774536, + "learning_rate": 2.3026953655225907e-05, + "loss": 5.4319, + "step": 5564 + }, + { + "epoch": 0.54, + "grad_norm": 1.0589152574539185, + "learning_rate": 2.3007562536358348e-05, + "loss": 5.3367, + "step": 5568 + }, + { + "epoch": 0.54, + "grad_norm": 1.0058645009994507, + "learning_rate": 2.2988171417490788e-05, + "loss": 5.3473, + "step": 5572 + }, + { + "epoch": 0.54, + "grad_norm": 1.0159991979599, + "learning_rate": 2.2968780298623232e-05, + "loss": 5.375, + "step": 5576 + }, + { + "epoch": 0.54, + "grad_norm": 1.0634344816207886, + "learning_rate": 2.2949389179755673e-05, + "loss": 5.401, + "step": 5580 + }, + { + "epoch": 0.54, + "grad_norm": 1.077378273010254, + "learning_rate": 2.2929998060888113e-05, + "loss": 5.4141, + "step": 5584 + }, + { + "epoch": 0.54, + "grad_norm": 0.9465997815132141, + "learning_rate": 2.2910606942020554e-05, + "loss": 5.357, + "step": 5588 + }, + { + "epoch": 0.54, + "grad_norm": 1.069690465927124, + "learning_rate": 2.2891215823152998e-05, + "loss": 5.3555, + "step": 5592 + }, + { + "epoch": 0.54, + "grad_norm": 0.9830535054206848, + "learning_rate": 2.2871824704285438e-05, + "loss": 5.3552, + "step": 5596 + }, + { + "epoch": 0.54, + "grad_norm": 1.0138301849365234, + "learning_rate": 2.285243358541788e-05, + "loss": 5.342, + "step": 5600 + }, + { + "epoch": 0.54, + "grad_norm": 1.0020854473114014, + "learning_rate": 2.2833042466550323e-05, + "loss": 5.3489, + "step": 5604 + }, + { + "epoch": 0.54, + "grad_norm": 1.0177456140518188, + "learning_rate": 2.2813651347682763e-05, + "loss": 5.269, + "step": 5608 + }, + { + "epoch": 0.54, + "grad_norm": 1.0663460493087769, + "learning_rate": 2.2794260228815204e-05, + "loss": 5.297, + "step": 5612 + }, + { + "epoch": 0.54, + "grad_norm": 0.9828472137451172, + "learning_rate": 2.2774869109947644e-05, + "loss": 5.3487, + "step": 5616 + }, + { + "epoch": 0.54, + "grad_norm": 1.0115931034088135, + "learning_rate": 2.2755477991080088e-05, + "loss": 5.2995, + "step": 5620 + }, + { + "epoch": 0.55, + "grad_norm": 0.974193274974823, + "learning_rate": 2.273608687221253e-05, + "loss": 5.3582, + "step": 5624 + }, + { + "epoch": 0.55, + "grad_norm": 1.0036460161209106, + "learning_rate": 2.271669575334497e-05, + "loss": 5.3034, + "step": 5628 + }, + { + "epoch": 0.55, + "grad_norm": 1.0930732488632202, + "learning_rate": 2.269730463447741e-05, + "loss": 5.3337, + "step": 5632 + }, + { + "epoch": 0.55, + "grad_norm": 0.9957634806632996, + "learning_rate": 2.2677913515609854e-05, + "loss": 5.3641, + "step": 5636 + }, + { + "epoch": 0.55, + "grad_norm": 1.0515607595443726, + "learning_rate": 2.2658522396742294e-05, + "loss": 5.5135, + "step": 5640 + }, + { + "epoch": 0.55, + "grad_norm": 0.9784524440765381, + "learning_rate": 2.2639131277874735e-05, + "loss": 5.344, + "step": 5644 + }, + { + "epoch": 0.55, + "grad_norm": 1.0054571628570557, + "learning_rate": 2.2619740159007175e-05, + "loss": 5.3933, + "step": 5648 + }, + { + "epoch": 0.55, + "grad_norm": 0.965859591960907, + "learning_rate": 2.260034904013962e-05, + "loss": 5.3461, + "step": 5652 + }, + { + "epoch": 0.55, + "grad_norm": 1.1478146314620972, + "learning_rate": 2.258095792127206e-05, + "loss": 5.3515, + "step": 5656 + }, + { + "epoch": 0.55, + "grad_norm": 0.9893291592597961, + "learning_rate": 2.25615668024045e-05, + "loss": 5.2155, + "step": 5660 + }, + { + "epoch": 0.55, + "grad_norm": 0.945925235748291, + "learning_rate": 2.254217568353694e-05, + "loss": 5.3156, + "step": 5664 + }, + { + "epoch": 0.55, + "grad_norm": 1.020790934562683, + "learning_rate": 2.252278456466938e-05, + "loss": 5.4443, + "step": 5668 + }, + { + "epoch": 0.55, + "grad_norm": 0.9524897933006287, + "learning_rate": 2.2503393445801822e-05, + "loss": 5.3692, + "step": 5672 + }, + { + "epoch": 0.55, + "grad_norm": 1.0599005222320557, + "learning_rate": 2.2484002326934266e-05, + "loss": 5.3473, + "step": 5676 + }, + { + "epoch": 0.55, + "grad_norm": 1.029077410697937, + "learning_rate": 2.2464611208066706e-05, + "loss": 5.3266, + "step": 5680 + }, + { + "epoch": 0.55, + "grad_norm": 1.0404125452041626, + "learning_rate": 2.2445220089199147e-05, + "loss": 5.3654, + "step": 5684 + }, + { + "epoch": 0.55, + "grad_norm": 0.9789564609527588, + "learning_rate": 2.2425828970331587e-05, + "loss": 5.3689, + "step": 5688 + }, + { + "epoch": 0.55, + "grad_norm": 0.9803569912910461, + "learning_rate": 2.2406437851464028e-05, + "loss": 5.3231, + "step": 5692 + }, + { + "epoch": 0.55, + "grad_norm": 1.036303997039795, + "learning_rate": 2.2387046732596472e-05, + "loss": 5.3055, + "step": 5696 + }, + { + "epoch": 0.55, + "grad_norm": 1.0285331010818481, + "learning_rate": 2.2367655613728912e-05, + "loss": 5.3553, + "step": 5700 + }, + { + "epoch": 0.55, + "grad_norm": 0.9725428819656372, + "learning_rate": 2.2348264494861353e-05, + "loss": 5.3338, + "step": 5704 + }, + { + "epoch": 0.55, + "grad_norm": 1.0550493001937866, + "learning_rate": 2.2328873375993794e-05, + "loss": 5.3542, + "step": 5708 + }, + { + "epoch": 0.55, + "grad_norm": 0.9990110993385315, + "learning_rate": 2.2309482257126237e-05, + "loss": 5.3457, + "step": 5712 + }, + { + "epoch": 0.55, + "grad_norm": 1.0040626525878906, + "learning_rate": 2.2290091138258678e-05, + "loss": 5.3244, + "step": 5716 + }, + { + "epoch": 0.55, + "grad_norm": 1.0053871870040894, + "learning_rate": 2.227070001939112e-05, + "loss": 5.3694, + "step": 5720 + }, + { + "epoch": 0.55, + "grad_norm": 1.0112636089324951, + "learning_rate": 2.2251308900523562e-05, + "loss": 5.4549, + "step": 5724 + }, + { + "epoch": 0.56, + "grad_norm": 1.0170855522155762, + "learning_rate": 2.2231917781656003e-05, + "loss": 5.3509, + "step": 5728 + }, + { + "epoch": 0.56, + "grad_norm": 0.9416339993476868, + "learning_rate": 2.2212526662788444e-05, + "loss": 5.2804, + "step": 5732 + }, + { + "epoch": 0.56, + "grad_norm": 1.014902114868164, + "learning_rate": 2.2193135543920884e-05, + "loss": 5.3263, + "step": 5736 + }, + { + "epoch": 0.56, + "grad_norm": 1.0529359579086304, + "learning_rate": 2.2173744425053328e-05, + "loss": 5.3151, + "step": 5740 + }, + { + "epoch": 0.56, + "grad_norm": 0.9608129262924194, + "learning_rate": 2.215435330618577e-05, + "loss": 5.2865, + "step": 5744 + }, + { + "epoch": 0.56, + "grad_norm": 1.0631046295166016, + "learning_rate": 2.213496218731821e-05, + "loss": 5.3379, + "step": 5748 + }, + { + "epoch": 0.56, + "grad_norm": 1.0569404363632202, + "learning_rate": 2.211557106845065e-05, + "loss": 5.4344, + "step": 5752 + }, + { + "epoch": 0.56, + "grad_norm": 1.0010555982589722, + "learning_rate": 2.2096179949583094e-05, + "loss": 5.3753, + "step": 5756 + }, + { + "epoch": 0.56, + "grad_norm": 0.9690674543380737, + "learning_rate": 2.2076788830715534e-05, + "loss": 5.3373, + "step": 5760 + }, + { + "epoch": 0.56, + "grad_norm": 1.03341805934906, + "learning_rate": 2.2057397711847975e-05, + "loss": 5.4154, + "step": 5764 + }, + { + "epoch": 0.56, + "grad_norm": 0.9742380976676941, + "learning_rate": 2.2038006592980415e-05, + "loss": 5.4585, + "step": 5768 + }, + { + "epoch": 0.56, + "grad_norm": 0.977780818939209, + "learning_rate": 2.201861547411286e-05, + "loss": 5.351, + "step": 5772 + }, + { + "epoch": 0.56, + "grad_norm": 0.9809128046035767, + "learning_rate": 2.19992243552453e-05, + "loss": 5.2699, + "step": 5776 + }, + { + "epoch": 0.56, + "grad_norm": 1.0754761695861816, + "learning_rate": 2.197983323637774e-05, + "loss": 5.4224, + "step": 5780 + }, + { + "epoch": 0.56, + "grad_norm": 1.1107863187789917, + "learning_rate": 2.1960442117510184e-05, + "loss": 5.3779, + "step": 5784 + }, + { + "epoch": 0.56, + "grad_norm": 0.9794926643371582, + "learning_rate": 2.1941050998642625e-05, + "loss": 5.4183, + "step": 5788 + }, + { + "epoch": 0.56, + "grad_norm": 1.0375285148620605, + "learning_rate": 2.1921659879775065e-05, + "loss": 5.3357, + "step": 5792 + }, + { + "epoch": 0.56, + "grad_norm": 1.028106451034546, + "learning_rate": 2.1902268760907506e-05, + "loss": 5.3636, + "step": 5796 + }, + { + "epoch": 0.56, + "grad_norm": 0.9496660828590393, + "learning_rate": 2.1882877642039946e-05, + "loss": 5.4075, + "step": 5800 + }, + { + "epoch": 0.56, + "grad_norm": 0.9957082867622375, + "learning_rate": 2.1863486523172387e-05, + "loss": 5.3508, + "step": 5804 + }, + { + "epoch": 0.56, + "grad_norm": 1.0487018823623657, + "learning_rate": 2.184409540430483e-05, + "loss": 5.3796, + "step": 5808 + }, + { + "epoch": 0.56, + "grad_norm": 0.9658469557762146, + "learning_rate": 2.182470428543727e-05, + "loss": 5.2512, + "step": 5812 + }, + { + "epoch": 0.56, + "grad_norm": 1.0086036920547485, + "learning_rate": 2.1805313166569712e-05, + "loss": 5.3492, + "step": 5816 + }, + { + "epoch": 0.56, + "grad_norm": 1.09529447555542, + "learning_rate": 2.1785922047702152e-05, + "loss": 5.3167, + "step": 5820 + }, + { + "epoch": 0.56, + "grad_norm": 1.0463392734527588, + "learning_rate": 2.1766530928834593e-05, + "loss": 5.3627, + "step": 5824 + }, + { + "epoch": 0.57, + "grad_norm": 1.0675616264343262, + "learning_rate": 2.1747139809967033e-05, + "loss": 5.3868, + "step": 5828 + }, + { + "epoch": 0.57, + "grad_norm": 0.9881440997123718, + "learning_rate": 2.1727748691099477e-05, + "loss": 5.3498, + "step": 5832 + }, + { + "epoch": 0.57, + "grad_norm": 1.0680946111679077, + "learning_rate": 2.1708357572231918e-05, + "loss": 5.3376, + "step": 5836 + }, + { + "epoch": 0.57, + "grad_norm": 1.1282621622085571, + "learning_rate": 2.168896645336436e-05, + "loss": 5.3426, + "step": 5840 + }, + { + "epoch": 0.57, + "grad_norm": 1.0684411525726318, + "learning_rate": 2.1669575334496802e-05, + "loss": 5.3769, + "step": 5844 + }, + { + "epoch": 0.57, + "grad_norm": 1.0463793277740479, + "learning_rate": 2.1650184215629243e-05, + "loss": 5.3831, + "step": 5848 + }, + { + "epoch": 0.57, + "grad_norm": 1.0346778631210327, + "learning_rate": 2.1630793096761683e-05, + "loss": 5.3522, + "step": 5852 + }, + { + "epoch": 0.57, + "grad_norm": 1.0426431894302368, + "learning_rate": 2.1611401977894124e-05, + "loss": 5.4098, + "step": 5856 + }, + { + "epoch": 0.57, + "grad_norm": 1.0200704336166382, + "learning_rate": 2.1592010859026568e-05, + "loss": 5.265, + "step": 5860 + }, + { + "epoch": 0.57, + "grad_norm": 1.0575344562530518, + "learning_rate": 2.157261974015901e-05, + "loss": 5.3004, + "step": 5864 + }, + { + "epoch": 0.57, + "grad_norm": 0.9970076680183411, + "learning_rate": 2.155322862129145e-05, + "loss": 5.3595, + "step": 5868 + }, + { + "epoch": 0.57, + "grad_norm": 1.0401124954223633, + "learning_rate": 2.153383750242389e-05, + "loss": 5.5118, + "step": 5872 + }, + { + "epoch": 0.57, + "grad_norm": 1.0695228576660156, + "learning_rate": 2.1514446383556333e-05, + "loss": 5.3078, + "step": 5876 + }, + { + "epoch": 0.57, + "grad_norm": 1.0231329202651978, + "learning_rate": 2.1495055264688774e-05, + "loss": 5.3619, + "step": 5880 + }, + { + "epoch": 0.57, + "grad_norm": 1.0032014846801758, + "learning_rate": 2.1475664145821214e-05, + "loss": 5.2561, + "step": 5884 + }, + { + "epoch": 0.57, + "grad_norm": 1.0087443590164185, + "learning_rate": 2.1456273026953655e-05, + "loss": 5.4882, + "step": 5888 + }, + { + "epoch": 0.57, + "grad_norm": 1.014003038406372, + "learning_rate": 2.14368819080861e-05, + "loss": 5.4139, + "step": 5892 + }, + { + "epoch": 0.57, + "grad_norm": 0.9923560619354248, + "learning_rate": 2.141749078921854e-05, + "loss": 5.3716, + "step": 5896 + }, + { + "epoch": 0.57, + "grad_norm": 1.022443175315857, + "learning_rate": 2.139809967035098e-05, + "loss": 5.3654, + "step": 5900 + }, + { + "epoch": 0.57, + "grad_norm": 1.0812815427780151, + "learning_rate": 2.1378708551483424e-05, + "loss": 5.4307, + "step": 5904 + }, + { + "epoch": 0.57, + "grad_norm": 0.9952316284179688, + "learning_rate": 2.1359317432615864e-05, + "loss": 5.3851, + "step": 5908 + }, + { + "epoch": 0.57, + "grad_norm": 1.0843716859817505, + "learning_rate": 2.1339926313748305e-05, + "loss": 5.3507, + "step": 5912 + }, + { + "epoch": 0.57, + "grad_norm": 0.9775163531303406, + "learning_rate": 2.1320535194880746e-05, + "loss": 5.2868, + "step": 5916 + }, + { + "epoch": 0.57, + "grad_norm": 1.051283597946167, + "learning_rate": 2.130114407601319e-05, + "loss": 5.4404, + "step": 5920 + }, + { + "epoch": 0.57, + "grad_norm": 1.0350865125656128, + "learning_rate": 2.128175295714563e-05, + "loss": 5.3286, + "step": 5924 + }, + { + "epoch": 0.57, + "grad_norm": 1.1702210903167725, + "learning_rate": 2.126236183827807e-05, + "loss": 5.4338, + "step": 5928 + }, + { + "epoch": 0.58, + "grad_norm": 1.0268408060073853, + "learning_rate": 2.124297071941051e-05, + "loss": 5.4108, + "step": 5932 + }, + { + "epoch": 0.58, + "grad_norm": 1.002463459968567, + "learning_rate": 2.122357960054295e-05, + "loss": 5.2631, + "step": 5936 + }, + { + "epoch": 0.58, + "grad_norm": 1.04947829246521, + "learning_rate": 2.1204188481675396e-05, + "loss": 5.3011, + "step": 5940 + }, + { + "epoch": 0.58, + "grad_norm": 1.0733132362365723, + "learning_rate": 2.1184797362807836e-05, + "loss": 5.5408, + "step": 5944 + }, + { + "epoch": 0.58, + "grad_norm": 1.1274093389511108, + "learning_rate": 2.1165406243940277e-05, + "loss": 5.4001, + "step": 5948 + }, + { + "epoch": 0.58, + "grad_norm": 1.0161489248275757, + "learning_rate": 2.1146015125072717e-05, + "loss": 5.392, + "step": 5952 + }, + { + "epoch": 0.58, + "grad_norm": 1.1048080921173096, + "learning_rate": 2.1126624006205158e-05, + "loss": 5.3586, + "step": 5956 + }, + { + "epoch": 0.58, + "grad_norm": 1.0635709762573242, + "learning_rate": 2.1107232887337598e-05, + "loss": 5.3164, + "step": 5960 + }, + { + "epoch": 0.58, + "grad_norm": 1.0826531648635864, + "learning_rate": 2.1087841768470042e-05, + "loss": 5.4166, + "step": 5964 + }, + { + "epoch": 0.58, + "grad_norm": 1.0716171264648438, + "learning_rate": 2.1068450649602483e-05, + "loss": 5.3028, + "step": 5968 + }, + { + "epoch": 0.58, + "grad_norm": 1.0432461500167847, + "learning_rate": 2.1049059530734923e-05, + "loss": 5.3777, + "step": 5972 + }, + { + "epoch": 0.58, + "grad_norm": 1.0327306985855103, + "learning_rate": 2.1029668411867364e-05, + "loss": 5.2718, + "step": 5976 + }, + { + "epoch": 0.58, + "grad_norm": 1.0294878482818604, + "learning_rate": 2.1010277292999808e-05, + "loss": 5.2555, + "step": 5980 + }, + { + "epoch": 0.58, + "grad_norm": 1.0690736770629883, + "learning_rate": 2.0990886174132248e-05, + "loss": 5.2835, + "step": 5984 + }, + { + "epoch": 0.58, + "grad_norm": 1.060434341430664, + "learning_rate": 2.097149505526469e-05, + "loss": 5.4269, + "step": 5988 + }, + { + "epoch": 0.58, + "grad_norm": 1.1109763383865356, + "learning_rate": 2.095210393639713e-05, + "loss": 5.4351, + "step": 5992 + }, + { + "epoch": 0.58, + "grad_norm": 1.0055971145629883, + "learning_rate": 2.0932712817529573e-05, + "loss": 5.2801, + "step": 5996 + }, + { + "epoch": 0.58, + "grad_norm": 0.990050733089447, + "learning_rate": 2.0913321698662014e-05, + "loss": 5.3526, + "step": 6000 + }, + { + "epoch": 0.58, + "grad_norm": 0.9774187207221985, + "learning_rate": 2.0893930579794454e-05, + "loss": 5.3241, + "step": 6004 + }, + { + "epoch": 0.58, + "grad_norm": 1.0026575326919556, + "learning_rate": 2.0874539460926895e-05, + "loss": 5.3575, + "step": 6008 + }, + { + "epoch": 0.58, + "grad_norm": 0.9780769944190979, + "learning_rate": 2.085514834205934e-05, + "loss": 5.2948, + "step": 6012 + }, + { + "epoch": 0.58, + "grad_norm": 1.0509179830551147, + "learning_rate": 2.083575722319178e-05, + "loss": 5.3332, + "step": 6016 + }, + { + "epoch": 0.58, + "grad_norm": 1.0327376127243042, + "learning_rate": 2.081636610432422e-05, + "loss": 5.38, + "step": 6020 + }, + { + "epoch": 0.58, + "grad_norm": 1.105682134628296, + "learning_rate": 2.0796974985456664e-05, + "loss": 5.3568, + "step": 6024 + }, + { + "epoch": 0.58, + "grad_norm": 1.0796016454696655, + "learning_rate": 2.0777583866589104e-05, + "loss": 5.3871, + "step": 6028 + }, + { + "epoch": 0.58, + "grad_norm": 0.983562171459198, + "learning_rate": 2.0758192747721545e-05, + "loss": 5.3737, + "step": 6032 + }, + { + "epoch": 0.59, + "grad_norm": 1.0870602130889893, + "learning_rate": 2.0738801628853985e-05, + "loss": 5.2752, + "step": 6036 + }, + { + "epoch": 0.59, + "grad_norm": 0.9455908536911011, + "learning_rate": 2.071941050998643e-05, + "loss": 5.3517, + "step": 6040 + }, + { + "epoch": 0.59, + "grad_norm": 1.0257951021194458, + "learning_rate": 2.070001939111887e-05, + "loss": 5.3043, + "step": 6044 + }, + { + "epoch": 0.59, + "grad_norm": 1.0068105459213257, + "learning_rate": 2.068062827225131e-05, + "loss": 5.2876, + "step": 6048 + }, + { + "epoch": 0.59, + "grad_norm": 1.0739387273788452, + "learning_rate": 2.066123715338375e-05, + "loss": 5.3432, + "step": 6052 + }, + { + "epoch": 0.59, + "grad_norm": 0.9713083505630493, + "learning_rate": 2.0641846034516195e-05, + "loss": 5.368, + "step": 6056 + }, + { + "epoch": 0.59, + "grad_norm": 0.9598544239997864, + "learning_rate": 2.0622454915648635e-05, + "loss": 5.467, + "step": 6060 + }, + { + "epoch": 0.59, + "grad_norm": 1.0342966318130493, + "learning_rate": 2.0603063796781076e-05, + "loss": 5.3495, + "step": 6064 + }, + { + "epoch": 0.59, + "grad_norm": 0.9927212595939636, + "learning_rate": 2.0583672677913516e-05, + "loss": 5.3898, + "step": 6068 + }, + { + "epoch": 0.59, + "grad_norm": 0.985098659992218, + "learning_rate": 2.056428155904596e-05, + "loss": 5.3315, + "step": 6072 + }, + { + "epoch": 0.59, + "grad_norm": 1.1182785034179688, + "learning_rate": 2.05448904401784e-05, + "loss": 5.3721, + "step": 6076 + }, + { + "epoch": 0.59, + "grad_norm": 1.0341837406158447, + "learning_rate": 2.052549932131084e-05, + "loss": 5.3284, + "step": 6080 + }, + { + "epoch": 0.59, + "grad_norm": 1.0580908060073853, + "learning_rate": 2.0506108202443282e-05, + "loss": 5.3776, + "step": 6084 + }, + { + "epoch": 0.59, + "grad_norm": 1.016257405281067, + "learning_rate": 2.0486717083575722e-05, + "loss": 5.3578, + "step": 6088 + }, + { + "epoch": 0.59, + "grad_norm": 1.0956933498382568, + "learning_rate": 2.0467325964708163e-05, + "loss": 5.3792, + "step": 6092 + }, + { + "epoch": 0.59, + "grad_norm": 1.0442465543746948, + "learning_rate": 2.0447934845840604e-05, + "loss": 5.2992, + "step": 6096 + }, + { + "epoch": 0.59, + "grad_norm": 1.1148533821105957, + "learning_rate": 2.0428543726973047e-05, + "loss": 5.2835, + "step": 6100 + }, + { + "epoch": 0.59, + "grad_norm": 1.056727409362793, + "learning_rate": 2.0409152608105488e-05, + "loss": 5.3336, + "step": 6104 + }, + { + "epoch": 0.59, + "grad_norm": 0.9661352634429932, + "learning_rate": 2.038976148923793e-05, + "loss": 5.356, + "step": 6108 + }, + { + "epoch": 0.59, + "grad_norm": 1.0282217264175415, + "learning_rate": 2.037037037037037e-05, + "loss": 5.3118, + "step": 6112 + }, + { + "epoch": 0.59, + "grad_norm": 1.0714175701141357, + "learning_rate": 2.0350979251502813e-05, + "loss": 5.3817, + "step": 6116 + }, + { + "epoch": 0.59, + "grad_norm": 1.0747861862182617, + "learning_rate": 2.0331588132635254e-05, + "loss": 5.3961, + "step": 6120 + }, + { + "epoch": 0.59, + "grad_norm": 0.9946739673614502, + "learning_rate": 2.0312197013767694e-05, + "loss": 5.343, + "step": 6124 + }, + { + "epoch": 0.59, + "grad_norm": 1.002004623413086, + "learning_rate": 2.0292805894900135e-05, + "loss": 5.3654, + "step": 6128 + }, + { + "epoch": 0.59, + "grad_norm": 1.0547658205032349, + "learning_rate": 2.027341477603258e-05, + "loss": 5.3869, + "step": 6132 + }, + { + "epoch": 0.59, + "grad_norm": 1.076196551322937, + "learning_rate": 2.025402365716502e-05, + "loss": 5.2561, + "step": 6136 + }, + { + "epoch": 0.6, + "grad_norm": 1.3180465698242188, + "learning_rate": 2.023463253829746e-05, + "loss": 5.3457, + "step": 6140 + }, + { + "epoch": 0.6, + "grad_norm": 1.0028260946273804, + "learning_rate": 2.0215241419429904e-05, + "loss": 5.4271, + "step": 6144 + }, + { + "epoch": 0.6, + "grad_norm": 1.0375735759735107, + "learning_rate": 2.0195850300562344e-05, + "loss": 5.3181, + "step": 6148 + }, + { + "epoch": 0.6, + "grad_norm": 1.0055443048477173, + "learning_rate": 2.0176459181694785e-05, + "loss": 5.3544, + "step": 6152 + }, + { + "epoch": 0.6, + "grad_norm": 1.0876275300979614, + "learning_rate": 2.0157068062827225e-05, + "loss": 5.3487, + "step": 6156 + }, + { + "epoch": 0.6, + "grad_norm": 1.018140196800232, + "learning_rate": 2.013767694395967e-05, + "loss": 5.4325, + "step": 6160 + }, + { + "epoch": 0.6, + "grad_norm": 1.0800230503082275, + "learning_rate": 2.011828582509211e-05, + "loss": 5.2748, + "step": 6164 + }, + { + "epoch": 0.6, + "grad_norm": 1.0820457935333252, + "learning_rate": 2.009889470622455e-05, + "loss": 5.3661, + "step": 6168 + }, + { + "epoch": 0.6, + "grad_norm": 1.0936886072158813, + "learning_rate": 2.007950358735699e-05, + "loss": 5.3191, + "step": 6172 + }, + { + "epoch": 0.6, + "grad_norm": 1.0739785432815552, + "learning_rate": 2.0060112468489435e-05, + "loss": 5.372, + "step": 6176 + }, + { + "epoch": 0.6, + "grad_norm": 1.0785382986068726, + "learning_rate": 2.0040721349621875e-05, + "loss": 5.3858, + "step": 6180 + }, + { + "epoch": 0.6, + "grad_norm": 1.000064492225647, + "learning_rate": 2.0021330230754316e-05, + "loss": 5.3544, + "step": 6184 + }, + { + "epoch": 0.6, + "grad_norm": 1.096794605255127, + "learning_rate": 2.0001939111886756e-05, + "loss": 5.3608, + "step": 6188 + }, + { + "epoch": 0.6, + "grad_norm": 1.0260350704193115, + "learning_rate": 1.99825479930192e-05, + "loss": 5.4049, + "step": 6192 + }, + { + "epoch": 0.6, + "grad_norm": 1.0056685209274292, + "learning_rate": 1.996315687415164e-05, + "loss": 5.3588, + "step": 6196 + }, + { + "epoch": 0.6, + "grad_norm": 1.003697156906128, + "learning_rate": 1.994376575528408e-05, + "loss": 5.4, + "step": 6200 + }, + { + "epoch": 0.6, + "grad_norm": 1.0048468112945557, + "learning_rate": 1.9924374636416525e-05, + "loss": 5.2654, + "step": 6204 + }, + { + "epoch": 0.6, + "grad_norm": 0.9437733292579651, + "learning_rate": 1.9904983517548966e-05, + "loss": 5.2859, + "step": 6208 + }, + { + "epoch": 0.6, + "grad_norm": 0.961186945438385, + "learning_rate": 1.9885592398681406e-05, + "loss": 5.3605, + "step": 6212 + }, + { + "epoch": 0.6, + "grad_norm": 1.0297905206680298, + "learning_rate": 1.9866201279813847e-05, + "loss": 5.3202, + "step": 6216 + }, + { + "epoch": 0.6, + "grad_norm": 1.3261497020721436, + "learning_rate": 1.9846810160946287e-05, + "loss": 5.3587, + "step": 6220 + }, + { + "epoch": 0.6, + "grad_norm": 1.1081327199935913, + "learning_rate": 1.9827419042078728e-05, + "loss": 5.427, + "step": 6224 + }, + { + "epoch": 0.6, + "grad_norm": 1.0671581029891968, + "learning_rate": 1.980802792321117e-05, + "loss": 5.453, + "step": 6228 + }, + { + "epoch": 0.6, + "grad_norm": 1.0665878057479858, + "learning_rate": 1.978863680434361e-05, + "loss": 5.3263, + "step": 6232 + }, + { + "epoch": 0.6, + "grad_norm": 0.9510138034820557, + "learning_rate": 1.9769245685476053e-05, + "loss": 5.3364, + "step": 6236 + }, + { + "epoch": 0.61, + "grad_norm": 0.9980200529098511, + "learning_rate": 1.9749854566608493e-05, + "loss": 5.2064, + "step": 6240 + }, + { + "epoch": 0.61, + "grad_norm": 1.0917850732803345, + "learning_rate": 1.9730463447740934e-05, + "loss": 5.3213, + "step": 6244 + }, + { + "epoch": 0.61, + "grad_norm": 1.0644116401672363, + "learning_rate": 1.9711072328873374e-05, + "loss": 5.2939, + "step": 6248 + }, + { + "epoch": 0.61, + "grad_norm": 1.0722259283065796, + "learning_rate": 1.969168121000582e-05, + "loss": 5.3984, + "step": 6252 + }, + { + "epoch": 0.61, + "grad_norm": 1.0094479322433472, + "learning_rate": 1.967229009113826e-05, + "loss": 5.3198, + "step": 6256 + }, + { + "epoch": 0.61, + "grad_norm": 1.0126482248306274, + "learning_rate": 1.96528989722707e-05, + "loss": 5.3356, + "step": 6260 + }, + { + "epoch": 0.61, + "grad_norm": 1.061038613319397, + "learning_rate": 1.9633507853403143e-05, + "loss": 5.294, + "step": 6264 + }, + { + "epoch": 0.61, + "grad_norm": 1.1081583499908447, + "learning_rate": 1.9614116734535584e-05, + "loss": 5.3518, + "step": 6268 + }, + { + "epoch": 0.61, + "grad_norm": 1.0472965240478516, + "learning_rate": 1.9594725615668024e-05, + "loss": 5.3526, + "step": 6272 + }, + { + "epoch": 0.61, + "grad_norm": 0.9935174584388733, + "learning_rate": 1.9575334496800465e-05, + "loss": 5.3108, + "step": 6276 + }, + { + "epoch": 0.61, + "grad_norm": 1.0617495775222778, + "learning_rate": 1.955594337793291e-05, + "loss": 5.3518, + "step": 6280 + }, + { + "epoch": 0.61, + "grad_norm": 1.0488762855529785, + "learning_rate": 1.953655225906535e-05, + "loss": 5.2888, + "step": 6284 + }, + { + "epoch": 0.61, + "grad_norm": 1.065202236175537, + "learning_rate": 1.951716114019779e-05, + "loss": 5.2927, + "step": 6288 + }, + { + "epoch": 0.61, + "grad_norm": 1.1323513984680176, + "learning_rate": 1.949777002133023e-05, + "loss": 5.3774, + "step": 6292 + }, + { + "epoch": 0.61, + "grad_norm": 1.0796220302581787, + "learning_rate": 1.9478378902462674e-05, + "loss": 5.3579, + "step": 6296 + }, + { + "epoch": 0.61, + "grad_norm": 1.0631530284881592, + "learning_rate": 1.9458987783595115e-05, + "loss": 5.2644, + "step": 6300 + }, + { + "epoch": 0.61, + "grad_norm": 1.0359313488006592, + "learning_rate": 1.9439596664727556e-05, + "loss": 5.3949, + "step": 6304 + }, + { + "epoch": 0.61, + "grad_norm": 1.0680855512619019, + "learning_rate": 1.9420205545859996e-05, + "loss": 5.3567, + "step": 6308 + }, + { + "epoch": 0.61, + "grad_norm": 1.013702154159546, + "learning_rate": 1.940081442699244e-05, + "loss": 5.4166, + "step": 6312 + }, + { + "epoch": 0.61, + "grad_norm": 0.9817814230918884, + "learning_rate": 1.938142330812488e-05, + "loss": 5.4169, + "step": 6316 + }, + { + "epoch": 0.61, + "grad_norm": 1.0646679401397705, + "learning_rate": 1.936203218925732e-05, + "loss": 5.3344, + "step": 6320 + }, + { + "epoch": 0.61, + "grad_norm": 1.0351269245147705, + "learning_rate": 1.9342641070389765e-05, + "loss": 5.358, + "step": 6324 + }, + { + "epoch": 0.61, + "grad_norm": 0.9547563195228577, + "learning_rate": 1.9323249951522206e-05, + "loss": 5.3869, + "step": 6328 + }, + { + "epoch": 0.61, + "grad_norm": 1.0959416627883911, + "learning_rate": 1.9303858832654646e-05, + "loss": 5.3785, + "step": 6332 + }, + { + "epoch": 0.61, + "grad_norm": 1.026036262512207, + "learning_rate": 1.9284467713787087e-05, + "loss": 5.3627, + "step": 6336 + }, + { + "epoch": 0.61, + "grad_norm": 1.0361682176589966, + "learning_rate": 1.926507659491953e-05, + "loss": 5.3649, + "step": 6340 + }, + { + "epoch": 0.62, + "grad_norm": 1.0472002029418945, + "learning_rate": 1.924568547605197e-05, + "loss": 5.2838, + "step": 6344 + }, + { + "epoch": 0.62, + "grad_norm": 1.0799552202224731, + "learning_rate": 1.922629435718441e-05, + "loss": 5.3879, + "step": 6348 + }, + { + "epoch": 0.62, + "grad_norm": 1.0271328687667847, + "learning_rate": 1.9206903238316852e-05, + "loss": 5.3387, + "step": 6352 + }, + { + "epoch": 0.62, + "grad_norm": 1.0310901403427124, + "learning_rate": 1.9187512119449293e-05, + "loss": 5.4013, + "step": 6356 + }, + { + "epoch": 0.62, + "grad_norm": 1.0406147241592407, + "learning_rate": 1.9168121000581733e-05, + "loss": 5.3939, + "step": 6360 + }, + { + "epoch": 0.62, + "grad_norm": 1.048988938331604, + "learning_rate": 1.9148729881714174e-05, + "loss": 5.3211, + "step": 6364 + }, + { + "epoch": 0.62, + "grad_norm": 1.0325740575790405, + "learning_rate": 1.9129338762846614e-05, + "loss": 5.4601, + "step": 6368 + }, + { + "epoch": 0.62, + "grad_norm": 0.9758936166763306, + "learning_rate": 1.9109947643979058e-05, + "loss": 5.3075, + "step": 6372 + }, + { + "epoch": 0.62, + "grad_norm": 1.064858078956604, + "learning_rate": 1.90905565251115e-05, + "loss": 5.366, + "step": 6376 + }, + { + "epoch": 0.62, + "grad_norm": 0.9335108399391174, + "learning_rate": 1.907116540624394e-05, + "loss": 5.2433, + "step": 6380 + }, + { + "epoch": 0.62, + "grad_norm": 1.0345276594161987, + "learning_rate": 1.9051774287376383e-05, + "loss": 5.3438, + "step": 6384 + }, + { + "epoch": 0.62, + "grad_norm": 1.0446693897247314, + "learning_rate": 1.9032383168508824e-05, + "loss": 5.4391, + "step": 6388 + }, + { + "epoch": 0.62, + "grad_norm": 1.0137783288955688, + "learning_rate": 1.9012992049641264e-05, + "loss": 5.3926, + "step": 6392 + }, + { + "epoch": 0.62, + "grad_norm": 0.953395664691925, + "learning_rate": 1.8993600930773705e-05, + "loss": 5.3703, + "step": 6396 + }, + { + "epoch": 0.62, + "grad_norm": 1.0034725666046143, + "learning_rate": 1.897420981190615e-05, + "loss": 5.3504, + "step": 6400 + }, + { + "epoch": 0.62, + "grad_norm": 1.0460563898086548, + "learning_rate": 1.895481869303859e-05, + "loss": 5.4165, + "step": 6404 + }, + { + "epoch": 0.62, + "grad_norm": 1.0266785621643066, + "learning_rate": 1.893542757417103e-05, + "loss": 5.3853, + "step": 6408 + }, + { + "epoch": 0.62, + "grad_norm": 1.1107487678527832, + "learning_rate": 1.891603645530347e-05, + "loss": 5.4166, + "step": 6412 + }, + { + "epoch": 0.62, + "grad_norm": 1.1155457496643066, + "learning_rate": 1.8896645336435914e-05, + "loss": 5.3838, + "step": 6416 + }, + { + "epoch": 0.62, + "grad_norm": 1.0329363346099854, + "learning_rate": 1.8877254217568355e-05, + "loss": 5.3541, + "step": 6420 + }, + { + "epoch": 0.62, + "grad_norm": 1.028456449508667, + "learning_rate": 1.8857863098700795e-05, + "loss": 5.3506, + "step": 6424 + }, + { + "epoch": 0.62, + "grad_norm": 1.090142011642456, + "learning_rate": 1.8838471979833236e-05, + "loss": 5.3296, + "step": 6428 + }, + { + "epoch": 0.62, + "grad_norm": 1.071015477180481, + "learning_rate": 1.881908086096568e-05, + "loss": 5.3341, + "step": 6432 + }, + { + "epoch": 0.62, + "grad_norm": 0.9764314293861389, + "learning_rate": 1.879968974209812e-05, + "loss": 5.3198, + "step": 6436 + }, + { + "epoch": 0.62, + "grad_norm": 1.046424388885498, + "learning_rate": 1.878029862323056e-05, + "loss": 5.3087, + "step": 6440 + }, + { + "epoch": 0.62, + "grad_norm": 1.0243570804595947, + "learning_rate": 1.8760907504363005e-05, + "loss": 5.3045, + "step": 6444 + }, + { + "epoch": 0.63, + "grad_norm": 0.9760267734527588, + "learning_rate": 1.8741516385495445e-05, + "loss": 5.2859, + "step": 6448 + }, + { + "epoch": 0.63, + "grad_norm": 1.1072561740875244, + "learning_rate": 1.8722125266627886e-05, + "loss": 5.359, + "step": 6452 + }, + { + "epoch": 0.63, + "grad_norm": 1.0756561756134033, + "learning_rate": 1.8702734147760326e-05, + "loss": 5.409, + "step": 6456 + }, + { + "epoch": 0.63, + "grad_norm": 0.9566421508789062, + "learning_rate": 1.868334302889277e-05, + "loss": 5.3902, + "step": 6460 + }, + { + "epoch": 0.63, + "grad_norm": 1.0135536193847656, + "learning_rate": 1.866395191002521e-05, + "loss": 5.504, + "step": 6464 + }, + { + "epoch": 0.63, + "grad_norm": 1.139150619506836, + "learning_rate": 1.864456079115765e-05, + "loss": 5.3502, + "step": 6468 + }, + { + "epoch": 0.63, + "grad_norm": 1.0426169633865356, + "learning_rate": 1.8625169672290092e-05, + "loss": 5.2562, + "step": 6472 + }, + { + "epoch": 0.63, + "grad_norm": 1.0229958295822144, + "learning_rate": 1.8605778553422536e-05, + "loss": 5.3472, + "step": 6476 + }, + { + "epoch": 0.63, + "grad_norm": 1.0061606168746948, + "learning_rate": 1.8586387434554976e-05, + "loss": 5.2391, + "step": 6480 + }, + { + "epoch": 0.63, + "grad_norm": 0.9669675230979919, + "learning_rate": 1.8566996315687417e-05, + "loss": 5.4266, + "step": 6484 + }, + { + "epoch": 0.63, + "grad_norm": 1.0888794660568237, + "learning_rate": 1.8547605196819857e-05, + "loss": 5.3798, + "step": 6488 + }, + { + "epoch": 0.63, + "grad_norm": 1.0779885053634644, + "learning_rate": 1.8528214077952298e-05, + "loss": 5.3159, + "step": 6492 + }, + { + "epoch": 0.63, + "grad_norm": 1.0792943239212036, + "learning_rate": 1.850882295908474e-05, + "loss": 5.3322, + "step": 6496 + }, + { + "epoch": 0.63, + "grad_norm": 1.0349109172821045, + "learning_rate": 1.848943184021718e-05, + "loss": 5.3125, + "step": 6500 + }, + { + "epoch": 0.63, + "grad_norm": 1.0294796228408813, + "learning_rate": 1.8470040721349623e-05, + "loss": 5.3059, + "step": 6504 + }, + { + "epoch": 0.63, + "grad_norm": 0.9547713994979858, + "learning_rate": 1.8450649602482064e-05, + "loss": 5.2556, + "step": 6508 + }, + { + "epoch": 0.63, + "grad_norm": 1.1305149793624878, + "learning_rate": 1.8431258483614504e-05, + "loss": 5.3476, + "step": 6512 + }, + { + "epoch": 0.63, + "grad_norm": 1.0296196937561035, + "learning_rate": 1.8411867364746945e-05, + "loss": 5.3404, + "step": 6516 + }, + { + "epoch": 0.63, + "grad_norm": 1.0865687131881714, + "learning_rate": 1.839247624587939e-05, + "loss": 5.3772, + "step": 6520 + }, + { + "epoch": 0.63, + "grad_norm": 1.065975546836853, + "learning_rate": 1.837308512701183e-05, + "loss": 5.3577, + "step": 6524 + }, + { + "epoch": 0.63, + "grad_norm": 1.1665056943893433, + "learning_rate": 1.835369400814427e-05, + "loss": 5.3344, + "step": 6528 + }, + { + "epoch": 0.63, + "grad_norm": 1.0219711065292358, + "learning_rate": 1.833430288927671e-05, + "loss": 5.3717, + "step": 6532 + }, + { + "epoch": 0.63, + "grad_norm": 1.01799738407135, + "learning_rate": 1.8314911770409154e-05, + "loss": 5.3329, + "step": 6536 + }, + { + "epoch": 0.63, + "grad_norm": 1.0811940431594849, + "learning_rate": 1.8295520651541595e-05, + "loss": 5.2708, + "step": 6540 + }, + { + "epoch": 0.63, + "grad_norm": 1.0369648933410645, + "learning_rate": 1.8276129532674035e-05, + "loss": 5.3731, + "step": 6544 + }, + { + "epoch": 0.63, + "grad_norm": 1.0611283779144287, + "learning_rate": 1.8256738413806476e-05, + "loss": 5.4037, + "step": 6548 + }, + { + "epoch": 0.64, + "grad_norm": 0.9396833777427673, + "learning_rate": 1.823734729493892e-05, + "loss": 5.3158, + "step": 6552 + }, + { + "epoch": 0.64, + "grad_norm": 0.9994028210639954, + "learning_rate": 1.821795617607136e-05, + "loss": 5.3562, + "step": 6556 + }, + { + "epoch": 0.64, + "grad_norm": 1.0078858137130737, + "learning_rate": 1.81985650572038e-05, + "loss": 5.322, + "step": 6560 + }, + { + "epoch": 0.64, + "grad_norm": 1.0305253267288208, + "learning_rate": 1.8179173938336245e-05, + "loss": 5.2791, + "step": 6564 + }, + { + "epoch": 0.64, + "grad_norm": 1.0548304319381714, + "learning_rate": 1.8159782819468685e-05, + "loss": 5.356, + "step": 6568 + }, + { + "epoch": 0.64, + "grad_norm": 0.9770938754081726, + "learning_rate": 1.8140391700601126e-05, + "loss": 5.3234, + "step": 6572 + }, + { + "epoch": 0.64, + "grad_norm": 1.087689757347107, + "learning_rate": 1.8121000581733566e-05, + "loss": 5.3944, + "step": 6576 + }, + { + "epoch": 0.64, + "grad_norm": 1.14712655544281, + "learning_rate": 1.810160946286601e-05, + "loss": 5.2835, + "step": 6580 + }, + { + "epoch": 0.64, + "grad_norm": 1.1165133714675903, + "learning_rate": 1.808221834399845e-05, + "loss": 5.2902, + "step": 6584 + }, + { + "epoch": 0.64, + "grad_norm": 1.063067078590393, + "learning_rate": 1.806282722513089e-05, + "loss": 5.3108, + "step": 6588 + }, + { + "epoch": 0.64, + "grad_norm": 1.0933597087860107, + "learning_rate": 1.8043436106263332e-05, + "loss": 5.431, + "step": 6592 + }, + { + "epoch": 0.64, + "grad_norm": 1.0003483295440674, + "learning_rate": 1.8024044987395776e-05, + "loss": 5.3212, + "step": 6596 + }, + { + "epoch": 0.64, + "grad_norm": 0.9593095779418945, + "learning_rate": 1.8004653868528216e-05, + "loss": 5.3799, + "step": 6600 + }, + { + "epoch": 0.64, + "grad_norm": 1.0600883960723877, + "learning_rate": 1.7985262749660657e-05, + "loss": 5.2845, + "step": 6604 + }, + { + "epoch": 0.64, + "grad_norm": 1.054432988166809, + "learning_rate": 1.7965871630793097e-05, + "loss": 5.3376, + "step": 6608 + }, + { + "epoch": 0.64, + "grad_norm": 1.0431993007659912, + "learning_rate": 1.794648051192554e-05, + "loss": 5.3923, + "step": 6612 + }, + { + "epoch": 0.64, + "grad_norm": 1.1059125661849976, + "learning_rate": 1.7927089393057982e-05, + "loss": 5.3124, + "step": 6616 + }, + { + "epoch": 0.64, + "grad_norm": 1.0165326595306396, + "learning_rate": 1.7907698274190422e-05, + "loss": 5.3649, + "step": 6620 + }, + { + "epoch": 0.64, + "grad_norm": 1.1015493869781494, + "learning_rate": 1.7888307155322863e-05, + "loss": 5.3207, + "step": 6624 + }, + { + "epoch": 0.64, + "grad_norm": 1.062280297279358, + "learning_rate": 1.7868916036455303e-05, + "loss": 5.3276, + "step": 6628 + }, + { + "epoch": 0.64, + "grad_norm": 1.101285696029663, + "learning_rate": 1.7849524917587744e-05, + "loss": 5.4415, + "step": 6632 + }, + { + "epoch": 0.64, + "grad_norm": 1.039291501045227, + "learning_rate": 1.7830133798720184e-05, + "loss": 5.3052, + "step": 6636 + }, + { + "epoch": 0.64, + "grad_norm": 1.0250643491744995, + "learning_rate": 1.781074267985263e-05, + "loss": 5.337, + "step": 6640 + }, + { + "epoch": 0.64, + "grad_norm": 1.0628857612609863, + "learning_rate": 1.779135156098507e-05, + "loss": 5.3236, + "step": 6644 + }, + { + "epoch": 0.64, + "grad_norm": 1.0590665340423584, + "learning_rate": 1.777196044211751e-05, + "loss": 5.2928, + "step": 6648 + }, + { + "epoch": 0.64, + "grad_norm": 1.009734034538269, + "learning_rate": 1.775256932324995e-05, + "loss": 5.3493, + "step": 6652 + }, + { + "epoch": 0.65, + "grad_norm": 0.9915148019790649, + "learning_rate": 1.7733178204382394e-05, + "loss": 5.3266, + "step": 6656 + }, + { + "epoch": 0.65, + "grad_norm": 1.0235103368759155, + "learning_rate": 1.7713787085514834e-05, + "loss": 5.3359, + "step": 6660 + }, + { + "epoch": 0.65, + "grad_norm": 1.003313422203064, + "learning_rate": 1.7694395966647275e-05, + "loss": 5.3196, + "step": 6664 + }, + { + "epoch": 0.65, + "grad_norm": 1.0449254512786865, + "learning_rate": 1.7675004847779716e-05, + "loss": 5.3287, + "step": 6668 + }, + { + "epoch": 0.65, + "grad_norm": 1.0780065059661865, + "learning_rate": 1.765561372891216e-05, + "loss": 5.3901, + "step": 6672 + }, + { + "epoch": 0.65, + "grad_norm": 1.134416103363037, + "learning_rate": 1.76362226100446e-05, + "loss": 5.3008, + "step": 6676 + }, + { + "epoch": 0.65, + "grad_norm": 1.095788836479187, + "learning_rate": 1.761683149117704e-05, + "loss": 5.3973, + "step": 6680 + }, + { + "epoch": 0.65, + "grad_norm": 1.038216471672058, + "learning_rate": 1.7597440372309484e-05, + "loss": 5.3031, + "step": 6684 + }, + { + "epoch": 0.65, + "grad_norm": 1.033199429512024, + "learning_rate": 1.7578049253441925e-05, + "loss": 5.2332, + "step": 6688 + }, + { + "epoch": 0.65, + "grad_norm": 1.0989290475845337, + "learning_rate": 1.7558658134574366e-05, + "loss": 5.3559, + "step": 6692 + }, + { + "epoch": 0.65, + "grad_norm": 1.0528738498687744, + "learning_rate": 1.7539267015706806e-05, + "loss": 5.3435, + "step": 6696 + }, + { + "epoch": 0.65, + "grad_norm": 1.0338224172592163, + "learning_rate": 1.751987589683925e-05, + "loss": 5.4003, + "step": 6700 + }, + { + "epoch": 0.65, + "grad_norm": 1.021791934967041, + "learning_rate": 1.750048477797169e-05, + "loss": 5.2902, + "step": 6704 + }, + { + "epoch": 0.65, + "grad_norm": 1.069692850112915, + "learning_rate": 1.748109365910413e-05, + "loss": 5.2897, + "step": 6708 + }, + { + "epoch": 0.65, + "grad_norm": 1.0573803186416626, + "learning_rate": 1.746170254023657e-05, + "loss": 5.386, + "step": 6712 + }, + { + "epoch": 0.65, + "grad_norm": 1.0095142126083374, + "learning_rate": 1.7442311421369016e-05, + "loss": 5.3541, + "step": 6716 + }, + { + "epoch": 0.65, + "grad_norm": 1.0076450109481812, + "learning_rate": 1.7422920302501456e-05, + "loss": 5.3088, + "step": 6720 + }, + { + "epoch": 0.65, + "grad_norm": 0.9816309809684753, + "learning_rate": 1.7403529183633897e-05, + "loss": 5.3906, + "step": 6724 + }, + { + "epoch": 0.65, + "grad_norm": 1.1446229219436646, + "learning_rate": 1.7384138064766337e-05, + "loss": 5.3372, + "step": 6728 + }, + { + "epoch": 0.65, + "grad_norm": 0.9934061169624329, + "learning_rate": 1.736474694589878e-05, + "loss": 5.4092, + "step": 6732 + }, + { + "epoch": 0.65, + "grad_norm": 0.9881500005722046, + "learning_rate": 1.734535582703122e-05, + "loss": 5.2318, + "step": 6736 + }, + { + "epoch": 0.65, + "grad_norm": 1.0100693702697754, + "learning_rate": 1.7325964708163662e-05, + "loss": 5.35, + "step": 6740 + }, + { + "epoch": 0.65, + "grad_norm": 1.0123350620269775, + "learning_rate": 1.7306573589296106e-05, + "loss": 5.2857, + "step": 6744 + }, + { + "epoch": 0.65, + "grad_norm": 1.0262787342071533, + "learning_rate": 1.7287182470428547e-05, + "loss": 5.425, + "step": 6748 + }, + { + "epoch": 0.65, + "grad_norm": 1.0568041801452637, + "learning_rate": 1.7267791351560987e-05, + "loss": 5.3167, + "step": 6752 + }, + { + "epoch": 0.66, + "grad_norm": 1.0791805982589722, + "learning_rate": 1.7248400232693428e-05, + "loss": 5.3329, + "step": 6756 + }, + { + "epoch": 0.66, + "grad_norm": 1.0418261289596558, + "learning_rate": 1.7229009113825868e-05, + "loss": 5.3187, + "step": 6760 + }, + { + "epoch": 0.66, + "grad_norm": 1.0648219585418701, + "learning_rate": 1.720961799495831e-05, + "loss": 5.2998, + "step": 6764 + }, + { + "epoch": 0.66, + "grad_norm": 1.0518834590911865, + "learning_rate": 1.719022687609075e-05, + "loss": 5.4489, + "step": 6768 + }, + { + "epoch": 0.66, + "grad_norm": 1.1231526136398315, + "learning_rate": 1.7170835757223193e-05, + "loss": 5.2959, + "step": 6772 + }, + { + "epoch": 0.66, + "grad_norm": 1.0702370405197144, + "learning_rate": 1.7151444638355634e-05, + "loss": 5.321, + "step": 6776 + }, + { + "epoch": 0.66, + "grad_norm": 1.0818907022476196, + "learning_rate": 1.7132053519488074e-05, + "loss": 5.2864, + "step": 6780 + }, + { + "epoch": 0.66, + "grad_norm": 0.990020215511322, + "learning_rate": 1.7112662400620515e-05, + "loss": 5.2245, + "step": 6784 + }, + { + "epoch": 0.66, + "grad_norm": 1.104169249534607, + "learning_rate": 1.7093271281752955e-05, + "loss": 5.29, + "step": 6788 + }, + { + "epoch": 0.66, + "grad_norm": 1.0738272666931152, + "learning_rate": 1.70738801628854e-05, + "loss": 5.2843, + "step": 6792 + }, + { + "epoch": 0.66, + "grad_norm": 1.009252905845642, + "learning_rate": 1.705448904401784e-05, + "loss": 5.4593, + "step": 6796 + }, + { + "epoch": 0.66, + "grad_norm": 1.0361759662628174, + "learning_rate": 1.703509792515028e-05, + "loss": 5.2717, + "step": 6800 + }, + { + "epoch": 0.66, + "grad_norm": 1.1224076747894287, + "learning_rate": 1.7015706806282724e-05, + "loss": 5.3675, + "step": 6804 + }, + { + "epoch": 0.66, + "grad_norm": 1.094973087310791, + "learning_rate": 1.6996315687415165e-05, + "loss": 5.3096, + "step": 6808 + }, + { + "epoch": 0.66, + "grad_norm": 1.0065085887908936, + "learning_rate": 1.6976924568547605e-05, + "loss": 5.41, + "step": 6812 + }, + { + "epoch": 0.66, + "grad_norm": 1.0526219606399536, + "learning_rate": 1.6957533449680046e-05, + "loss": 5.408, + "step": 6816 + }, + { + "epoch": 0.66, + "grad_norm": 1.0697109699249268, + "learning_rate": 1.693814233081249e-05, + "loss": 5.3375, + "step": 6820 + }, + { + "epoch": 0.66, + "grad_norm": 1.0227476358413696, + "learning_rate": 1.691875121194493e-05, + "loss": 5.3254, + "step": 6824 + }, + { + "epoch": 0.66, + "grad_norm": 0.9872763156890869, + "learning_rate": 1.689936009307737e-05, + "loss": 5.2915, + "step": 6828 + }, + { + "epoch": 0.66, + "grad_norm": 1.028903603553772, + "learning_rate": 1.687996897420981e-05, + "loss": 5.2651, + "step": 6832 + }, + { + "epoch": 0.66, + "grad_norm": 1.095890998840332, + "learning_rate": 1.6860577855342255e-05, + "loss": 5.4315, + "step": 6836 + }, + { + "epoch": 0.66, + "grad_norm": 1.0813144445419312, + "learning_rate": 1.6841186736474696e-05, + "loss": 5.3725, + "step": 6840 + }, + { + "epoch": 0.66, + "grad_norm": 1.0944348573684692, + "learning_rate": 1.6821795617607136e-05, + "loss": 5.3591, + "step": 6844 + }, + { + "epoch": 0.66, + "grad_norm": 1.023173451423645, + "learning_rate": 1.6802404498739577e-05, + "loss": 5.3222, + "step": 6848 + }, + { + "epoch": 0.66, + "grad_norm": 1.1330089569091797, + "learning_rate": 1.678301337987202e-05, + "loss": 5.3623, + "step": 6852 + }, + { + "epoch": 0.66, + "grad_norm": 1.1375625133514404, + "learning_rate": 1.676362226100446e-05, + "loss": 5.3356, + "step": 6856 + }, + { + "epoch": 0.67, + "grad_norm": 1.1181647777557373, + "learning_rate": 1.6744231142136902e-05, + "loss": 5.3939, + "step": 6860 + }, + { + "epoch": 0.67, + "grad_norm": 1.0055108070373535, + "learning_rate": 1.6724840023269346e-05, + "loss": 5.339, + "step": 6864 + }, + { + "epoch": 0.67, + "grad_norm": 1.001213788986206, + "learning_rate": 1.6705448904401786e-05, + "loss": 5.4536, + "step": 6868 + }, + { + "epoch": 0.67, + "grad_norm": 1.0213780403137207, + "learning_rate": 1.6686057785534227e-05, + "loss": 5.3785, + "step": 6872 + }, + { + "epoch": 0.67, + "grad_norm": 1.0076147317886353, + "learning_rate": 1.6666666666666667e-05, + "loss": 5.2837, + "step": 6876 + }, + { + "epoch": 0.67, + "grad_norm": 1.0241953134536743, + "learning_rate": 1.664727554779911e-05, + "loss": 5.3633, + "step": 6880 + }, + { + "epoch": 0.67, + "grad_norm": 1.022125005722046, + "learning_rate": 1.6627884428931552e-05, + "loss": 5.3523, + "step": 6884 + }, + { + "epoch": 0.67, + "grad_norm": 1.0576834678649902, + "learning_rate": 1.6608493310063992e-05, + "loss": 5.315, + "step": 6888 + }, + { + "epoch": 0.67, + "grad_norm": 1.0498878955841064, + "learning_rate": 1.6589102191196433e-05, + "loss": 5.3896, + "step": 6892 + }, + { + "epoch": 0.67, + "grad_norm": 1.003936529159546, + "learning_rate": 1.6569711072328874e-05, + "loss": 5.3293, + "step": 6896 + }, + { + "epoch": 0.67, + "grad_norm": 0.9774566292762756, + "learning_rate": 1.6550319953461314e-05, + "loss": 5.3388, + "step": 6900 + }, + { + "epoch": 0.67, + "grad_norm": 1.0642743110656738, + "learning_rate": 1.6530928834593755e-05, + "loss": 5.3104, + "step": 6904 + }, + { + "epoch": 0.67, + "grad_norm": 0.9880549907684326, + "learning_rate": 1.65115377157262e-05, + "loss": 5.3273, + "step": 6908 + }, + { + "epoch": 0.67, + "grad_norm": 1.0308412313461304, + "learning_rate": 1.649214659685864e-05, + "loss": 5.293, + "step": 6912 + }, + { + "epoch": 0.67, + "grad_norm": 1.0004191398620605, + "learning_rate": 1.647275547799108e-05, + "loss": 5.3624, + "step": 6916 + }, + { + "epoch": 0.67, + "grad_norm": 1.0223098993301392, + "learning_rate": 1.645336435912352e-05, + "loss": 5.3298, + "step": 6920 + }, + { + "epoch": 0.67, + "grad_norm": 1.0578629970550537, + "learning_rate": 1.6433973240255964e-05, + "loss": 5.3331, + "step": 6924 + }, + { + "epoch": 0.67, + "grad_norm": 1.0649224519729614, + "learning_rate": 1.6414582121388405e-05, + "loss": 5.4143, + "step": 6928 + }, + { + "epoch": 0.67, + "grad_norm": 1.0141630172729492, + "learning_rate": 1.6395191002520845e-05, + "loss": 5.2971, + "step": 6932 + }, + { + "epoch": 0.67, + "grad_norm": 1.06792414188385, + "learning_rate": 1.6375799883653286e-05, + "loss": 5.359, + "step": 6936 + }, + { + "epoch": 0.67, + "grad_norm": 1.0014034509658813, + "learning_rate": 1.635640876478573e-05, + "loss": 5.3262, + "step": 6940 + }, + { + "epoch": 0.67, + "grad_norm": 1.0073959827423096, + "learning_rate": 1.633701764591817e-05, + "loss": 5.2941, + "step": 6944 + }, + { + "epoch": 0.67, + "grad_norm": 1.0175294876098633, + "learning_rate": 1.631762652705061e-05, + "loss": 5.4023, + "step": 6948 + }, + { + "epoch": 0.67, + "grad_norm": 1.0541809797286987, + "learning_rate": 1.629823540818305e-05, + "loss": 5.3224, + "step": 6952 + }, + { + "epoch": 0.67, + "grad_norm": 1.0851467847824097, + "learning_rate": 1.6278844289315495e-05, + "loss": 5.3333, + "step": 6956 + }, + { + "epoch": 0.67, + "grad_norm": 1.081163763999939, + "learning_rate": 1.6259453170447936e-05, + "loss": 5.4189, + "step": 6960 + }, + { + "epoch": 0.68, + "grad_norm": 1.0276130437850952, + "learning_rate": 1.6240062051580376e-05, + "loss": 5.2743, + "step": 6964 + }, + { + "epoch": 0.68, + "grad_norm": 1.0390230417251587, + "learning_rate": 1.6220670932712817e-05, + "loss": 5.3315, + "step": 6968 + }, + { + "epoch": 0.68, + "grad_norm": 0.9651821851730347, + "learning_rate": 1.620127981384526e-05, + "loss": 5.33, + "step": 6972 + }, + { + "epoch": 0.68, + "grad_norm": 1.05519437789917, + "learning_rate": 1.61818886949777e-05, + "loss": 5.3139, + "step": 6976 + }, + { + "epoch": 0.68, + "grad_norm": 0.9778598546981812, + "learning_rate": 1.6162497576110142e-05, + "loss": 5.303, + "step": 6980 + }, + { + "epoch": 0.68, + "grad_norm": 1.0078487396240234, + "learning_rate": 1.6143106457242586e-05, + "loss": 5.3094, + "step": 6984 + }, + { + "epoch": 0.68, + "grad_norm": 1.0488457679748535, + "learning_rate": 1.6123715338375026e-05, + "loss": 5.2953, + "step": 6988 + }, + { + "epoch": 0.68, + "grad_norm": 1.0421743392944336, + "learning_rate": 1.6104324219507467e-05, + "loss": 5.394, + "step": 6992 + }, + { + "epoch": 0.68, + "grad_norm": 0.9976141452789307, + "learning_rate": 1.6084933100639907e-05, + "loss": 5.3734, + "step": 6996 + }, + { + "epoch": 0.68, + "grad_norm": 1.0560002326965332, + "learning_rate": 1.606554198177235e-05, + "loss": 5.3631, + "step": 7000 + }, + { + "epoch": 0.68, + "grad_norm": 1.0117374658584595, + "learning_rate": 1.6046150862904792e-05, + "loss": 5.3586, + "step": 7004 + }, + { + "epoch": 0.68, + "grad_norm": 1.0476561784744263, + "learning_rate": 1.6026759744037232e-05, + "loss": 5.3215, + "step": 7008 + }, + { + "epoch": 0.68, + "grad_norm": 1.0643686056137085, + "learning_rate": 1.6007368625169673e-05, + "loss": 5.2853, + "step": 7012 + }, + { + "epoch": 0.68, + "grad_norm": 1.0695873498916626, + "learning_rate": 1.5987977506302117e-05, + "loss": 5.3231, + "step": 7016 + }, + { + "epoch": 0.68, + "grad_norm": 1.0597788095474243, + "learning_rate": 1.5968586387434557e-05, + "loss": 5.3185, + "step": 7020 + }, + { + "epoch": 0.68, + "grad_norm": 1.0414984226226807, + "learning_rate": 1.5949195268566998e-05, + "loss": 5.3407, + "step": 7024 + }, + { + "epoch": 0.68, + "grad_norm": 0.9906083941459656, + "learning_rate": 1.592980414969944e-05, + "loss": 5.2572, + "step": 7028 + }, + { + "epoch": 0.68, + "grad_norm": 0.9810131192207336, + "learning_rate": 1.591041303083188e-05, + "loss": 5.2424, + "step": 7032 + }, + { + "epoch": 0.68, + "grad_norm": 1.0690699815750122, + "learning_rate": 1.589102191196432e-05, + "loss": 5.3503, + "step": 7036 + }, + { + "epoch": 0.68, + "grad_norm": 1.037235140800476, + "learning_rate": 1.5871630793096763e-05, + "loss": 5.3903, + "step": 7040 + }, + { + "epoch": 0.68, + "grad_norm": 0.9991822838783264, + "learning_rate": 1.5852239674229204e-05, + "loss": 5.3439, + "step": 7044 + }, + { + "epoch": 0.68, + "grad_norm": 1.0834139585494995, + "learning_rate": 1.5832848555361644e-05, + "loss": 5.2871, + "step": 7048 + }, + { + "epoch": 0.68, + "grad_norm": 0.9821889400482178, + "learning_rate": 1.5813457436494085e-05, + "loss": 5.248, + "step": 7052 + }, + { + "epoch": 0.68, + "grad_norm": 1.0037583112716675, + "learning_rate": 1.5794066317626526e-05, + "loss": 5.2412, + "step": 7056 + }, + { + "epoch": 0.68, + "grad_norm": 1.059874176979065, + "learning_rate": 1.577467519875897e-05, + "loss": 5.2965, + "step": 7060 + }, + { + "epoch": 0.68, + "grad_norm": 1.0708321332931519, + "learning_rate": 1.575528407989141e-05, + "loss": 5.1961, + "step": 7064 + }, + { + "epoch": 0.69, + "grad_norm": 1.0226424932479858, + "learning_rate": 1.573589296102385e-05, + "loss": 5.3895, + "step": 7068 + }, + { + "epoch": 0.69, + "grad_norm": 1.088083028793335, + "learning_rate": 1.571650184215629e-05, + "loss": 5.3202, + "step": 7072 + }, + { + "epoch": 0.69, + "grad_norm": 1.1663302183151245, + "learning_rate": 1.5697110723288735e-05, + "loss": 5.2799, + "step": 7076 + }, + { + "epoch": 0.69, + "grad_norm": 1.049932599067688, + "learning_rate": 1.5677719604421175e-05, + "loss": 5.3477, + "step": 7080 + }, + { + "epoch": 0.69, + "grad_norm": 1.0670647621154785, + "learning_rate": 1.5658328485553616e-05, + "loss": 5.321, + "step": 7084 + }, + { + "epoch": 0.69, + "grad_norm": 1.0027042627334595, + "learning_rate": 1.5638937366686057e-05, + "loss": 5.2589, + "step": 7088 + }, + { + "epoch": 0.69, + "grad_norm": 1.0624514818191528, + "learning_rate": 1.56195462478185e-05, + "loss": 5.3408, + "step": 7092 + }, + { + "epoch": 0.69, + "grad_norm": 1.1033272743225098, + "learning_rate": 1.560015512895094e-05, + "loss": 5.3141, + "step": 7096 + }, + { + "epoch": 0.69, + "grad_norm": 0.9922211766242981, + "learning_rate": 1.558076401008338e-05, + "loss": 5.3127, + "step": 7100 + }, + { + "epoch": 0.69, + "grad_norm": 1.1714577674865723, + "learning_rate": 1.5561372891215825e-05, + "loss": 5.3356, + "step": 7104 + }, + { + "epoch": 0.69, + "grad_norm": 1.092363715171814, + "learning_rate": 1.5541981772348266e-05, + "loss": 5.3375, + "step": 7108 + }, + { + "epoch": 0.69, + "grad_norm": 1.1520874500274658, + "learning_rate": 1.5522590653480707e-05, + "loss": 5.38, + "step": 7112 + }, + { + "epoch": 0.69, + "grad_norm": 1.0429631471633911, + "learning_rate": 1.5503199534613147e-05, + "loss": 5.2862, + "step": 7116 + }, + { + "epoch": 0.69, + "grad_norm": 1.133023977279663, + "learning_rate": 1.548380841574559e-05, + "loss": 5.3376, + "step": 7120 + }, + { + "epoch": 0.69, + "grad_norm": 1.0379852056503296, + "learning_rate": 1.546441729687803e-05, + "loss": 5.3617, + "step": 7124 + }, + { + "epoch": 0.69, + "grad_norm": 0.9884594082832336, + "learning_rate": 1.5445026178010472e-05, + "loss": 5.3478, + "step": 7128 + }, + { + "epoch": 0.69, + "grad_norm": 1.0757851600646973, + "learning_rate": 1.5425635059142913e-05, + "loss": 5.3307, + "step": 7132 + }, + { + "epoch": 0.69, + "grad_norm": 1.0310724973678589, + "learning_rate": 1.5406243940275357e-05, + "loss": 5.2984, + "step": 7136 + }, + { + "epoch": 0.69, + "grad_norm": 0.989493727684021, + "learning_rate": 1.5386852821407797e-05, + "loss": 5.4285, + "step": 7140 + }, + { + "epoch": 0.69, + "grad_norm": 0.950312614440918, + "learning_rate": 1.5367461702540238e-05, + "loss": 5.2814, + "step": 7144 + }, + { + "epoch": 0.69, + "grad_norm": 1.0180491209030151, + "learning_rate": 1.5348070583672678e-05, + "loss": 5.3137, + "step": 7148 + }, + { + "epoch": 0.69, + "grad_norm": 1.0413585901260376, + "learning_rate": 1.5328679464805122e-05, + "loss": 5.3889, + "step": 7152 + }, + { + "epoch": 0.69, + "grad_norm": 1.0932857990264893, + "learning_rate": 1.5309288345937563e-05, + "loss": 5.4341, + "step": 7156 + }, + { + "epoch": 0.69, + "grad_norm": 1.06010901927948, + "learning_rate": 1.5289897227070003e-05, + "loss": 5.3442, + "step": 7160 + }, + { + "epoch": 0.69, + "grad_norm": 1.0883803367614746, + "learning_rate": 1.5270506108202444e-05, + "loss": 5.3341, + "step": 7164 + }, + { + "epoch": 0.69, + "grad_norm": 1.0533767938613892, + "learning_rate": 1.5251114989334886e-05, + "loss": 5.2995, + "step": 7168 + }, + { + "epoch": 0.7, + "grad_norm": 0.9598619937896729, + "learning_rate": 1.5231723870467326e-05, + "loss": 5.3465, + "step": 7172 + }, + { + "epoch": 0.7, + "grad_norm": 1.0123393535614014, + "learning_rate": 1.5212332751599767e-05, + "loss": 5.2892, + "step": 7176 + }, + { + "epoch": 0.7, + "grad_norm": 1.0960266590118408, + "learning_rate": 1.5192941632732211e-05, + "loss": 5.332, + "step": 7180 + }, + { + "epoch": 0.7, + "grad_norm": 1.1648544073104858, + "learning_rate": 1.5173550513864651e-05, + "loss": 5.4285, + "step": 7184 + }, + { + "epoch": 0.7, + "grad_norm": 1.051171064376831, + "learning_rate": 1.5154159394997092e-05, + "loss": 5.4299, + "step": 7188 + }, + { + "epoch": 0.7, + "grad_norm": 1.0466375350952148, + "learning_rate": 1.5134768276129533e-05, + "loss": 5.3216, + "step": 7192 + }, + { + "epoch": 0.7, + "grad_norm": 0.9910897612571716, + "learning_rate": 1.5115377157261976e-05, + "loss": 5.3598, + "step": 7196 + }, + { + "epoch": 0.7, + "grad_norm": 1.1121070384979248, + "learning_rate": 1.5095986038394417e-05, + "loss": 5.3169, + "step": 7200 + }, + { + "epoch": 0.7, + "grad_norm": 1.0514254570007324, + "learning_rate": 1.5076594919526858e-05, + "loss": 5.3502, + "step": 7204 + }, + { + "epoch": 0.7, + "grad_norm": 1.0363171100616455, + "learning_rate": 1.5057203800659298e-05, + "loss": 5.3192, + "step": 7208 + }, + { + "epoch": 0.7, + "grad_norm": 1.0386378765106201, + "learning_rate": 1.503781268179174e-05, + "loss": 5.3244, + "step": 7212 + }, + { + "epoch": 0.7, + "grad_norm": 1.0868582725524902, + "learning_rate": 1.5018421562924181e-05, + "loss": 5.2998, + "step": 7216 + }, + { + "epoch": 0.7, + "grad_norm": 1.106695532798767, + "learning_rate": 1.4999030444056621e-05, + "loss": 5.2703, + "step": 7220 + }, + { + "epoch": 0.7, + "grad_norm": 1.0727300643920898, + "learning_rate": 1.4979639325189065e-05, + "loss": 5.2557, + "step": 7224 + }, + { + "epoch": 0.7, + "grad_norm": 1.0959160327911377, + "learning_rate": 1.4960248206321506e-05, + "loss": 5.2666, + "step": 7228 + }, + { + "epoch": 0.7, + "grad_norm": 1.0157089233398438, + "learning_rate": 1.4940857087453946e-05, + "loss": 5.3451, + "step": 7232 + }, + { + "epoch": 0.7, + "grad_norm": 1.1046866178512573, + "learning_rate": 1.4921465968586387e-05, + "loss": 5.2349, + "step": 7236 + }, + { + "epoch": 0.7, + "grad_norm": 1.0781642198562622, + "learning_rate": 1.4902074849718831e-05, + "loss": 5.2609, + "step": 7240 + }, + { + "epoch": 0.7, + "grad_norm": 1.0019387006759644, + "learning_rate": 1.4882683730851271e-05, + "loss": 5.3954, + "step": 7244 + }, + { + "epoch": 0.7, + "grad_norm": 1.1083266735076904, + "learning_rate": 1.4863292611983712e-05, + "loss": 5.2636, + "step": 7248 + }, + { + "epoch": 0.7, + "grad_norm": 1.2309002876281738, + "learning_rate": 1.4843901493116152e-05, + "loss": 5.2955, + "step": 7252 + }, + { + "epoch": 0.7, + "grad_norm": 1.087774634361267, + "learning_rate": 1.4824510374248596e-05, + "loss": 5.3, + "step": 7256 + }, + { + "epoch": 0.7, + "grad_norm": 1.075287938117981, + "learning_rate": 1.4805119255381037e-05, + "loss": 5.3727, + "step": 7260 + }, + { + "epoch": 0.7, + "grad_norm": 1.0246081352233887, + "learning_rate": 1.4785728136513477e-05, + "loss": 5.3558, + "step": 7264 + }, + { + "epoch": 0.7, + "grad_norm": 1.124543309211731, + "learning_rate": 1.4766337017645918e-05, + "loss": 5.3379, + "step": 7268 + }, + { + "epoch": 0.71, + "grad_norm": 1.047892689704895, + "learning_rate": 1.474694589877836e-05, + "loss": 5.3975, + "step": 7272 + }, + { + "epoch": 0.71, + "grad_norm": 1.0381947755813599, + "learning_rate": 1.47275547799108e-05, + "loss": 5.3515, + "step": 7276 + }, + { + "epoch": 0.71, + "grad_norm": 1.0230307579040527, + "learning_rate": 1.4708163661043243e-05, + "loss": 5.2925, + "step": 7280 + }, + { + "epoch": 0.71, + "grad_norm": 1.0596458911895752, + "learning_rate": 1.4688772542175685e-05, + "loss": 5.3191, + "step": 7284 + }, + { + "epoch": 0.71, + "grad_norm": 1.1031346321105957, + "learning_rate": 1.4669381423308126e-05, + "loss": 5.402, + "step": 7288 + }, + { + "epoch": 0.71, + "grad_norm": 1.0289580821990967, + "learning_rate": 1.4649990304440566e-05, + "loss": 5.3957, + "step": 7292 + }, + { + "epoch": 0.71, + "grad_norm": 1.1469511985778809, + "learning_rate": 1.4630599185573007e-05, + "loss": 5.3244, + "step": 7296 + }, + { + "epoch": 0.71, + "grad_norm": 1.0669410228729248, + "learning_rate": 1.461120806670545e-05, + "loss": 5.431, + "step": 7300 + }, + { + "epoch": 0.71, + "grad_norm": 1.05574631690979, + "learning_rate": 1.4591816947837891e-05, + "loss": 5.3382, + "step": 7304 + }, + { + "epoch": 0.71, + "grad_norm": 1.0296452045440674, + "learning_rate": 1.4572425828970332e-05, + "loss": 5.337, + "step": 7308 + }, + { + "epoch": 0.71, + "grad_norm": 1.0180591344833374, + "learning_rate": 1.4553034710102772e-05, + "loss": 5.3239, + "step": 7312 + }, + { + "epoch": 0.71, + "grad_norm": 1.0508371591567993, + "learning_rate": 1.4533643591235216e-05, + "loss": 5.2944, + "step": 7316 + }, + { + "epoch": 0.71, + "grad_norm": 1.0255225896835327, + "learning_rate": 1.4514252472367657e-05, + "loss": 5.3574, + "step": 7320 + }, + { + "epoch": 0.71, + "grad_norm": 1.0599967241287231, + "learning_rate": 1.4494861353500097e-05, + "loss": 5.3568, + "step": 7324 + }, + { + "epoch": 0.71, + "grad_norm": 0.9832557439804077, + "learning_rate": 1.4475470234632538e-05, + "loss": 5.2891, + "step": 7328 + }, + { + "epoch": 0.71, + "grad_norm": 1.0541869401931763, + "learning_rate": 1.4456079115764982e-05, + "loss": 5.2502, + "step": 7332 + }, + { + "epoch": 0.71, + "grad_norm": 1.069973111152649, + "learning_rate": 1.4436687996897422e-05, + "loss": 5.3376, + "step": 7336 + }, + { + "epoch": 0.71, + "grad_norm": 1.0768502950668335, + "learning_rate": 1.4417296878029863e-05, + "loss": 5.2392, + "step": 7340 + }, + { + "epoch": 0.71, + "grad_norm": 1.000628113746643, + "learning_rate": 1.4397905759162305e-05, + "loss": 5.3293, + "step": 7344 + }, + { + "epoch": 0.71, + "grad_norm": 0.9960778951644897, + "learning_rate": 1.4378514640294746e-05, + "loss": 5.2655, + "step": 7348 + }, + { + "epoch": 0.71, + "grad_norm": 1.0327279567718506, + "learning_rate": 1.4359123521427186e-05, + "loss": 5.3477, + "step": 7352 + }, + { + "epoch": 0.71, + "grad_norm": 1.0809035301208496, + "learning_rate": 1.4339732402559627e-05, + "loss": 5.331, + "step": 7356 + }, + { + "epoch": 0.71, + "grad_norm": 1.0690807104110718, + "learning_rate": 1.432034128369207e-05, + "loss": 5.3986, + "step": 7360 + }, + { + "epoch": 0.71, + "grad_norm": 1.0245548486709595, + "learning_rate": 1.4300950164824511e-05, + "loss": 5.3024, + "step": 7364 + }, + { + "epoch": 0.71, + "grad_norm": 0.9999493956565857, + "learning_rate": 1.4281559045956952e-05, + "loss": 5.2878, + "step": 7368 + }, + { + "epoch": 0.71, + "grad_norm": 1.0037769079208374, + "learning_rate": 1.4262167927089392e-05, + "loss": 5.2216, + "step": 7372 + }, + { + "epoch": 0.72, + "grad_norm": 1.002113699913025, + "learning_rate": 1.4242776808221836e-05, + "loss": 5.4347, + "step": 7376 + }, + { + "epoch": 0.72, + "grad_norm": 1.04192054271698, + "learning_rate": 1.4223385689354277e-05, + "loss": 5.3812, + "step": 7380 + }, + { + "epoch": 0.72, + "grad_norm": 1.0271295309066772, + "learning_rate": 1.4203994570486717e-05, + "loss": 5.427, + "step": 7384 + }, + { + "epoch": 0.72, + "grad_norm": 1.0071477890014648, + "learning_rate": 1.4184603451619158e-05, + "loss": 5.2727, + "step": 7388 + }, + { + "epoch": 0.72, + "grad_norm": 1.0228255987167358, + "learning_rate": 1.4165212332751602e-05, + "loss": 5.3244, + "step": 7392 + }, + { + "epoch": 0.72, + "grad_norm": 1.0034810304641724, + "learning_rate": 1.4145821213884042e-05, + "loss": 5.3883, + "step": 7396 + }, + { + "epoch": 0.72, + "grad_norm": 1.0305688381195068, + "learning_rate": 1.4126430095016483e-05, + "loss": 5.3851, + "step": 7400 + }, + { + "epoch": 0.72, + "grad_norm": 1.074646234512329, + "learning_rate": 1.4107038976148925e-05, + "loss": 5.2406, + "step": 7404 + }, + { + "epoch": 0.72, + "grad_norm": 0.9961770176887512, + "learning_rate": 1.4087647857281366e-05, + "loss": 5.2848, + "step": 7408 + }, + { + "epoch": 0.72, + "grad_norm": 1.034627079963684, + "learning_rate": 1.4068256738413808e-05, + "loss": 5.2984, + "step": 7412 + }, + { + "epoch": 0.72, + "grad_norm": 1.072096824645996, + "learning_rate": 1.4048865619546248e-05, + "loss": 5.3182, + "step": 7416 + }, + { + "epoch": 0.72, + "grad_norm": 1.0899096727371216, + "learning_rate": 1.402947450067869e-05, + "loss": 5.3017, + "step": 7420 + }, + { + "epoch": 0.72, + "grad_norm": 1.1063376665115356, + "learning_rate": 1.4010083381811131e-05, + "loss": 5.311, + "step": 7424 + }, + { + "epoch": 0.72, + "grad_norm": 1.0748202800750732, + "learning_rate": 1.3990692262943572e-05, + "loss": 5.318, + "step": 7428 + }, + { + "epoch": 0.72, + "grad_norm": 1.0851057767868042, + "learning_rate": 1.3971301144076012e-05, + "loss": 5.3425, + "step": 7432 + }, + { + "epoch": 0.72, + "grad_norm": 0.9942495822906494, + "learning_rate": 1.3951910025208456e-05, + "loss": 5.3258, + "step": 7436 + }, + { + "epoch": 0.72, + "grad_norm": 1.0894228219985962, + "learning_rate": 1.3932518906340897e-05, + "loss": 5.2769, + "step": 7440 + }, + { + "epoch": 0.72, + "grad_norm": 0.9603523015975952, + "learning_rate": 1.3913127787473337e-05, + "loss": 5.4087, + "step": 7444 + }, + { + "epoch": 0.72, + "grad_norm": 1.1049365997314453, + "learning_rate": 1.3893736668605778e-05, + "loss": 5.339, + "step": 7448 + }, + { + "epoch": 0.72, + "grad_norm": 1.021468162536621, + "learning_rate": 1.3874345549738222e-05, + "loss": 5.3447, + "step": 7452 + }, + { + "epoch": 0.72, + "grad_norm": 1.088437557220459, + "learning_rate": 1.3854954430870662e-05, + "loss": 5.3546, + "step": 7456 + }, + { + "epoch": 0.72, + "grad_norm": 1.0266107320785522, + "learning_rate": 1.3835563312003103e-05, + "loss": 5.1867, + "step": 7460 + }, + { + "epoch": 0.72, + "grad_norm": 1.088911533355713, + "learning_rate": 1.3816172193135547e-05, + "loss": 5.3247, + "step": 7464 + }, + { + "epoch": 0.72, + "grad_norm": 1.0675928592681885, + "learning_rate": 1.3796781074267987e-05, + "loss": 5.2807, + "step": 7468 + }, + { + "epoch": 0.72, + "grad_norm": 0.9511438608169556, + "learning_rate": 1.3777389955400428e-05, + "loss": 5.3774, + "step": 7472 + }, + { + "epoch": 0.72, + "grad_norm": 1.0546114444732666, + "learning_rate": 1.3757998836532868e-05, + "loss": 5.2856, + "step": 7476 + }, + { + "epoch": 0.73, + "grad_norm": 1.0445231199264526, + "learning_rate": 1.373860771766531e-05, + "loss": 5.3805, + "step": 7480 + }, + { + "epoch": 0.73, + "grad_norm": 1.0493693351745605, + "learning_rate": 1.3719216598797751e-05, + "loss": 5.3603, + "step": 7484 + }, + { + "epoch": 0.73, + "grad_norm": 1.0047773122787476, + "learning_rate": 1.3699825479930192e-05, + "loss": 5.3224, + "step": 7488 + }, + { + "epoch": 0.73, + "grad_norm": 1.0536508560180664, + "learning_rate": 1.3680434361062632e-05, + "loss": 5.4235, + "step": 7492 + }, + { + "epoch": 0.73, + "grad_norm": 1.0347819328308105, + "learning_rate": 1.3661043242195076e-05, + "loss": 5.2528, + "step": 7496 + }, + { + "epoch": 0.73, + "grad_norm": 1.0392394065856934, + "learning_rate": 1.3641652123327517e-05, + "loss": 5.4127, + "step": 7500 + }, + { + "epoch": 0.73, + "grad_norm": 1.0930792093276978, + "learning_rate": 1.3622261004459957e-05, + "loss": 5.3221, + "step": 7504 + }, + { + "epoch": 0.73, + "grad_norm": 0.9962918758392334, + "learning_rate": 1.3602869885592398e-05, + "loss": 5.3108, + "step": 7508 + }, + { + "epoch": 0.73, + "grad_norm": 1.030331015586853, + "learning_rate": 1.3583478766724842e-05, + "loss": 5.2632, + "step": 7512 + }, + { + "epoch": 0.73, + "grad_norm": 1.008636236190796, + "learning_rate": 1.3564087647857282e-05, + "loss": 5.3089, + "step": 7516 + }, + { + "epoch": 0.73, + "grad_norm": 1.006934642791748, + "learning_rate": 1.3544696528989723e-05, + "loss": 5.2903, + "step": 7520 + }, + { + "epoch": 0.73, + "grad_norm": 1.0194462537765503, + "learning_rate": 1.3525305410122167e-05, + "loss": 5.3827, + "step": 7524 + }, + { + "epoch": 0.73, + "grad_norm": 0.9879323840141296, + "learning_rate": 1.3505914291254607e-05, + "loss": 5.4307, + "step": 7528 + }, + { + "epoch": 0.73, + "grad_norm": 1.0651185512542725, + "learning_rate": 1.3486523172387048e-05, + "loss": 5.3134, + "step": 7532 + }, + { + "epoch": 0.73, + "grad_norm": 1.0858125686645508, + "learning_rate": 1.3467132053519488e-05, + "loss": 5.3024, + "step": 7536 + }, + { + "epoch": 0.73, + "grad_norm": 1.0240780115127563, + "learning_rate": 1.344774093465193e-05, + "loss": 5.3555, + "step": 7540 + }, + { + "epoch": 0.73, + "grad_norm": 1.0629985332489014, + "learning_rate": 1.3428349815784373e-05, + "loss": 5.3652, + "step": 7544 + }, + { + "epoch": 0.73, + "grad_norm": 1.1222716569900513, + "learning_rate": 1.3408958696916813e-05, + "loss": 5.3845, + "step": 7548 + }, + { + "epoch": 0.73, + "grad_norm": 1.0008291006088257, + "learning_rate": 1.3389567578049254e-05, + "loss": 5.3552, + "step": 7552 + }, + { + "epoch": 0.73, + "grad_norm": 1.0860753059387207, + "learning_rate": 1.3370176459181696e-05, + "loss": 5.3044, + "step": 7556 + }, + { + "epoch": 0.73, + "grad_norm": 1.0495448112487793, + "learning_rate": 1.3350785340314136e-05, + "loss": 5.3259, + "step": 7560 + }, + { + "epoch": 0.73, + "grad_norm": 1.0432664155960083, + "learning_rate": 1.3331394221446577e-05, + "loss": 5.3482, + "step": 7564 + }, + { + "epoch": 0.73, + "grad_norm": 1.0440526008605957, + "learning_rate": 1.3312003102579018e-05, + "loss": 5.2362, + "step": 7568 + }, + { + "epoch": 0.73, + "grad_norm": 1.0317118167877197, + "learning_rate": 1.3292611983711461e-05, + "loss": 5.3579, + "step": 7572 + }, + { + "epoch": 0.73, + "grad_norm": 0.9613714218139648, + "learning_rate": 1.3273220864843902e-05, + "loss": 5.3631, + "step": 7576 + }, + { + "epoch": 0.73, + "grad_norm": 1.1608860492706299, + "learning_rate": 1.3253829745976343e-05, + "loss": 5.3547, + "step": 7580 + }, + { + "epoch": 0.74, + "grad_norm": 1.06599760055542, + "learning_rate": 1.3234438627108786e-05, + "loss": 5.3728, + "step": 7584 + }, + { + "epoch": 0.74, + "grad_norm": 1.0115044116973877, + "learning_rate": 1.3215047508241227e-05, + "loss": 5.2954, + "step": 7588 + }, + { + "epoch": 0.74, + "grad_norm": 1.0504167079925537, + "learning_rate": 1.3195656389373668e-05, + "loss": 5.3378, + "step": 7592 + }, + { + "epoch": 0.74, + "grad_norm": 1.1047917604446411, + "learning_rate": 1.3176265270506108e-05, + "loss": 5.3153, + "step": 7596 + }, + { + "epoch": 0.74, + "grad_norm": 1.0862175226211548, + "learning_rate": 1.3156874151638552e-05, + "loss": 5.4315, + "step": 7600 + }, + { + "epoch": 0.74, + "grad_norm": 1.05397629737854, + "learning_rate": 1.3137483032770993e-05, + "loss": 5.3807, + "step": 7604 + }, + { + "epoch": 0.74, + "grad_norm": 1.0143108367919922, + "learning_rate": 1.3118091913903433e-05, + "loss": 5.346, + "step": 7608 + }, + { + "epoch": 0.74, + "grad_norm": 1.0487464666366577, + "learning_rate": 1.3098700795035874e-05, + "loss": 5.2966, + "step": 7612 + }, + { + "epoch": 0.74, + "grad_norm": 1.156467318534851, + "learning_rate": 1.3079309676168316e-05, + "loss": 5.24, + "step": 7616 + }, + { + "epoch": 0.74, + "grad_norm": 1.0586912631988525, + "learning_rate": 1.3059918557300756e-05, + "loss": 5.3162, + "step": 7620 + }, + { + "epoch": 0.74, + "grad_norm": 1.0472930669784546, + "learning_rate": 1.3040527438433197e-05, + "loss": 5.4309, + "step": 7624 + }, + { + "epoch": 0.74, + "grad_norm": 1.0239797830581665, + "learning_rate": 1.3021136319565637e-05, + "loss": 5.3124, + "step": 7628 + }, + { + "epoch": 0.74, + "grad_norm": 1.084915041923523, + "learning_rate": 1.3001745200698081e-05, + "loss": 5.382, + "step": 7632 + }, + { + "epoch": 0.74, + "grad_norm": 1.121639370918274, + "learning_rate": 1.2982354081830522e-05, + "loss": 5.2888, + "step": 7636 + }, + { + "epoch": 0.74, + "grad_norm": 1.06790030002594, + "learning_rate": 1.2962962962962962e-05, + "loss": 5.3484, + "step": 7640 + }, + { + "epoch": 0.74, + "grad_norm": 1.0664012432098389, + "learning_rate": 1.2943571844095406e-05, + "loss": 5.3624, + "step": 7644 + }, + { + "epoch": 0.74, + "grad_norm": 1.032593846321106, + "learning_rate": 1.2924180725227847e-05, + "loss": 5.3109, + "step": 7648 + }, + { + "epoch": 0.74, + "grad_norm": 1.0182029008865356, + "learning_rate": 1.2904789606360287e-05, + "loss": 5.3785, + "step": 7652 + }, + { + "epoch": 0.74, + "grad_norm": 0.9787065982818604, + "learning_rate": 1.2885398487492728e-05, + "loss": 5.2586, + "step": 7656 + }, + { + "epoch": 0.74, + "grad_norm": 1.14923095703125, + "learning_rate": 1.2866007368625172e-05, + "loss": 5.3378, + "step": 7660 + }, + { + "epoch": 0.74, + "grad_norm": 1.0064685344696045, + "learning_rate": 1.2846616249757612e-05, + "loss": 5.3339, + "step": 7664 + }, + { + "epoch": 0.74, + "grad_norm": 1.015594720840454, + "learning_rate": 1.2827225130890053e-05, + "loss": 5.2412, + "step": 7668 + }, + { + "epoch": 0.74, + "grad_norm": 1.1527953147888184, + "learning_rate": 1.2807834012022494e-05, + "loss": 5.4026, + "step": 7672 + }, + { + "epoch": 0.74, + "grad_norm": 1.024170994758606, + "learning_rate": 1.2788442893154936e-05, + "loss": 5.2431, + "step": 7676 + }, + { + "epoch": 0.74, + "grad_norm": 1.0834672451019287, + "learning_rate": 1.2769051774287378e-05, + "loss": 5.3018, + "step": 7680 + }, + { + "epoch": 0.75, + "grad_norm": 1.00787353515625, + "learning_rate": 1.2749660655419819e-05, + "loss": 5.3129, + "step": 7684 + }, + { + "epoch": 0.75, + "grad_norm": 1.0804412364959717, + "learning_rate": 1.2730269536552259e-05, + "loss": 5.3974, + "step": 7688 + }, + { + "epoch": 0.75, + "grad_norm": 1.0361813306808472, + "learning_rate": 1.2710878417684701e-05, + "loss": 5.291, + "step": 7692 + }, + { + "epoch": 0.75, + "grad_norm": 0.9639879465103149, + "learning_rate": 1.2691487298817142e-05, + "loss": 5.3649, + "step": 7696 + }, + { + "epoch": 0.75, + "grad_norm": 1.035768985748291, + "learning_rate": 1.2672096179949582e-05, + "loss": 5.3159, + "step": 7700 + }, + { + "epoch": 0.75, + "grad_norm": 1.002968430519104, + "learning_rate": 1.2652705061082026e-05, + "loss": 5.1975, + "step": 7704 + }, + { + "epoch": 0.75, + "grad_norm": 1.0666626691818237, + "learning_rate": 1.2633313942214467e-05, + "loss": 5.2216, + "step": 7708 + }, + { + "epoch": 0.75, + "grad_norm": 1.031752347946167, + "learning_rate": 1.2613922823346907e-05, + "loss": 5.4174, + "step": 7712 + }, + { + "epoch": 0.75, + "grad_norm": 1.0436227321624756, + "learning_rate": 1.2594531704479348e-05, + "loss": 5.2295, + "step": 7716 + }, + { + "epoch": 0.75, + "grad_norm": 1.0823688507080078, + "learning_rate": 1.2575140585611792e-05, + "loss": 5.3274, + "step": 7720 + }, + { + "epoch": 0.75, + "grad_norm": 1.0667200088500977, + "learning_rate": 1.2555749466744232e-05, + "loss": 5.3149, + "step": 7724 + }, + { + "epoch": 0.75, + "grad_norm": 1.0479573011398315, + "learning_rate": 1.2536358347876673e-05, + "loss": 5.2653, + "step": 7728 + }, + { + "epoch": 0.75, + "grad_norm": 1.1973553895950317, + "learning_rate": 1.2516967229009113e-05, + "loss": 5.2504, + "step": 7732 + }, + { + "epoch": 0.75, + "grad_norm": 1.0791184902191162, + "learning_rate": 1.2497576110141556e-05, + "loss": 5.2797, + "step": 7736 + }, + { + "epoch": 0.75, + "grad_norm": 1.091112732887268, + "learning_rate": 1.2478184991273998e-05, + "loss": 5.3129, + "step": 7740 + }, + { + "epoch": 0.75, + "grad_norm": 1.0607527494430542, + "learning_rate": 1.2458793872406438e-05, + "loss": 5.2997, + "step": 7744 + }, + { + "epoch": 0.75, + "grad_norm": 1.0152305364608765, + "learning_rate": 1.243940275353888e-05, + "loss": 5.3655, + "step": 7748 + }, + { + "epoch": 0.75, + "grad_norm": 1.009064793586731, + "learning_rate": 1.2420011634671321e-05, + "loss": 5.3058, + "step": 7752 + }, + { + "epoch": 0.75, + "grad_norm": 1.0338549613952637, + "learning_rate": 1.2400620515803762e-05, + "loss": 5.2527, + "step": 7756 + }, + { + "epoch": 0.75, + "grad_norm": 1.0737717151641846, + "learning_rate": 1.2381229396936204e-05, + "loss": 5.3859, + "step": 7760 + }, + { + "epoch": 0.75, + "grad_norm": 1.0524028539657593, + "learning_rate": 1.2361838278068645e-05, + "loss": 5.407, + "step": 7764 + }, + { + "epoch": 0.75, + "grad_norm": 1.0182230472564697, + "learning_rate": 1.2342447159201087e-05, + "loss": 5.3243, + "step": 7768 + }, + { + "epoch": 0.75, + "grad_norm": 1.042325496673584, + "learning_rate": 1.2323056040333527e-05, + "loss": 5.372, + "step": 7772 + }, + { + "epoch": 0.75, + "grad_norm": 1.099138855934143, + "learning_rate": 1.230366492146597e-05, + "loss": 5.3577, + "step": 7776 + }, + { + "epoch": 0.75, + "grad_norm": 1.007309913635254, + "learning_rate": 1.228427380259841e-05, + "loss": 5.252, + "step": 7780 + }, + { + "epoch": 0.75, + "grad_norm": 1.1510696411132812, + "learning_rate": 1.2264882683730852e-05, + "loss": 5.3726, + "step": 7784 + }, + { + "epoch": 0.76, + "grad_norm": 1.0110242366790771, + "learning_rate": 1.2245491564863293e-05, + "loss": 5.3932, + "step": 7788 + }, + { + "epoch": 0.76, + "grad_norm": 1.0620808601379395, + "learning_rate": 1.2226100445995735e-05, + "loss": 5.2691, + "step": 7792 + }, + { + "epoch": 0.76, + "grad_norm": 1.052786946296692, + "learning_rate": 1.2206709327128176e-05, + "loss": 5.3597, + "step": 7796 + }, + { + "epoch": 0.76, + "grad_norm": 1.1259844303131104, + "learning_rate": 1.2187318208260618e-05, + "loss": 5.3324, + "step": 7800 + }, + { + "epoch": 0.76, + "grad_norm": 1.0244724750518799, + "learning_rate": 1.2167927089393058e-05, + "loss": 5.3643, + "step": 7804 + }, + { + "epoch": 0.76, + "grad_norm": 1.1225011348724365, + "learning_rate": 1.21485359705255e-05, + "loss": 5.3501, + "step": 7808 + }, + { + "epoch": 0.76, + "grad_norm": 0.9966182112693787, + "learning_rate": 1.2129144851657941e-05, + "loss": 5.3966, + "step": 7812 + }, + { + "epoch": 0.76, + "grad_norm": 1.102308988571167, + "learning_rate": 1.2109753732790383e-05, + "loss": 5.2625, + "step": 7816 + }, + { + "epoch": 0.76, + "grad_norm": 1.0900803804397583, + "learning_rate": 1.2095210393639714e-05, + "loss": 5.365, + "step": 7820 + }, + { + "epoch": 0.76, + "grad_norm": 1.1339807510375977, + "learning_rate": 1.2075819274772154e-05, + "loss": 5.3816, + "step": 7824 + }, + { + "epoch": 0.76, + "grad_norm": 1.077379822731018, + "learning_rate": 1.2056428155904597e-05, + "loss": 5.1332, + "step": 7828 + }, + { + "epoch": 0.76, + "grad_norm": 1.1201417446136475, + "learning_rate": 1.2037037037037037e-05, + "loss": 5.3037, + "step": 7832 + }, + { + "epoch": 0.76, + "grad_norm": 1.1047320365905762, + "learning_rate": 1.201764591816948e-05, + "loss": 5.4453, + "step": 7836 + }, + { + "epoch": 0.76, + "grad_norm": 1.0625344514846802, + "learning_rate": 1.1998254799301922e-05, + "loss": 5.2676, + "step": 7840 + }, + { + "epoch": 0.76, + "grad_norm": 1.0774505138397217, + "learning_rate": 1.1978863680434362e-05, + "loss": 5.3222, + "step": 7844 + }, + { + "epoch": 0.76, + "grad_norm": 1.0557003021240234, + "learning_rate": 1.1959472561566804e-05, + "loss": 5.3235, + "step": 7848 + }, + { + "epoch": 0.76, + "grad_norm": 0.9856312274932861, + "learning_rate": 1.1940081442699245e-05, + "loss": 5.2947, + "step": 7852 + }, + { + "epoch": 0.76, + "grad_norm": 1.0185786485671997, + "learning_rate": 1.1920690323831685e-05, + "loss": 5.3262, + "step": 7856 + }, + { + "epoch": 0.76, + "grad_norm": 1.0777360200881958, + "learning_rate": 1.1901299204964126e-05, + "loss": 5.254, + "step": 7860 + }, + { + "epoch": 0.76, + "grad_norm": 1.0238205194473267, + "learning_rate": 1.1881908086096568e-05, + "loss": 5.356, + "step": 7864 + }, + { + "epoch": 0.76, + "grad_norm": 1.0025471448898315, + "learning_rate": 1.1862516967229009e-05, + "loss": 5.2541, + "step": 7868 + }, + { + "epoch": 0.76, + "grad_norm": 1.030316710472107, + "learning_rate": 1.1843125848361451e-05, + "loss": 5.2578, + "step": 7872 + }, + { + "epoch": 0.76, + "grad_norm": 1.091535210609436, + "learning_rate": 1.1823734729493891e-05, + "loss": 5.3234, + "step": 7876 + }, + { + "epoch": 0.76, + "grad_norm": 1.0827471017837524, + "learning_rate": 1.1804343610626334e-05, + "loss": 5.2876, + "step": 7880 + }, + { + "epoch": 0.76, + "grad_norm": 1.0991337299346924, + "learning_rate": 1.1784952491758774e-05, + "loss": 5.473, + "step": 7884 + }, + { + "epoch": 0.76, + "grad_norm": 1.0421675443649292, + "learning_rate": 1.1765561372891216e-05, + "loss": 5.3059, + "step": 7888 + }, + { + "epoch": 0.77, + "grad_norm": 1.1006362438201904, + "learning_rate": 1.1746170254023657e-05, + "loss": 5.2836, + "step": 7892 + }, + { + "epoch": 0.77, + "grad_norm": 1.0210435390472412, + "learning_rate": 1.17267791351561e-05, + "loss": 5.3632, + "step": 7896 + }, + { + "epoch": 0.77, + "grad_norm": 0.9662442207336426, + "learning_rate": 1.1707388016288541e-05, + "loss": 5.3246, + "step": 7900 + }, + { + "epoch": 0.77, + "grad_norm": 1.1299954652786255, + "learning_rate": 1.1687996897420982e-05, + "loss": 5.2499, + "step": 7904 + }, + { + "epoch": 0.77, + "grad_norm": 1.0942116975784302, + "learning_rate": 1.1668605778553424e-05, + "loss": 5.2834, + "step": 7908 + }, + { + "epoch": 0.77, + "grad_norm": 1.0262593030929565, + "learning_rate": 1.1649214659685865e-05, + "loss": 5.3228, + "step": 7912 + }, + { + "epoch": 0.77, + "grad_norm": 1.0624995231628418, + "learning_rate": 1.1629823540818307e-05, + "loss": 5.2603, + "step": 7916 + }, + { + "epoch": 0.77, + "grad_norm": 1.0851109027862549, + "learning_rate": 1.1610432421950747e-05, + "loss": 5.2343, + "step": 7920 + }, + { + "epoch": 0.77, + "grad_norm": 1.0987358093261719, + "learning_rate": 1.1591041303083188e-05, + "loss": 5.2615, + "step": 7924 + }, + { + "epoch": 0.77, + "grad_norm": 1.0852704048156738, + "learning_rate": 1.1571650184215629e-05, + "loss": 5.2886, + "step": 7928 + }, + { + "epoch": 0.77, + "grad_norm": 1.0782544612884521, + "learning_rate": 1.155225906534807e-05, + "loss": 5.3585, + "step": 7932 + }, + { + "epoch": 0.77, + "grad_norm": 1.0928773880004883, + "learning_rate": 1.1532867946480511e-05, + "loss": 5.2083, + "step": 7936 + }, + { + "epoch": 0.77, + "grad_norm": 0.9779551029205322, + "learning_rate": 1.1513476827612954e-05, + "loss": 5.2969, + "step": 7940 + }, + { + "epoch": 0.77, + "grad_norm": 1.0859659910202026, + "learning_rate": 1.1494085708745394e-05, + "loss": 5.3044, + "step": 7944 + }, + { + "epoch": 0.77, + "grad_norm": 1.0626839399337769, + "learning_rate": 1.1474694589877836e-05, + "loss": 5.3455, + "step": 7948 + }, + { + "epoch": 0.77, + "grad_norm": 1.1160836219787598, + "learning_rate": 1.1455303471010277e-05, + "loss": 5.3082, + "step": 7952 + }, + { + "epoch": 0.77, + "grad_norm": 1.089357614517212, + "learning_rate": 1.1435912352142719e-05, + "loss": 5.3013, + "step": 7956 + }, + { + "epoch": 0.77, + "grad_norm": 0.9616773128509521, + "learning_rate": 1.1416521233275161e-05, + "loss": 5.2995, + "step": 7960 + }, + { + "epoch": 0.77, + "grad_norm": 1.0657833814620972, + "learning_rate": 1.1397130114407602e-05, + "loss": 5.3208, + "step": 7964 + }, + { + "epoch": 0.77, + "grad_norm": 1.0845454931259155, + "learning_rate": 1.1377738995540044e-05, + "loss": 5.3213, + "step": 7968 + }, + { + "epoch": 0.77, + "grad_norm": 1.0332484245300293, + "learning_rate": 1.1358347876672485e-05, + "loss": 5.2833, + "step": 7972 + }, + { + "epoch": 0.77, + "grad_norm": 1.0877047777175903, + "learning_rate": 1.1338956757804927e-05, + "loss": 5.2889, + "step": 7976 + }, + { + "epoch": 0.77, + "grad_norm": 1.0782090425491333, + "learning_rate": 1.1319565638937367e-05, + "loss": 5.2914, + "step": 7980 + }, + { + "epoch": 0.77, + "grad_norm": 1.0773468017578125, + "learning_rate": 1.130017452006981e-05, + "loss": 5.2853, + "step": 7984 + }, + { + "epoch": 0.77, + "grad_norm": 1.1031116247177124, + "learning_rate": 1.128078340120225e-05, + "loss": 5.2285, + "step": 7988 + }, + { + "epoch": 0.77, + "grad_norm": 1.0579017400741577, + "learning_rate": 1.126139228233469e-05, + "loss": 5.2608, + "step": 7992 + }, + { + "epoch": 0.78, + "grad_norm": 1.0039610862731934, + "learning_rate": 1.1242001163467133e-05, + "loss": 5.2801, + "step": 7996 + }, + { + "epoch": 0.78, + "grad_norm": 1.1101170778274536, + "learning_rate": 1.1222610044599573e-05, + "loss": 5.3003, + "step": 8000 + }, + { + "epoch": 0.78, + "grad_norm": 1.1231920719146729, + "learning_rate": 1.1203218925732014e-05, + "loss": 5.2951, + "step": 8004 + }, + { + "epoch": 0.78, + "grad_norm": 1.104062557220459, + "learning_rate": 1.1183827806864456e-05, + "loss": 5.2419, + "step": 8008 + }, + { + "epoch": 0.78, + "grad_norm": 1.0631533861160278, + "learning_rate": 1.1164436687996897e-05, + "loss": 5.2927, + "step": 8012 + }, + { + "epoch": 0.78, + "grad_norm": 1.0923023223876953, + "learning_rate": 1.1145045569129339e-05, + "loss": 5.3566, + "step": 8016 + }, + { + "epoch": 0.78, + "grad_norm": 1.1646323204040527, + "learning_rate": 1.1125654450261781e-05, + "loss": 5.229, + "step": 8020 + }, + { + "epoch": 0.78, + "grad_norm": 1.0775333642959595, + "learning_rate": 1.1106263331394222e-05, + "loss": 5.3121, + "step": 8024 + }, + { + "epoch": 0.78, + "grad_norm": 1.101682186126709, + "learning_rate": 1.1086872212526664e-05, + "loss": 5.3746, + "step": 8028 + }, + { + "epoch": 0.78, + "grad_norm": 1.06061589717865, + "learning_rate": 1.1067481093659105e-05, + "loss": 5.2878, + "step": 8032 + }, + { + "epoch": 0.78, + "grad_norm": 1.0500963926315308, + "learning_rate": 1.1048089974791547e-05, + "loss": 5.3188, + "step": 8036 + }, + { + "epoch": 0.78, + "grad_norm": 1.1280819177627563, + "learning_rate": 1.1028698855923987e-05, + "loss": 5.2829, + "step": 8040 + }, + { + "epoch": 0.78, + "grad_norm": 1.0025454759597778, + "learning_rate": 1.100930773705643e-05, + "loss": 5.3121, + "step": 8044 + }, + { + "epoch": 0.78, + "grad_norm": 1.0445626974105835, + "learning_rate": 1.098991661818887e-05, + "loss": 5.3264, + "step": 8048 + }, + { + "epoch": 0.78, + "grad_norm": 1.0949641466140747, + "learning_rate": 1.0970525499321312e-05, + "loss": 5.3306, + "step": 8052 + }, + { + "epoch": 0.78, + "grad_norm": 1.044668197631836, + "learning_rate": 1.0951134380453753e-05, + "loss": 5.3335, + "step": 8056 + }, + { + "epoch": 0.78, + "grad_norm": 1.079334020614624, + "learning_rate": 1.0931743261586193e-05, + "loss": 5.3383, + "step": 8060 + }, + { + "epoch": 0.78, + "grad_norm": 1.0334477424621582, + "learning_rate": 1.0912352142718636e-05, + "loss": 5.3128, + "step": 8064 + }, + { + "epoch": 0.78, + "grad_norm": 0.9899519681930542, + "learning_rate": 1.0892961023851076e-05, + "loss": 5.3245, + "step": 8068 + }, + { + "epoch": 0.78, + "grad_norm": 1.0130146741867065, + "learning_rate": 1.0873569904983517e-05, + "loss": 5.2891, + "step": 8072 + }, + { + "epoch": 0.78, + "grad_norm": 1.0125179290771484, + "learning_rate": 1.0854178786115959e-05, + "loss": 5.3007, + "step": 8076 + }, + { + "epoch": 0.78, + "grad_norm": 1.0482258796691895, + "learning_rate": 1.0834787667248401e-05, + "loss": 5.2417, + "step": 8080 + }, + { + "epoch": 0.78, + "grad_norm": 1.055640697479248, + "learning_rate": 1.0815396548380842e-05, + "loss": 5.3779, + "step": 8084 + }, + { + "epoch": 0.78, + "grad_norm": 1.0579723119735718, + "learning_rate": 1.0796005429513284e-05, + "loss": 5.232, + "step": 8088 + }, + { + "epoch": 0.78, + "grad_norm": 1.083598017692566, + "learning_rate": 1.0776614310645724e-05, + "loss": 5.4277, + "step": 8092 + }, + { + "epoch": 0.78, + "grad_norm": 0.9834340214729309, + "learning_rate": 1.0757223191778167e-05, + "loss": 5.2498, + "step": 8096 + }, + { + "epoch": 0.79, + "grad_norm": 1.0400574207305908, + "learning_rate": 1.0737832072910607e-05, + "loss": 5.3321, + "step": 8100 + }, + { + "epoch": 0.79, + "grad_norm": 1.083775281906128, + "learning_rate": 1.071844095404305e-05, + "loss": 5.1589, + "step": 8104 + }, + { + "epoch": 0.79, + "grad_norm": 1.035290241241455, + "learning_rate": 1.069904983517549e-05, + "loss": 5.4106, + "step": 8108 + }, + { + "epoch": 0.79, + "grad_norm": 1.0348436832427979, + "learning_rate": 1.0679658716307932e-05, + "loss": 5.2728, + "step": 8112 + }, + { + "epoch": 0.79, + "grad_norm": 1.058597445487976, + "learning_rate": 1.0660267597440373e-05, + "loss": 5.2081, + "step": 8116 + }, + { + "epoch": 0.79, + "grad_norm": 1.0467309951782227, + "learning_rate": 1.0640876478572815e-05, + "loss": 5.232, + "step": 8120 + }, + { + "epoch": 0.79, + "grad_norm": 0.9697101712226868, + "learning_rate": 1.0621485359705256e-05, + "loss": 5.2641, + "step": 8124 + }, + { + "epoch": 0.79, + "grad_norm": 1.0596665143966675, + "learning_rate": 1.0602094240837698e-05, + "loss": 5.2865, + "step": 8128 + }, + { + "epoch": 0.79, + "grad_norm": 1.0948309898376465, + "learning_rate": 1.0582703121970138e-05, + "loss": 5.2826, + "step": 8132 + }, + { + "epoch": 0.79, + "grad_norm": 1.0270111560821533, + "learning_rate": 1.0563312003102579e-05, + "loss": 5.2726, + "step": 8136 + }, + { + "epoch": 0.79, + "grad_norm": 1.0012414455413818, + "learning_rate": 1.0543920884235021e-05, + "loss": 5.3184, + "step": 8140 + }, + { + "epoch": 0.79, + "grad_norm": 1.019332766532898, + "learning_rate": 1.0524529765367462e-05, + "loss": 5.2908, + "step": 8144 + }, + { + "epoch": 0.79, + "grad_norm": 1.0300483703613281, + "learning_rate": 1.0505138646499904e-05, + "loss": 5.3333, + "step": 8148 + }, + { + "epoch": 0.79, + "grad_norm": 1.0536975860595703, + "learning_rate": 1.0485747527632344e-05, + "loss": 5.2944, + "step": 8152 + }, + { + "epoch": 0.79, + "grad_norm": 0.9881764650344849, + "learning_rate": 1.0466356408764787e-05, + "loss": 5.2767, + "step": 8156 + }, + { + "epoch": 0.79, + "grad_norm": 1.0163639783859253, + "learning_rate": 1.0446965289897227e-05, + "loss": 5.2723, + "step": 8160 + }, + { + "epoch": 0.79, + "grad_norm": 1.0919207334518433, + "learning_rate": 1.042757417102967e-05, + "loss": 5.3327, + "step": 8164 + }, + { + "epoch": 0.79, + "grad_norm": 1.0306917428970337, + "learning_rate": 1.040818305216211e-05, + "loss": 5.2565, + "step": 8168 + }, + { + "epoch": 0.79, + "grad_norm": 1.0125017166137695, + "learning_rate": 1.0388791933294552e-05, + "loss": 5.342, + "step": 8172 + }, + { + "epoch": 0.79, + "grad_norm": 1.0879104137420654, + "learning_rate": 1.0369400814426993e-05, + "loss": 5.3122, + "step": 8176 + }, + { + "epoch": 0.79, + "grad_norm": 0.9527262449264526, + "learning_rate": 1.0350009695559435e-05, + "loss": 5.284, + "step": 8180 + }, + { + "epoch": 0.79, + "grad_norm": 1.0486680269241333, + "learning_rate": 1.0330618576691875e-05, + "loss": 5.2428, + "step": 8184 + }, + { + "epoch": 0.79, + "grad_norm": 1.0278397798538208, + "learning_rate": 1.0311227457824318e-05, + "loss": 5.3466, + "step": 8188 + }, + { + "epoch": 0.79, + "grad_norm": 1.0635344982147217, + "learning_rate": 1.0291836338956758e-05, + "loss": 5.2645, + "step": 8192 + }, + { + "epoch": 0.79, + "grad_norm": 1.0180613994598389, + "learning_rate": 1.02724452200892e-05, + "loss": 5.3552, + "step": 8196 + }, + { + "epoch": 0.8, + "grad_norm": 1.0265159606933594, + "learning_rate": 1.0253054101221641e-05, + "loss": 5.3422, + "step": 8200 + }, + { + "epoch": 0.8, + "grad_norm": 1.029842495918274, + "learning_rate": 1.0233662982354082e-05, + "loss": 5.4009, + "step": 8204 + }, + { + "epoch": 0.8, + "grad_norm": 1.0848013162612915, + "learning_rate": 1.0214271863486524e-05, + "loss": 5.3982, + "step": 8208 + }, + { + "epoch": 0.8, + "grad_norm": 1.0277658700942993, + "learning_rate": 1.0194880744618964e-05, + "loss": 5.3567, + "step": 8212 + }, + { + "epoch": 0.8, + "grad_norm": 1.0273760557174683, + "learning_rate": 1.0175489625751407e-05, + "loss": 5.2762, + "step": 8216 + }, + { + "epoch": 0.8, + "grad_norm": 0.9951087832450867, + "learning_rate": 1.0156098506883847e-05, + "loss": 5.3267, + "step": 8220 + }, + { + "epoch": 0.8, + "grad_norm": 1.0347890853881836, + "learning_rate": 1.013670738801629e-05, + "loss": 5.2681, + "step": 8224 + }, + { + "epoch": 0.8, + "grad_norm": 1.076242446899414, + "learning_rate": 1.011731626914873e-05, + "loss": 5.3621, + "step": 8228 + }, + { + "epoch": 0.8, + "grad_norm": 1.02762770652771, + "learning_rate": 1.0097925150281172e-05, + "loss": 5.3412, + "step": 8232 + }, + { + "epoch": 0.8, + "grad_norm": 1.0394989252090454, + "learning_rate": 1.0078534031413613e-05, + "loss": 5.2555, + "step": 8236 + }, + { + "epoch": 0.8, + "grad_norm": 1.0912150144577026, + "learning_rate": 1.0059142912546055e-05, + "loss": 5.3784, + "step": 8240 + }, + { + "epoch": 0.8, + "grad_norm": 1.0681991577148438, + "learning_rate": 1.0039751793678495e-05, + "loss": 5.3226, + "step": 8244 + }, + { + "epoch": 0.8, + "grad_norm": 1.0572381019592285, + "learning_rate": 1.0020360674810938e-05, + "loss": 5.2935, + "step": 8248 + }, + { + "epoch": 0.8, + "grad_norm": 1.0432649850845337, + "learning_rate": 1.0000969555943378e-05, + "loss": 5.3087, + "step": 8252 + }, + { + "epoch": 0.8, + "grad_norm": 1.0084688663482666, + "learning_rate": 9.98157843707582e-06, + "loss": 5.2778, + "step": 8256 + }, + { + "epoch": 0.8, + "grad_norm": 1.036868691444397, + "learning_rate": 9.962187318208263e-06, + "loss": 5.31, + "step": 8260 + }, + { + "epoch": 0.8, + "grad_norm": 1.0464235544204712, + "learning_rate": 9.942796199340703e-06, + "loss": 5.3512, + "step": 8264 + }, + { + "epoch": 0.8, + "grad_norm": 1.001470685005188, + "learning_rate": 9.923405080473144e-06, + "loss": 5.2698, + "step": 8268 + }, + { + "epoch": 0.8, + "grad_norm": 1.0910736322402954, + "learning_rate": 9.904013961605584e-06, + "loss": 5.363, + "step": 8272 + }, + { + "epoch": 0.8, + "grad_norm": 1.087928056716919, + "learning_rate": 9.884622842738026e-06, + "loss": 5.3454, + "step": 8276 + }, + { + "epoch": 0.8, + "grad_norm": 1.055014967918396, + "learning_rate": 9.865231723870467e-06, + "loss": 5.3134, + "step": 8280 + }, + { + "epoch": 0.8, + "grad_norm": 1.1186180114746094, + "learning_rate": 9.84584060500291e-06, + "loss": 5.3016, + "step": 8284 + }, + { + "epoch": 0.8, + "grad_norm": 1.0159074068069458, + "learning_rate": 9.82644948613535e-06, + "loss": 5.3974, + "step": 8288 + }, + { + "epoch": 0.8, + "grad_norm": 1.1419733762741089, + "learning_rate": 9.807058367267792e-06, + "loss": 5.333, + "step": 8292 + }, + { + "epoch": 0.8, + "grad_norm": 1.078598976135254, + "learning_rate": 9.787667248400232e-06, + "loss": 5.2157, + "step": 8296 + }, + { + "epoch": 0.8, + "grad_norm": 1.0978525876998901, + "learning_rate": 9.768276129532675e-06, + "loss": 5.3084, + "step": 8300 + }, + { + "epoch": 0.81, + "grad_norm": 1.0307817459106445, + "learning_rate": 9.748885010665115e-06, + "loss": 5.2962, + "step": 8304 + }, + { + "epoch": 0.81, + "grad_norm": 0.9767160415649414, + "learning_rate": 9.729493891797557e-06, + "loss": 5.3765, + "step": 8308 + }, + { + "epoch": 0.81, + "grad_norm": 1.048744797706604, + "learning_rate": 9.710102772929998e-06, + "loss": 5.3491, + "step": 8312 + }, + { + "epoch": 0.81, + "grad_norm": 1.1452877521514893, + "learning_rate": 9.69071165406244e-06, + "loss": 5.3348, + "step": 8316 + }, + { + "epoch": 0.81, + "grad_norm": 1.0742149353027344, + "learning_rate": 9.671320535194882e-06, + "loss": 5.4119, + "step": 8320 + }, + { + "epoch": 0.81, + "grad_norm": 1.016554594039917, + "learning_rate": 9.651929416327323e-06, + "loss": 5.2832, + "step": 8324 + }, + { + "epoch": 0.81, + "grad_norm": 1.1286094188690186, + "learning_rate": 9.632538297459765e-06, + "loss": 5.3436, + "step": 8328 + }, + { + "epoch": 0.81, + "grad_norm": 1.0719375610351562, + "learning_rate": 9.613147178592206e-06, + "loss": 5.2561, + "step": 8332 + }, + { + "epoch": 0.81, + "grad_norm": 1.0239946842193604, + "learning_rate": 9.593756059724646e-06, + "loss": 5.3411, + "step": 8336 + }, + { + "epoch": 0.81, + "grad_norm": 1.16642165184021, + "learning_rate": 9.574364940857087e-06, + "loss": 5.2173, + "step": 8340 + }, + { + "epoch": 0.81, + "grad_norm": 1.056943655014038, + "learning_rate": 9.554973821989529e-06, + "loss": 5.3138, + "step": 8344 + }, + { + "epoch": 0.81, + "grad_norm": 1.0310717821121216, + "learning_rate": 9.53558270312197e-06, + "loss": 5.2147, + "step": 8348 + }, + { + "epoch": 0.81, + "grad_norm": 1.0939549207687378, + "learning_rate": 9.516191584254412e-06, + "loss": 5.3849, + "step": 8352 + }, + { + "epoch": 0.81, + "grad_norm": 1.0846009254455566, + "learning_rate": 9.496800465386852e-06, + "loss": 5.2793, + "step": 8356 + }, + { + "epoch": 0.81, + "grad_norm": 1.0984148979187012, + "learning_rate": 9.477409346519295e-06, + "loss": 5.3142, + "step": 8360 + }, + { + "epoch": 0.81, + "grad_norm": 1.035758376121521, + "learning_rate": 9.458018227651735e-06, + "loss": 5.264, + "step": 8364 + }, + { + "epoch": 0.81, + "grad_norm": 1.0837132930755615, + "learning_rate": 9.438627108784177e-06, + "loss": 5.432, + "step": 8368 + }, + { + "epoch": 0.81, + "grad_norm": 1.0333995819091797, + "learning_rate": 9.419235989916618e-06, + "loss": 5.2622, + "step": 8372 + }, + { + "epoch": 0.81, + "grad_norm": 1.054474949836731, + "learning_rate": 9.39984487104906e-06, + "loss": 5.2721, + "step": 8376 + }, + { + "epoch": 0.81, + "grad_norm": 1.0750809907913208, + "learning_rate": 9.380453752181502e-06, + "loss": 5.2433, + "step": 8380 + }, + { + "epoch": 0.81, + "grad_norm": 1.054914116859436, + "learning_rate": 9.361062633313943e-06, + "loss": 5.3407, + "step": 8384 + }, + { + "epoch": 0.81, + "grad_norm": 1.1101247072219849, + "learning_rate": 9.341671514446385e-06, + "loss": 5.2774, + "step": 8388 + }, + { + "epoch": 0.81, + "grad_norm": 1.1054069995880127, + "learning_rate": 9.322280395578826e-06, + "loss": 5.3978, + "step": 8392 + }, + { + "epoch": 0.81, + "grad_norm": 1.0813637971878052, + "learning_rate": 9.302889276711268e-06, + "loss": 5.3735, + "step": 8396 + }, + { + "epoch": 0.81, + "grad_norm": 1.151734709739685, + "learning_rate": 9.283498157843708e-06, + "loss": 5.4109, + "step": 8400 + }, + { + "epoch": 0.81, + "grad_norm": 1.0774791240692139, + "learning_rate": 9.264107038976149e-06, + "loss": 5.2963, + "step": 8404 + }, + { + "epoch": 0.82, + "grad_norm": 1.0468578338623047, + "learning_rate": 9.24471592010859e-06, + "loss": 5.334, + "step": 8408 + }, + { + "epoch": 0.82, + "grad_norm": 1.0247350931167603, + "learning_rate": 9.225324801241032e-06, + "loss": 5.3307, + "step": 8412 + }, + { + "epoch": 0.82, + "grad_norm": 1.021700143814087, + "learning_rate": 9.205933682373472e-06, + "loss": 5.3315, + "step": 8416 + }, + { + "epoch": 0.82, + "grad_norm": 1.0852890014648438, + "learning_rate": 9.186542563505915e-06, + "loss": 5.23, + "step": 8420 + }, + { + "epoch": 0.82, + "grad_norm": 1.0435699224472046, + "learning_rate": 9.167151444638355e-06, + "loss": 5.3353, + "step": 8424 + }, + { + "epoch": 0.82, + "grad_norm": 1.0124831199645996, + "learning_rate": 9.147760325770797e-06, + "loss": 5.3257, + "step": 8428 + }, + { + "epoch": 0.82, + "grad_norm": 1.0061957836151123, + "learning_rate": 9.128369206903238e-06, + "loss": 5.3437, + "step": 8432 + }, + { + "epoch": 0.82, + "grad_norm": 1.0675718784332275, + "learning_rate": 9.10897808803568e-06, + "loss": 5.3543, + "step": 8436 + }, + { + "epoch": 0.82, + "grad_norm": 1.0940582752227783, + "learning_rate": 9.089586969168122e-06, + "loss": 5.3205, + "step": 8440 + }, + { + "epoch": 0.82, + "grad_norm": 1.0253922939300537, + "learning_rate": 9.070195850300563e-06, + "loss": 5.2836, + "step": 8444 + }, + { + "epoch": 0.82, + "grad_norm": 1.0632801055908203, + "learning_rate": 9.050804731433005e-06, + "loss": 5.3634, + "step": 8448 + }, + { + "epoch": 0.82, + "grad_norm": 1.151405692100525, + "learning_rate": 9.031413612565446e-06, + "loss": 5.266, + "step": 8452 + }, + { + "epoch": 0.82, + "grad_norm": 1.1215803623199463, + "learning_rate": 9.012022493697888e-06, + "loss": 5.3605, + "step": 8456 + }, + { + "epoch": 0.82, + "grad_norm": 1.0406687259674072, + "learning_rate": 8.992631374830328e-06, + "loss": 5.3037, + "step": 8460 + }, + { + "epoch": 0.82, + "grad_norm": 1.0305143594741821, + "learning_rate": 8.97324025596277e-06, + "loss": 5.2419, + "step": 8464 + }, + { + "epoch": 0.82, + "grad_norm": 1.0159248113632202, + "learning_rate": 8.953849137095211e-06, + "loss": 5.2576, + "step": 8468 + }, + { + "epoch": 0.82, + "grad_norm": 1.0444971323013306, + "learning_rate": 8.934458018227652e-06, + "loss": 5.2461, + "step": 8472 + }, + { + "epoch": 0.82, + "grad_norm": 1.062738299369812, + "learning_rate": 8.915066899360092e-06, + "loss": 5.2934, + "step": 8476 + }, + { + "epoch": 0.82, + "grad_norm": 1.0445396900177002, + "learning_rate": 8.895675780492534e-06, + "loss": 5.3011, + "step": 8480 + }, + { + "epoch": 0.82, + "grad_norm": 1.1299471855163574, + "learning_rate": 8.876284661624975e-06, + "loss": 5.1977, + "step": 8484 + }, + { + "epoch": 0.82, + "grad_norm": 1.0241918563842773, + "learning_rate": 8.856893542757417e-06, + "loss": 5.2821, + "step": 8488 + }, + { + "epoch": 0.82, + "grad_norm": 1.0978903770446777, + "learning_rate": 8.837502423889858e-06, + "loss": 5.3411, + "step": 8492 + }, + { + "epoch": 0.82, + "grad_norm": 1.0630090236663818, + "learning_rate": 8.8181113050223e-06, + "loss": 5.2487, + "step": 8496 + }, + { + "epoch": 0.82, + "grad_norm": 1.0468335151672363, + "learning_rate": 8.798720186154742e-06, + "loss": 5.2238, + "step": 8500 + }, + { + "epoch": 0.82, + "grad_norm": 1.0701797008514404, + "learning_rate": 8.779329067287183e-06, + "loss": 5.2461, + "step": 8504 + }, + { + "epoch": 0.82, + "grad_norm": 1.0644505023956299, + "learning_rate": 8.759937948419625e-06, + "loss": 5.2462, + "step": 8508 + }, + { + "epoch": 0.83, + "grad_norm": 1.0857114791870117, + "learning_rate": 8.740546829552066e-06, + "loss": 5.3715, + "step": 8512 + }, + { + "epoch": 0.83, + "grad_norm": 1.0911511182785034, + "learning_rate": 8.721155710684508e-06, + "loss": 5.3341, + "step": 8516 + }, + { + "epoch": 0.83, + "grad_norm": 1.066658854484558, + "learning_rate": 8.701764591816948e-06, + "loss": 5.2962, + "step": 8520 + }, + { + "epoch": 0.83, + "grad_norm": 1.0484204292297363, + "learning_rate": 8.68237347294939e-06, + "loss": 5.2937, + "step": 8524 + }, + { + "epoch": 0.83, + "grad_norm": 1.1011348962783813, + "learning_rate": 8.662982354081831e-06, + "loss": 5.3372, + "step": 8528 + }, + { + "epoch": 0.83, + "grad_norm": 1.1982569694519043, + "learning_rate": 8.643591235214273e-06, + "loss": 5.1987, + "step": 8532 + }, + { + "epoch": 0.83, + "grad_norm": 1.1187928915023804, + "learning_rate": 8.624200116346714e-06, + "loss": 5.3326, + "step": 8536 + }, + { + "epoch": 0.83, + "grad_norm": 1.0191898345947266, + "learning_rate": 8.604808997479154e-06, + "loss": 5.384, + "step": 8540 + }, + { + "epoch": 0.83, + "grad_norm": 1.1834492683410645, + "learning_rate": 8.585417878611597e-06, + "loss": 5.3774, + "step": 8544 + }, + { + "epoch": 0.83, + "grad_norm": 1.0459861755371094, + "learning_rate": 8.566026759744037e-06, + "loss": 5.3309, + "step": 8548 + }, + { + "epoch": 0.83, + "grad_norm": 1.019656777381897, + "learning_rate": 8.546635640876478e-06, + "loss": 5.3024, + "step": 8552 + }, + { + "epoch": 0.83, + "grad_norm": 1.1104713678359985, + "learning_rate": 8.52724452200892e-06, + "loss": 5.2309, + "step": 8556 + }, + { + "epoch": 0.83, + "grad_norm": 1.0480828285217285, + "learning_rate": 8.507853403141362e-06, + "loss": 5.256, + "step": 8560 + }, + { + "epoch": 0.83, + "grad_norm": 1.0698785781860352, + "learning_rate": 8.488462284273803e-06, + "loss": 5.2322, + "step": 8564 + }, + { + "epoch": 0.83, + "grad_norm": 1.0998084545135498, + "learning_rate": 8.469071165406245e-06, + "loss": 5.2919, + "step": 8568 + }, + { + "epoch": 0.83, + "grad_norm": 1.0241094827651978, + "learning_rate": 8.449680046538685e-06, + "loss": 5.3195, + "step": 8572 + }, + { + "epoch": 0.83, + "grad_norm": 1.1492643356323242, + "learning_rate": 8.430288927671128e-06, + "loss": 5.2311, + "step": 8576 + }, + { + "epoch": 0.83, + "grad_norm": 1.1048632860183716, + "learning_rate": 8.410897808803568e-06, + "loss": 5.3227, + "step": 8580 + }, + { + "epoch": 0.83, + "grad_norm": 1.0702450275421143, + "learning_rate": 8.39150668993601e-06, + "loss": 5.3173, + "step": 8584 + }, + { + "epoch": 0.83, + "grad_norm": 1.0409200191497803, + "learning_rate": 8.372115571068451e-06, + "loss": 5.3185, + "step": 8588 + }, + { + "epoch": 0.83, + "grad_norm": 1.046713948249817, + "learning_rate": 8.352724452200893e-06, + "loss": 5.3996, + "step": 8592 + }, + { + "epoch": 0.83, + "grad_norm": 1.039919376373291, + "learning_rate": 8.333333333333334e-06, + "loss": 5.2483, + "step": 8596 + }, + { + "epoch": 0.83, + "grad_norm": 1.0520331859588623, + "learning_rate": 8.313942214465776e-06, + "loss": 5.3082, + "step": 8600 + }, + { + "epoch": 0.83, + "grad_norm": 1.0794312953948975, + "learning_rate": 8.294551095598217e-06, + "loss": 5.382, + "step": 8604 + }, + { + "epoch": 0.83, + "grad_norm": 1.0222445726394653, + "learning_rate": 8.275159976730657e-06, + "loss": 5.2186, + "step": 8608 + }, + { + "epoch": 0.83, + "grad_norm": 1.043550729751587, + "learning_rate": 8.2557688578631e-06, + "loss": 5.2447, + "step": 8612 + }, + { + "epoch": 0.84, + "grad_norm": 1.065027117729187, + "learning_rate": 8.23637773899554e-06, + "loss": 5.3149, + "step": 8616 + }, + { + "epoch": 0.84, + "grad_norm": 1.089449405670166, + "learning_rate": 8.216986620127982e-06, + "loss": 5.231, + "step": 8620 + }, + { + "epoch": 0.84, + "grad_norm": 1.0330005884170532, + "learning_rate": 8.197595501260423e-06, + "loss": 5.3099, + "step": 8624 + }, + { + "epoch": 0.84, + "grad_norm": 1.088131070137024, + "learning_rate": 8.178204382392865e-06, + "loss": 5.3941, + "step": 8628 + }, + { + "epoch": 0.84, + "grad_norm": 1.0130773782730103, + "learning_rate": 8.158813263525305e-06, + "loss": 5.2442, + "step": 8632 + }, + { + "epoch": 0.84, + "grad_norm": 1.035882592201233, + "learning_rate": 8.139422144657748e-06, + "loss": 5.21, + "step": 8636 + }, + { + "epoch": 0.84, + "grad_norm": 1.0550565719604492, + "learning_rate": 8.120031025790188e-06, + "loss": 5.2888, + "step": 8640 + }, + { + "epoch": 0.84, + "grad_norm": 1.1436634063720703, + "learning_rate": 8.10063990692263e-06, + "loss": 5.3587, + "step": 8644 + }, + { + "epoch": 0.84, + "grad_norm": 1.1211497783660889, + "learning_rate": 8.081248788055071e-06, + "loss": 5.3188, + "step": 8648 + }, + { + "epoch": 0.84, + "grad_norm": 1.1205918788909912, + "learning_rate": 8.061857669187513e-06, + "loss": 5.2688, + "step": 8652 + }, + { + "epoch": 0.84, + "grad_norm": 1.0907244682312012, + "learning_rate": 8.042466550319954e-06, + "loss": 5.2518, + "step": 8656 + }, + { + "epoch": 0.84, + "grad_norm": 1.0856692790985107, + "learning_rate": 8.023075431452396e-06, + "loss": 5.3384, + "step": 8660 + }, + { + "epoch": 0.84, + "grad_norm": 1.0303173065185547, + "learning_rate": 8.003684312584836e-06, + "loss": 5.2906, + "step": 8664 + }, + { + "epoch": 0.84, + "grad_norm": 1.0908282995224, + "learning_rate": 7.984293193717279e-06, + "loss": 5.3331, + "step": 8668 + }, + { + "epoch": 0.84, + "grad_norm": 1.088040828704834, + "learning_rate": 7.96490207484972e-06, + "loss": 5.3521, + "step": 8672 + }, + { + "epoch": 0.84, + "grad_norm": 1.0086363554000854, + "learning_rate": 7.94551095598216e-06, + "loss": 5.3294, + "step": 8676 + }, + { + "epoch": 0.84, + "grad_norm": 1.1544169187545776, + "learning_rate": 7.926119837114602e-06, + "loss": 5.3184, + "step": 8680 + }, + { + "epoch": 0.84, + "grad_norm": 1.0530931949615479, + "learning_rate": 7.906728718247042e-06, + "loss": 5.2866, + "step": 8684 + }, + { + "epoch": 0.84, + "grad_norm": 1.080121397972107, + "learning_rate": 7.887337599379485e-06, + "loss": 5.2793, + "step": 8688 + }, + { + "epoch": 0.84, + "grad_norm": 1.0175666809082031, + "learning_rate": 7.867946480511925e-06, + "loss": 5.3282, + "step": 8692 + }, + { + "epoch": 0.84, + "grad_norm": 1.0283890962600708, + "learning_rate": 7.848555361644367e-06, + "loss": 5.3439, + "step": 8696 + }, + { + "epoch": 0.84, + "grad_norm": 1.0480095148086548, + "learning_rate": 7.829164242776808e-06, + "loss": 5.3042, + "step": 8700 + }, + { + "epoch": 0.84, + "grad_norm": 0.9947773814201355, + "learning_rate": 7.80977312390925e-06, + "loss": 5.251, + "step": 8704 + }, + { + "epoch": 0.84, + "grad_norm": 1.033823847770691, + "learning_rate": 7.79038200504169e-06, + "loss": 5.4016, + "step": 8708 + }, + { + "epoch": 0.84, + "grad_norm": 1.129824161529541, + "learning_rate": 7.770990886174133e-06, + "loss": 5.2176, + "step": 8712 + }, + { + "epoch": 0.85, + "grad_norm": 1.0202304124832153, + "learning_rate": 7.751599767306574e-06, + "loss": 5.3219, + "step": 8716 + }, + { + "epoch": 0.85, + "grad_norm": 1.0748639106750488, + "learning_rate": 7.732208648439016e-06, + "loss": 5.2945, + "step": 8720 + }, + { + "epoch": 0.85, + "grad_norm": 1.0026463270187378, + "learning_rate": 7.712817529571456e-06, + "loss": 5.3474, + "step": 8724 + }, + { + "epoch": 0.85, + "grad_norm": 0.98891282081604, + "learning_rate": 7.693426410703899e-06, + "loss": 5.2612, + "step": 8728 + }, + { + "epoch": 0.85, + "grad_norm": 1.079750418663025, + "learning_rate": 7.674035291836339e-06, + "loss": 5.2398, + "step": 8732 + }, + { + "epoch": 0.85, + "grad_norm": 1.057255744934082, + "learning_rate": 7.654644172968781e-06, + "loss": 5.4364, + "step": 8736 + }, + { + "epoch": 0.85, + "grad_norm": 1.0570470094680786, + "learning_rate": 7.635253054101222e-06, + "loss": 5.2695, + "step": 8740 + }, + { + "epoch": 0.85, + "grad_norm": 1.0664699077606201, + "learning_rate": 7.615861935233663e-06, + "loss": 5.3378, + "step": 8744 + }, + { + "epoch": 0.85, + "grad_norm": 1.0890289545059204, + "learning_rate": 7.5964708163661055e-06, + "loss": 5.375, + "step": 8748 + }, + { + "epoch": 0.85, + "grad_norm": 1.0607807636260986, + "learning_rate": 7.577079697498546e-06, + "loss": 5.3258, + "step": 8752 + }, + { + "epoch": 0.85, + "grad_norm": 1.0911225080490112, + "learning_rate": 7.557688578630988e-06, + "loss": 5.242, + "step": 8756 + }, + { + "epoch": 0.85, + "grad_norm": 1.1005817651748657, + "learning_rate": 7.538297459763429e-06, + "loss": 5.2734, + "step": 8760 + }, + { + "epoch": 0.85, + "grad_norm": 1.0650907754898071, + "learning_rate": 7.51890634089587e-06, + "loss": 5.2939, + "step": 8764 + }, + { + "epoch": 0.85, + "grad_norm": 1.0343334674835205, + "learning_rate": 7.499515222028311e-06, + "loss": 5.2601, + "step": 8768 + }, + { + "epoch": 0.85, + "grad_norm": 1.092239260673523, + "learning_rate": 7.480124103160753e-06, + "loss": 5.266, + "step": 8772 + }, + { + "epoch": 0.85, + "grad_norm": 1.140648603439331, + "learning_rate": 7.4607329842931935e-06, + "loss": 5.2353, + "step": 8776 + }, + { + "epoch": 0.85, + "grad_norm": 1.0743423700332642, + "learning_rate": 7.441341865425636e-06, + "loss": 5.389, + "step": 8780 + }, + { + "epoch": 0.85, + "grad_norm": 1.0870285034179688, + "learning_rate": 7.421950746558076e-06, + "loss": 5.3212, + "step": 8784 + }, + { + "epoch": 0.85, + "grad_norm": 1.0318245887756348, + "learning_rate": 7.4025596276905185e-06, + "loss": 5.2854, + "step": 8788 + }, + { + "epoch": 0.85, + "grad_norm": 1.0597593784332275, + "learning_rate": 7.383168508822959e-06, + "loss": 5.4204, + "step": 8792 + }, + { + "epoch": 0.85, + "grad_norm": 1.0621132850646973, + "learning_rate": 7.3637773899554e-06, + "loss": 5.2903, + "step": 8796 + }, + { + "epoch": 0.85, + "grad_norm": 1.057024598121643, + "learning_rate": 7.344386271087843e-06, + "loss": 5.3975, + "step": 8800 + }, + { + "epoch": 0.85, + "grad_norm": 1.1537240743637085, + "learning_rate": 7.324995152220283e-06, + "loss": 5.3118, + "step": 8804 + }, + { + "epoch": 0.85, + "grad_norm": 1.161657691001892, + "learning_rate": 7.305604033352725e-06, + "loss": 5.3516, + "step": 8808 + }, + { + "epoch": 0.85, + "grad_norm": 1.0824769735336304, + "learning_rate": 7.286212914485166e-06, + "loss": 5.3601, + "step": 8812 + }, + { + "epoch": 0.85, + "grad_norm": 1.0471476316452026, + "learning_rate": 7.266821795617608e-06, + "loss": 5.2858, + "step": 8816 + }, + { + "epoch": 0.86, + "grad_norm": 1.0438990592956543, + "learning_rate": 7.247430676750049e-06, + "loss": 5.2688, + "step": 8820 + }, + { + "epoch": 0.86, + "grad_norm": 1.0383694171905518, + "learning_rate": 7.228039557882491e-06, + "loss": 5.3404, + "step": 8824 + }, + { + "epoch": 0.86, + "grad_norm": 1.039699673652649, + "learning_rate": 7.2086484390149315e-06, + "loss": 5.2846, + "step": 8828 + }, + { + "epoch": 0.86, + "grad_norm": 1.1256271600723267, + "learning_rate": 7.189257320147373e-06, + "loss": 5.327, + "step": 8832 + }, + { + "epoch": 0.86, + "grad_norm": 0.9789720177650452, + "learning_rate": 7.169866201279813e-06, + "loss": 5.3369, + "step": 8836 + }, + { + "epoch": 0.86, + "grad_norm": 1.0387988090515137, + "learning_rate": 7.150475082412256e-06, + "loss": 5.3402, + "step": 8840 + }, + { + "epoch": 0.86, + "grad_norm": 1.0737075805664062, + "learning_rate": 7.131083963544696e-06, + "loss": 5.2837, + "step": 8844 + }, + { + "epoch": 0.86, + "grad_norm": 1.1215327978134155, + "learning_rate": 7.111692844677138e-06, + "loss": 5.2237, + "step": 8848 + }, + { + "epoch": 0.86, + "grad_norm": 1.0533177852630615, + "learning_rate": 7.092301725809579e-06, + "loss": 5.3783, + "step": 8852 + }, + { + "epoch": 0.86, + "grad_norm": 1.1658439636230469, + "learning_rate": 7.072910606942021e-06, + "loss": 5.3175, + "step": 8856 + }, + { + "epoch": 0.86, + "grad_norm": 1.0966906547546387, + "learning_rate": 7.0535194880744625e-06, + "loss": 5.4216, + "step": 8860 + }, + { + "epoch": 0.86, + "grad_norm": 1.0952768325805664, + "learning_rate": 7.034128369206904e-06, + "loss": 5.2748, + "step": 8864 + }, + { + "epoch": 0.86, + "grad_norm": 1.096529483795166, + "learning_rate": 7.014737250339345e-06, + "loss": 5.1666, + "step": 8868 + }, + { + "epoch": 0.86, + "grad_norm": 1.0736936330795288, + "learning_rate": 6.995346131471786e-06, + "loss": 5.24, + "step": 8872 + }, + { + "epoch": 0.86, + "grad_norm": 1.0133376121520996, + "learning_rate": 6.975955012604228e-06, + "loss": 5.2827, + "step": 8876 + }, + { + "epoch": 0.86, + "grad_norm": 1.0580708980560303, + "learning_rate": 6.956563893736669e-06, + "loss": 5.3801, + "step": 8880 + }, + { + "epoch": 0.86, + "grad_norm": 1.1220327615737915, + "learning_rate": 6.937172774869111e-06, + "loss": 5.2949, + "step": 8884 + }, + { + "epoch": 0.86, + "grad_norm": 1.136806845664978, + "learning_rate": 6.917781656001551e-06, + "loss": 5.347, + "step": 8888 + }, + { + "epoch": 0.86, + "grad_norm": 1.1147714853286743, + "learning_rate": 6.898390537133994e-06, + "loss": 5.2648, + "step": 8892 + }, + { + "epoch": 0.86, + "grad_norm": 1.0525692701339722, + "learning_rate": 6.878999418266434e-06, + "loss": 5.2665, + "step": 8896 + }, + { + "epoch": 0.86, + "grad_norm": 1.0400636196136475, + "learning_rate": 6.8596082993988755e-06, + "loss": 5.3571, + "step": 8900 + }, + { + "epoch": 0.86, + "grad_norm": 1.0699836015701294, + "learning_rate": 6.840217180531316e-06, + "loss": 5.3079, + "step": 8904 + }, + { + "epoch": 0.86, + "grad_norm": 1.0171644687652588, + "learning_rate": 6.820826061663758e-06, + "loss": 5.3438, + "step": 8908 + }, + { + "epoch": 0.86, + "grad_norm": 1.0426756143569946, + "learning_rate": 6.801434942796199e-06, + "loss": 5.417, + "step": 8912 + }, + { + "epoch": 0.86, + "grad_norm": 1.1138461828231812, + "learning_rate": 6.782043823928641e-06, + "loss": 5.2769, + "step": 8916 + }, + { + "epoch": 0.86, + "grad_norm": 1.0219694375991821, + "learning_rate": 6.762652705061083e-06, + "loss": 5.2811, + "step": 8920 + }, + { + "epoch": 0.87, + "grad_norm": 1.1686511039733887, + "learning_rate": 6.743261586193524e-06, + "loss": 5.3218, + "step": 8924 + }, + { + "epoch": 0.87, + "grad_norm": 1.0407147407531738, + "learning_rate": 6.723870467325965e-06, + "loss": 5.339, + "step": 8928 + }, + { + "epoch": 0.87, + "grad_norm": 1.1056681871414185, + "learning_rate": 6.704479348458407e-06, + "loss": 5.2758, + "step": 8932 + }, + { + "epoch": 0.87, + "grad_norm": 1.0969740152359009, + "learning_rate": 6.685088229590848e-06, + "loss": 5.2432, + "step": 8936 + }, + { + "epoch": 0.87, + "grad_norm": 0.9841113090515137, + "learning_rate": 6.6656971107232885e-06, + "loss": 5.2596, + "step": 8940 + }, + { + "epoch": 0.87, + "grad_norm": 1.1172292232513428, + "learning_rate": 6.646305991855731e-06, + "loss": 5.2884, + "step": 8944 + }, + { + "epoch": 0.87, + "grad_norm": 0.9936596155166626, + "learning_rate": 6.626914872988171e-06, + "loss": 5.2968, + "step": 8948 + }, + { + "epoch": 0.87, + "grad_norm": 1.0389301776885986, + "learning_rate": 6.6075237541206135e-06, + "loss": 5.2827, + "step": 8952 + }, + { + "epoch": 0.87, + "grad_norm": 1.020494818687439, + "learning_rate": 6.588132635253054e-06, + "loss": 5.3882, + "step": 8956 + }, + { + "epoch": 0.87, + "grad_norm": 1.0391160249710083, + "learning_rate": 6.568741516385496e-06, + "loss": 5.2172, + "step": 8960 + }, + { + "epoch": 0.87, + "grad_norm": 1.0213825702667236, + "learning_rate": 6.549350397517937e-06, + "loss": 5.3307, + "step": 8964 + }, + { + "epoch": 0.87, + "grad_norm": 1.0745649337768555, + "learning_rate": 6.529959278650378e-06, + "loss": 5.2638, + "step": 8968 + }, + { + "epoch": 0.87, + "grad_norm": 1.0567609071731567, + "learning_rate": 6.510568159782819e-06, + "loss": 5.3164, + "step": 8972 + }, + { + "epoch": 0.87, + "grad_norm": 1.0450811386108398, + "learning_rate": 6.491177040915261e-06, + "loss": 5.3648, + "step": 8976 + }, + { + "epoch": 0.87, + "grad_norm": 1.0880790948867798, + "learning_rate": 6.471785922047703e-06, + "loss": 5.3617, + "step": 8980 + }, + { + "epoch": 0.87, + "grad_norm": 1.0606417655944824, + "learning_rate": 6.452394803180144e-06, + "loss": 5.2201, + "step": 8984 + }, + { + "epoch": 0.87, + "grad_norm": 1.0124664306640625, + "learning_rate": 6.433003684312586e-06, + "loss": 5.2499, + "step": 8988 + }, + { + "epoch": 0.87, + "grad_norm": 1.1352604627609253, + "learning_rate": 6.4136125654450265e-06, + "loss": 5.3782, + "step": 8992 + }, + { + "epoch": 0.87, + "grad_norm": 1.1061619520187378, + "learning_rate": 6.394221446577468e-06, + "loss": 5.3408, + "step": 8996 + }, + { + "epoch": 0.87, + "grad_norm": 1.1135718822479248, + "learning_rate": 6.374830327709909e-06, + "loss": 5.2323, + "step": 9000 + }, + { + "epoch": 0.87, + "grad_norm": 1.0155010223388672, + "learning_rate": 6.355439208842351e-06, + "loss": 5.2942, + "step": 9004 + }, + { + "epoch": 0.87, + "grad_norm": 1.044931173324585, + "learning_rate": 6.336048089974791e-06, + "loss": 5.3267, + "step": 9008 + }, + { + "epoch": 0.87, + "grad_norm": 1.0643398761749268, + "learning_rate": 6.316656971107233e-06, + "loss": 5.2651, + "step": 9012 + }, + { + "epoch": 0.87, + "grad_norm": 1.0203381776809692, + "learning_rate": 6.297265852239674e-06, + "loss": 5.3493, + "step": 9016 + }, + { + "epoch": 0.87, + "grad_norm": 1.0563126802444458, + "learning_rate": 6.277874733372116e-06, + "loss": 5.362, + "step": 9020 + }, + { + "epoch": 0.87, + "grad_norm": 1.0667084455490112, + "learning_rate": 6.258483614504557e-06, + "loss": 5.2151, + "step": 9024 + }, + { + "epoch": 0.88, + "grad_norm": 1.0885251760482788, + "learning_rate": 6.239092495636999e-06, + "loss": 5.3404, + "step": 9028 + }, + { + "epoch": 0.88, + "grad_norm": 1.0210630893707275, + "learning_rate": 6.21970137676944e-06, + "loss": 5.2937, + "step": 9032 + }, + { + "epoch": 0.88, + "grad_norm": 1.1303844451904297, + "learning_rate": 6.200310257901881e-06, + "loss": 5.3373, + "step": 9036 + }, + { + "epoch": 0.88, + "grad_norm": 1.1023499965667725, + "learning_rate": 6.180919139034322e-06, + "loss": 5.2946, + "step": 9040 + }, + { + "epoch": 0.88, + "grad_norm": 1.0469759702682495, + "learning_rate": 6.161528020166764e-06, + "loss": 5.3265, + "step": 9044 + }, + { + "epoch": 0.88, + "grad_norm": 0.9917576313018799, + "learning_rate": 6.142136901299205e-06, + "loss": 5.2984, + "step": 9048 + }, + { + "epoch": 0.88, + "grad_norm": 1.0630229711532593, + "learning_rate": 6.122745782431646e-06, + "loss": 5.2025, + "step": 9052 + }, + { + "epoch": 0.88, + "grad_norm": 1.0385984182357788, + "learning_rate": 6.103354663564088e-06, + "loss": 5.2934, + "step": 9056 + }, + { + "epoch": 0.88, + "grad_norm": 1.0480278730392456, + "learning_rate": 6.083963544696529e-06, + "loss": 5.2933, + "step": 9060 + }, + { + "epoch": 0.88, + "grad_norm": 1.0395824909210205, + "learning_rate": 6.0645724258289706e-06, + "loss": 5.267, + "step": 9064 + }, + { + "epoch": 0.88, + "grad_norm": 1.0996421575546265, + "learning_rate": 6.045181306961412e-06, + "loss": 5.3497, + "step": 9068 + }, + { + "epoch": 0.88, + "grad_norm": 1.0920591354370117, + "learning_rate": 6.025790188093853e-06, + "loss": 5.2752, + "step": 9072 + }, + { + "epoch": 0.88, + "grad_norm": 1.0730946063995361, + "learning_rate": 6.006399069226295e-06, + "loss": 5.3234, + "step": 9076 + }, + { + "epoch": 0.88, + "grad_norm": 1.077646017074585, + "learning_rate": 5.987007950358736e-06, + "loss": 5.3009, + "step": 9080 + }, + { + "epoch": 0.88, + "grad_norm": 1.0901986360549927, + "learning_rate": 5.9676168314911775e-06, + "loss": 5.2653, + "step": 9084 + }, + { + "epoch": 0.88, + "grad_norm": 1.1307499408721924, + "learning_rate": 5.948225712623619e-06, + "loss": 5.393, + "step": 9088 + }, + { + "epoch": 0.88, + "grad_norm": 1.1002899408340454, + "learning_rate": 5.92883459375606e-06, + "loss": 5.2785, + "step": 9092 + }, + { + "epoch": 0.88, + "grad_norm": 1.1348730325698853, + "learning_rate": 5.909443474888502e-06, + "loss": 5.3749, + "step": 9096 + }, + { + "epoch": 0.88, + "grad_norm": 1.2404577732086182, + "learning_rate": 5.890052356020943e-06, + "loss": 5.2968, + "step": 9100 + }, + { + "epoch": 0.88, + "grad_norm": 1.1860145330429077, + "learning_rate": 5.8706612371533835e-06, + "loss": 5.2951, + "step": 9104 + }, + { + "epoch": 0.88, + "grad_norm": 1.0747588872909546, + "learning_rate": 5.851270118285825e-06, + "loss": 5.2725, + "step": 9108 + }, + { + "epoch": 0.88, + "grad_norm": 1.0642809867858887, + "learning_rate": 5.831878999418266e-06, + "loss": 5.3347, + "step": 9112 + }, + { + "epoch": 0.88, + "grad_norm": 1.0361077785491943, + "learning_rate": 5.812487880550708e-06, + "loss": 5.2899, + "step": 9116 + }, + { + "epoch": 0.88, + "grad_norm": 1.027950406074524, + "learning_rate": 5.793096761683149e-06, + "loss": 5.3016, + "step": 9120 + }, + { + "epoch": 0.88, + "grad_norm": 1.0341978073120117, + "learning_rate": 5.7737056428155905e-06, + "loss": 5.3066, + "step": 9124 + }, + { + "epoch": 0.89, + "grad_norm": 1.0980345010757446, + "learning_rate": 5.754314523948033e-06, + "loss": 5.2685, + "step": 9128 + }, + { + "epoch": 0.89, + "grad_norm": 1.0036580562591553, + "learning_rate": 5.734923405080474e-06, + "loss": 5.2972, + "step": 9132 + }, + { + "epoch": 0.89, + "grad_norm": 1.0951625108718872, + "learning_rate": 5.715532286212915e-06, + "loss": 5.2363, + "step": 9136 + }, + { + "epoch": 0.89, + "grad_norm": 1.010358214378357, + "learning_rate": 5.696141167345356e-06, + "loss": 5.2715, + "step": 9140 + }, + { + "epoch": 0.89, + "grad_norm": 1.0721516609191895, + "learning_rate": 5.676750048477797e-06, + "loss": 5.3006, + "step": 9144 + }, + { + "epoch": 0.89, + "grad_norm": 0.9922645688056946, + "learning_rate": 5.657358929610239e-06, + "loss": 5.2985, + "step": 9148 + }, + { + "epoch": 0.89, + "grad_norm": 1.0958447456359863, + "learning_rate": 5.63796781074268e-06, + "loss": 5.2388, + "step": 9152 + }, + { + "epoch": 0.89, + "grad_norm": 0.9977266788482666, + "learning_rate": 5.6185766918751215e-06, + "loss": 5.4273, + "step": 9156 + }, + { + "epoch": 0.89, + "grad_norm": 1.1025915145874023, + "learning_rate": 5.599185573007563e-06, + "loss": 5.3768, + "step": 9160 + }, + { + "epoch": 0.89, + "grad_norm": 1.0521866083145142, + "learning_rate": 5.579794454140004e-06, + "loss": 5.2555, + "step": 9164 + }, + { + "epoch": 0.89, + "grad_norm": 1.0546320676803589, + "learning_rate": 5.560403335272446e-06, + "loss": 5.259, + "step": 9168 + }, + { + "epoch": 0.89, + "grad_norm": 1.084153413772583, + "learning_rate": 5.541012216404887e-06, + "loss": 5.2733, + "step": 9172 + }, + { + "epoch": 0.89, + "grad_norm": 1.1298420429229736, + "learning_rate": 5.521621097537328e-06, + "loss": 5.2499, + "step": 9176 + }, + { + "epoch": 0.89, + "grad_norm": 1.0296047925949097, + "learning_rate": 5.502229978669769e-06, + "loss": 5.3293, + "step": 9180 + }, + { + "epoch": 0.89, + "grad_norm": 1.0231281518936157, + "learning_rate": 5.48283885980221e-06, + "loss": 5.2614, + "step": 9184 + }, + { + "epoch": 0.89, + "grad_norm": 1.1063759326934814, + "learning_rate": 5.463447740934653e-06, + "loss": 5.3569, + "step": 9188 + }, + { + "epoch": 0.89, + "grad_norm": 1.0762827396392822, + "learning_rate": 5.444056622067094e-06, + "loss": 5.3734, + "step": 9192 + }, + { + "epoch": 0.89, + "grad_norm": 1.0667394399642944, + "learning_rate": 5.424665503199535e-06, + "loss": 5.3136, + "step": 9196 + }, + { + "epoch": 0.89, + "grad_norm": 1.0663567781448364, + "learning_rate": 5.405274384331977e-06, + "loss": 5.2178, + "step": 9200 + }, + { + "epoch": 0.89, + "grad_norm": 1.1014020442962646, + "learning_rate": 5.385883265464417e-06, + "loss": 5.3121, + "step": 9204 + }, + { + "epoch": 0.89, + "grad_norm": 1.0944761037826538, + "learning_rate": 5.366492146596859e-06, + "loss": 5.3321, + "step": 9208 + }, + { + "epoch": 0.89, + "grad_norm": 1.0576825141906738, + "learning_rate": 5.3471010277293e-06, + "loss": 5.187, + "step": 9212 + }, + { + "epoch": 0.89, + "grad_norm": 1.102414608001709, + "learning_rate": 5.3277099088617414e-06, + "loss": 5.2198, + "step": 9216 + }, + { + "epoch": 0.89, + "grad_norm": 1.0515443086624146, + "learning_rate": 5.308318789994183e-06, + "loss": 5.3509, + "step": 9220 + }, + { + "epoch": 0.89, + "grad_norm": 1.0225639343261719, + "learning_rate": 5.288927671126624e-06, + "loss": 5.3264, + "step": 9224 + }, + { + "epoch": 0.89, + "grad_norm": 1.0482409000396729, + "learning_rate": 5.269536552259066e-06, + "loss": 5.314, + "step": 9228 + }, + { + "epoch": 0.9, + "grad_norm": 1.0829366445541382, + "learning_rate": 5.250145433391507e-06, + "loss": 5.3288, + "step": 9232 + }, + { + "epoch": 0.9, + "grad_norm": 1.018813967704773, + "learning_rate": 5.230754314523948e-06, + "loss": 5.245, + "step": 9236 + }, + { + "epoch": 0.9, + "grad_norm": 1.1042280197143555, + "learning_rate": 5.21136319565639e-06, + "loss": 5.3871, + "step": 9240 + }, + { + "epoch": 0.9, + "grad_norm": 1.0457403659820557, + "learning_rate": 5.19197207678883e-06, + "loss": 5.3858, + "step": 9244 + }, + { + "epoch": 0.9, + "grad_norm": 1.0873547792434692, + "learning_rate": 5.1725809579212725e-06, + "loss": 5.2907, + "step": 9248 + }, + { + "epoch": 0.9, + "grad_norm": 1.0615798234939575, + "learning_rate": 5.153189839053714e-06, + "loss": 5.2729, + "step": 9252 + }, + { + "epoch": 0.9, + "grad_norm": 1.1086784601211548, + "learning_rate": 5.133798720186155e-06, + "loss": 5.388, + "step": 9256 + }, + { + "epoch": 0.9, + "grad_norm": 1.0895764827728271, + "learning_rate": 5.114407601318597e-06, + "loss": 5.2892, + "step": 9260 + }, + { + "epoch": 0.9, + "grad_norm": 1.0369954109191895, + "learning_rate": 5.095016482451038e-06, + "loss": 5.2989, + "step": 9264 + }, + { + "epoch": 0.9, + "grad_norm": 1.049892544746399, + "learning_rate": 5.0756253635834794e-06, + "loss": 5.3419, + "step": 9268 + }, + { + "epoch": 0.9, + "grad_norm": 1.0920242071151733, + "learning_rate": 5.056234244715921e-06, + "loss": 5.3116, + "step": 9272 + }, + { + "epoch": 0.9, + "grad_norm": 1.0999177694320679, + "learning_rate": 5.036843125848361e-06, + "loss": 5.2779, + "step": 9276 + }, + { + "epoch": 0.9, + "grad_norm": 1.0239474773406982, + "learning_rate": 5.017452006980803e-06, + "loss": 5.35, + "step": 9280 + }, + { + "epoch": 0.9, + "grad_norm": 1.0571128129959106, + "learning_rate": 4.998060888113244e-06, + "loss": 5.3139, + "step": 9284 + }, + { + "epoch": 0.9, + "grad_norm": 1.1082773208618164, + "learning_rate": 4.9786697692456855e-06, + "loss": 5.2779, + "step": 9288 + }, + { + "epoch": 0.9, + "grad_norm": 1.0475021600723267, + "learning_rate": 4.959278650378127e-06, + "loss": 5.1911, + "step": 9292 + }, + { + "epoch": 0.9, + "grad_norm": 1.0756545066833496, + "learning_rate": 4.939887531510568e-06, + "loss": 5.2851, + "step": 9296 + }, + { + "epoch": 0.9, + "grad_norm": 0.9714142680168152, + "learning_rate": 4.92049641264301e-06, + "loss": 5.2584, + "step": 9300 + }, + { + "epoch": 0.9, + "grad_norm": 1.0212279558181763, + "learning_rate": 4.901105293775451e-06, + "loss": 5.2914, + "step": 9304 + }, + { + "epoch": 0.9, + "grad_norm": 1.038690209388733, + "learning_rate": 4.8817141749078924e-06, + "loss": 5.3318, + "step": 9308 + }, + { + "epoch": 0.9, + "grad_norm": 1.0438801050186157, + "learning_rate": 4.862323056040334e-06, + "loss": 5.3556, + "step": 9312 + }, + { + "epoch": 0.9, + "grad_norm": 1.0903195142745972, + "learning_rate": 4.842931937172775e-06, + "loss": 5.2413, + "step": 9316 + }, + { + "epoch": 0.9, + "grad_norm": 1.029875636100769, + "learning_rate": 4.823540818305217e-06, + "loss": 5.2558, + "step": 9320 + }, + { + "epoch": 0.9, + "grad_norm": 1.0729329586029053, + "learning_rate": 4.804149699437658e-06, + "loss": 5.2387, + "step": 9324 + }, + { + "epoch": 0.9, + "grad_norm": 1.012494683265686, + "learning_rate": 4.784758580570099e-06, + "loss": 5.2488, + "step": 9328 + }, + { + "epoch": 0.9, + "grad_norm": 1.0358749628067017, + "learning_rate": 4.765367461702541e-06, + "loss": 5.2961, + "step": 9332 + }, + { + "epoch": 0.91, + "grad_norm": 1.0714225769042969, + "learning_rate": 4.745976342834982e-06, + "loss": 5.2666, + "step": 9336 + }, + { + "epoch": 0.91, + "grad_norm": 1.0506253242492676, + "learning_rate": 4.7265852239674235e-06, + "loss": 5.2997, + "step": 9340 + }, + { + "epoch": 0.91, + "grad_norm": 1.067970633506775, + "learning_rate": 4.707194105099864e-06, + "loss": 5.2415, + "step": 9344 + }, + { + "epoch": 0.91, + "grad_norm": 1.1353143453598022, + "learning_rate": 4.687802986232305e-06, + "loss": 5.2687, + "step": 9348 + }, + { + "epoch": 0.91, + "grad_norm": 1.0400621891021729, + "learning_rate": 4.668411867364747e-06, + "loss": 5.269, + "step": 9352 + }, + { + "epoch": 0.91, + "grad_norm": 1.1021333932876587, + "learning_rate": 4.649020748497188e-06, + "loss": 5.3425, + "step": 9356 + }, + { + "epoch": 0.91, + "grad_norm": 1.0333983898162842, + "learning_rate": 4.6296296296296296e-06, + "loss": 5.3178, + "step": 9360 + }, + { + "epoch": 0.91, + "grad_norm": 1.1346157789230347, + "learning_rate": 4.610238510762071e-06, + "loss": 5.2779, + "step": 9364 + }, + { + "epoch": 0.91, + "grad_norm": 1.1103806495666504, + "learning_rate": 4.590847391894513e-06, + "loss": 5.3102, + "step": 9368 + }, + { + "epoch": 0.91, + "grad_norm": 1.109861135482788, + "learning_rate": 4.5714562730269546e-06, + "loss": 5.2864, + "step": 9372 + }, + { + "epoch": 0.91, + "grad_norm": 1.0895229578018188, + "learning_rate": 4.552065154159395e-06, + "loss": 5.3538, + "step": 9376 + }, + { + "epoch": 0.91, + "grad_norm": 1.0834227800369263, + "learning_rate": 4.5326740352918365e-06, + "loss": 5.328, + "step": 9380 + }, + { + "epoch": 0.91, + "grad_norm": 1.0530760288238525, + "learning_rate": 4.513282916424278e-06, + "loss": 5.3337, + "step": 9384 + }, + { + "epoch": 0.91, + "grad_norm": 1.0888159275054932, + "learning_rate": 4.493891797556719e-06, + "loss": 5.2769, + "step": 9388 + }, + { + "epoch": 0.91, + "grad_norm": 0.9955815076828003, + "learning_rate": 4.474500678689161e-06, + "loss": 5.3042, + "step": 9392 + }, + { + "epoch": 0.91, + "grad_norm": 1.1201573610305786, + "learning_rate": 4.455109559821602e-06, + "loss": 5.3797, + "step": 9396 + }, + { + "epoch": 0.91, + "grad_norm": 1.0425370931625366, + "learning_rate": 4.435718440954043e-06, + "loss": 5.2661, + "step": 9400 + }, + { + "epoch": 0.91, + "grad_norm": 1.10947847366333, + "learning_rate": 4.416327322086485e-06, + "loss": 5.3799, + "step": 9404 + }, + { + "epoch": 0.91, + "grad_norm": 1.088013768196106, + "learning_rate": 4.396936203218926e-06, + "loss": 5.4089, + "step": 9408 + }, + { + "epoch": 0.91, + "grad_norm": 1.0372400283813477, + "learning_rate": 4.377545084351367e-06, + "loss": 5.2819, + "step": 9412 + }, + { + "epoch": 0.91, + "grad_norm": 1.030407428741455, + "learning_rate": 4.358153965483808e-06, + "loss": 5.3247, + "step": 9416 + }, + { + "epoch": 0.91, + "grad_norm": 1.0526556968688965, + "learning_rate": 4.3387628466162495e-06, + "loss": 5.243, + "step": 9420 + }, + { + "epoch": 0.91, + "grad_norm": 1.0113731622695923, + "learning_rate": 4.319371727748691e-06, + "loss": 5.3229, + "step": 9424 + }, + { + "epoch": 0.91, + "grad_norm": 1.1396671533584595, + "learning_rate": 4.299980608881133e-06, + "loss": 5.2698, + "step": 9428 + }, + { + "epoch": 0.91, + "grad_norm": 1.066570520401001, + "learning_rate": 4.2805894900135745e-06, + "loss": 5.3542, + "step": 9432 + }, + { + "epoch": 0.91, + "grad_norm": 1.0811322927474976, + "learning_rate": 4.261198371146016e-06, + "loss": 5.2388, + "step": 9436 + }, + { + "epoch": 0.92, + "grad_norm": 1.0718379020690918, + "learning_rate": 4.241807252278457e-06, + "loss": 5.2413, + "step": 9440 + }, + { + "epoch": 0.92, + "grad_norm": 1.0602755546569824, + "learning_rate": 4.222416133410898e-06, + "loss": 5.299, + "step": 9444 + }, + { + "epoch": 0.92, + "grad_norm": 1.0844111442565918, + "learning_rate": 4.203025014543339e-06, + "loss": 5.407, + "step": 9448 + }, + { + "epoch": 0.92, + "grad_norm": 1.0383292436599731, + "learning_rate": 4.1836338956757805e-06, + "loss": 5.3359, + "step": 9452 + }, + { + "epoch": 0.92, + "grad_norm": 1.0988202095031738, + "learning_rate": 4.164242776808222e-06, + "loss": 5.4107, + "step": 9456 + }, + { + "epoch": 0.92, + "grad_norm": 1.1423723697662354, + "learning_rate": 4.144851657940663e-06, + "loss": 5.3404, + "step": 9460 + }, + { + "epoch": 0.92, + "grad_norm": 1.1802666187286377, + "learning_rate": 4.125460539073105e-06, + "loss": 5.247, + "step": 9464 + }, + { + "epoch": 0.92, + "grad_norm": 0.9723669290542603, + "learning_rate": 4.106069420205546e-06, + "loss": 5.2306, + "step": 9468 + }, + { + "epoch": 0.92, + "grad_norm": 1.0496985912322998, + "learning_rate": 4.0866783013379875e-06, + "loss": 5.2929, + "step": 9472 + }, + { + "epoch": 0.92, + "grad_norm": 1.0715490579605103, + "learning_rate": 4.067287182470429e-06, + "loss": 5.2874, + "step": 9476 + }, + { + "epoch": 0.92, + "grad_norm": 1.0244431495666504, + "learning_rate": 4.04789606360287e-06, + "loss": 5.3395, + "step": 9480 + }, + { + "epoch": 0.92, + "grad_norm": 1.0559965372085571, + "learning_rate": 4.028504944735311e-06, + "loss": 5.4143, + "step": 9484 + }, + { + "epoch": 0.92, + "grad_norm": 1.0647709369659424, + "learning_rate": 4.009113825867752e-06, + "loss": 5.3425, + "step": 9488 + }, + { + "epoch": 0.92, + "grad_norm": 1.0821641683578491, + "learning_rate": 3.989722707000194e-06, + "loss": 5.2985, + "step": 9492 + }, + { + "epoch": 0.92, + "grad_norm": 1.0766880512237549, + "learning_rate": 3.970331588132636e-06, + "loss": 5.3269, + "step": 9496 + }, + { + "epoch": 0.92, + "grad_norm": 1.0378237962722778, + "learning_rate": 3.950940469265077e-06, + "loss": 5.2643, + "step": 9500 + }, + { + "epoch": 0.92, + "grad_norm": 1.1061269044876099, + "learning_rate": 3.9315493503975185e-06, + "loss": 5.3131, + "step": 9504 + }, + { + "epoch": 0.92, + "grad_norm": 1.0113786458969116, + "learning_rate": 3.91215823152996e-06, + "loss": 5.3312, + "step": 9508 + }, + { + "epoch": 0.92, + "grad_norm": 1.0934914350509644, + "learning_rate": 3.8927671126624005e-06, + "loss": 5.2506, + "step": 9512 + }, + { + "epoch": 0.92, + "grad_norm": 1.024383544921875, + "learning_rate": 3.873375993794842e-06, + "loss": 5.3126, + "step": 9516 + }, + { + "epoch": 0.92, + "grad_norm": 1.0432902574539185, + "learning_rate": 3.853984874927283e-06, + "loss": 5.2686, + "step": 9520 + }, + { + "epoch": 0.92, + "grad_norm": 1.0603485107421875, + "learning_rate": 3.834593756059725e-06, + "loss": 5.2765, + "step": 9524 + }, + { + "epoch": 0.92, + "grad_norm": 1.0203194618225098, + "learning_rate": 3.815202637192166e-06, + "loss": 5.2347, + "step": 9528 + }, + { + "epoch": 0.92, + "grad_norm": 1.0024125576019287, + "learning_rate": 3.7958115183246074e-06, + "loss": 5.2144, + "step": 9532 + }, + { + "epoch": 0.92, + "grad_norm": 1.098630666732788, + "learning_rate": 3.7764203994570488e-06, + "loss": 5.327, + "step": 9536 + }, + { + "epoch": 0.92, + "grad_norm": 1.0508458614349365, + "learning_rate": 3.75702928058949e-06, + "loss": 5.2154, + "step": 9540 + }, + { + "epoch": 0.93, + "grad_norm": 1.0391416549682617, + "learning_rate": 3.737638161721931e-06, + "loss": 5.3172, + "step": 9544 + }, + { + "epoch": 0.93, + "grad_norm": 1.0499507188796997, + "learning_rate": 3.7182470428543725e-06, + "loss": 5.3587, + "step": 9548 + }, + { + "epoch": 0.93, + "grad_norm": 1.1225402355194092, + "learning_rate": 3.6988559239868147e-06, + "loss": 5.2066, + "step": 9552 + }, + { + "epoch": 0.93, + "grad_norm": 1.0803992748260498, + "learning_rate": 3.6794648051192557e-06, + "loss": 5.2294, + "step": 9556 + }, + { + "epoch": 0.93, + "grad_norm": 1.0833008289337158, + "learning_rate": 3.660073686251697e-06, + "loss": 5.3644, + "step": 9560 + }, + { + "epoch": 0.93, + "grad_norm": 1.069279670715332, + "learning_rate": 3.6406825673841384e-06, + "loss": 5.2461, + "step": 9564 + }, + { + "epoch": 0.93, + "grad_norm": 1.013576626777649, + "learning_rate": 3.62129144851658e-06, + "loss": 5.3483, + "step": 9568 + }, + { + "epoch": 0.93, + "grad_norm": 1.082595944404602, + "learning_rate": 3.601900329649021e-06, + "loss": 5.2905, + "step": 9572 + }, + { + "epoch": 0.93, + "grad_norm": 1.0759378671646118, + "learning_rate": 3.582509210781462e-06, + "loss": 5.3518, + "step": 9576 + }, + { + "epoch": 0.93, + "grad_norm": 1.070561170578003, + "learning_rate": 3.5631180919139036e-06, + "loss": 5.3375, + "step": 9580 + }, + { + "epoch": 0.93, + "grad_norm": 1.0099434852600098, + "learning_rate": 3.543726973046345e-06, + "loss": 5.3133, + "step": 9584 + }, + { + "epoch": 0.93, + "grad_norm": 1.1549813747406006, + "learning_rate": 3.5243358541787863e-06, + "loss": 5.3642, + "step": 9588 + }, + { + "epoch": 0.93, + "grad_norm": 1.024865746498108, + "learning_rate": 3.5049447353112273e-06, + "loss": 5.3313, + "step": 9592 + }, + { + "epoch": 0.93, + "grad_norm": 1.074829339981079, + "learning_rate": 3.4855536164436687e-06, + "loss": 5.3294, + "step": 9596 + }, + { + "epoch": 0.93, + "grad_norm": 1.0802452564239502, + "learning_rate": 3.46616249757611e-06, + "loss": 5.3024, + "step": 9600 + }, + { + "epoch": 0.93, + "grad_norm": 1.1103779077529907, + "learning_rate": 3.4467713787085514e-06, + "loss": 5.259, + "step": 9604 + }, + { + "epoch": 0.93, + "grad_norm": 1.122125506401062, + "learning_rate": 3.427380259840993e-06, + "loss": 5.388, + "step": 9608 + }, + { + "epoch": 0.93, + "grad_norm": 1.106959581375122, + "learning_rate": 3.4079891409734346e-06, + "loss": 5.344, + "step": 9612 + }, + { + "epoch": 0.93, + "grad_norm": 1.15294349193573, + "learning_rate": 3.388598022105876e-06, + "loss": 5.3002, + "step": 9616 + }, + { + "epoch": 0.93, + "grad_norm": 1.124644160270691, + "learning_rate": 3.3692069032383174e-06, + "loss": 5.406, + "step": 9620 + }, + { + "epoch": 0.93, + "grad_norm": 0.9895834922790527, + "learning_rate": 3.3498157843707584e-06, + "loss": 5.3485, + "step": 9624 + }, + { + "epoch": 0.93, + "grad_norm": 1.155667781829834, + "learning_rate": 3.3304246655031997e-06, + "loss": 5.3161, + "step": 9628 + }, + { + "epoch": 0.93, + "grad_norm": 1.0220144987106323, + "learning_rate": 3.311033546635641e-06, + "loss": 5.3078, + "step": 9632 + }, + { + "epoch": 0.93, + "grad_norm": 1.0805678367614746, + "learning_rate": 3.2916424277680825e-06, + "loss": 5.3262, + "step": 9636 + }, + { + "epoch": 0.93, + "grad_norm": 1.0154497623443604, + "learning_rate": 3.272251308900524e-06, + "loss": 5.2922, + "step": 9640 + }, + { + "epoch": 0.94, + "grad_norm": 1.0291320085525513, + "learning_rate": 3.252860190032965e-06, + "loss": 5.2613, + "step": 9644 + }, + { + "epoch": 0.94, + "grad_norm": 1.0393972396850586, + "learning_rate": 3.2334690711654062e-06, + "loss": 5.3226, + "step": 9648 + }, + { + "epoch": 0.94, + "grad_norm": 1.0496784448623657, + "learning_rate": 3.2140779522978476e-06, + "loss": 5.3214, + "step": 9652 + }, + { + "epoch": 0.94, + "grad_norm": 1.1420729160308838, + "learning_rate": 3.194686833430289e-06, + "loss": 5.3446, + "step": 9656 + }, + { + "epoch": 0.94, + "grad_norm": 1.1252871751785278, + "learning_rate": 3.17529571456273e-06, + "loss": 5.2488, + "step": 9660 + }, + { + "epoch": 0.94, + "grad_norm": 1.0620160102844238, + "learning_rate": 3.1559045956951714e-06, + "loss": 5.3523, + "step": 9664 + }, + { + "epoch": 0.94, + "grad_norm": 1.14591646194458, + "learning_rate": 3.1365134768276127e-06, + "loss": 5.277, + "step": 9668 + }, + { + "epoch": 0.94, + "grad_norm": 1.0520758628845215, + "learning_rate": 3.1171223579600545e-06, + "loss": 5.2889, + "step": 9672 + }, + { + "epoch": 0.94, + "grad_norm": 1.0912026166915894, + "learning_rate": 3.0977312390924955e-06, + "loss": 5.3545, + "step": 9676 + }, + { + "epoch": 0.94, + "grad_norm": 1.0775065422058105, + "learning_rate": 3.078340120224937e-06, + "loss": 5.3119, + "step": 9680 + }, + { + "epoch": 0.94, + "grad_norm": 1.0335484743118286, + "learning_rate": 3.0589490013573787e-06, + "loss": 5.3732, + "step": 9684 + }, + { + "epoch": 0.94, + "grad_norm": 1.0346179008483887, + "learning_rate": 3.03955788248982e-06, + "loss": 5.2468, + "step": 9688 + }, + { + "epoch": 0.94, + "grad_norm": 1.091991662979126, + "learning_rate": 3.020166763622261e-06, + "loss": 5.3091, + "step": 9692 + }, + { + "epoch": 0.94, + "grad_norm": 1.07473886013031, + "learning_rate": 3.0007756447547024e-06, + "loss": 5.2884, + "step": 9696 + }, + { + "epoch": 0.94, + "grad_norm": 1.125009298324585, + "learning_rate": 2.981384525887144e-06, + "loss": 5.1994, + "step": 9700 + }, + { + "epoch": 0.94, + "grad_norm": 1.2514116764068604, + "learning_rate": 2.961993407019585e-06, + "loss": 5.3651, + "step": 9704 + }, + { + "epoch": 0.94, + "grad_norm": 1.1177825927734375, + "learning_rate": 2.9426022881520266e-06, + "loss": 5.3326, + "step": 9708 + }, + { + "epoch": 0.94, + "grad_norm": 0.9657092690467834, + "learning_rate": 2.9232111692844675e-06, + "loss": 5.2113, + "step": 9712 + }, + { + "epoch": 0.94, + "grad_norm": 1.1035900115966797, + "learning_rate": 2.9038200504169093e-06, + "loss": 5.3596, + "step": 9716 + }, + { + "epoch": 0.94, + "grad_norm": 1.0969198942184448, + "learning_rate": 2.8844289315493507e-06, + "loss": 5.2202, + "step": 9720 + }, + { + "epoch": 0.94, + "grad_norm": 1.0725688934326172, + "learning_rate": 2.865037812681792e-06, + "loss": 5.3028, + "step": 9724 + }, + { + "epoch": 0.94, + "grad_norm": 1.0617296695709229, + "learning_rate": 2.845646693814233e-06, + "loss": 5.2939, + "step": 9728 + }, + { + "epoch": 0.94, + "grad_norm": 1.0573393106460571, + "learning_rate": 2.8262555749466745e-06, + "loss": 5.2684, + "step": 9732 + }, + { + "epoch": 0.94, + "grad_norm": 1.0836176872253418, + "learning_rate": 2.806864456079116e-06, + "loss": 5.2659, + "step": 9736 + }, + { + "epoch": 0.94, + "grad_norm": 1.157784342765808, + "learning_rate": 2.7874733372115572e-06, + "loss": 5.326, + "step": 9740 + }, + { + "epoch": 0.94, + "grad_norm": 1.1517001390457153, + "learning_rate": 2.7680822183439986e-06, + "loss": 5.3198, + "step": 9744 + }, + { + "epoch": 0.95, + "grad_norm": 1.0895476341247559, + "learning_rate": 2.74869109947644e-06, + "loss": 5.2795, + "step": 9748 + }, + { + "epoch": 0.95, + "grad_norm": 1.0055677890777588, + "learning_rate": 2.7292999806088814e-06, + "loss": 5.2201, + "step": 9752 + }, + { + "epoch": 0.95, + "grad_norm": 1.0537750720977783, + "learning_rate": 2.7099088617413228e-06, + "loss": 5.2557, + "step": 9756 + }, + { + "epoch": 0.95, + "grad_norm": 1.1611661911010742, + "learning_rate": 2.690517742873764e-06, + "loss": 5.2528, + "step": 9760 + }, + { + "epoch": 0.95, + "grad_norm": 1.0563558340072632, + "learning_rate": 2.671126624006205e-06, + "loss": 5.3437, + "step": 9764 + }, + { + "epoch": 0.95, + "grad_norm": 1.08133065700531, + "learning_rate": 2.6517355051386465e-06, + "loss": 5.278, + "step": 9768 + }, + { + "epoch": 0.95, + "grad_norm": 1.0893269777297974, + "learning_rate": 2.632344386271088e-06, + "loss": 5.4056, + "step": 9772 + }, + { + "epoch": 0.95, + "grad_norm": 1.0335135459899902, + "learning_rate": 2.6129532674035292e-06, + "loss": 5.2513, + "step": 9776 + }, + { + "epoch": 0.95, + "grad_norm": 1.0350239276885986, + "learning_rate": 2.5935621485359706e-06, + "loss": 5.2965, + "step": 9780 + }, + { + "epoch": 0.95, + "grad_norm": 1.1059588193893433, + "learning_rate": 2.574171029668412e-06, + "loss": 5.291, + "step": 9784 + }, + { + "epoch": 0.95, + "grad_norm": 1.109610915184021, + "learning_rate": 2.5547799108008534e-06, + "loss": 5.3289, + "step": 9788 + }, + { + "epoch": 0.95, + "grad_norm": 1.0264822244644165, + "learning_rate": 2.5353887919332948e-06, + "loss": 5.3212, + "step": 9792 + }, + { + "epoch": 0.95, + "grad_norm": 1.0764135122299194, + "learning_rate": 2.5159976730657357e-06, + "loss": 5.4082, + "step": 9796 + }, + { + "epoch": 0.95, + "grad_norm": 1.0444653034210205, + "learning_rate": 2.496606554198177e-06, + "loss": 5.2801, + "step": 9800 + }, + { + "epoch": 0.95, + "grad_norm": 1.23245108127594, + "learning_rate": 2.477215435330619e-06, + "loss": 5.4502, + "step": 9804 + }, + { + "epoch": 0.95, + "grad_norm": 1.0822020769119263, + "learning_rate": 2.4578243164630603e-06, + "loss": 5.2585, + "step": 9808 + }, + { + "epoch": 0.95, + "grad_norm": 1.0727524757385254, + "learning_rate": 2.4384331975955013e-06, + "loss": 5.3905, + "step": 9812 + }, + { + "epoch": 0.95, + "grad_norm": 1.1105926036834717, + "learning_rate": 2.4190420787279427e-06, + "loss": 5.3339, + "step": 9816 + }, + { + "epoch": 0.95, + "grad_norm": 1.0885437726974487, + "learning_rate": 2.399650959860384e-06, + "loss": 5.277, + "step": 9820 + }, + { + "epoch": 0.95, + "grad_norm": 1.0500329732894897, + "learning_rate": 2.3802598409928254e-06, + "loss": 5.3009, + "step": 9824 + }, + { + "epoch": 0.95, + "grad_norm": 1.0552301406860352, + "learning_rate": 2.360868722125267e-06, + "loss": 5.3666, + "step": 9828 + }, + { + "epoch": 0.95, + "grad_norm": 1.1031043529510498, + "learning_rate": 2.3414776032577078e-06, + "loss": 5.3025, + "step": 9832 + }, + { + "epoch": 0.95, + "grad_norm": 1.0527067184448242, + "learning_rate": 2.3220864843901496e-06, + "loss": 5.2945, + "step": 9836 + }, + { + "epoch": 0.95, + "grad_norm": 1.0818541049957275, + "learning_rate": 2.302695365522591e-06, + "loss": 5.3044, + "step": 9840 + }, + { + "epoch": 0.95, + "grad_norm": 1.071460485458374, + "learning_rate": 2.2833042466550324e-06, + "loss": 5.3087, + "step": 9844 + }, + { + "epoch": 0.95, + "grad_norm": 1.1150330305099487, + "learning_rate": 2.2639131277874733e-06, + "loss": 5.2367, + "step": 9848 + }, + { + "epoch": 0.96, + "grad_norm": 1.1372606754302979, + "learning_rate": 2.2445220089199147e-06, + "loss": 5.3709, + "step": 9852 + }, + { + "epoch": 0.96, + "grad_norm": 1.1396896839141846, + "learning_rate": 2.225130890052356e-06, + "loss": 5.2739, + "step": 9856 + }, + { + "epoch": 0.96, + "grad_norm": 1.0782867670059204, + "learning_rate": 2.2057397711847975e-06, + "loss": 5.3314, + "step": 9860 + }, + { + "epoch": 0.96, + "grad_norm": 1.0799883604049683, + "learning_rate": 2.186348652317239e-06, + "loss": 5.2398, + "step": 9864 + }, + { + "epoch": 0.96, + "grad_norm": 1.063509464263916, + "learning_rate": 2.1669575334496802e-06, + "loss": 5.3209, + "step": 9868 + }, + { + "epoch": 0.96, + "grad_norm": 1.074567437171936, + "learning_rate": 2.1475664145821216e-06, + "loss": 5.3116, + "step": 9872 + }, + { + "epoch": 0.96, + "grad_norm": 1.0963295698165894, + "learning_rate": 2.128175295714563e-06, + "loss": 5.2356, + "step": 9876 + }, + { + "epoch": 0.96, + "grad_norm": 1.0509377717971802, + "learning_rate": 2.108784176847004e-06, + "loss": 5.3888, + "step": 9880 + }, + { + "epoch": 0.96, + "grad_norm": 1.0329670906066895, + "learning_rate": 2.0893930579794453e-06, + "loss": 5.2065, + "step": 9884 + }, + { + "epoch": 0.96, + "grad_norm": 1.0395572185516357, + "learning_rate": 2.0700019391118867e-06, + "loss": 5.2347, + "step": 9888 + }, + { + "epoch": 0.96, + "grad_norm": 1.0882209539413452, + "learning_rate": 2.050610820244328e-06, + "loss": 5.3839, + "step": 9892 + }, + { + "epoch": 0.96, + "grad_norm": 1.0747255086898804, + "learning_rate": 2.0312197013767695e-06, + "loss": 5.3768, + "step": 9896 + }, + { + "epoch": 0.96, + "grad_norm": 1.0426535606384277, + "learning_rate": 2.011828582509211e-06, + "loss": 5.2553, + "step": 9900 + }, + { + "epoch": 0.96, + "grad_norm": 0.9872206449508667, + "learning_rate": 1.9924374636416523e-06, + "loss": 5.246, + "step": 9904 + }, + { + "epoch": 0.96, + "grad_norm": 1.0624208450317383, + "learning_rate": 1.9730463447740936e-06, + "loss": 5.2859, + "step": 9908 + }, + { + "epoch": 0.96, + "grad_norm": 1.0884760618209839, + "learning_rate": 1.953655225906535e-06, + "loss": 5.3412, + "step": 9912 + }, + { + "epoch": 0.96, + "grad_norm": 1.0986016988754272, + "learning_rate": 1.934264107038976e-06, + "loss": 5.2388, + "step": 9916 + }, + { + "epoch": 0.96, + "grad_norm": 1.0865901708602905, + "learning_rate": 1.9148729881714174e-06, + "loss": 5.2769, + "step": 9920 + }, + { + "epoch": 0.96, + "grad_norm": 1.0592422485351562, + "learning_rate": 1.8954818693038588e-06, + "loss": 5.3446, + "step": 9924 + }, + { + "epoch": 0.96, + "grad_norm": 1.0826843976974487, + "learning_rate": 1.8760907504363004e-06, + "loss": 5.254, + "step": 9928 + }, + { + "epoch": 0.96, + "grad_norm": 1.1321392059326172, + "learning_rate": 1.8566996315687417e-06, + "loss": 5.266, + "step": 9932 + }, + { + "epoch": 0.96, + "grad_norm": 1.0519055128097534, + "learning_rate": 1.837308512701183e-06, + "loss": 5.3199, + "step": 9936 + }, + { + "epoch": 0.96, + "grad_norm": 1.0540200471878052, + "learning_rate": 1.8179173938336243e-06, + "loss": 5.3855, + "step": 9940 + }, + { + "epoch": 0.96, + "grad_norm": 1.0404717922210693, + "learning_rate": 1.7985262749660657e-06, + "loss": 5.3105, + "step": 9944 + }, + { + "epoch": 0.96, + "grad_norm": 1.076065182685852, + "learning_rate": 1.7791351560985068e-06, + "loss": 5.3344, + "step": 9948 + }, + { + "epoch": 0.96, + "grad_norm": 1.0112926959991455, + "learning_rate": 1.7597440372309482e-06, + "loss": 5.4095, + "step": 9952 + }, + { + "epoch": 0.97, + "grad_norm": 1.0115467309951782, + "learning_rate": 1.7403529183633898e-06, + "loss": 5.2626, + "step": 9956 + }, + { + "epoch": 0.97, + "grad_norm": 1.0050586462020874, + "learning_rate": 1.720961799495831e-06, + "loss": 5.346, + "step": 9960 + }, + { + "epoch": 0.97, + "grad_norm": 1.0998096466064453, + "learning_rate": 1.7015706806282724e-06, + "loss": 5.2961, + "step": 9964 + }, + { + "epoch": 0.97, + "grad_norm": 1.0492075681686401, + "learning_rate": 1.6821795617607138e-06, + "loss": 5.3103, + "step": 9968 + }, + { + "epoch": 0.97, + "grad_norm": 1.1068048477172852, + "learning_rate": 1.662788442893155e-06, + "loss": 5.3157, + "step": 9972 + }, + { + "epoch": 0.97, + "grad_norm": 1.0755717754364014, + "learning_rate": 1.6433973240255963e-06, + "loss": 5.3995, + "step": 9976 + }, + { + "epoch": 0.97, + "grad_norm": 1.0472952127456665, + "learning_rate": 1.6240062051580375e-06, + "loss": 5.2634, + "step": 9980 + }, + { + "epoch": 0.97, + "grad_norm": 1.0923290252685547, + "learning_rate": 1.6046150862904789e-06, + "loss": 5.267, + "step": 9984 + }, + { + "epoch": 0.97, + "grad_norm": 1.0469489097595215, + "learning_rate": 1.5852239674229205e-06, + "loss": 5.2445, + "step": 9988 + }, + { + "epoch": 0.97, + "grad_norm": 1.055059552192688, + "learning_rate": 1.5658328485553619e-06, + "loss": 5.2376, + "step": 9992 + }, + { + "epoch": 0.97, + "grad_norm": 0.9697688817977905, + "learning_rate": 1.546441729687803e-06, + "loss": 5.2468, + "step": 9996 + }, + { + "epoch": 0.97, + "grad_norm": 1.0974935293197632, + "learning_rate": 1.5270506108202444e-06, + "loss": 5.3911, + "step": 10000 + }, + { + "epoch": 0.97, + "grad_norm": 1.0364536046981812, + "learning_rate": 1.5076594919526856e-06, + "loss": 5.171, + "step": 10004 + }, + { + "epoch": 0.97, + "grad_norm": 1.0847764015197754, + "learning_rate": 1.4882683730851272e-06, + "loss": 5.3467, + "step": 10008 + }, + { + "epoch": 0.97, + "grad_norm": 1.0459425449371338, + "learning_rate": 1.4688772542175684e-06, + "loss": 5.2316, + "step": 10012 + }, + { + "epoch": 0.97, + "grad_norm": 1.084526777267456, + "learning_rate": 1.4494861353500097e-06, + "loss": 5.279, + "step": 10016 + }, + { + "epoch": 0.97, + "grad_norm": 1.141874074935913, + "learning_rate": 1.4300950164824511e-06, + "loss": 5.1842, + "step": 10020 + }, + { + "epoch": 0.97, + "grad_norm": 1.1359285116195679, + "learning_rate": 1.4107038976148925e-06, + "loss": 5.2476, + "step": 10024 + }, + { + "epoch": 0.97, + "grad_norm": 1.1360441446304321, + "learning_rate": 1.3913127787473339e-06, + "loss": 5.4255, + "step": 10028 + }, + { + "epoch": 0.97, + "grad_norm": 1.0821614265441895, + "learning_rate": 1.371921659879775e-06, + "loss": 5.2677, + "step": 10032 + }, + { + "epoch": 0.97, + "grad_norm": 1.108450174331665, + "learning_rate": 1.3525305410122164e-06, + "loss": 5.3634, + "step": 10036 + }, + { + "epoch": 0.97, + "grad_norm": 1.1564322710037231, + "learning_rate": 1.3331394221446578e-06, + "loss": 5.3358, + "step": 10040 + }, + { + "epoch": 0.97, + "grad_norm": 1.0499944686889648, + "learning_rate": 1.3137483032770992e-06, + "loss": 5.2706, + "step": 10044 + }, + { + "epoch": 0.97, + "grad_norm": 1.115499496459961, + "learning_rate": 1.2943571844095404e-06, + "loss": 5.2685, + "step": 10048 + }, + { + "epoch": 0.97, + "grad_norm": 1.0848426818847656, + "learning_rate": 1.274966065541982e-06, + "loss": 5.1596, + "step": 10052 + }, + { + "epoch": 0.97, + "grad_norm": 1.014477014541626, + "learning_rate": 1.2555749466744232e-06, + "loss": 5.3194, + "step": 10056 + }, + { + "epoch": 0.98, + "grad_norm": 1.0822529792785645, + "learning_rate": 1.2361838278068645e-06, + "loss": 5.2521, + "step": 10060 + }, + { + "epoch": 0.98, + "grad_norm": 1.097193717956543, + "learning_rate": 1.2167927089393057e-06, + "loss": 5.2256, + "step": 10064 + }, + { + "epoch": 0.98, + "grad_norm": 1.0768396854400635, + "learning_rate": 1.1974015900717473e-06, + "loss": 5.3245, + "step": 10068 + }, + { + "epoch": 0.98, + "grad_norm": 1.1191695928573608, + "learning_rate": 1.1780104712041885e-06, + "loss": 5.3045, + "step": 10072 + }, + { + "epoch": 0.98, + "grad_norm": 1.0832325220108032, + "learning_rate": 1.1586193523366299e-06, + "loss": 5.3106, + "step": 10076 + }, + { + "epoch": 0.98, + "grad_norm": 1.083602786064148, + "learning_rate": 1.1392282334690712e-06, + "loss": 5.3153, + "step": 10080 + }, + { + "epoch": 0.98, + "grad_norm": 1.1011203527450562, + "learning_rate": 1.1198371146015126e-06, + "loss": 5.3236, + "step": 10084 + }, + { + "epoch": 0.98, + "grad_norm": 1.1766139268875122, + "learning_rate": 1.100445995733954e-06, + "loss": 5.2973, + "step": 10088 + }, + { + "epoch": 0.98, + "grad_norm": 1.0857359170913696, + "learning_rate": 1.0810548768663952e-06, + "loss": 5.2986, + "step": 10092 + }, + { + "epoch": 0.98, + "grad_norm": 1.1640616655349731, + "learning_rate": 1.0616637579988366e-06, + "loss": 5.3083, + "step": 10096 + }, + { + "epoch": 0.98, + "grad_norm": 1.1388750076293945, + "learning_rate": 1.042272639131278e-06, + "loss": 5.3472, + "step": 10100 + }, + { + "epoch": 0.98, + "grad_norm": 1.1172057390213013, + "learning_rate": 1.0228815202637193e-06, + "loss": 5.2601, + "step": 10104 + }, + { + "epoch": 0.98, + "grad_norm": 1.078047275543213, + "learning_rate": 1.0034904013961605e-06, + "loss": 5.2567, + "step": 10108 + }, + { + "epoch": 0.98, + "grad_norm": 1.134878158569336, + "learning_rate": 9.840992825286019e-07, + "loss": 5.2999, + "step": 10112 + }, + { + "epoch": 0.98, + "grad_norm": 1.049080729484558, + "learning_rate": 9.647081636610433e-07, + "loss": 5.2823, + "step": 10116 + }, + { + "epoch": 0.98, + "grad_norm": 1.0992294549942017, + "learning_rate": 9.453170447934847e-07, + "loss": 5.264, + "step": 10120 + }, + { + "epoch": 0.98, + "grad_norm": 0.9882086515426636, + "learning_rate": 9.259259259259259e-07, + "loss": 5.2944, + "step": 10124 + }, + { + "epoch": 0.98, + "grad_norm": 1.0819331407546997, + "learning_rate": 9.065348070583674e-07, + "loss": 5.3227, + "step": 10128 + }, + { + "epoch": 0.98, + "grad_norm": 1.0847752094268799, + "learning_rate": 8.871436881908087e-07, + "loss": 5.375, + "step": 10132 + }, + { + "epoch": 0.98, + "grad_norm": 1.0530517101287842, + "learning_rate": 8.6775256932325e-07, + "loss": 5.2639, + "step": 10136 + }, + { + "epoch": 0.98, + "grad_norm": 1.010219931602478, + "learning_rate": 8.483614504556913e-07, + "loss": 5.2476, + "step": 10140 + }, + { + "epoch": 0.98, + "grad_norm": 0.9741299748420715, + "learning_rate": 8.289703315881327e-07, + "loss": 5.3081, + "step": 10144 + }, + { + "epoch": 0.98, + "grad_norm": 1.1004730463027954, + "learning_rate": 8.09579212720574e-07, + "loss": 5.2126, + "step": 10148 + }, + { + "epoch": 0.98, + "grad_norm": 1.1317744255065918, + "learning_rate": 7.901880938530153e-07, + "loss": 5.2943, + "step": 10152 + }, + { + "epoch": 0.98, + "grad_norm": 1.1052905321121216, + "learning_rate": 7.707969749854567e-07, + "loss": 5.4339, + "step": 10156 + }, + { + "epoch": 0.99, + "grad_norm": 1.0474963188171387, + "learning_rate": 7.51405856117898e-07, + "loss": 5.3417, + "step": 10160 + }, + { + "epoch": 0.99, + "grad_norm": 1.0810647010803223, + "learning_rate": 7.320147372503393e-07, + "loss": 5.4219, + "step": 10164 + }, + { + "epoch": 0.99, + "grad_norm": 1.0385664701461792, + "learning_rate": 7.126236183827807e-07, + "loss": 5.3207, + "step": 10168 + }, + { + "epoch": 0.99, + "grad_norm": 1.1225430965423584, + "learning_rate": 6.93232499515222e-07, + "loss": 5.3818, + "step": 10172 + }, + { + "epoch": 0.99, + "grad_norm": 1.021005630493164, + "learning_rate": 6.738413806476634e-07, + "loss": 5.3095, + "step": 10176 + }, + { + "epoch": 0.99, + "grad_norm": 1.0524823665618896, + "learning_rate": 6.544502617801048e-07, + "loss": 5.2837, + "step": 10180 + }, + { + "epoch": 0.99, + "grad_norm": 1.0468004941940308, + "learning_rate": 6.350591429125462e-07, + "loss": 5.2493, + "step": 10184 + }, + { + "epoch": 0.99, + "grad_norm": 1.0307637453079224, + "learning_rate": 6.156680240449874e-07, + "loss": 5.2559, + "step": 10188 + }, + { + "epoch": 0.99, + "grad_norm": 1.0332852602005005, + "learning_rate": 5.962769051774288e-07, + "loss": 5.4378, + "step": 10192 + }, + { + "epoch": 0.99, + "grad_norm": 1.0646268129348755, + "learning_rate": 5.768857863098701e-07, + "loss": 5.2816, + "step": 10196 + }, + { + "epoch": 0.99, + "grad_norm": 1.0905969142913818, + "learning_rate": 5.574946674423115e-07, + "loss": 5.363, + "step": 10200 + }, + { + "epoch": 0.99, + "grad_norm": 1.0439486503601074, + "learning_rate": 5.381035485747528e-07, + "loss": 5.3199, + "step": 10204 + }, + { + "epoch": 0.99, + "grad_norm": 1.0738022327423096, + "learning_rate": 5.187124297071941e-07, + "loss": 5.251, + "step": 10208 + }, + { + "epoch": 0.99, + "grad_norm": 1.030446171760559, + "learning_rate": 4.993213108396354e-07, + "loss": 5.3156, + "step": 10212 + }, + { + "epoch": 0.99, + "grad_norm": 1.042098045349121, + "learning_rate": 4.799301919720768e-07, + "loss": 5.2401, + "step": 10216 + }, + { + "epoch": 0.99, + "grad_norm": 0.9965024590492249, + "learning_rate": 4.6053907310451814e-07, + "loss": 5.2561, + "step": 10220 + }, + { + "epoch": 0.99, + "grad_norm": 1.0656026601791382, + "learning_rate": 4.411479542369595e-07, + "loss": 5.3587, + "step": 10224 + }, + { + "epoch": 0.99, + "grad_norm": 1.0533535480499268, + "learning_rate": 4.217568353694008e-07, + "loss": 5.3381, + "step": 10228 + }, + { + "epoch": 0.99, + "grad_norm": 1.0184639692306519, + "learning_rate": 4.023657165018422e-07, + "loss": 5.3548, + "step": 10232 + }, + { + "epoch": 0.99, + "grad_norm": 1.0845178365707397, + "learning_rate": 3.829745976342835e-07, + "loss": 5.3597, + "step": 10236 + }, + { + "epoch": 0.99, + "grad_norm": 1.068476915359497, + "learning_rate": 3.6358347876672485e-07, + "loss": 5.2314, + "step": 10240 + }, + { + "epoch": 0.99, + "grad_norm": 1.1127116680145264, + "learning_rate": 3.441923598991662e-07, + "loss": 5.2483, + "step": 10244 + }, + { + "epoch": 0.99, + "grad_norm": 0.9709210395812988, + "learning_rate": 3.248012410316075e-07, + "loss": 5.2665, + "step": 10248 + }, + { + "epoch": 0.99, + "grad_norm": 1.0695501565933228, + "learning_rate": 3.054101221640489e-07, + "loss": 5.2079, + "step": 10252 + }, + { + "epoch": 0.99, + "grad_norm": 1.1100637912750244, + "learning_rate": 2.860190032964902e-07, + "loss": 5.3078, + "step": 10256 + }, + { + "epoch": 0.99, + "grad_norm": 1.0800820589065552, + "learning_rate": 2.6662788442893156e-07, + "loss": 5.4309, + "step": 10260 + }, + { + "epoch": 1.0, + "grad_norm": 1.0710211992263794, + "learning_rate": 2.472367655613729e-07, + "loss": 5.3063, + "step": 10264 + }, + { + "epoch": 1.0, + "grad_norm": 1.0967445373535156, + "learning_rate": 2.2784564669381422e-07, + "loss": 5.2245, + "step": 10268 + }, + { + "epoch": 1.0, + "grad_norm": 1.05315101146698, + "learning_rate": 2.084545278262556e-07, + "loss": 5.3429, + "step": 10272 + }, + { + "epoch": 1.0, + "grad_norm": 0.97821044921875, + "learning_rate": 1.8906340895869693e-07, + "loss": 5.2508, + "step": 10276 + }, + { + "epoch": 1.0, + "grad_norm": 1.0844597816467285, + "learning_rate": 1.6967229009113826e-07, + "loss": 5.2811, + "step": 10280 + }, + { + "epoch": 1.0, + "grad_norm": 1.0489965677261353, + "learning_rate": 1.5028117122357962e-07, + "loss": 5.3193, + "step": 10284 + }, + { + "epoch": 1.0, + "grad_norm": 1.0502104759216309, + "learning_rate": 1.3089005235602095e-07, + "loss": 5.3323, + "step": 10288 + }, + { + "epoch": 1.0, + "grad_norm": 1.1087231636047363, + "learning_rate": 1.114989334884623e-07, + "loss": 5.2669, + "step": 10292 + }, + { + "epoch": 1.0, + "grad_norm": 1.0919489860534668, + "learning_rate": 9.210781462090364e-08, + "loss": 5.2765, + "step": 10296 + }, + { + "epoch": 1.0, + "grad_norm": 1.017907738685608, + "learning_rate": 7.271669575334497e-08, + "loss": 5.3242, + "step": 10300 + }, + { + "epoch": 1.0, + "grad_norm": 1.1086193323135376, + "learning_rate": 5.332557688578631e-08, + "loss": 5.2659, + "step": 10304 + }, + { + "epoch": 1.0, + "grad_norm": 1.0139724016189575, + "learning_rate": 3.393445801822765e-08, + "loss": 5.3324, + "step": 10308 + }, + { + "epoch": 1.0, + "grad_norm": 0.9419534802436829, + "learning_rate": 1.4543339150668994e-08, + "loss": 5.3066, + "step": 10312 + }, + { + "epoch": 1.0, + "step": 10314, + "total_flos": 8.68702227309527e+16, + "train_loss": 5.446075817446203, + "train_runtime": 2459.6002, + "train_samples_per_second": 67.094, + "train_steps_per_second": 4.193 + } + ], + "logging_steps": 4, + "max_steps": 10314, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1032, + "total_flos": 8.68702227309527e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}