diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16026 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 2280, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013157894736842105, + "grad_norm": 0.9943248451862411, + "learning_rate": 8.771929824561404e-07, + "loss": 1.3563, + "step": 1 + }, + { + "epoch": 0.002631578947368421, + "grad_norm": 1.0356516381932348, + "learning_rate": 1.7543859649122807e-06, + "loss": 1.3798, + "step": 2 + }, + { + "epoch": 0.003947368421052632, + "grad_norm": 0.9909513605669136, + "learning_rate": 2.631578947368421e-06, + "loss": 1.3634, + "step": 3 + }, + { + "epoch": 0.005263157894736842, + "grad_norm": 1.008401142445123, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.39, + "step": 4 + }, + { + "epoch": 0.006578947368421052, + "grad_norm": 0.9851935697509421, + "learning_rate": 4.3859649122807014e-06, + "loss": 1.366, + "step": 5 + }, + { + "epoch": 0.007894736842105263, + "grad_norm": 0.9899431502727841, + "learning_rate": 5.263157894736842e-06, + "loss": 1.3624, + "step": 6 + }, + { + "epoch": 0.009210526315789473, + "grad_norm": 0.9636771026547807, + "learning_rate": 6.140350877192982e-06, + "loss": 1.3356, + "step": 7 + }, + { + "epoch": 0.010526315789473684, + "grad_norm": 0.9342738734933147, + "learning_rate": 7.017543859649123e-06, + "loss": 1.3189, + "step": 8 + }, + { + "epoch": 0.011842105263157895, + "grad_norm": 0.9962701021709995, + "learning_rate": 7.894736842105263e-06, + "loss": 1.3505, + "step": 9 + }, + { + "epoch": 0.013157894736842105, + "grad_norm": 0.942842160351429, + "learning_rate": 8.771929824561403e-06, + "loss": 1.3352, + "step": 10 + }, + { + "epoch": 0.014473684210526316, + "grad_norm": 0.926592433083301, + "learning_rate": 9.649122807017545e-06, + "loss": 1.3057, + "step": 11 + }, + { + "epoch": 0.015789473684210527, + "grad_norm": 0.8649365529618144, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.2733, + "step": 12 + }, + { + "epoch": 0.017105263157894738, + "grad_norm": 0.8589185686142112, + "learning_rate": 1.1403508771929824e-05, + "loss": 1.2501, + "step": 13 + }, + { + "epoch": 0.018421052631578946, + "grad_norm": 0.7827884394796245, + "learning_rate": 1.2280701754385964e-05, + "loss": 1.2434, + "step": 14 + }, + { + "epoch": 0.019736842105263157, + "grad_norm": 0.7242909203107426, + "learning_rate": 1.3157894736842106e-05, + "loss": 1.2158, + "step": 15 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 0.6258453259636476, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.174, + "step": 16 + }, + { + "epoch": 0.02236842105263158, + "grad_norm": 0.581184200984528, + "learning_rate": 1.4912280701754386e-05, + "loss": 1.1096, + "step": 17 + }, + { + "epoch": 0.02368421052631579, + "grad_norm": 0.5605953225429362, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.1116, + "step": 18 + }, + { + "epoch": 0.025, + "grad_norm": 0.5408018123073384, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.0664, + "step": 19 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 0.5521377988070432, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.0341, + "step": 20 + }, + { + "epoch": 0.02763157894736842, + "grad_norm": 0.5881737820139201, + "learning_rate": 1.8421052631578947e-05, + "loss": 0.994, + "step": 21 + }, + { + "epoch": 0.02894736842105263, + "grad_norm": 0.635129231733668, + "learning_rate": 1.929824561403509e-05, + "loss": 0.9912, + "step": 22 + }, + { + "epoch": 0.030263157894736843, + "grad_norm": 0.6075255860392289, + "learning_rate": 2.0175438596491227e-05, + "loss": 0.9355, + "step": 23 + }, + { + "epoch": 0.031578947368421054, + "grad_norm": 0.6242744152482619, + "learning_rate": 2.105263157894737e-05, + "loss": 0.8811, + "step": 24 + }, + { + "epoch": 0.03289473684210526, + "grad_norm": 0.5870365386284072, + "learning_rate": 2.1929824561403507e-05, + "loss": 0.8533, + "step": 25 + }, + { + "epoch": 0.034210526315789476, + "grad_norm": 0.59700551817252, + "learning_rate": 2.280701754385965e-05, + "loss": 0.8276, + "step": 26 + }, + { + "epoch": 0.035526315789473684, + "grad_norm": 0.5437014014509646, + "learning_rate": 2.368421052631579e-05, + "loss": 0.8294, + "step": 27 + }, + { + "epoch": 0.03684210526315789, + "grad_norm": 0.5524675830958683, + "learning_rate": 2.456140350877193e-05, + "loss": 0.7643, + "step": 28 + }, + { + "epoch": 0.038157894736842106, + "grad_norm": 0.5314274045755922, + "learning_rate": 2.5438596491228074e-05, + "loss": 0.7262, + "step": 29 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 0.47225342909039525, + "learning_rate": 2.6315789473684212e-05, + "loss": 0.6754, + "step": 30 + }, + { + "epoch": 0.04078947368421053, + "grad_norm": 0.44700447040118774, + "learning_rate": 2.7192982456140354e-05, + "loss": 0.6517, + "step": 31 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 0.4497880712541435, + "learning_rate": 2.8070175438596492e-05, + "loss": 0.5831, + "step": 32 + }, + { + "epoch": 0.04342105263157895, + "grad_norm": 0.3556582069847081, + "learning_rate": 2.8947368421052634e-05, + "loss": 0.6016, + "step": 33 + }, + { + "epoch": 0.04473684210526316, + "grad_norm": 0.26076585757651716, + "learning_rate": 2.9824561403508772e-05, + "loss": 0.578, + "step": 34 + }, + { + "epoch": 0.046052631578947366, + "grad_norm": 0.2673773341905264, + "learning_rate": 3.0701754385964913e-05, + "loss": 0.5595, + "step": 35 + }, + { + "epoch": 0.04736842105263158, + "grad_norm": 0.2456506716030528, + "learning_rate": 3.157894736842105e-05, + "loss": 0.5533, + "step": 36 + }, + { + "epoch": 0.04868421052631579, + "grad_norm": 0.22542446425364246, + "learning_rate": 3.24561403508772e-05, + "loss": 0.5515, + "step": 37 + }, + { + "epoch": 0.05, + "grad_norm": 0.22177522330874816, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.5391, + "step": 38 + }, + { + "epoch": 0.05131578947368421, + "grad_norm": 0.2205640272585472, + "learning_rate": 3.421052631578947e-05, + "loss": 0.5154, + "step": 39 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 0.1985886053937116, + "learning_rate": 3.508771929824561e-05, + "loss": 0.4966, + "step": 40 + }, + { + "epoch": 0.05394736842105263, + "grad_norm": 0.17484163584959408, + "learning_rate": 3.5964912280701756e-05, + "loss": 0.5112, + "step": 41 + }, + { + "epoch": 0.05526315789473684, + "grad_norm": 0.2050282059933558, + "learning_rate": 3.6842105263157895e-05, + "loss": 0.5032, + "step": 42 + }, + { + "epoch": 0.056578947368421055, + "grad_norm": 0.21199590152167688, + "learning_rate": 3.771929824561404e-05, + "loss": 0.4952, + "step": 43 + }, + { + "epoch": 0.05789473684210526, + "grad_norm": 0.17849681012135296, + "learning_rate": 3.859649122807018e-05, + "loss": 0.4918, + "step": 44 + }, + { + "epoch": 0.05921052631578947, + "grad_norm": 0.2019786429643155, + "learning_rate": 3.9473684210526316e-05, + "loss": 0.4886, + "step": 45 + }, + { + "epoch": 0.060526315789473685, + "grad_norm": 0.20225571639981366, + "learning_rate": 4.0350877192982455e-05, + "loss": 0.4926, + "step": 46 + }, + { + "epoch": 0.06184210526315789, + "grad_norm": 0.1777962019167991, + "learning_rate": 4.12280701754386e-05, + "loss": 0.47, + "step": 47 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 0.15246060517847743, + "learning_rate": 4.210526315789474e-05, + "loss": 0.4767, + "step": 48 + }, + { + "epoch": 0.06447368421052632, + "grad_norm": 0.14898464152134963, + "learning_rate": 4.298245614035088e-05, + "loss": 0.4713, + "step": 49 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 0.13773290443803046, + "learning_rate": 4.3859649122807014e-05, + "loss": 0.4743, + "step": 50 + }, + { + "epoch": 0.06710526315789474, + "grad_norm": 0.1408929769423494, + "learning_rate": 4.473684210526316e-05, + "loss": 0.4501, + "step": 51 + }, + { + "epoch": 0.06842105263157895, + "grad_norm": 0.1368756686186497, + "learning_rate": 4.56140350877193e-05, + "loss": 0.4526, + "step": 52 + }, + { + "epoch": 0.06973684210526315, + "grad_norm": 0.13392249182682364, + "learning_rate": 4.649122807017544e-05, + "loss": 0.4347, + "step": 53 + }, + { + "epoch": 0.07105263157894737, + "grad_norm": 0.13153597559965788, + "learning_rate": 4.736842105263158e-05, + "loss": 0.4631, + "step": 54 + }, + { + "epoch": 0.07236842105263158, + "grad_norm": 0.1345131441878225, + "learning_rate": 4.824561403508772e-05, + "loss": 0.4495, + "step": 55 + }, + { + "epoch": 0.07368421052631578, + "grad_norm": 0.13825684252520345, + "learning_rate": 4.912280701754386e-05, + "loss": 0.4393, + "step": 56 + }, + { + "epoch": 0.075, + "grad_norm": 0.11938240156936133, + "learning_rate": 5e-05, + "loss": 0.4216, + "step": 57 + }, + { + "epoch": 0.07631578947368421, + "grad_norm": 0.12724516953784049, + "learning_rate": 5.087719298245615e-05, + "loss": 0.4387, + "step": 58 + }, + { + "epoch": 0.07763157894736843, + "grad_norm": 0.1202517377574683, + "learning_rate": 5.1754385964912286e-05, + "loss": 0.4375, + "step": 59 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 0.11502282485893392, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.43, + "step": 60 + }, + { + "epoch": 0.08026315789473684, + "grad_norm": 0.11833243534328311, + "learning_rate": 5.350877192982456e-05, + "loss": 0.4502, + "step": 61 + }, + { + "epoch": 0.08157894736842106, + "grad_norm": 0.1162576327218189, + "learning_rate": 5.438596491228071e-05, + "loss": 0.4122, + "step": 62 + }, + { + "epoch": 0.08289473684210526, + "grad_norm": 0.11646926324993098, + "learning_rate": 5.526315789473685e-05, + "loss": 0.4281, + "step": 63 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 0.10805987411499896, + "learning_rate": 5.6140350877192984e-05, + "loss": 0.4222, + "step": 64 + }, + { + "epoch": 0.08552631578947369, + "grad_norm": 0.11176891376288024, + "learning_rate": 5.701754385964912e-05, + "loss": 0.416, + "step": 65 + }, + { + "epoch": 0.0868421052631579, + "grad_norm": 0.12524757529866518, + "learning_rate": 5.789473684210527e-05, + "loss": 0.4047, + "step": 66 + }, + { + "epoch": 0.0881578947368421, + "grad_norm": 0.11006405173241035, + "learning_rate": 5.877192982456141e-05, + "loss": 0.4266, + "step": 67 + }, + { + "epoch": 0.08947368421052632, + "grad_norm": 0.12250978018085709, + "learning_rate": 5.9649122807017544e-05, + "loss": 0.4265, + "step": 68 + }, + { + "epoch": 0.09078947368421053, + "grad_norm": 0.11723491918915026, + "learning_rate": 6.052631578947369e-05, + "loss": 0.4173, + "step": 69 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 0.11327108284284121, + "learning_rate": 6.140350877192983e-05, + "loss": 0.428, + "step": 70 + }, + { + "epoch": 0.09342105263157895, + "grad_norm": 0.11624261730248432, + "learning_rate": 6.228070175438597e-05, + "loss": 0.4181, + "step": 71 + }, + { + "epoch": 0.09473684210526316, + "grad_norm": 0.1097767429764226, + "learning_rate": 6.31578947368421e-05, + "loss": 0.4172, + "step": 72 + }, + { + "epoch": 0.09605263157894736, + "grad_norm": 0.10948304839484718, + "learning_rate": 6.403508771929825e-05, + "loss": 0.4114, + "step": 73 + }, + { + "epoch": 0.09736842105263158, + "grad_norm": 0.11381746923579758, + "learning_rate": 6.49122807017544e-05, + "loss": 0.4138, + "step": 74 + }, + { + "epoch": 0.09868421052631579, + "grad_norm": 0.11879309263584711, + "learning_rate": 6.578947368421054e-05, + "loss": 0.4234, + "step": 75 + }, + { + "epoch": 0.1, + "grad_norm": 0.11504992983557438, + "learning_rate": 6.666666666666667e-05, + "loss": 0.4069, + "step": 76 + }, + { + "epoch": 0.1013157894736842, + "grad_norm": 0.1289568391364873, + "learning_rate": 6.754385964912281e-05, + "loss": 0.4172, + "step": 77 + }, + { + "epoch": 0.10263157894736842, + "grad_norm": 0.1027991596519212, + "learning_rate": 6.842105263157895e-05, + "loss": 0.4064, + "step": 78 + }, + { + "epoch": 0.10394736842105264, + "grad_norm": 0.1703158129160332, + "learning_rate": 6.929824561403509e-05, + "loss": 0.4146, + "step": 79 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 0.10978980803904749, + "learning_rate": 7.017543859649122e-05, + "loss": 0.3953, + "step": 80 + }, + { + "epoch": 0.10657894736842105, + "grad_norm": 0.11715465011733911, + "learning_rate": 7.105263157894737e-05, + "loss": 0.4195, + "step": 81 + }, + { + "epoch": 0.10789473684210527, + "grad_norm": 0.12584313901221594, + "learning_rate": 7.192982456140351e-05, + "loss": 0.4095, + "step": 82 + }, + { + "epoch": 0.10921052631578948, + "grad_norm": 0.11940093441042002, + "learning_rate": 7.280701754385966e-05, + "loss": 0.3918, + "step": 83 + }, + { + "epoch": 0.11052631578947368, + "grad_norm": 0.12190899565903182, + "learning_rate": 7.368421052631579e-05, + "loss": 0.3912, + "step": 84 + }, + { + "epoch": 0.1118421052631579, + "grad_norm": 0.13982192975495122, + "learning_rate": 7.456140350877193e-05, + "loss": 0.3741, + "step": 85 + }, + { + "epoch": 0.11315789473684211, + "grad_norm": 0.11868632446195661, + "learning_rate": 7.543859649122808e-05, + "loss": 0.4007, + "step": 86 + }, + { + "epoch": 0.11447368421052631, + "grad_norm": 0.12861533906094824, + "learning_rate": 7.631578947368422e-05, + "loss": 0.4054, + "step": 87 + }, + { + "epoch": 0.11578947368421053, + "grad_norm": 0.11712395700046184, + "learning_rate": 7.719298245614036e-05, + "loss": 0.3942, + "step": 88 + }, + { + "epoch": 0.11710526315789474, + "grad_norm": 0.11911543233284826, + "learning_rate": 7.807017543859649e-05, + "loss": 0.3992, + "step": 89 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 0.12597614512349814, + "learning_rate": 7.894736842105263e-05, + "loss": 0.3868, + "step": 90 + }, + { + "epoch": 0.11973684210526316, + "grad_norm": 0.11923027766958509, + "learning_rate": 7.982456140350878e-05, + "loss": 0.3994, + "step": 91 + }, + { + "epoch": 0.12105263157894737, + "grad_norm": 0.11704509885009143, + "learning_rate": 8.070175438596491e-05, + "loss": 0.3813, + "step": 92 + }, + { + "epoch": 0.12236842105263158, + "grad_norm": 0.1415178300748815, + "learning_rate": 8.157894736842105e-05, + "loss": 0.3995, + "step": 93 + }, + { + "epoch": 0.12368421052631579, + "grad_norm": 0.12356686708530311, + "learning_rate": 8.24561403508772e-05, + "loss": 0.385, + "step": 94 + }, + { + "epoch": 0.125, + "grad_norm": 0.12008298974430553, + "learning_rate": 8.333333333333334e-05, + "loss": 0.3849, + "step": 95 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 0.11678986714389192, + "learning_rate": 8.421052631578948e-05, + "loss": 0.3763, + "step": 96 + }, + { + "epoch": 0.12763157894736843, + "grad_norm": 0.130987416202565, + "learning_rate": 8.508771929824562e-05, + "loss": 0.3795, + "step": 97 + }, + { + "epoch": 0.12894736842105264, + "grad_norm": 0.13585638455268312, + "learning_rate": 8.596491228070177e-05, + "loss": 0.3714, + "step": 98 + }, + { + "epoch": 0.13026315789473683, + "grad_norm": 0.1343330070878276, + "learning_rate": 8.68421052631579e-05, + "loss": 0.3686, + "step": 99 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 0.1344794137365553, + "learning_rate": 8.771929824561403e-05, + "loss": 0.3963, + "step": 100 + }, + { + "epoch": 0.13289473684210526, + "grad_norm": 0.13882732763835956, + "learning_rate": 8.859649122807017e-05, + "loss": 0.3759, + "step": 101 + }, + { + "epoch": 0.13421052631578947, + "grad_norm": 0.13720598333651746, + "learning_rate": 8.947368421052632e-05, + "loss": 0.3821, + "step": 102 + }, + { + "epoch": 0.1355263157894737, + "grad_norm": 0.14777326263342197, + "learning_rate": 9.035087719298246e-05, + "loss": 0.3667, + "step": 103 + }, + { + "epoch": 0.1368421052631579, + "grad_norm": 0.13337880990510198, + "learning_rate": 9.12280701754386e-05, + "loss": 0.3978, + "step": 104 + }, + { + "epoch": 0.13815789473684212, + "grad_norm": 0.14435581876340933, + "learning_rate": 9.210526315789474e-05, + "loss": 0.3751, + "step": 105 + }, + { + "epoch": 0.1394736842105263, + "grad_norm": 0.1307662742116557, + "learning_rate": 9.298245614035089e-05, + "loss": 0.3898, + "step": 106 + }, + { + "epoch": 0.14078947368421052, + "grad_norm": 0.13020669111992744, + "learning_rate": 9.385964912280703e-05, + "loss": 0.3904, + "step": 107 + }, + { + "epoch": 0.14210526315789473, + "grad_norm": 0.14245838078165218, + "learning_rate": 9.473684210526316e-05, + "loss": 0.3814, + "step": 108 + }, + { + "epoch": 0.14342105263157895, + "grad_norm": 0.13114307459864877, + "learning_rate": 9.56140350877193e-05, + "loss": 0.374, + "step": 109 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 0.12257270456238782, + "learning_rate": 9.649122807017544e-05, + "loss": 0.3835, + "step": 110 + }, + { + "epoch": 0.14605263157894738, + "grad_norm": 0.1377457018530249, + "learning_rate": 9.736842105263158e-05, + "loss": 0.3772, + "step": 111 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 0.13906509029436523, + "learning_rate": 9.824561403508771e-05, + "loss": 0.3777, + "step": 112 + }, + { + "epoch": 0.14868421052631578, + "grad_norm": 0.13702341765645296, + "learning_rate": 9.912280701754386e-05, + "loss": 0.3778, + "step": 113 + }, + { + "epoch": 0.15, + "grad_norm": 0.131545294973937, + "learning_rate": 0.0001, + "loss": 0.3676, + "step": 114 + }, + { + "epoch": 0.1513157894736842, + "grad_norm": 0.144839378587314, + "learning_rate": 0.00010087719298245615, + "loss": 0.3656, + "step": 115 + }, + { + "epoch": 0.15263157894736842, + "grad_norm": 0.13801404126106712, + "learning_rate": 0.0001017543859649123, + "loss": 0.3698, + "step": 116 + }, + { + "epoch": 0.15394736842105264, + "grad_norm": 0.14250664033740984, + "learning_rate": 0.00010263157894736844, + "loss": 0.3556, + "step": 117 + }, + { + "epoch": 0.15526315789473685, + "grad_norm": 0.14495828716442766, + "learning_rate": 0.00010350877192982457, + "loss": 0.3874, + "step": 118 + }, + { + "epoch": 0.15657894736842104, + "grad_norm": 0.1458656065808899, + "learning_rate": 0.0001043859649122807, + "loss": 0.3707, + "step": 119 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 0.15169186829219217, + "learning_rate": 0.00010526315789473685, + "loss": 0.3807, + "step": 120 + }, + { + "epoch": 0.15921052631578947, + "grad_norm": 0.1384837433981043, + "learning_rate": 0.00010614035087719298, + "loss": 0.3606, + "step": 121 + }, + { + "epoch": 0.16052631578947368, + "grad_norm": 0.1397696554751311, + "learning_rate": 0.00010701754385964912, + "loss": 0.3703, + "step": 122 + }, + { + "epoch": 0.1618421052631579, + "grad_norm": 0.14890695623711342, + "learning_rate": 0.00010789473684210527, + "loss": 0.3613, + "step": 123 + }, + { + "epoch": 0.1631578947368421, + "grad_norm": 0.15703056192380713, + "learning_rate": 0.00010877192982456141, + "loss": 0.3504, + "step": 124 + }, + { + "epoch": 0.16447368421052633, + "grad_norm": 0.1392335253666652, + "learning_rate": 0.00010964912280701756, + "loss": 0.3708, + "step": 125 + }, + { + "epoch": 0.16578947368421051, + "grad_norm": 0.14687002166712368, + "learning_rate": 0.0001105263157894737, + "loss": 0.3698, + "step": 126 + }, + { + "epoch": 0.16710526315789473, + "grad_norm": 0.15870994148978584, + "learning_rate": 0.00011140350877192982, + "loss": 0.3779, + "step": 127 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 0.14754629529682614, + "learning_rate": 0.00011228070175438597, + "loss": 0.3623, + "step": 128 + }, + { + "epoch": 0.16973684210526316, + "grad_norm": 0.15124044879001075, + "learning_rate": 0.00011315789473684211, + "loss": 0.3608, + "step": 129 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 0.14959917991128074, + "learning_rate": 0.00011403508771929824, + "loss": 0.3642, + "step": 130 + }, + { + "epoch": 0.1723684210526316, + "grad_norm": 0.1571767924920792, + "learning_rate": 0.00011491228070175439, + "loss": 0.3631, + "step": 131 + }, + { + "epoch": 0.1736842105263158, + "grad_norm": 0.14490254678965067, + "learning_rate": 0.00011578947368421053, + "loss": 0.3559, + "step": 132 + }, + { + "epoch": 0.175, + "grad_norm": 0.144229202921289, + "learning_rate": 0.00011666666666666668, + "loss": 0.3546, + "step": 133 + }, + { + "epoch": 0.1763157894736842, + "grad_norm": 0.1600537686886715, + "learning_rate": 0.00011754385964912282, + "loss": 0.3455, + "step": 134 + }, + { + "epoch": 0.17763157894736842, + "grad_norm": 0.1313150257696866, + "learning_rate": 0.00011842105263157894, + "loss": 0.3552, + "step": 135 + }, + { + "epoch": 0.17894736842105263, + "grad_norm": 0.13732726897904224, + "learning_rate": 0.00011929824561403509, + "loss": 0.3497, + "step": 136 + }, + { + "epoch": 0.18026315789473685, + "grad_norm": 0.1463571482716898, + "learning_rate": 0.00012017543859649123, + "loss": 0.3519, + "step": 137 + }, + { + "epoch": 0.18157894736842106, + "grad_norm": 0.1424850899462053, + "learning_rate": 0.00012105263157894738, + "loss": 0.3475, + "step": 138 + }, + { + "epoch": 0.18289473684210528, + "grad_norm": 0.15624853614650133, + "learning_rate": 0.00012192982456140352, + "loss": 0.3541, + "step": 139 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 0.15391671545839364, + "learning_rate": 0.00012280701754385965, + "loss": 0.347, + "step": 140 + }, + { + "epoch": 0.18552631578947368, + "grad_norm": 0.1476455568748672, + "learning_rate": 0.0001236842105263158, + "loss": 0.3641, + "step": 141 + }, + { + "epoch": 0.1868421052631579, + "grad_norm": 0.15837728677358026, + "learning_rate": 0.00012456140350877194, + "loss": 0.3542, + "step": 142 + }, + { + "epoch": 0.1881578947368421, + "grad_norm": 0.1387983783021263, + "learning_rate": 0.00012543859649122806, + "loss": 0.3425, + "step": 143 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 0.15375599222156547, + "learning_rate": 0.0001263157894736842, + "loss": 0.353, + "step": 144 + }, + { + "epoch": 0.19078947368421054, + "grad_norm": 0.14286343766479473, + "learning_rate": 0.00012719298245614035, + "loss": 0.3425, + "step": 145 + }, + { + "epoch": 0.19210526315789472, + "grad_norm": 0.15500148844719516, + "learning_rate": 0.0001280701754385965, + "loss": 0.3536, + "step": 146 + }, + { + "epoch": 0.19342105263157894, + "grad_norm": 0.1527274581882464, + "learning_rate": 0.00012894736842105264, + "loss": 0.3529, + "step": 147 + }, + { + "epoch": 0.19473684210526315, + "grad_norm": 0.13911203035920786, + "learning_rate": 0.0001298245614035088, + "loss": 0.3582, + "step": 148 + }, + { + "epoch": 0.19605263157894737, + "grad_norm": 0.1603641558907569, + "learning_rate": 0.00013070175438596493, + "loss": 0.3379, + "step": 149 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 0.15724848532335062, + "learning_rate": 0.00013157894736842108, + "loss": 0.3447, + "step": 150 + }, + { + "epoch": 0.1986842105263158, + "grad_norm": 0.15104286411263596, + "learning_rate": 0.0001324561403508772, + "loss": 0.3592, + "step": 151 + }, + { + "epoch": 0.2, + "grad_norm": 0.14256111417333361, + "learning_rate": 0.00013333333333333334, + "loss": 0.3418, + "step": 152 + }, + { + "epoch": 0.2013157894736842, + "grad_norm": 0.14616033132964826, + "learning_rate": 0.00013421052631578948, + "loss": 0.365, + "step": 153 + }, + { + "epoch": 0.2026315789473684, + "grad_norm": 0.13617570529003223, + "learning_rate": 0.00013508771929824563, + "loss": 0.3687, + "step": 154 + }, + { + "epoch": 0.20394736842105263, + "grad_norm": 0.15392252191058636, + "learning_rate": 0.00013596491228070177, + "loss": 0.3481, + "step": 155 + }, + { + "epoch": 0.20526315789473684, + "grad_norm": 0.14731210087429397, + "learning_rate": 0.0001368421052631579, + "loss": 0.3578, + "step": 156 + }, + { + "epoch": 0.20657894736842106, + "grad_norm": 0.15561784274185514, + "learning_rate": 0.00013771929824561404, + "loss": 0.3592, + "step": 157 + }, + { + "epoch": 0.20789473684210527, + "grad_norm": 0.15066657618186907, + "learning_rate": 0.00013859649122807018, + "loss": 0.3467, + "step": 158 + }, + { + "epoch": 0.20921052631578949, + "grad_norm": 0.1420611399655039, + "learning_rate": 0.0001394736842105263, + "loss": 0.3616, + "step": 159 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.1488250715622558, + "learning_rate": 0.00014035087719298245, + "loss": 0.339, + "step": 160 + }, + { + "epoch": 0.2118421052631579, + "grad_norm": 0.1462212684656958, + "learning_rate": 0.0001412280701754386, + "loss": 0.3414, + "step": 161 + }, + { + "epoch": 0.2131578947368421, + "grad_norm": 0.13807950981607273, + "learning_rate": 0.00014210526315789474, + "loss": 0.3342, + "step": 162 + }, + { + "epoch": 0.21447368421052632, + "grad_norm": 0.15320658283775204, + "learning_rate": 0.00014298245614035088, + "loss": 0.3448, + "step": 163 + }, + { + "epoch": 0.21578947368421053, + "grad_norm": 0.1465532353318612, + "learning_rate": 0.00014385964912280703, + "loss": 0.3472, + "step": 164 + }, + { + "epoch": 0.21710526315789475, + "grad_norm": 0.14706679089630642, + "learning_rate": 0.00014473684210526317, + "loss": 0.3437, + "step": 165 + }, + { + "epoch": 0.21842105263157896, + "grad_norm": 0.15624893484875924, + "learning_rate": 0.00014561403508771932, + "loss": 0.3417, + "step": 166 + }, + { + "epoch": 0.21973684210526315, + "grad_norm": 0.14729900208480234, + "learning_rate": 0.00014649122807017543, + "loss": 0.3651, + "step": 167 + }, + { + "epoch": 0.22105263157894736, + "grad_norm": 0.13340629262561993, + "learning_rate": 0.00014736842105263158, + "loss": 0.3452, + "step": 168 + }, + { + "epoch": 0.22236842105263158, + "grad_norm": 0.1427638316007593, + "learning_rate": 0.00014824561403508772, + "loss": 0.3368, + "step": 169 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 0.1394836492366252, + "learning_rate": 0.00014912280701754387, + "loss": 0.3366, + "step": 170 + }, + { + "epoch": 0.225, + "grad_norm": 0.12982465074656047, + "learning_rate": 0.00015000000000000001, + "loss": 0.3266, + "step": 171 + }, + { + "epoch": 0.22631578947368422, + "grad_norm": 0.1489587612015448, + "learning_rate": 0.00015087719298245616, + "loss": 0.3342, + "step": 172 + }, + { + "epoch": 0.22763157894736843, + "grad_norm": 0.15658289578441603, + "learning_rate": 0.0001517543859649123, + "loss": 0.3461, + "step": 173 + }, + { + "epoch": 0.22894736842105262, + "grad_norm": 0.13299000630905108, + "learning_rate": 0.00015263157894736845, + "loss": 0.3343, + "step": 174 + }, + { + "epoch": 0.23026315789473684, + "grad_norm": 0.1501297297839127, + "learning_rate": 0.00015350877192982457, + "loss": 0.3319, + "step": 175 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 0.1447458092769113, + "learning_rate": 0.0001543859649122807, + "loss": 0.3538, + "step": 176 + }, + { + "epoch": 0.23289473684210527, + "grad_norm": 0.12642159107807327, + "learning_rate": 0.00015526315789473686, + "loss": 0.3507, + "step": 177 + }, + { + "epoch": 0.23421052631578948, + "grad_norm": 0.1307467882888706, + "learning_rate": 0.00015614035087719297, + "loss": 0.3352, + "step": 178 + }, + { + "epoch": 0.2355263157894737, + "grad_norm": 0.14575680367060406, + "learning_rate": 0.00015701754385964912, + "loss": 0.3415, + "step": 179 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 0.13772178353175255, + "learning_rate": 0.00015789473684210527, + "loss": 0.3517, + "step": 180 + }, + { + "epoch": 0.2381578947368421, + "grad_norm": 0.13177565526904855, + "learning_rate": 0.0001587719298245614, + "loss": 0.326, + "step": 181 + }, + { + "epoch": 0.2394736842105263, + "grad_norm": 0.13895296380351296, + "learning_rate": 0.00015964912280701756, + "loss": 0.3418, + "step": 182 + }, + { + "epoch": 0.24078947368421053, + "grad_norm": 0.13234030291334134, + "learning_rate": 0.0001605263157894737, + "loss": 0.3492, + "step": 183 + }, + { + "epoch": 0.24210526315789474, + "grad_norm": 0.1343103364584722, + "learning_rate": 0.00016140350877192982, + "loss": 0.334, + "step": 184 + }, + { + "epoch": 0.24342105263157895, + "grad_norm": 0.13330977836549726, + "learning_rate": 0.00016228070175438596, + "loss": 0.342, + "step": 185 + }, + { + "epoch": 0.24473684210526317, + "grad_norm": 0.12933054253662082, + "learning_rate": 0.0001631578947368421, + "loss": 0.3476, + "step": 186 + }, + { + "epoch": 0.24605263157894736, + "grad_norm": 0.12637139899067856, + "learning_rate": 0.00016403508771929825, + "loss": 0.3317, + "step": 187 + }, + { + "epoch": 0.24736842105263157, + "grad_norm": 0.12268533782351379, + "learning_rate": 0.0001649122807017544, + "loss": 0.3369, + "step": 188 + }, + { + "epoch": 0.24868421052631579, + "grad_norm": 0.1269784203784765, + "learning_rate": 0.00016578947368421054, + "loss": 0.3555, + "step": 189 + }, + { + "epoch": 0.25, + "grad_norm": 0.13455995822795366, + "learning_rate": 0.0001666666666666667, + "loss": 0.3352, + "step": 190 + }, + { + "epoch": 0.2513157894736842, + "grad_norm": 0.13042417788105207, + "learning_rate": 0.00016754385964912283, + "loss": 0.3425, + "step": 191 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 0.13259270895700348, + "learning_rate": 0.00016842105263157895, + "loss": 0.3388, + "step": 192 + }, + { + "epoch": 0.25394736842105264, + "grad_norm": 0.13229060651188568, + "learning_rate": 0.0001692982456140351, + "loss": 0.3444, + "step": 193 + }, + { + "epoch": 0.25526315789473686, + "grad_norm": 0.1409002440562877, + "learning_rate": 0.00017017543859649124, + "loss": 0.326, + "step": 194 + }, + { + "epoch": 0.2565789473684211, + "grad_norm": 0.13331986255382416, + "learning_rate": 0.00017105263157894739, + "loss": 0.3333, + "step": 195 + }, + { + "epoch": 0.2578947368421053, + "grad_norm": 0.13211357309903607, + "learning_rate": 0.00017192982456140353, + "loss": 0.3303, + "step": 196 + }, + { + "epoch": 0.25921052631578945, + "grad_norm": 0.13392529140137704, + "learning_rate": 0.00017280701754385965, + "loss": 0.3266, + "step": 197 + }, + { + "epoch": 0.26052631578947366, + "grad_norm": 0.135716179073066, + "learning_rate": 0.0001736842105263158, + "loss": 0.3372, + "step": 198 + }, + { + "epoch": 0.2618421052631579, + "grad_norm": 0.1325252534873712, + "learning_rate": 0.00017456140350877194, + "loss": 0.3377, + "step": 199 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.13891964420010164, + "learning_rate": 0.00017543859649122806, + "loss": 0.3359, + "step": 200 + }, + { + "epoch": 0.2644736842105263, + "grad_norm": 0.14101443449526624, + "learning_rate": 0.0001763157894736842, + "loss": 0.3254, + "step": 201 + }, + { + "epoch": 0.2657894736842105, + "grad_norm": 0.12637174218931757, + "learning_rate": 0.00017719298245614035, + "loss": 0.3372, + "step": 202 + }, + { + "epoch": 0.26710526315789473, + "grad_norm": 0.13174772610887958, + "learning_rate": 0.0001780701754385965, + "loss": 0.3365, + "step": 203 + }, + { + "epoch": 0.26842105263157895, + "grad_norm": 0.13681575981535574, + "learning_rate": 0.00017894736842105264, + "loss": 0.3356, + "step": 204 + }, + { + "epoch": 0.26973684210526316, + "grad_norm": 0.1328471909403007, + "learning_rate": 0.00017982456140350878, + "loss": 0.33, + "step": 205 + }, + { + "epoch": 0.2710526315789474, + "grad_norm": 0.1407266391635488, + "learning_rate": 0.00018070175438596493, + "loss": 0.3391, + "step": 206 + }, + { + "epoch": 0.2723684210526316, + "grad_norm": 0.1348033610448726, + "learning_rate": 0.00018157894736842107, + "loss": 0.3239, + "step": 207 + }, + { + "epoch": 0.2736842105263158, + "grad_norm": 0.1265119642350682, + "learning_rate": 0.0001824561403508772, + "loss": 0.3345, + "step": 208 + }, + { + "epoch": 0.275, + "grad_norm": 0.12645375412506446, + "learning_rate": 0.00018333333333333334, + "loss": 0.3241, + "step": 209 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 0.13126982713178578, + "learning_rate": 0.00018421052631578948, + "loss": 0.3367, + "step": 210 + }, + { + "epoch": 0.2776315789473684, + "grad_norm": 0.1243720842002383, + "learning_rate": 0.00018508771929824563, + "loss": 0.329, + "step": 211 + }, + { + "epoch": 0.2789473684210526, + "grad_norm": 0.14402206242169863, + "learning_rate": 0.00018596491228070177, + "loss": 0.3321, + "step": 212 + }, + { + "epoch": 0.2802631578947368, + "grad_norm": 0.12463209754003, + "learning_rate": 0.00018684210526315792, + "loss": 0.3191, + "step": 213 + }, + { + "epoch": 0.28157894736842104, + "grad_norm": 0.11980526105079334, + "learning_rate": 0.00018771929824561406, + "loss": 0.3238, + "step": 214 + }, + { + "epoch": 0.28289473684210525, + "grad_norm": 0.13443132852719006, + "learning_rate": 0.0001885964912280702, + "loss": 0.3409, + "step": 215 + }, + { + "epoch": 0.28421052631578947, + "grad_norm": 0.12477177793237498, + "learning_rate": 0.00018947368421052632, + "loss": 0.334, + "step": 216 + }, + { + "epoch": 0.2855263157894737, + "grad_norm": 0.12434521968333248, + "learning_rate": 0.00019035087719298247, + "loss": 0.3301, + "step": 217 + }, + { + "epoch": 0.2868421052631579, + "grad_norm": 0.12926013045173962, + "learning_rate": 0.0001912280701754386, + "loss": 0.3403, + "step": 218 + }, + { + "epoch": 0.2881578947368421, + "grad_norm": 0.12767169045744933, + "learning_rate": 0.00019210526315789473, + "loss": 0.3285, + "step": 219 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 0.12496998663175744, + "learning_rate": 0.00019298245614035088, + "loss": 0.3485, + "step": 220 + }, + { + "epoch": 0.29078947368421054, + "grad_norm": 0.12044174136139364, + "learning_rate": 0.00019385964912280702, + "loss": 0.3264, + "step": 221 + }, + { + "epoch": 0.29210526315789476, + "grad_norm": 0.13199190001366287, + "learning_rate": 0.00019473684210526317, + "loss": 0.3422, + "step": 222 + }, + { + "epoch": 0.29342105263157897, + "grad_norm": 0.1201369924836855, + "learning_rate": 0.0001956140350877193, + "loss": 0.3244, + "step": 223 + }, + { + "epoch": 0.29473684210526313, + "grad_norm": 0.11423784430905021, + "learning_rate": 0.00019649122807017543, + "loss": 0.3347, + "step": 224 + }, + { + "epoch": 0.29605263157894735, + "grad_norm": 0.12466053172472819, + "learning_rate": 0.00019736842105263157, + "loss": 0.3309, + "step": 225 + }, + { + "epoch": 0.29736842105263156, + "grad_norm": 0.12442689437527811, + "learning_rate": 0.00019824561403508772, + "loss": 0.3354, + "step": 226 + }, + { + "epoch": 0.2986842105263158, + "grad_norm": 0.11914676425566721, + "learning_rate": 0.00019912280701754386, + "loss": 0.3367, + "step": 227 + }, + { + "epoch": 0.3, + "grad_norm": 0.12114432656113984, + "learning_rate": 0.0002, + "loss": 0.3225, + "step": 228 + }, + { + "epoch": 0.3013157894736842, + "grad_norm": 0.12708865662155097, + "learning_rate": 0.00019999988280341633, + "loss": 0.3267, + "step": 229 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 0.12093461278364943, + "learning_rate": 0.00019999953121394002, + "loss": 0.3341, + "step": 230 + }, + { + "epoch": 0.30394736842105263, + "grad_norm": 0.1259425626165656, + "learning_rate": 0.00019999894523239515, + "loss": 0.3352, + "step": 231 + }, + { + "epoch": 0.30526315789473685, + "grad_norm": 0.11881152231373332, + "learning_rate": 0.00019999812486015523, + "loss": 0.3263, + "step": 232 + }, + { + "epoch": 0.30657894736842106, + "grad_norm": 0.13525301351060123, + "learning_rate": 0.00019999707009914317, + "loss": 0.3405, + "step": 233 + }, + { + "epoch": 0.3078947368421053, + "grad_norm": 0.12155988934380389, + "learning_rate": 0.00019999578095183124, + "loss": 0.3264, + "step": 234 + }, + { + "epoch": 0.3092105263157895, + "grad_norm": 0.12309971712704007, + "learning_rate": 0.00019999425742124114, + "loss": 0.3126, + "step": 235 + }, + { + "epoch": 0.3105263157894737, + "grad_norm": 0.12559904653470885, + "learning_rate": 0.00019999249951094388, + "loss": 0.3436, + "step": 236 + }, + { + "epoch": 0.3118421052631579, + "grad_norm": 0.11464091226244318, + "learning_rate": 0.00019999050722505993, + "loss": 0.3212, + "step": 237 + }, + { + "epoch": 0.3131578947368421, + "grad_norm": 0.11798628745307271, + "learning_rate": 0.000199988280568259, + "loss": 0.3187, + "step": 238 + }, + { + "epoch": 0.3144736842105263, + "grad_norm": 0.11937212399519956, + "learning_rate": 0.00019998581954576032, + "loss": 0.3261, + "step": 239 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.10973860755841888, + "learning_rate": 0.00019998312416333227, + "loss": 0.3097, + "step": 240 + }, + { + "epoch": 0.3171052631578947, + "grad_norm": 0.11870178537603401, + "learning_rate": 0.00019998019442729273, + "loss": 0.3264, + "step": 241 + }, + { + "epoch": 0.31842105263157894, + "grad_norm": 0.11706254926047537, + "learning_rate": 0.0001999770303445087, + "loss": 0.3276, + "step": 242 + }, + { + "epoch": 0.31973684210526315, + "grad_norm": 0.12369716377452905, + "learning_rate": 0.00019997363192239664, + "loss": 0.3178, + "step": 243 + }, + { + "epoch": 0.32105263157894737, + "grad_norm": 0.10989749847863158, + "learning_rate": 0.0001999699991689222, + "loss": 0.3126, + "step": 244 + }, + { + "epoch": 0.3223684210526316, + "grad_norm": 0.11488770235752266, + "learning_rate": 0.00019996613209260033, + "loss": 0.3305, + "step": 245 + }, + { + "epoch": 0.3236842105263158, + "grad_norm": 0.11550955957920862, + "learning_rate": 0.00019996203070249516, + "loss": 0.3395, + "step": 246 + }, + { + "epoch": 0.325, + "grad_norm": 0.10896874824994508, + "learning_rate": 0.0001999576950082201, + "loss": 0.3266, + "step": 247 + }, + { + "epoch": 0.3263157894736842, + "grad_norm": 0.11429922245859891, + "learning_rate": 0.00019995312501993765, + "loss": 0.3241, + "step": 248 + }, + { + "epoch": 0.32763157894736844, + "grad_norm": 0.11185855088004702, + "learning_rate": 0.00019994832074835963, + "loss": 0.3215, + "step": 249 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 0.10618407427160971, + "learning_rate": 0.00019994328220474688, + "loss": 0.3114, + "step": 250 + }, + { + "epoch": 0.33026315789473687, + "grad_norm": 0.10827179543543274, + "learning_rate": 0.00019993800940090942, + "loss": 0.3275, + "step": 251 + }, + { + "epoch": 0.33157894736842103, + "grad_norm": 0.11600822718215488, + "learning_rate": 0.00019993250234920636, + "loss": 0.3205, + "step": 252 + }, + { + "epoch": 0.33289473684210524, + "grad_norm": 0.11660805309271545, + "learning_rate": 0.00019992676106254584, + "loss": 0.338, + "step": 253 + }, + { + "epoch": 0.33421052631578946, + "grad_norm": 0.11992791956747861, + "learning_rate": 0.00019992078555438502, + "loss": 0.33, + "step": 254 + }, + { + "epoch": 0.3355263157894737, + "grad_norm": 0.11271215008229082, + "learning_rate": 0.0001999145758387301, + "loss": 0.3312, + "step": 255 + }, + { + "epoch": 0.3368421052631579, + "grad_norm": 0.10842995849313372, + "learning_rate": 0.00019990813193013625, + "loss": 0.3257, + "step": 256 + }, + { + "epoch": 0.3381578947368421, + "grad_norm": 0.10874370438818506, + "learning_rate": 0.0001999014538437075, + "loss": 0.3128, + "step": 257 + }, + { + "epoch": 0.3394736842105263, + "grad_norm": 0.11809880440680368, + "learning_rate": 0.0001998945415950969, + "loss": 0.3211, + "step": 258 + }, + { + "epoch": 0.34078947368421053, + "grad_norm": 0.11661765420730584, + "learning_rate": 0.00019988739520050618, + "loss": 0.3148, + "step": 259 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 0.10343508953195514, + "learning_rate": 0.0001998800146766861, + "loss": 0.3217, + "step": 260 + }, + { + "epoch": 0.34342105263157896, + "grad_norm": 0.11172563699747126, + "learning_rate": 0.0001998724000409361, + "loss": 0.3098, + "step": 261 + }, + { + "epoch": 0.3447368421052632, + "grad_norm": 0.11124141647463505, + "learning_rate": 0.00019986455131110428, + "loss": 0.3177, + "step": 262 + }, + { + "epoch": 0.3460526315789474, + "grad_norm": 0.1049604206863391, + "learning_rate": 0.00019985646850558764, + "loss": 0.3202, + "step": 263 + }, + { + "epoch": 0.3473684210526316, + "grad_norm": 0.11674285217463065, + "learning_rate": 0.00019984815164333163, + "loss": 0.3325, + "step": 264 + }, + { + "epoch": 0.34868421052631576, + "grad_norm": 0.10939260536399624, + "learning_rate": 0.00019983960074383046, + "loss": 0.3135, + "step": 265 + }, + { + "epoch": 0.35, + "grad_norm": 0.11228143225921142, + "learning_rate": 0.00019983081582712685, + "loss": 0.3184, + "step": 266 + }, + { + "epoch": 0.3513157894736842, + "grad_norm": 0.11736606638037488, + "learning_rate": 0.000199821796913812, + "loss": 0.3228, + "step": 267 + }, + { + "epoch": 0.3526315789473684, + "grad_norm": 0.10870195278237296, + "learning_rate": 0.00019981254402502566, + "loss": 0.3162, + "step": 268 + }, + { + "epoch": 0.3539473684210526, + "grad_norm": 0.10319062608987703, + "learning_rate": 0.000199803057182456, + "loss": 0.3088, + "step": 269 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 0.11157466968877176, + "learning_rate": 0.00019979333640833947, + "loss": 0.3157, + "step": 270 + }, + { + "epoch": 0.35657894736842105, + "grad_norm": 0.11150128423279522, + "learning_rate": 0.00019978338172546093, + "loss": 0.3101, + "step": 271 + }, + { + "epoch": 0.35789473684210527, + "grad_norm": 0.1085421577182214, + "learning_rate": 0.0001997731931571535, + "loss": 0.3126, + "step": 272 + }, + { + "epoch": 0.3592105263157895, + "grad_norm": 0.10475914894230995, + "learning_rate": 0.00019976277072729845, + "loss": 0.3224, + "step": 273 + }, + { + "epoch": 0.3605263157894737, + "grad_norm": 0.1082831141347499, + "learning_rate": 0.00019975211446032526, + "loss": 0.3082, + "step": 274 + }, + { + "epoch": 0.3618421052631579, + "grad_norm": 0.10740486742911703, + "learning_rate": 0.0001997412243812115, + "loss": 0.3257, + "step": 275 + }, + { + "epoch": 0.3631578947368421, + "grad_norm": 0.1020992105394808, + "learning_rate": 0.00019973010051548275, + "loss": 0.3156, + "step": 276 + }, + { + "epoch": 0.36447368421052634, + "grad_norm": 0.10963434549263805, + "learning_rate": 0.0001997187428892126, + "loss": 0.312, + "step": 277 + }, + { + "epoch": 0.36578947368421055, + "grad_norm": 0.10738473907360331, + "learning_rate": 0.00019970715152902254, + "loss": 0.3264, + "step": 278 + }, + { + "epoch": 0.3671052631578947, + "grad_norm": 0.10172040132714297, + "learning_rate": 0.00019969532646208195, + "loss": 0.3, + "step": 279 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 0.10348844172942914, + "learning_rate": 0.00019968326771610797, + "loss": 0.3116, + "step": 280 + }, + { + "epoch": 0.36973684210526314, + "grad_norm": 0.11435260003026458, + "learning_rate": 0.00019967097531936546, + "loss": 0.3196, + "step": 281 + }, + { + "epoch": 0.37105263157894736, + "grad_norm": 0.1121081703641299, + "learning_rate": 0.000199658449300667, + "loss": 0.3226, + "step": 282 + }, + { + "epoch": 0.37236842105263157, + "grad_norm": 0.1067537506801231, + "learning_rate": 0.00019964568968937267, + "loss": 0.3027, + "step": 283 + }, + { + "epoch": 0.3736842105263158, + "grad_norm": 0.11350941919576825, + "learning_rate": 0.00019963269651539017, + "loss": 0.3328, + "step": 284 + }, + { + "epoch": 0.375, + "grad_norm": 0.10315345770730597, + "learning_rate": 0.00019961946980917456, + "loss": 0.3007, + "step": 285 + }, + { + "epoch": 0.3763157894736842, + "grad_norm": 0.11217086438246132, + "learning_rate": 0.0001996060096017284, + "loss": 0.307, + "step": 286 + }, + { + "epoch": 0.37763157894736843, + "grad_norm": 0.10830051369447247, + "learning_rate": 0.00019959231592460143, + "loss": 0.3128, + "step": 287 + }, + { + "epoch": 0.37894736842105264, + "grad_norm": 0.11069575850173559, + "learning_rate": 0.00019957838880989078, + "loss": 0.3096, + "step": 288 + }, + { + "epoch": 0.38026315789473686, + "grad_norm": 0.1085222475355183, + "learning_rate": 0.00019956422829024055, + "loss": 0.315, + "step": 289 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 0.09952894778052519, + "learning_rate": 0.0001995498343988421, + "loss": 0.3101, + "step": 290 + }, + { + "epoch": 0.3828947368421053, + "grad_norm": 0.11065531030038152, + "learning_rate": 0.00019953520716943371, + "loss": 0.3088, + "step": 291 + }, + { + "epoch": 0.38421052631578945, + "grad_norm": 0.11350469293630033, + "learning_rate": 0.00019952034663630062, + "loss": 0.318, + "step": 292 + }, + { + "epoch": 0.38552631578947366, + "grad_norm": 0.11212904813152226, + "learning_rate": 0.00019950525283427491, + "loss": 0.3172, + "step": 293 + }, + { + "epoch": 0.3868421052631579, + "grad_norm": 0.11122338935927506, + "learning_rate": 0.00019948992579873538, + "loss": 0.3058, + "step": 294 + }, + { + "epoch": 0.3881578947368421, + "grad_norm": 0.11166549208778644, + "learning_rate": 0.0001994743655656076, + "loss": 0.3014, + "step": 295 + }, + { + "epoch": 0.3894736842105263, + "grad_norm": 0.11276484187285273, + "learning_rate": 0.00019945857217136363, + "loss": 0.3177, + "step": 296 + }, + { + "epoch": 0.3907894736842105, + "grad_norm": 0.111642408366338, + "learning_rate": 0.00019944254565302217, + "loss": 0.3077, + "step": 297 + }, + { + "epoch": 0.39210526315789473, + "grad_norm": 0.12415010602542942, + "learning_rate": 0.00019942628604814825, + "loss": 0.3109, + "step": 298 + }, + { + "epoch": 0.39342105263157895, + "grad_norm": 0.10701081689961081, + "learning_rate": 0.00019940979339485332, + "loss": 0.3027, + "step": 299 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 0.1036095978911461, + "learning_rate": 0.00019939306773179497, + "loss": 0.3155, + "step": 300 + }, + { + "epoch": 0.3960526315789474, + "grad_norm": 0.12314325764301473, + "learning_rate": 0.00019937610909817702, + "loss": 0.3197, + "step": 301 + }, + { + "epoch": 0.3973684210526316, + "grad_norm": 0.10965174611997063, + "learning_rate": 0.0001993589175337494, + "loss": 0.3055, + "step": 302 + }, + { + "epoch": 0.3986842105263158, + "grad_norm": 0.10691327868633747, + "learning_rate": 0.00019934149307880791, + "loss": 0.3047, + "step": 303 + }, + { + "epoch": 0.4, + "grad_norm": 0.11048488675527199, + "learning_rate": 0.00019932383577419432, + "loss": 0.3162, + "step": 304 + }, + { + "epoch": 0.40131578947368424, + "grad_norm": 0.115786509511319, + "learning_rate": 0.0001993059456612961, + "loss": 0.2998, + "step": 305 + }, + { + "epoch": 0.4026315789473684, + "grad_norm": 0.10197052941221454, + "learning_rate": 0.0001992878227820465, + "loss": 0.3167, + "step": 306 + }, + { + "epoch": 0.4039473684210526, + "grad_norm": 0.10818823997320136, + "learning_rate": 0.00019926946717892428, + "loss": 0.3175, + "step": 307 + }, + { + "epoch": 0.4052631578947368, + "grad_norm": 0.11647859718654414, + "learning_rate": 0.00019925087889495374, + "loss": 0.3223, + "step": 308 + }, + { + "epoch": 0.40657894736842104, + "grad_norm": 0.10764841122975913, + "learning_rate": 0.0001992320579737045, + "loss": 0.3134, + "step": 309 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 0.10744012941703222, + "learning_rate": 0.0001992130044592916, + "loss": 0.3072, + "step": 310 + }, + { + "epoch": 0.40921052631578947, + "grad_norm": 0.11064482894831129, + "learning_rate": 0.00019919371839637512, + "loss": 0.315, + "step": 311 + }, + { + "epoch": 0.4105263157894737, + "grad_norm": 0.10891430893886998, + "learning_rate": 0.00019917419983016025, + "loss": 0.3016, + "step": 312 + }, + { + "epoch": 0.4118421052631579, + "grad_norm": 0.09966222612672655, + "learning_rate": 0.0001991544488063972, + "loss": 0.3104, + "step": 313 + }, + { + "epoch": 0.4131578947368421, + "grad_norm": 0.09948289810327957, + "learning_rate": 0.00019913446537138106, + "loss": 0.3199, + "step": 314 + }, + { + "epoch": 0.4144736842105263, + "grad_norm": 0.10221948739250226, + "learning_rate": 0.00019911424957195158, + "loss": 0.3027, + "step": 315 + }, + { + "epoch": 0.41578947368421054, + "grad_norm": 0.10319916825175317, + "learning_rate": 0.00019909380145549324, + "loss": 0.2962, + "step": 316 + }, + { + "epoch": 0.41710526315789476, + "grad_norm": 0.10112207458406539, + "learning_rate": 0.00019907312106993503, + "loss": 0.3072, + "step": 317 + }, + { + "epoch": 0.41842105263157897, + "grad_norm": 0.1056216300520592, + "learning_rate": 0.00019905220846375032, + "loss": 0.3108, + "step": 318 + }, + { + "epoch": 0.41973684210526313, + "grad_norm": 0.10432503631292016, + "learning_rate": 0.0001990310636859569, + "loss": 0.3084, + "step": 319 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.10088521223549587, + "learning_rate": 0.00019900968678611666, + "loss": 0.3094, + "step": 320 + }, + { + "epoch": 0.42236842105263156, + "grad_norm": 0.09994245819164545, + "learning_rate": 0.00019898807781433555, + "loss": 0.3142, + "step": 321 + }, + { + "epoch": 0.4236842105263158, + "grad_norm": 0.09903460972489679, + "learning_rate": 0.00019896623682126355, + "loss": 0.307, + "step": 322 + }, + { + "epoch": 0.425, + "grad_norm": 0.09914885914415125, + "learning_rate": 0.00019894416385809444, + "loss": 0.3014, + "step": 323 + }, + { + "epoch": 0.4263157894736842, + "grad_norm": 0.10122296873459234, + "learning_rate": 0.00019892185897656578, + "loss": 0.303, + "step": 324 + }, + { + "epoch": 0.4276315789473684, + "grad_norm": 0.09910489155114076, + "learning_rate": 0.0001988993222289587, + "loss": 0.3117, + "step": 325 + }, + { + "epoch": 0.42894736842105263, + "grad_norm": 0.09241116281014006, + "learning_rate": 0.0001988765536680977, + "loss": 0.2934, + "step": 326 + }, + { + "epoch": 0.43026315789473685, + "grad_norm": 0.09770357586471753, + "learning_rate": 0.00019885355334735082, + "loss": 0.317, + "step": 327 + }, + { + "epoch": 0.43157894736842106, + "grad_norm": 0.09697496010174222, + "learning_rate": 0.00019883032132062925, + "loss": 0.2998, + "step": 328 + }, + { + "epoch": 0.4328947368421053, + "grad_norm": 0.10013197876212468, + "learning_rate": 0.0001988068576423872, + "loss": 0.3105, + "step": 329 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 0.09453550444418382, + "learning_rate": 0.00019878316236762196, + "loss": 0.3091, + "step": 330 + }, + { + "epoch": 0.4355263157894737, + "grad_norm": 0.10361296309003387, + "learning_rate": 0.00019875923555187365, + "loss": 0.3117, + "step": 331 + }, + { + "epoch": 0.4368421052631579, + "grad_norm": 0.09867438093722174, + "learning_rate": 0.00019873507725122504, + "loss": 0.3105, + "step": 332 + }, + { + "epoch": 0.4381578947368421, + "grad_norm": 0.09914171961719935, + "learning_rate": 0.00019871068752230162, + "loss": 0.3299, + "step": 333 + }, + { + "epoch": 0.4394736842105263, + "grad_norm": 0.09508589565593037, + "learning_rate": 0.00019868606642227122, + "loss": 0.3057, + "step": 334 + }, + { + "epoch": 0.4407894736842105, + "grad_norm": 0.10553369850294496, + "learning_rate": 0.00019866121400884397, + "loss": 0.2968, + "step": 335 + }, + { + "epoch": 0.4421052631578947, + "grad_norm": 0.09693152875657689, + "learning_rate": 0.00019863613034027224, + "loss": 0.3157, + "step": 336 + }, + { + "epoch": 0.44342105263157894, + "grad_norm": 0.09914036757559133, + "learning_rate": 0.0001986108154753505, + "loss": 0.3075, + "step": 337 + }, + { + "epoch": 0.44473684210526315, + "grad_norm": 0.10323372097858888, + "learning_rate": 0.00019858526947341497, + "loss": 0.316, + "step": 338 + }, + { + "epoch": 0.44605263157894737, + "grad_norm": 0.1047670642802198, + "learning_rate": 0.0001985594923943438, + "loss": 0.3016, + "step": 339 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 0.10038200225737869, + "learning_rate": 0.00019853348429855672, + "loss": 0.313, + "step": 340 + }, + { + "epoch": 0.4486842105263158, + "grad_norm": 0.09995800002881376, + "learning_rate": 0.00019850724524701486, + "loss": 0.3201, + "step": 341 + }, + { + "epoch": 0.45, + "grad_norm": 0.09706393223860003, + "learning_rate": 0.00019848077530122083, + "loss": 0.302, + "step": 342 + }, + { + "epoch": 0.4513157894736842, + "grad_norm": 0.10217585842187529, + "learning_rate": 0.0001984540745232183, + "loss": 0.3052, + "step": 343 + }, + { + "epoch": 0.45263157894736844, + "grad_norm": 0.09716163731483823, + "learning_rate": 0.00019842714297559213, + "loss": 0.3182, + "step": 344 + }, + { + "epoch": 0.45394736842105265, + "grad_norm": 0.09580525044842596, + "learning_rate": 0.000198399980721468, + "loss": 0.3016, + "step": 345 + }, + { + "epoch": 0.45526315789473687, + "grad_norm": 0.10216214300781075, + "learning_rate": 0.0001983725878245124, + "loss": 0.3093, + "step": 346 + }, + { + "epoch": 0.45657894736842103, + "grad_norm": 0.09748580988527811, + "learning_rate": 0.0001983449643489324, + "loss": 0.3077, + "step": 347 + }, + { + "epoch": 0.45789473684210524, + "grad_norm": 0.10212802995697862, + "learning_rate": 0.0001983171103594755, + "loss": 0.3039, + "step": 348 + }, + { + "epoch": 0.45921052631578946, + "grad_norm": 0.10018192206504958, + "learning_rate": 0.00019828902592142962, + "loss": 0.3023, + "step": 349 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 0.09425076167869935, + "learning_rate": 0.0001982607111006227, + "loss": 0.3119, + "step": 350 + }, + { + "epoch": 0.4618421052631579, + "grad_norm": 0.0967215301525356, + "learning_rate": 0.0001982321659634228, + "loss": 0.3097, + "step": 351 + }, + { + "epoch": 0.4631578947368421, + "grad_norm": 0.09622422612500942, + "learning_rate": 0.0001982033905767377, + "loss": 0.3114, + "step": 352 + }, + { + "epoch": 0.4644736842105263, + "grad_norm": 0.09360899731223273, + "learning_rate": 0.00019817438500801502, + "loss": 0.3099, + "step": 353 + }, + { + "epoch": 0.46578947368421053, + "grad_norm": 0.09987640487171251, + "learning_rate": 0.0001981451493252418, + "loss": 0.2994, + "step": 354 + }, + { + "epoch": 0.46710526315789475, + "grad_norm": 0.10065235571453063, + "learning_rate": 0.00019811568359694447, + "loss": 0.3048, + "step": 355 + }, + { + "epoch": 0.46842105263157896, + "grad_norm": 0.10208391531367922, + "learning_rate": 0.00019808598789218865, + "loss": 0.2939, + "step": 356 + }, + { + "epoch": 0.4697368421052632, + "grad_norm": 0.10402670400636525, + "learning_rate": 0.00019805606228057916, + "loss": 0.3024, + "step": 357 + }, + { + "epoch": 0.4710526315789474, + "grad_norm": 0.09930311628562899, + "learning_rate": 0.00019802590683225946, + "loss": 0.3024, + "step": 358 + }, + { + "epoch": 0.4723684210526316, + "grad_norm": 0.10262536271155684, + "learning_rate": 0.0001979955216179119, + "loss": 0.3158, + "step": 359 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.0943881489890398, + "learning_rate": 0.0001979649067087574, + "loss": 0.3095, + "step": 360 + }, + { + "epoch": 0.475, + "grad_norm": 0.09827781548848098, + "learning_rate": 0.00019793406217655517, + "loss": 0.3084, + "step": 361 + }, + { + "epoch": 0.4763157894736842, + "grad_norm": 0.09396956618890727, + "learning_rate": 0.00019790298809360267, + "loss": 0.3108, + "step": 362 + }, + { + "epoch": 0.4776315789473684, + "grad_norm": 0.0971493552146761, + "learning_rate": 0.00019787168453273544, + "loss": 0.3234, + "step": 363 + }, + { + "epoch": 0.4789473684210526, + "grad_norm": 0.09635250868834104, + "learning_rate": 0.00019784015156732693, + "loss": 0.2961, + "step": 364 + }, + { + "epoch": 0.48026315789473684, + "grad_norm": 0.0957638945230677, + "learning_rate": 0.00019780838927128822, + "loss": 0.2999, + "step": 365 + }, + { + "epoch": 0.48157894736842105, + "grad_norm": 0.09673326599521005, + "learning_rate": 0.00019777639771906795, + "loss": 0.3065, + "step": 366 + }, + { + "epoch": 0.48289473684210527, + "grad_norm": 0.10037616383525308, + "learning_rate": 0.00019774417698565215, + "loss": 0.3155, + "step": 367 + }, + { + "epoch": 0.4842105263157895, + "grad_norm": 0.09978049327969367, + "learning_rate": 0.000197711727146564, + "loss": 0.3165, + "step": 368 + }, + { + "epoch": 0.4855263157894737, + "grad_norm": 0.09791958592311975, + "learning_rate": 0.00019767904827786375, + "loss": 0.3089, + "step": 369 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 0.09995882365072112, + "learning_rate": 0.00019764614045614836, + "loss": 0.2988, + "step": 370 + }, + { + "epoch": 0.4881578947368421, + "grad_norm": 0.09232234693055072, + "learning_rate": 0.0001976130037585516, + "loss": 0.3042, + "step": 371 + }, + { + "epoch": 0.48947368421052634, + "grad_norm": 0.09643481824906593, + "learning_rate": 0.00019757963826274357, + "loss": 0.294, + "step": 372 + }, + { + "epoch": 0.49078947368421055, + "grad_norm": 0.09697913699025175, + "learning_rate": 0.00019754604404693073, + "loss": 0.3053, + "step": 373 + }, + { + "epoch": 0.4921052631578947, + "grad_norm": 0.09907221334912318, + "learning_rate": 0.00019751222118985563, + "loss": 0.2968, + "step": 374 + }, + { + "epoch": 0.4934210526315789, + "grad_norm": 0.09502914913791907, + "learning_rate": 0.00019747816977079671, + "loss": 0.3078, + "step": 375 + }, + { + "epoch": 0.49473684210526314, + "grad_norm": 0.09936470649154033, + "learning_rate": 0.00019744388986956822, + "loss": 0.3097, + "step": 376 + }, + { + "epoch": 0.49605263157894736, + "grad_norm": 0.09802911974074544, + "learning_rate": 0.0001974093815665199, + "loss": 0.2958, + "step": 377 + }, + { + "epoch": 0.49736842105263157, + "grad_norm": 0.09898755664324496, + "learning_rate": 0.0001973746449425368, + "loss": 0.3083, + "step": 378 + }, + { + "epoch": 0.4986842105263158, + "grad_norm": 0.0961406374333812, + "learning_rate": 0.00019733968007903922, + "loss": 0.2976, + "step": 379 + }, + { + "epoch": 0.5, + "grad_norm": 0.10159753311874659, + "learning_rate": 0.00019730448705798239, + "loss": 0.3033, + "step": 380 + }, + { + "epoch": 0.5013157894736842, + "grad_norm": 0.0947441745895011, + "learning_rate": 0.0001972690659618564, + "loss": 0.2852, + "step": 381 + }, + { + "epoch": 0.5026315789473684, + "grad_norm": 0.09775748236249004, + "learning_rate": 0.00019723341687368583, + "loss": 0.2929, + "step": 382 + }, + { + "epoch": 0.5039473684210526, + "grad_norm": 0.09935691145327614, + "learning_rate": 0.00019719753987702978, + "loss": 0.3111, + "step": 383 + }, + { + "epoch": 0.5052631578947369, + "grad_norm": 0.09893488807438079, + "learning_rate": 0.0001971614350559814, + "loss": 0.3158, + "step": 384 + }, + { + "epoch": 0.506578947368421, + "grad_norm": 0.0947497912778192, + "learning_rate": 0.00019712510249516793, + "loss": 0.2921, + "step": 385 + }, + { + "epoch": 0.5078947368421053, + "grad_norm": 0.08737508895101483, + "learning_rate": 0.00019708854227975048, + "loss": 0.2927, + "step": 386 + }, + { + "epoch": 0.5092105263157894, + "grad_norm": 0.09835779795374115, + "learning_rate": 0.00019705175449542358, + "loss": 0.2976, + "step": 387 + }, + { + "epoch": 0.5105263157894737, + "grad_norm": 0.09107621488427446, + "learning_rate": 0.0001970147392284154, + "loss": 0.2814, + "step": 388 + }, + { + "epoch": 0.5118421052631579, + "grad_norm": 0.09743589157185283, + "learning_rate": 0.00019697749656548714, + "loss": 0.3003, + "step": 389 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 0.09595683633527828, + "learning_rate": 0.00019694002659393305, + "loss": 0.3033, + "step": 390 + }, + { + "epoch": 0.5144736842105263, + "grad_norm": 0.09879879361599342, + "learning_rate": 0.0001969023294015802, + "loss": 0.3133, + "step": 391 + }, + { + "epoch": 0.5157894736842106, + "grad_norm": 0.10105822465511066, + "learning_rate": 0.00019686440507678824, + "loss": 0.3103, + "step": 392 + }, + { + "epoch": 0.5171052631578947, + "grad_norm": 0.10182072904717256, + "learning_rate": 0.00019682625370844918, + "loss": 0.2978, + "step": 393 + }, + { + "epoch": 0.5184210526315789, + "grad_norm": 0.09540364410216295, + "learning_rate": 0.00019678787538598725, + "loss": 0.3243, + "step": 394 + }, + { + "epoch": 0.5197368421052632, + "grad_norm": 0.09754493311070281, + "learning_rate": 0.00019674927019935857, + "loss": 0.2957, + "step": 395 + }, + { + "epoch": 0.5210526315789473, + "grad_norm": 0.09261668278184784, + "learning_rate": 0.0001967104382390511, + "loss": 0.2909, + "step": 396 + }, + { + "epoch": 0.5223684210526316, + "grad_norm": 0.0978202914794332, + "learning_rate": 0.00019667137959608426, + "loss": 0.2875, + "step": 397 + }, + { + "epoch": 0.5236842105263158, + "grad_norm": 0.09087851403559502, + "learning_rate": 0.00019663209436200887, + "loss": 0.2959, + "step": 398 + }, + { + "epoch": 0.525, + "grad_norm": 0.09362515028658314, + "learning_rate": 0.00019659258262890683, + "loss": 0.3042, + "step": 399 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.09012777438494002, + "learning_rate": 0.00019655284448939094, + "loss": 0.2959, + "step": 400 + }, + { + "epoch": 0.5276315789473685, + "grad_norm": 0.0960827279363291, + "learning_rate": 0.0001965128800366047, + "loss": 0.2929, + "step": 401 + }, + { + "epoch": 0.5289473684210526, + "grad_norm": 0.09300199527426295, + "learning_rate": 0.00019647268936422206, + "loss": 0.283, + "step": 402 + }, + { + "epoch": 0.5302631578947369, + "grad_norm": 0.09683305560772038, + "learning_rate": 0.00019643227256644716, + "loss": 0.3105, + "step": 403 + }, + { + "epoch": 0.531578947368421, + "grad_norm": 0.0954490202804032, + "learning_rate": 0.00019639162973801426, + "loss": 0.3023, + "step": 404 + }, + { + "epoch": 0.5328947368421053, + "grad_norm": 0.09612701147402002, + "learning_rate": 0.00019635076097418734, + "loss": 0.3176, + "step": 405 + }, + { + "epoch": 0.5342105263157895, + "grad_norm": 0.09104963315940301, + "learning_rate": 0.00019630966637076004, + "loss": 0.2997, + "step": 406 + }, + { + "epoch": 0.5355263157894737, + "grad_norm": 0.0986012989783705, + "learning_rate": 0.00019626834602405523, + "loss": 0.2964, + "step": 407 + }, + { + "epoch": 0.5368421052631579, + "grad_norm": 0.10336828609236494, + "learning_rate": 0.00019622680003092503, + "loss": 0.2886, + "step": 408 + }, + { + "epoch": 0.5381578947368421, + "grad_norm": 0.09141302481564861, + "learning_rate": 0.00019618502848875045, + "loss": 0.2898, + "step": 409 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.09811357001705759, + "learning_rate": 0.00019614303149544102, + "loss": 0.312, + "step": 410 + }, + { + "epoch": 0.5407894736842105, + "grad_norm": 0.09760884292596834, + "learning_rate": 0.00019610080914943492, + "loss": 0.31, + "step": 411 + }, + { + "epoch": 0.5421052631578948, + "grad_norm": 0.09299566319312146, + "learning_rate": 0.0001960583615496984, + "loss": 0.2892, + "step": 412 + }, + { + "epoch": 0.5434210526315789, + "grad_norm": 0.09898383275023284, + "learning_rate": 0.0001960156887957257, + "loss": 0.3005, + "step": 413 + }, + { + "epoch": 0.5447368421052632, + "grad_norm": 0.10036363231456787, + "learning_rate": 0.0001959727909875389, + "loss": 0.2882, + "step": 414 + }, + { + "epoch": 0.5460526315789473, + "grad_norm": 0.08979092246398529, + "learning_rate": 0.00019592966822568753, + "loss": 0.301, + "step": 415 + }, + { + "epoch": 0.5473684210526316, + "grad_norm": 0.09422502905792629, + "learning_rate": 0.00019588632061124837, + "loss": 0.2851, + "step": 416 + }, + { + "epoch": 0.5486842105263158, + "grad_norm": 0.08769883690329003, + "learning_rate": 0.0001958427482458253, + "loss": 0.2874, + "step": 417 + }, + { + "epoch": 0.55, + "grad_norm": 0.10126724866744835, + "learning_rate": 0.0001957989512315489, + "loss": 0.3068, + "step": 418 + }, + { + "epoch": 0.5513157894736842, + "grad_norm": 0.09755145401929033, + "learning_rate": 0.00019575492967107642, + "loss": 0.3072, + "step": 419 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.09089099824025165, + "learning_rate": 0.00019571068366759143, + "loss": 0.2891, + "step": 420 + }, + { + "epoch": 0.5539473684210526, + "grad_norm": 0.0961002056223822, + "learning_rate": 0.00019566621332480348, + "loss": 0.2935, + "step": 421 + }, + { + "epoch": 0.5552631578947368, + "grad_norm": 0.09426174969645366, + "learning_rate": 0.00019562151874694803, + "loss": 0.2925, + "step": 422 + }, + { + "epoch": 0.5565789473684211, + "grad_norm": 0.09928190504185619, + "learning_rate": 0.00019557660003878614, + "loss": 0.3055, + "step": 423 + }, + { + "epoch": 0.5578947368421052, + "grad_norm": 0.09531727011576784, + "learning_rate": 0.00019553145730560415, + "loss": 0.2916, + "step": 424 + }, + { + "epoch": 0.5592105263157895, + "grad_norm": 0.09655059432651882, + "learning_rate": 0.00019548609065321356, + "loss": 0.3058, + "step": 425 + }, + { + "epoch": 0.5605263157894737, + "grad_norm": 0.09707096390323401, + "learning_rate": 0.00019544050018795075, + "loss": 0.2873, + "step": 426 + }, + { + "epoch": 0.5618421052631579, + "grad_norm": 0.09428953984002672, + "learning_rate": 0.00019539468601667662, + "loss": 0.298, + "step": 427 + }, + { + "epoch": 0.5631578947368421, + "grad_norm": 0.09658263944025584, + "learning_rate": 0.0001953486482467764, + "loss": 0.3004, + "step": 428 + }, + { + "epoch": 0.5644736842105263, + "grad_norm": 0.10052098718968268, + "learning_rate": 0.00019530238698615957, + "loss": 0.2975, + "step": 429 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 0.09350068437861885, + "learning_rate": 0.00019525590234325933, + "loss": 0.2977, + "step": 430 + }, + { + "epoch": 0.5671052631578948, + "grad_norm": 0.09265833701591007, + "learning_rate": 0.00019520919442703245, + "loss": 0.2941, + "step": 431 + }, + { + "epoch": 0.5684210526315789, + "grad_norm": 0.0914948295571144, + "learning_rate": 0.0001951622633469592, + "loss": 0.3032, + "step": 432 + }, + { + "epoch": 0.5697368421052632, + "grad_norm": 0.09538108350867988, + "learning_rate": 0.00019511510921304273, + "loss": 0.3057, + "step": 433 + }, + { + "epoch": 0.5710526315789474, + "grad_norm": 0.09388517597881102, + "learning_rate": 0.00019506773213580917, + "loss": 0.2974, + "step": 434 + }, + { + "epoch": 0.5723684210526315, + "grad_norm": 0.09853399504304618, + "learning_rate": 0.00019502013222630712, + "loss": 0.2831, + "step": 435 + }, + { + "epoch": 0.5736842105263158, + "grad_norm": 0.09228736628869874, + "learning_rate": 0.00019497230959610756, + "loss": 0.2885, + "step": 436 + }, + { + "epoch": 0.575, + "grad_norm": 0.09786678573184258, + "learning_rate": 0.0001949242643573034, + "loss": 0.3082, + "step": 437 + }, + { + "epoch": 0.5763157894736842, + "grad_norm": 0.09190204992934663, + "learning_rate": 0.00019487599662250943, + "loss": 0.2974, + "step": 438 + }, + { + "epoch": 0.5776315789473684, + "grad_norm": 0.09838875478146845, + "learning_rate": 0.00019482750650486193, + "loss": 0.2869, + "step": 439 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.09080464891287887, + "learning_rate": 0.00019477879411801844, + "loss": 0.2999, + "step": 440 + }, + { + "epoch": 0.5802631578947368, + "grad_norm": 0.094818410464559, + "learning_rate": 0.0001947298595761574, + "loss": 0.2948, + "step": 441 + }, + { + "epoch": 0.5815789473684211, + "grad_norm": 0.09229949003285352, + "learning_rate": 0.00019468070299397808, + "loss": 0.2851, + "step": 442 + }, + { + "epoch": 0.5828947368421052, + "grad_norm": 0.0946200275878873, + "learning_rate": 0.0001946313244867002, + "loss": 0.3035, + "step": 443 + }, + { + "epoch": 0.5842105263157895, + "grad_norm": 0.0972452681474162, + "learning_rate": 0.00019458172417006347, + "loss": 0.2921, + "step": 444 + }, + { + "epoch": 0.5855263157894737, + "grad_norm": 0.09168097969668783, + "learning_rate": 0.00019453190216032776, + "loss": 0.2965, + "step": 445 + }, + { + "epoch": 0.5868421052631579, + "grad_norm": 0.09238240620410484, + "learning_rate": 0.00019448185857427242, + "loss": 0.2965, + "step": 446 + }, + { + "epoch": 0.5881578947368421, + "grad_norm": 0.10748289837029415, + "learning_rate": 0.00019443159352919623, + "loss": 0.2912, + "step": 447 + }, + { + "epoch": 0.5894736842105263, + "grad_norm": 0.09871636341616816, + "learning_rate": 0.00019438110714291694, + "loss": 0.3111, + "step": 448 + }, + { + "epoch": 0.5907894736842105, + "grad_norm": 0.08861315777205786, + "learning_rate": 0.00019433039953377127, + "loss": 0.2891, + "step": 449 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.09570467370902319, + "learning_rate": 0.00019427947082061432, + "loss": 0.3046, + "step": 450 + }, + { + "epoch": 0.593421052631579, + "grad_norm": 0.0882411637366609, + "learning_rate": 0.00019422832112281962, + "loss": 0.2948, + "step": 451 + }, + { + "epoch": 0.5947368421052631, + "grad_norm": 0.08681692074605786, + "learning_rate": 0.00019417695056027844, + "loss": 0.2975, + "step": 452 + }, + { + "epoch": 0.5960526315789474, + "grad_norm": 0.0894965926644007, + "learning_rate": 0.00019412535925339997, + "loss": 0.2982, + "step": 453 + }, + { + "epoch": 0.5973684210526315, + "grad_norm": 0.08949478378197007, + "learning_rate": 0.00019407354732311064, + "loss": 0.2923, + "step": 454 + }, + { + "epoch": 0.5986842105263158, + "grad_norm": 0.0952879006180906, + "learning_rate": 0.0001940215148908541, + "loss": 0.2959, + "step": 455 + }, + { + "epoch": 0.6, + "grad_norm": 0.09341215938159328, + "learning_rate": 0.00019396926207859084, + "loss": 0.2987, + "step": 456 + }, + { + "epoch": 0.6013157894736842, + "grad_norm": 0.08989810174624617, + "learning_rate": 0.00019391678900879786, + "loss": 0.2991, + "step": 457 + }, + { + "epoch": 0.6026315789473684, + "grad_norm": 0.09052527717801154, + "learning_rate": 0.00019386409580446844, + "loss": 0.2841, + "step": 458 + }, + { + "epoch": 0.6039473684210527, + "grad_norm": 0.09246686105449765, + "learning_rate": 0.00019381118258911186, + "loss": 0.2998, + "step": 459 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.09632503586933118, + "learning_rate": 0.00019375804948675306, + "loss": 0.3075, + "step": 460 + }, + { + "epoch": 0.6065789473684211, + "grad_norm": 0.09476157150253976, + "learning_rate": 0.00019370469662193248, + "loss": 0.2981, + "step": 461 + }, + { + "epoch": 0.6078947368421053, + "grad_norm": 0.08742707617370972, + "learning_rate": 0.0001936511241197055, + "loss": 0.2805, + "step": 462 + }, + { + "epoch": 0.6092105263157894, + "grad_norm": 0.09283003181432853, + "learning_rate": 0.00019359733210564244, + "loss": 0.2955, + "step": 463 + }, + { + "epoch": 0.6105263157894737, + "grad_norm": 0.09458543714491262, + "learning_rate": 0.0001935433207058281, + "loss": 0.2949, + "step": 464 + }, + { + "epoch": 0.6118421052631579, + "grad_norm": 0.08784515458042345, + "learning_rate": 0.00019348909004686152, + "loss": 0.3033, + "step": 465 + }, + { + "epoch": 0.6131578947368421, + "grad_norm": 0.08813027640751597, + "learning_rate": 0.00019343464025585563, + "loss": 0.2949, + "step": 466 + }, + { + "epoch": 0.6144736842105263, + "grad_norm": 0.09671795564332149, + "learning_rate": 0.00019337997146043708, + "loss": 0.3022, + "step": 467 + }, + { + "epoch": 0.6157894736842106, + "grad_norm": 0.09555526827290024, + "learning_rate": 0.0001933250837887457, + "loss": 0.2956, + "step": 468 + }, + { + "epoch": 0.6171052631578947, + "grad_norm": 0.08589539494059092, + "learning_rate": 0.00019326997736943455, + "loss": 0.3052, + "step": 469 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 0.08999975458012179, + "learning_rate": 0.00019321465233166924, + "loss": 0.2949, + "step": 470 + }, + { + "epoch": 0.6197368421052631, + "grad_norm": 0.09506140617593234, + "learning_rate": 0.0001931591088051279, + "loss": 0.3151, + "step": 471 + }, + { + "epoch": 0.6210526315789474, + "grad_norm": 0.09611958378062155, + "learning_rate": 0.00019310334692000075, + "loss": 0.2907, + "step": 472 + }, + { + "epoch": 0.6223684210526316, + "grad_norm": 0.09078308388388881, + "learning_rate": 0.00019304736680698988, + "loss": 0.2969, + "step": 473 + }, + { + "epoch": 0.6236842105263158, + "grad_norm": 0.09080240341624422, + "learning_rate": 0.0001929911685973088, + "loss": 0.2969, + "step": 474 + }, + { + "epoch": 0.625, + "grad_norm": 0.08939149414674773, + "learning_rate": 0.00019293475242268223, + "loss": 0.2849, + "step": 475 + }, + { + "epoch": 0.6263157894736842, + "grad_norm": 0.08769153124531658, + "learning_rate": 0.00019287811841534595, + "loss": 0.2857, + "step": 476 + }, + { + "epoch": 0.6276315789473684, + "grad_norm": 0.09345365417023815, + "learning_rate": 0.00019282126670804614, + "loss": 0.3044, + "step": 477 + }, + { + "epoch": 0.6289473684210526, + "grad_norm": 0.09137801449382149, + "learning_rate": 0.00019276419743403933, + "loss": 0.2993, + "step": 478 + }, + { + "epoch": 0.6302631578947369, + "grad_norm": 0.090592310213129, + "learning_rate": 0.00019270691072709195, + "loss": 0.3005, + "step": 479 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.09174534803218672, + "learning_rate": 0.00019264940672148018, + "loss": 0.3059, + "step": 480 + }, + { + "epoch": 0.6328947368421053, + "grad_norm": 0.0871835682901987, + "learning_rate": 0.00019259168555198948, + "loss": 0.286, + "step": 481 + }, + { + "epoch": 0.6342105263157894, + "grad_norm": 0.0892410180390742, + "learning_rate": 0.0001925337473539143, + "loss": 0.2788, + "step": 482 + }, + { + "epoch": 0.6355263157894737, + "grad_norm": 0.09248088501840468, + "learning_rate": 0.00019247559226305785, + "loss": 0.2948, + "step": 483 + }, + { + "epoch": 0.6368421052631579, + "grad_norm": 0.09098010321941165, + "learning_rate": 0.00019241722041573166, + "loss": 0.3028, + "step": 484 + }, + { + "epoch": 0.6381578947368421, + "grad_norm": 0.0896570234030572, + "learning_rate": 0.00019235863194875532, + "loss": 0.2832, + "step": 485 + }, + { + "epoch": 0.6394736842105263, + "grad_norm": 0.08779926165652006, + "learning_rate": 0.0001922998269994563, + "loss": 0.2987, + "step": 486 + }, + { + "epoch": 0.6407894736842106, + "grad_norm": 0.09365677051236368, + "learning_rate": 0.00019224080570566927, + "loss": 0.2912, + "step": 487 + }, + { + "epoch": 0.6421052631578947, + "grad_norm": 0.08927856303791634, + "learning_rate": 0.0001921815682057362, + "loss": 0.2966, + "step": 488 + }, + { + "epoch": 0.6434210526315789, + "grad_norm": 0.0879241771326983, + "learning_rate": 0.00019212211463850567, + "loss": 0.284, + "step": 489 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.0878734974007894, + "learning_rate": 0.00019206244514333282, + "loss": 0.2834, + "step": 490 + }, + { + "epoch": 0.6460526315789473, + "grad_norm": 0.08899613109416632, + "learning_rate": 0.00019200255986007885, + "loss": 0.2905, + "step": 491 + }, + { + "epoch": 0.6473684210526316, + "grad_norm": 0.0868302957136876, + "learning_rate": 0.0001919424589291108, + "loss": 0.2994, + "step": 492 + }, + { + "epoch": 0.6486842105263158, + "grad_norm": 0.08873110421371189, + "learning_rate": 0.0001918821424913011, + "loss": 0.3054, + "step": 493 + }, + { + "epoch": 0.65, + "grad_norm": 0.08729340226342107, + "learning_rate": 0.00019182161068802741, + "loss": 0.2926, + "step": 494 + }, + { + "epoch": 0.6513157894736842, + "grad_norm": 0.09075104025668673, + "learning_rate": 0.00019176086366117211, + "loss": 0.304, + "step": 495 + }, + { + "epoch": 0.6526315789473685, + "grad_norm": 0.0893794254125347, + "learning_rate": 0.0001916999015531221, + "loss": 0.2917, + "step": 496 + }, + { + "epoch": 0.6539473684210526, + "grad_norm": 0.089792074117038, + "learning_rate": 0.00019163872450676835, + "loss": 0.2812, + "step": 497 + }, + { + "epoch": 0.6552631578947369, + "grad_norm": 0.09280308668273284, + "learning_rate": 0.00019157733266550575, + "loss": 0.3024, + "step": 498 + }, + { + "epoch": 0.656578947368421, + "grad_norm": 0.0871128187479411, + "learning_rate": 0.00019151572617323253, + "loss": 0.29, + "step": 499 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.08870708451193264, + "learning_rate": 0.00019145390517435012, + "loss": 0.3034, + "step": 500 + }, + { + "epoch": 0.6592105263157895, + "grad_norm": 0.08720360293850979, + "learning_rate": 0.00019139186981376267, + "loss": 0.2844, + "step": 501 + }, + { + "epoch": 0.6605263157894737, + "grad_norm": 0.08832729646265089, + "learning_rate": 0.0001913296202368769, + "loss": 0.2999, + "step": 502 + }, + { + "epoch": 0.6618421052631579, + "grad_norm": 0.08817831223979289, + "learning_rate": 0.0001912671565896015, + "loss": 0.3034, + "step": 503 + }, + { + "epoch": 0.6631578947368421, + "grad_norm": 0.08919226453530668, + "learning_rate": 0.00019120447901834706, + "loss": 0.2865, + "step": 504 + }, + { + "epoch": 0.6644736842105263, + "grad_norm": 0.09256732471786443, + "learning_rate": 0.00019114158767002547, + "loss": 0.2887, + "step": 505 + }, + { + "epoch": 0.6657894736842105, + "grad_norm": 0.08807553839373092, + "learning_rate": 0.00019107848269204976, + "loss": 0.2864, + "step": 506 + }, + { + "epoch": 0.6671052631578948, + "grad_norm": 0.08590339271381157, + "learning_rate": 0.00019101516423233368, + "loss": 0.2878, + "step": 507 + }, + { + "epoch": 0.6684210526315789, + "grad_norm": 0.08651840397545302, + "learning_rate": 0.00019095163243929142, + "loss": 0.3005, + "step": 508 + }, + { + "epoch": 0.6697368421052632, + "grad_norm": 0.08955034533153189, + "learning_rate": 0.00019088788746183714, + "loss": 0.2906, + "step": 509 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 0.08824857512769824, + "learning_rate": 0.00019082392944938466, + "loss": 0.2826, + "step": 510 + }, + { + "epoch": 0.6723684210526316, + "grad_norm": 0.08981332868291347, + "learning_rate": 0.00019075975855184724, + "loss": 0.2791, + "step": 511 + }, + { + "epoch": 0.6736842105263158, + "grad_norm": 0.08872229343602653, + "learning_rate": 0.0001906953749196371, + "loss": 0.2941, + "step": 512 + }, + { + "epoch": 0.675, + "grad_norm": 0.08604148465349523, + "learning_rate": 0.000190630778703665, + "loss": 0.2898, + "step": 513 + }, + { + "epoch": 0.6763157894736842, + "grad_norm": 0.09068748827778476, + "learning_rate": 0.00019056597005534013, + "loss": 0.3052, + "step": 514 + }, + { + "epoch": 0.6776315789473685, + "grad_norm": 0.09179119755745117, + "learning_rate": 0.00019050094912656952, + "loss": 0.3034, + "step": 515 + }, + { + "epoch": 0.6789473684210526, + "grad_norm": 0.08427846310926745, + "learning_rate": 0.00019043571606975777, + "loss": 0.2948, + "step": 516 + }, + { + "epoch": 0.6802631578947368, + "grad_norm": 0.08476965472606288, + "learning_rate": 0.00019037027103780668, + "loss": 0.2883, + "step": 517 + }, + { + "epoch": 0.6815789473684211, + "grad_norm": 0.08421580836098888, + "learning_rate": 0.00019030461418411497, + "loss": 0.2775, + "step": 518 + }, + { + "epoch": 0.6828947368421052, + "grad_norm": 0.08618030098765003, + "learning_rate": 0.00019023874566257784, + "loss": 0.2771, + "step": 519 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.08571261541013994, + "learning_rate": 0.00019017266562758659, + "loss": 0.289, + "step": 520 + }, + { + "epoch": 0.6855263157894737, + "grad_norm": 0.09462804464406739, + "learning_rate": 0.00019010637423402823, + "loss": 0.31, + "step": 521 + }, + { + "epoch": 0.6868421052631579, + "grad_norm": 0.09220006978220673, + "learning_rate": 0.00019003987163728535, + "loss": 0.2898, + "step": 522 + }, + { + "epoch": 0.6881578947368421, + "grad_norm": 0.08687665486527996, + "learning_rate": 0.00018997315799323548, + "loss": 0.2914, + "step": 523 + }, + { + "epoch": 0.6894736842105263, + "grad_norm": 0.08629627380926978, + "learning_rate": 0.00018990623345825083, + "loss": 0.2875, + "step": 524 + }, + { + "epoch": 0.6907894736842105, + "grad_norm": 0.08990734481501246, + "learning_rate": 0.0001898390981891979, + "loss": 0.2935, + "step": 525 + }, + { + "epoch": 0.6921052631578948, + "grad_norm": 0.09107845137785629, + "learning_rate": 0.00018977175234343723, + "loss": 0.293, + "step": 526 + }, + { + "epoch": 0.6934210526315789, + "grad_norm": 0.08746893451432518, + "learning_rate": 0.00018970419607882284, + "loss": 0.2834, + "step": 527 + }, + { + "epoch": 0.6947368421052632, + "grad_norm": 0.08510400660112896, + "learning_rate": 0.00018963642955370201, + "loss": 0.2836, + "step": 528 + }, + { + "epoch": 0.6960526315789474, + "grad_norm": 0.08902379266326126, + "learning_rate": 0.00018956845292691487, + "loss": 0.2918, + "step": 529 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 0.08716702459764675, + "learning_rate": 0.00018950026635779397, + "loss": 0.3018, + "step": 530 + }, + { + "epoch": 0.6986842105263158, + "grad_norm": 0.0860349723743795, + "learning_rate": 0.00018943187000616395, + "loss": 0.2862, + "step": 531 + }, + { + "epoch": 0.7, + "grad_norm": 0.08640233925578623, + "learning_rate": 0.00018936326403234125, + "loss": 0.281, + "step": 532 + }, + { + "epoch": 0.7013157894736842, + "grad_norm": 0.08958858699111341, + "learning_rate": 0.0001892944485971335, + "loss": 0.2835, + "step": 533 + }, + { + "epoch": 0.7026315789473684, + "grad_norm": 0.09155021097028584, + "learning_rate": 0.0001892254238618394, + "loss": 0.2871, + "step": 534 + }, + { + "epoch": 0.7039473684210527, + "grad_norm": 0.08704346353009886, + "learning_rate": 0.00018915618998824825, + "loss": 0.2773, + "step": 535 + }, + { + "epoch": 0.7052631578947368, + "grad_norm": 0.09429002348494249, + "learning_rate": 0.00018908674713863952, + "loss": 0.2995, + "step": 536 + }, + { + "epoch": 0.7065789473684211, + "grad_norm": 0.0872284448725193, + "learning_rate": 0.00018901709547578245, + "loss": 0.29, + "step": 537 + }, + { + "epoch": 0.7078947368421052, + "grad_norm": 0.08323277604666501, + "learning_rate": 0.00018894723516293583, + "loss": 0.2924, + "step": 538 + }, + { + "epoch": 0.7092105263157895, + "grad_norm": 0.09209955844526282, + "learning_rate": 0.00018887716636384745, + "loss": 0.3093, + "step": 539 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.0928458401326009, + "learning_rate": 0.00018880688924275378, + "loss": 0.2898, + "step": 540 + }, + { + "epoch": 0.7118421052631579, + "grad_norm": 0.08447656093725353, + "learning_rate": 0.00018873640396437958, + "loss": 0.2998, + "step": 541 + }, + { + "epoch": 0.7131578947368421, + "grad_norm": 0.08914403154111118, + "learning_rate": 0.00018866571069393753, + "loss": 0.2975, + "step": 542 + }, + { + "epoch": 0.7144736842105263, + "grad_norm": 0.08061793607147465, + "learning_rate": 0.0001885948095971278, + "loss": 0.2693, + "step": 543 + }, + { + "epoch": 0.7157894736842105, + "grad_norm": 0.08081926207565744, + "learning_rate": 0.0001885237008401378, + "loss": 0.2833, + "step": 544 + }, + { + "epoch": 0.7171052631578947, + "grad_norm": 0.08814474168945655, + "learning_rate": 0.00018845238458964155, + "loss": 0.2867, + "step": 545 + }, + { + "epoch": 0.718421052631579, + "grad_norm": 0.08581179650874697, + "learning_rate": 0.00018838086101279945, + "loss": 0.2966, + "step": 546 + }, + { + "epoch": 0.7197368421052631, + "grad_norm": 0.08824691850575504, + "learning_rate": 0.0001883091302772579, + "loss": 0.2855, + "step": 547 + }, + { + "epoch": 0.7210526315789474, + "grad_norm": 0.08188688143306905, + "learning_rate": 0.0001882371925511488, + "loss": 0.2913, + "step": 548 + }, + { + "epoch": 0.7223684210526315, + "grad_norm": 0.08584104136037612, + "learning_rate": 0.00018816504800308934, + "loss": 0.2907, + "step": 549 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.08410484967989623, + "learning_rate": 0.00018809269680218136, + "loss": 0.2745, + "step": 550 + }, + { + "epoch": 0.725, + "grad_norm": 0.08587157095386123, + "learning_rate": 0.00018802013911801112, + "loss": 0.2881, + "step": 551 + }, + { + "epoch": 0.7263157894736842, + "grad_norm": 0.09288984345534493, + "learning_rate": 0.0001879473751206489, + "loss": 0.3067, + "step": 552 + }, + { + "epoch": 0.7276315789473684, + "grad_norm": 0.08697284865649188, + "learning_rate": 0.00018787440498064856, + "loss": 0.2919, + "step": 553 + }, + { + "epoch": 0.7289473684210527, + "grad_norm": 0.08793299552682735, + "learning_rate": 0.00018780122886904709, + "loss": 0.3029, + "step": 554 + }, + { + "epoch": 0.7302631578947368, + "grad_norm": 0.08461397738242368, + "learning_rate": 0.0001877278469573643, + "loss": 0.2977, + "step": 555 + }, + { + "epoch": 0.7315789473684211, + "grad_norm": 0.0874714643672974, + "learning_rate": 0.00018765425941760238, + "loss": 0.2928, + "step": 556 + }, + { + "epoch": 0.7328947368421053, + "grad_norm": 0.0873090646029582, + "learning_rate": 0.0001875804664222455, + "loss": 0.3066, + "step": 557 + }, + { + "epoch": 0.7342105263157894, + "grad_norm": 0.08379654737914938, + "learning_rate": 0.00018750646814425938, + "loss": 0.2977, + "step": 558 + }, + { + "epoch": 0.7355263157894737, + "grad_norm": 0.09314788862619253, + "learning_rate": 0.00018743226475709094, + "loss": 0.2963, + "step": 559 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.08588334887643348, + "learning_rate": 0.00018735785643466784, + "loss": 0.2878, + "step": 560 + }, + { + "epoch": 0.7381578947368421, + "grad_norm": 0.08163329639613695, + "learning_rate": 0.00018728324335139814, + "loss": 0.2991, + "step": 561 + }, + { + "epoch": 0.7394736842105263, + "grad_norm": 0.08439090485689141, + "learning_rate": 0.00018720842568216978, + "loss": 0.274, + "step": 562 + }, + { + "epoch": 0.7407894736842106, + "grad_norm": 0.09031678503562862, + "learning_rate": 0.0001871334036023503, + "loss": 0.2942, + "step": 563 + }, + { + "epoch": 0.7421052631578947, + "grad_norm": 0.0812973217567514, + "learning_rate": 0.00018705817728778624, + "loss": 0.2725, + "step": 564 + }, + { + "epoch": 0.743421052631579, + "grad_norm": 0.08201157379247533, + "learning_rate": 0.00018698274691480302, + "loss": 0.2712, + "step": 565 + }, + { + "epoch": 0.7447368421052631, + "grad_norm": 0.08733882187748677, + "learning_rate": 0.00018690711266020426, + "loss": 0.2841, + "step": 566 + }, + { + "epoch": 0.7460526315789474, + "grad_norm": 0.08741424395823033, + "learning_rate": 0.0001868312747012715, + "loss": 0.2763, + "step": 567 + }, + { + "epoch": 0.7473684210526316, + "grad_norm": 0.08660511543001677, + "learning_rate": 0.00018675523321576371, + "loss": 0.2847, + "step": 568 + }, + { + "epoch": 0.7486842105263158, + "grad_norm": 0.08668938841950703, + "learning_rate": 0.00018667898838191694, + "loss": 0.2821, + "step": 569 + }, + { + "epoch": 0.75, + "grad_norm": 0.08397772374383645, + "learning_rate": 0.00018660254037844388, + "loss": 0.2919, + "step": 570 + }, + { + "epoch": 0.7513157894736842, + "grad_norm": 0.0890009207591882, + "learning_rate": 0.0001865258893845334, + "loss": 0.2935, + "step": 571 + }, + { + "epoch": 0.7526315789473684, + "grad_norm": 0.08615071748725678, + "learning_rate": 0.00018644903557985025, + "loss": 0.2862, + "step": 572 + }, + { + "epoch": 0.7539473684210526, + "grad_norm": 0.08035895214007202, + "learning_rate": 0.00018637197914453445, + "loss": 0.2915, + "step": 573 + }, + { + "epoch": 0.7552631578947369, + "grad_norm": 0.08394144603894776, + "learning_rate": 0.000186294720259201, + "loss": 0.2905, + "step": 574 + }, + { + "epoch": 0.756578947368421, + "grad_norm": 0.08681582101826608, + "learning_rate": 0.0001862172591049395, + "loss": 0.2927, + "step": 575 + }, + { + "epoch": 0.7578947368421053, + "grad_norm": 0.08027258031476615, + "learning_rate": 0.00018613959586331362, + "loss": 0.2717, + "step": 576 + }, + { + "epoch": 0.7592105263157894, + "grad_norm": 0.0832571328383153, + "learning_rate": 0.0001860617307163606, + "loss": 0.2877, + "step": 577 + }, + { + "epoch": 0.7605263157894737, + "grad_norm": 0.0855055635944409, + "learning_rate": 0.0001859836638465911, + "loss": 0.2879, + "step": 578 + }, + { + "epoch": 0.7618421052631579, + "grad_norm": 0.08455171659738935, + "learning_rate": 0.00018590539543698854, + "loss": 0.2946, + "step": 579 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.08740787324026139, + "learning_rate": 0.00018582692567100867, + "loss": 0.2835, + "step": 580 + }, + { + "epoch": 0.7644736842105263, + "grad_norm": 0.08908515189955798, + "learning_rate": 0.00018574825473257925, + "loss": 0.2917, + "step": 581 + }, + { + "epoch": 0.7657894736842106, + "grad_norm": 0.0856923697970573, + "learning_rate": 0.00018566938280609966, + "loss": 0.2794, + "step": 582 + }, + { + "epoch": 0.7671052631578947, + "grad_norm": 0.08400580488406503, + "learning_rate": 0.00018559031007644024, + "loss": 0.2764, + "step": 583 + }, + { + "epoch": 0.7684210526315789, + "grad_norm": 0.08599213131358657, + "learning_rate": 0.00018551103672894206, + "loss": 0.2902, + "step": 584 + }, + { + "epoch": 0.7697368421052632, + "grad_norm": 0.08612768497888425, + "learning_rate": 0.0001854315629494165, + "loss": 0.2913, + "step": 585 + }, + { + "epoch": 0.7710526315789473, + "grad_norm": 0.08848005818883788, + "learning_rate": 0.0001853518889241446, + "loss": 0.2913, + "step": 586 + }, + { + "epoch": 0.7723684210526316, + "grad_norm": 0.08443059614838172, + "learning_rate": 0.0001852720148398769, + "loss": 0.283, + "step": 587 + }, + { + "epoch": 0.7736842105263158, + "grad_norm": 0.08864582533090447, + "learning_rate": 0.00018519194088383273, + "loss": 0.2888, + "step": 588 + }, + { + "epoch": 0.775, + "grad_norm": 0.08840947193889603, + "learning_rate": 0.00018511166724369997, + "loss": 0.2931, + "step": 589 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 0.08363366619678043, + "learning_rate": 0.0001850311941076346, + "loss": 0.2782, + "step": 590 + }, + { + "epoch": 0.7776315789473685, + "grad_norm": 0.08538148462925309, + "learning_rate": 0.00018495052166426015, + "loss": 0.2856, + "step": 591 + }, + { + "epoch": 0.7789473684210526, + "grad_norm": 0.08343535715617335, + "learning_rate": 0.00018486965010266725, + "loss": 0.2847, + "step": 592 + }, + { + "epoch": 0.7802631578947369, + "grad_norm": 0.08230524033132103, + "learning_rate": 0.00018478857961241337, + "loss": 0.2878, + "step": 593 + }, + { + "epoch": 0.781578947368421, + "grad_norm": 0.08479401897743853, + "learning_rate": 0.0001847073103835222, + "loss": 0.2873, + "step": 594 + }, + { + "epoch": 0.7828947368421053, + "grad_norm": 0.08524719297922047, + "learning_rate": 0.00018462584260648323, + "loss": 0.2845, + "step": 595 + }, + { + "epoch": 0.7842105263157895, + "grad_norm": 0.08448036993692473, + "learning_rate": 0.0001845441764722514, + "loss": 0.288, + "step": 596 + }, + { + "epoch": 0.7855263157894737, + "grad_norm": 0.08740687637602888, + "learning_rate": 0.0001844623121722465, + "loss": 0.2753, + "step": 597 + }, + { + "epoch": 0.7868421052631579, + "grad_norm": 0.08303399012001453, + "learning_rate": 0.0001843802498983529, + "loss": 0.2875, + "step": 598 + }, + { + "epoch": 0.7881578947368421, + "grad_norm": 0.08523557992775858, + "learning_rate": 0.00018429798984291896, + "loss": 0.2843, + "step": 599 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.0882338916613198, + "learning_rate": 0.00018421553219875658, + "loss": 0.3044, + "step": 600 + }, + { + "epoch": 0.7907894736842105, + "grad_norm": 0.08386004069859475, + "learning_rate": 0.00018413287715914089, + "loss": 0.2744, + "step": 601 + }, + { + "epoch": 0.7921052631578948, + "grad_norm": 0.0860906320981426, + "learning_rate": 0.00018405002491780968, + "loss": 0.2904, + "step": 602 + }, + { + "epoch": 0.7934210526315789, + "grad_norm": 0.08412547381525294, + "learning_rate": 0.00018396697566896286, + "loss": 0.2944, + "step": 603 + }, + { + "epoch": 0.7947368421052632, + "grad_norm": 0.08160487708739002, + "learning_rate": 0.00018388372960726228, + "loss": 0.2894, + "step": 604 + }, + { + "epoch": 0.7960526315789473, + "grad_norm": 0.07789675625414566, + "learning_rate": 0.00018380028692783096, + "loss": 0.289, + "step": 605 + }, + { + "epoch": 0.7973684210526316, + "grad_norm": 0.08326388068355157, + "learning_rate": 0.00018371664782625287, + "loss": 0.2757, + "step": 606 + }, + { + "epoch": 0.7986842105263158, + "grad_norm": 0.0813892959155161, + "learning_rate": 0.00018363281249857233, + "loss": 0.2805, + "step": 607 + }, + { + "epoch": 0.8, + "grad_norm": 0.08338896150212435, + "learning_rate": 0.00018354878114129367, + "loss": 0.2761, + "step": 608 + }, + { + "epoch": 0.8013157894736842, + "grad_norm": 0.08472324714205545, + "learning_rate": 0.00018346455395138058, + "loss": 0.2814, + "step": 609 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 0.08513461063729397, + "learning_rate": 0.00018338013112625587, + "loss": 0.2952, + "step": 610 + }, + { + "epoch": 0.8039473684210526, + "grad_norm": 0.08100863304109138, + "learning_rate": 0.00018329551286380087, + "loss": 0.2756, + "step": 611 + }, + { + "epoch": 0.8052631578947368, + "grad_norm": 0.0817298921819726, + "learning_rate": 0.00018321069936235503, + "loss": 0.2788, + "step": 612 + }, + { + "epoch": 0.8065789473684211, + "grad_norm": 0.08139556434473312, + "learning_rate": 0.00018312569082071535, + "loss": 0.2855, + "step": 613 + }, + { + "epoch": 0.8078947368421052, + "grad_norm": 0.08426481750684653, + "learning_rate": 0.0001830404874381361, + "loss": 0.2932, + "step": 614 + }, + { + "epoch": 0.8092105263157895, + "grad_norm": 0.08338650300983878, + "learning_rate": 0.00018295508941432815, + "loss": 0.291, + "step": 615 + }, + { + "epoch": 0.8105263157894737, + "grad_norm": 0.0841355163206272, + "learning_rate": 0.00018286949694945866, + "loss": 0.2905, + "step": 616 + }, + { + "epoch": 0.8118421052631579, + "grad_norm": 0.08411824810347067, + "learning_rate": 0.0001827837102441505, + "loss": 0.2992, + "step": 617 + }, + { + "epoch": 0.8131578947368421, + "grad_norm": 0.08090022262513084, + "learning_rate": 0.00018269772949948182, + "loss": 0.2906, + "step": 618 + }, + { + "epoch": 0.8144736842105263, + "grad_norm": 0.0788680440518934, + "learning_rate": 0.00018261155491698568, + "loss": 0.2939, + "step": 619 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.07833398942854092, + "learning_rate": 0.00018252518669864936, + "loss": 0.2873, + "step": 620 + }, + { + "epoch": 0.8171052631578948, + "grad_norm": 0.08646064241825135, + "learning_rate": 0.00018243862504691407, + "loss": 0.2831, + "step": 621 + }, + { + "epoch": 0.8184210526315789, + "grad_norm": 0.07939171423489742, + "learning_rate": 0.00018235187016467442, + "loss": 0.2831, + "step": 622 + }, + { + "epoch": 0.8197368421052632, + "grad_norm": 0.08346477115007297, + "learning_rate": 0.0001822649222552779, + "loss": 0.2832, + "step": 623 + }, + { + "epoch": 0.8210526315789474, + "grad_norm": 0.08777211185850274, + "learning_rate": 0.0001821777815225245, + "loss": 0.2946, + "step": 624 + }, + { + "epoch": 0.8223684210526315, + "grad_norm": 0.08176020142698397, + "learning_rate": 0.00018209044817066617, + "loss": 0.2808, + "step": 625 + }, + { + "epoch": 0.8236842105263158, + "grad_norm": 0.083353664452084, + "learning_rate": 0.00018200292240440623, + "loss": 0.2964, + "step": 626 + }, + { + "epoch": 0.825, + "grad_norm": 0.08320257861919898, + "learning_rate": 0.0001819152044288992, + "loss": 0.2687, + "step": 627 + }, + { + "epoch": 0.8263157894736842, + "grad_norm": 0.08315187924191537, + "learning_rate": 0.00018182729444974992, + "loss": 0.2911, + "step": 628 + }, + { + "epoch": 0.8276315789473684, + "grad_norm": 0.08465028554778517, + "learning_rate": 0.00018173919267301344, + "loss": 0.2849, + "step": 629 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.0808599203291463, + "learning_rate": 0.0001816508993051943, + "loss": 0.2842, + "step": 630 + }, + { + "epoch": 0.8302631578947368, + "grad_norm": 0.08319880407928407, + "learning_rate": 0.0001815624145532461, + "loss": 0.2878, + "step": 631 + }, + { + "epoch": 0.8315789473684211, + "grad_norm": 0.08454873694804513, + "learning_rate": 0.00018147373862457107, + "loss": 0.2977, + "step": 632 + }, + { + "epoch": 0.8328947368421052, + "grad_norm": 0.08522249341627423, + "learning_rate": 0.0001813848717270195, + "loss": 0.2962, + "step": 633 + }, + { + "epoch": 0.8342105263157895, + "grad_norm": 0.08804872368012215, + "learning_rate": 0.00018129581406888936, + "loss": 0.283, + "step": 634 + }, + { + "epoch": 0.8355263157894737, + "grad_norm": 0.08019331668231523, + "learning_rate": 0.00018120656585892572, + "loss": 0.2856, + "step": 635 + }, + { + "epoch": 0.8368421052631579, + "grad_norm": 0.08839395034159263, + "learning_rate": 0.00018111712730632022, + "loss": 0.3064, + "step": 636 + }, + { + "epoch": 0.8381578947368421, + "grad_norm": 0.08267930670342614, + "learning_rate": 0.00018102749862071083, + "loss": 0.2884, + "step": 637 + }, + { + "epoch": 0.8394736842105263, + "grad_norm": 0.08334284753944939, + "learning_rate": 0.00018093768001218094, + "loss": 0.2882, + "step": 638 + }, + { + "epoch": 0.8407894736842105, + "grad_norm": 0.08293903985279776, + "learning_rate": 0.00018084767169125932, + "loss": 0.2855, + "step": 639 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.08224010769882795, + "learning_rate": 0.0001807574738689193, + "loss": 0.3051, + "step": 640 + }, + { + "epoch": 0.843421052631579, + "grad_norm": 0.07771382955899404, + "learning_rate": 0.00018066708675657837, + "loss": 0.255, + "step": 641 + }, + { + "epoch": 0.8447368421052631, + "grad_norm": 0.08276207696773076, + "learning_rate": 0.00018057651056609784, + "loss": 0.2851, + "step": 642 + }, + { + "epoch": 0.8460526315789474, + "grad_norm": 0.08138572227367649, + "learning_rate": 0.000180485745509782, + "loss": 0.2726, + "step": 643 + }, + { + "epoch": 0.8473684210526315, + "grad_norm": 0.07991337682516308, + "learning_rate": 0.000180394791800378, + "loss": 0.2795, + "step": 644 + }, + { + "epoch": 0.8486842105263158, + "grad_norm": 0.08368290547552978, + "learning_rate": 0.0001803036496510752, + "loss": 0.2876, + "step": 645 + }, + { + "epoch": 0.85, + "grad_norm": 0.08602046286102522, + "learning_rate": 0.0001802123192755044, + "loss": 0.2899, + "step": 646 + }, + { + "epoch": 0.8513157894736842, + "grad_norm": 0.08247602682736774, + "learning_rate": 0.00018012080088773786, + "loss": 0.289, + "step": 647 + }, + { + "epoch": 0.8526315789473684, + "grad_norm": 0.08076494362770659, + "learning_rate": 0.00018002909470228842, + "loss": 0.2953, + "step": 648 + }, + { + "epoch": 0.8539473684210527, + "grad_norm": 0.08213269492727011, + "learning_rate": 0.0001799372009341091, + "loss": 0.2834, + "step": 649 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.07906596825371759, + "learning_rate": 0.00017984511979859263, + "loss": 0.2884, + "step": 650 + }, + { + "epoch": 0.8565789473684211, + "grad_norm": 0.08280102631376626, + "learning_rate": 0.0001797528515115709, + "loss": 0.2898, + "step": 651 + }, + { + "epoch": 0.8578947368421053, + "grad_norm": 0.08195845362740756, + "learning_rate": 0.00017966039628931446, + "loss": 0.2811, + "step": 652 + }, + { + "epoch": 0.8592105263157894, + "grad_norm": 0.08275884521115351, + "learning_rate": 0.00017956775434853201, + "loss": 0.2809, + "step": 653 + }, + { + "epoch": 0.8605263157894737, + "grad_norm": 0.0823561522762983, + "learning_rate": 0.00017947492590637, + "loss": 0.2733, + "step": 654 + }, + { + "epoch": 0.8618421052631579, + "grad_norm": 0.08408575804402002, + "learning_rate": 0.00017938191118041185, + "loss": 0.2921, + "step": 655 + }, + { + "epoch": 0.8631578947368421, + "grad_norm": 0.08263210898936545, + "learning_rate": 0.00017928871038867784, + "loss": 0.2896, + "step": 656 + }, + { + "epoch": 0.8644736842105263, + "grad_norm": 0.08282963133814529, + "learning_rate": 0.00017919532374962416, + "loss": 0.267, + "step": 657 + }, + { + "epoch": 0.8657894736842106, + "grad_norm": 0.08456674605493757, + "learning_rate": 0.00017910175148214274, + "loss": 0.2846, + "step": 658 + }, + { + "epoch": 0.8671052631578947, + "grad_norm": 0.08295032078815928, + "learning_rate": 0.00017900799380556065, + "loss": 0.2957, + "step": 659 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.08407937334549097, + "learning_rate": 0.00017891405093963938, + "loss": 0.2801, + "step": 660 + }, + { + "epoch": 0.8697368421052631, + "grad_norm": 0.08152847255098901, + "learning_rate": 0.00017881992310457461, + "loss": 0.287, + "step": 661 + }, + { + "epoch": 0.8710526315789474, + "grad_norm": 0.07846383233153922, + "learning_rate": 0.00017872561052099562, + "loss": 0.2808, + "step": 662 + }, + { + "epoch": 0.8723684210526316, + "grad_norm": 0.08455839714844739, + "learning_rate": 0.00017863111340996458, + "loss": 0.2887, + "step": 663 + }, + { + "epoch": 0.8736842105263158, + "grad_norm": 0.08214963388362358, + "learning_rate": 0.00017853643199297633, + "loss": 0.2826, + "step": 664 + }, + { + "epoch": 0.875, + "grad_norm": 0.08555672825696439, + "learning_rate": 0.00017844156649195759, + "loss": 0.2933, + "step": 665 + }, + { + "epoch": 0.8763157894736842, + "grad_norm": 0.08460096862590989, + "learning_rate": 0.00017834651712926662, + "loss": 0.2826, + "step": 666 + }, + { + "epoch": 0.8776315789473684, + "grad_norm": 0.08252349078451936, + "learning_rate": 0.00017825128412769266, + "loss": 0.2958, + "step": 667 + }, + { + "epoch": 0.8789473684210526, + "grad_norm": 0.08712916060067398, + "learning_rate": 0.00017815586771045535, + "loss": 0.3058, + "step": 668 + }, + { + "epoch": 0.8802631578947369, + "grad_norm": 0.08180591687568112, + "learning_rate": 0.00017806026810120423, + "loss": 0.2895, + "step": 669 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 0.07876973529889683, + "learning_rate": 0.00017796448552401825, + "loss": 0.286, + "step": 670 + }, + { + "epoch": 0.8828947368421053, + "grad_norm": 0.07992980593985875, + "learning_rate": 0.00017786852020340525, + "loss": 0.2861, + "step": 671 + }, + { + "epoch": 0.8842105263157894, + "grad_norm": 0.08596031959873009, + "learning_rate": 0.0001777723723643014, + "loss": 0.2956, + "step": 672 + }, + { + "epoch": 0.8855263157894737, + "grad_norm": 0.08250501195390463, + "learning_rate": 0.00017767604223207064, + "loss": 0.2925, + "step": 673 + }, + { + "epoch": 0.8868421052631579, + "grad_norm": 0.08065929962541825, + "learning_rate": 0.00017757953003250422, + "loss": 0.2751, + "step": 674 + }, + { + "epoch": 0.8881578947368421, + "grad_norm": 0.079685175906907, + "learning_rate": 0.00017748283599182014, + "loss": 0.2713, + "step": 675 + }, + { + "epoch": 0.8894736842105263, + "grad_norm": 0.08275584920700671, + "learning_rate": 0.0001773859603366626, + "loss": 0.2805, + "step": 676 + }, + { + "epoch": 0.8907894736842106, + "grad_norm": 0.08237909937314071, + "learning_rate": 0.00017728890329410157, + "loss": 0.2769, + "step": 677 + }, + { + "epoch": 0.8921052631578947, + "grad_norm": 0.08027830306430328, + "learning_rate": 0.0001771916650916321, + "loss": 0.2815, + "step": 678 + }, + { + "epoch": 0.8934210526315789, + "grad_norm": 0.07965207648580828, + "learning_rate": 0.00017709424595717388, + "loss": 0.2767, + "step": 679 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.08200170949634054, + "learning_rate": 0.00017699664611907072, + "loss": 0.2671, + "step": 680 + }, + { + "epoch": 0.8960526315789473, + "grad_norm": 0.0837945547867222, + "learning_rate": 0.00017689886580608998, + "loss": 0.2846, + "step": 681 + }, + { + "epoch": 0.8973684210526316, + "grad_norm": 0.08287411520644314, + "learning_rate": 0.00017680090524742204, + "loss": 0.2797, + "step": 682 + }, + { + "epoch": 0.8986842105263158, + "grad_norm": 0.08346778301448879, + "learning_rate": 0.0001767027646726797, + "loss": 0.2851, + "step": 683 + }, + { + "epoch": 0.9, + "grad_norm": 0.08078331578360252, + "learning_rate": 0.0001766044443118978, + "loss": 0.2758, + "step": 684 + }, + { + "epoch": 0.9013157894736842, + "grad_norm": 0.0823050633372224, + "learning_rate": 0.0001765059443955326, + "loss": 0.2896, + "step": 685 + }, + { + "epoch": 0.9026315789473685, + "grad_norm": 0.08085978291636725, + "learning_rate": 0.00017640726515446103, + "loss": 0.2752, + "step": 686 + }, + { + "epoch": 0.9039473684210526, + "grad_norm": 0.08633725156118303, + "learning_rate": 0.00017630840681998066, + "loss": 0.2885, + "step": 687 + }, + { + "epoch": 0.9052631578947369, + "grad_norm": 0.08134705054643286, + "learning_rate": 0.00017620936962380856, + "loss": 0.2741, + "step": 688 + }, + { + "epoch": 0.906578947368421, + "grad_norm": 0.08218959378945977, + "learning_rate": 0.0001761101537980812, + "loss": 0.2932, + "step": 689 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 0.08050157591340845, + "learning_rate": 0.00017601075957535364, + "loss": 0.2801, + "step": 690 + }, + { + "epoch": 0.9092105263157895, + "grad_norm": 0.07842237436882951, + "learning_rate": 0.00017591118718859923, + "loss": 0.2672, + "step": 691 + }, + { + "epoch": 0.9105263157894737, + "grad_norm": 0.08141690192884726, + "learning_rate": 0.00017581143687120875, + "loss": 0.2875, + "step": 692 + }, + { + "epoch": 0.9118421052631579, + "grad_norm": 0.0841178887567793, + "learning_rate": 0.00017571150885699023, + "loss": 0.288, + "step": 693 + }, + { + "epoch": 0.9131578947368421, + "grad_norm": 0.08199498281628458, + "learning_rate": 0.00017561140338016802, + "loss": 0.2908, + "step": 694 + }, + { + "epoch": 0.9144736842105263, + "grad_norm": 0.07861915918731534, + "learning_rate": 0.00017551112067538255, + "loss": 0.2714, + "step": 695 + }, + { + "epoch": 0.9157894736842105, + "grad_norm": 0.0831092654603713, + "learning_rate": 0.00017541066097768963, + "loss": 0.2829, + "step": 696 + }, + { + "epoch": 0.9171052631578948, + "grad_norm": 0.08057002540174062, + "learning_rate": 0.00017531002452255993, + "loss": 0.2938, + "step": 697 + }, + { + "epoch": 0.9184210526315789, + "grad_norm": 0.08109895602300159, + "learning_rate": 0.00017520921154587843, + "loss": 0.2913, + "step": 698 + }, + { + "epoch": 0.9197368421052632, + "grad_norm": 0.08102699248656946, + "learning_rate": 0.00017510822228394385, + "loss": 0.2725, + "step": 699 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.07787423365882763, + "learning_rate": 0.0001750070569734681, + "loss": 0.2741, + "step": 700 + }, + { + "epoch": 0.9223684210526316, + "grad_norm": 0.07824132469372493, + "learning_rate": 0.00017490571585157576, + "loss": 0.2723, + "step": 701 + }, + { + "epoch": 0.9236842105263158, + "grad_norm": 0.08382351085436456, + "learning_rate": 0.00017480419915580356, + "loss": 0.284, + "step": 702 + }, + { + "epoch": 0.925, + "grad_norm": 0.0807625674700622, + "learning_rate": 0.0001747025071240996, + "loss": 0.2749, + "step": 703 + }, + { + "epoch": 0.9263157894736842, + "grad_norm": 0.0816515881967252, + "learning_rate": 0.00017460063999482316, + "loss": 0.2789, + "step": 704 + }, + { + "epoch": 0.9276315789473685, + "grad_norm": 0.07777785520204356, + "learning_rate": 0.00017449859800674371, + "loss": 0.2713, + "step": 705 + }, + { + "epoch": 0.9289473684210526, + "grad_norm": 0.08076226646853048, + "learning_rate": 0.0001743963813990408, + "loss": 0.2953, + "step": 706 + }, + { + "epoch": 0.9302631578947368, + "grad_norm": 0.07973740349278101, + "learning_rate": 0.00017429399041130313, + "loss": 0.2662, + "step": 707 + }, + { + "epoch": 0.9315789473684211, + "grad_norm": 0.08456169813288093, + "learning_rate": 0.00017419142528352817, + "loss": 0.2859, + "step": 708 + }, + { + "epoch": 0.9328947368421052, + "grad_norm": 0.08035481674307465, + "learning_rate": 0.0001740886862561216, + "loss": 0.2809, + "step": 709 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.08259040712172755, + "learning_rate": 0.00017398577356989665, + "loss": 0.3024, + "step": 710 + }, + { + "epoch": 0.9355263157894737, + "grad_norm": 0.08153299418934971, + "learning_rate": 0.0001738826874660737, + "loss": 0.2871, + "step": 711 + }, + { + "epoch": 0.9368421052631579, + "grad_norm": 0.08013584006531711, + "learning_rate": 0.00017377942818627942, + "loss": 0.279, + "step": 712 + }, + { + "epoch": 0.9381578947368421, + "grad_norm": 0.08047700820955582, + "learning_rate": 0.00017367599597254655, + "loss": 0.2801, + "step": 713 + }, + { + "epoch": 0.9394736842105263, + "grad_norm": 0.07885052618166923, + "learning_rate": 0.00017357239106731317, + "loss": 0.2899, + "step": 714 + }, + { + "epoch": 0.9407894736842105, + "grad_norm": 0.07929961550190592, + "learning_rate": 0.0001734686137134221, + "loss": 0.2796, + "step": 715 + }, + { + "epoch": 0.9421052631578948, + "grad_norm": 0.08339513330893553, + "learning_rate": 0.00017336466415412028, + "loss": 0.2742, + "step": 716 + }, + { + "epoch": 0.9434210526315789, + "grad_norm": 0.07669794587650931, + "learning_rate": 0.00017326054263305847, + "loss": 0.2718, + "step": 717 + }, + { + "epoch": 0.9447368421052632, + "grad_norm": 0.0799990017084231, + "learning_rate": 0.00017315624939429037, + "loss": 0.287, + "step": 718 + }, + { + "epoch": 0.9460526315789474, + "grad_norm": 0.07817929739263474, + "learning_rate": 0.0001730517846822722, + "loss": 0.2883, + "step": 719 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.0795663214179471, + "learning_rate": 0.0001729471487418621, + "loss": 0.2693, + "step": 720 + }, + { + "epoch": 0.9486842105263158, + "grad_norm": 0.0837605598262032, + "learning_rate": 0.00017284234181831956, + "loss": 0.2771, + "step": 721 + }, + { + "epoch": 0.95, + "grad_norm": 0.08193295646472974, + "learning_rate": 0.00017273736415730488, + "loss": 0.3021, + "step": 722 + }, + { + "epoch": 0.9513157894736842, + "grad_norm": 0.08213211517441393, + "learning_rate": 0.00017263221600487852, + "loss": 0.2696, + "step": 723 + }, + { + "epoch": 0.9526315789473684, + "grad_norm": 0.08608126209062855, + "learning_rate": 0.0001725268976075005, + "loss": 0.2891, + "step": 724 + }, + { + "epoch": 0.9539473684210527, + "grad_norm": 0.07916611709429071, + "learning_rate": 0.00017242140921203003, + "loss": 0.2833, + "step": 725 + }, + { + "epoch": 0.9552631578947368, + "grad_norm": 0.08221381743802868, + "learning_rate": 0.00017231575106572467, + "loss": 0.2953, + "step": 726 + }, + { + "epoch": 0.9565789473684211, + "grad_norm": 0.07838487546103713, + "learning_rate": 0.0001722099234162399, + "loss": 0.2854, + "step": 727 + }, + { + "epoch": 0.9578947368421052, + "grad_norm": 0.08166198075045744, + "learning_rate": 0.0001721039265116285, + "loss": 0.2818, + "step": 728 + }, + { + "epoch": 0.9592105263157895, + "grad_norm": 0.08220017081684095, + "learning_rate": 0.00017199776060033997, + "loss": 0.2761, + "step": 729 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.08134420494867958, + "learning_rate": 0.00017189142593121993, + "loss": 0.2738, + "step": 730 + }, + { + "epoch": 0.9618421052631579, + "grad_norm": 0.08157316013668467, + "learning_rate": 0.00017178492275350958, + "loss": 0.2637, + "step": 731 + }, + { + "epoch": 0.9631578947368421, + "grad_norm": 0.0850556774542164, + "learning_rate": 0.00017167825131684513, + "loss": 0.292, + "step": 732 + }, + { + "epoch": 0.9644736842105263, + "grad_norm": 0.08012953024736491, + "learning_rate": 0.00017157141187125713, + "loss": 0.2723, + "step": 733 + }, + { + "epoch": 0.9657894736842105, + "grad_norm": 0.07862483655819683, + "learning_rate": 0.00017146440466716991, + "loss": 0.2856, + "step": 734 + }, + { + "epoch": 0.9671052631578947, + "grad_norm": 0.08165975689746587, + "learning_rate": 0.00017135722995540107, + "loss": 0.2729, + "step": 735 + }, + { + "epoch": 0.968421052631579, + "grad_norm": 0.07988073177015712, + "learning_rate": 0.00017124988798716083, + "loss": 0.2822, + "step": 736 + }, + { + "epoch": 0.9697368421052631, + "grad_norm": 0.08138761359544663, + "learning_rate": 0.00017114237901405134, + "loss": 0.2642, + "step": 737 + }, + { + "epoch": 0.9710526315789474, + "grad_norm": 0.08079811328305973, + "learning_rate": 0.0001710347032880664, + "loss": 0.2843, + "step": 738 + }, + { + "epoch": 0.9723684210526315, + "grad_norm": 0.08408921154912564, + "learning_rate": 0.00017092686106159053, + "loss": 0.2924, + "step": 739 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.0805385190287914, + "learning_rate": 0.00017081885258739846, + "loss": 0.2823, + "step": 740 + }, + { + "epoch": 0.975, + "grad_norm": 0.08475021934413124, + "learning_rate": 0.00017071067811865476, + "loss": 0.2923, + "step": 741 + }, + { + "epoch": 0.9763157894736842, + "grad_norm": 0.07950603456214735, + "learning_rate": 0.00017060233790891296, + "loss": 0.275, + "step": 742 + }, + { + "epoch": 0.9776315789473684, + "grad_norm": 0.07785517335290744, + "learning_rate": 0.0001704938322121151, + "loss": 0.2831, + "step": 743 + }, + { + "epoch": 0.9789473684210527, + "grad_norm": 0.07960926560941854, + "learning_rate": 0.00017038516128259115, + "loss": 0.2871, + "step": 744 + }, + { + "epoch": 0.9802631578947368, + "grad_norm": 0.08293841281790956, + "learning_rate": 0.00017027632537505832, + "loss": 0.2753, + "step": 745 + }, + { + "epoch": 0.9815789473684211, + "grad_norm": 0.07747002473316371, + "learning_rate": 0.00017016732474462056, + "loss": 0.2839, + "step": 746 + }, + { + "epoch": 0.9828947368421053, + "grad_norm": 0.08046473275537684, + "learning_rate": 0.00017005815964676787, + "loss": 0.2768, + "step": 747 + }, + { + "epoch": 0.9842105263157894, + "grad_norm": 0.08364061752605101, + "learning_rate": 0.00016994883033737582, + "loss": 0.2931, + "step": 748 + }, + { + "epoch": 0.9855263157894737, + "grad_norm": 0.08050775331309325, + "learning_rate": 0.0001698393370727048, + "loss": 0.2893, + "step": 749 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.08002147081513335, + "learning_rate": 0.00016972968010939954, + "loss": 0.2848, + "step": 750 + }, + { + "epoch": 0.9881578947368421, + "grad_norm": 0.07808855249540297, + "learning_rate": 0.0001696198597044885, + "loss": 0.2846, + "step": 751 + }, + { + "epoch": 0.9894736842105263, + "grad_norm": 0.07784201379245488, + "learning_rate": 0.00016950987611538324, + "loss": 0.2838, + "step": 752 + }, + { + "epoch": 0.9907894736842106, + "grad_norm": 0.07950126160436796, + "learning_rate": 0.0001693997295998777, + "loss": 0.2933, + "step": 753 + }, + { + "epoch": 0.9921052631578947, + "grad_norm": 0.08013885766603035, + "learning_rate": 0.0001692894204161478, + "loss": 0.2703, + "step": 754 + }, + { + "epoch": 0.993421052631579, + "grad_norm": 0.08277409481218478, + "learning_rate": 0.00016917894882275075, + "loss": 0.2824, + "step": 755 + }, + { + "epoch": 0.9947368421052631, + "grad_norm": 0.0816211725278549, + "learning_rate": 0.00016906831507862443, + "loss": 0.2781, + "step": 756 + }, + { + "epoch": 0.9960526315789474, + "grad_norm": 0.08101924988182123, + "learning_rate": 0.00016895751944308679, + "loss": 0.284, + "step": 757 + }, + { + "epoch": 0.9973684210526316, + "grad_norm": 0.08179919265807176, + "learning_rate": 0.00016884656217583518, + "loss": 0.2847, + "step": 758 + }, + { + "epoch": 0.9986842105263158, + "grad_norm": 0.07951765088912952, + "learning_rate": 0.00016873544353694588, + "loss": 0.2869, + "step": 759 + }, + { + "epoch": 1.0, + "grad_norm": 0.07809652892973841, + "learning_rate": 0.0001686241637868734, + "loss": 0.2809, + "step": 760 + }, + { + "epoch": 1.0, + "eval_loss": 0.2806478440761566, + "eval_runtime": 142.699, + "eval_samples_per_second": 35.866, + "eval_steps_per_second": 1.121, + "step": 760 + }, + { + "epoch": 1.0013157894736842, + "grad_norm": 0.08089780903924765, + "learning_rate": 0.0001685127231864498, + "loss": 0.273, + "step": 761 + }, + { + "epoch": 1.0026315789473683, + "grad_norm": 0.0791344049462135, + "learning_rate": 0.00016840112199688432, + "loss": 0.2746, + "step": 762 + }, + { + "epoch": 1.0039473684210527, + "grad_norm": 0.07798719813102083, + "learning_rate": 0.00016828936047976248, + "loss": 0.2745, + "step": 763 + }, + { + "epoch": 1.0052631578947369, + "grad_norm": 0.08227033417121575, + "learning_rate": 0.00016817743889704565, + "loss": 0.2617, + "step": 764 + }, + { + "epoch": 1.006578947368421, + "grad_norm": 0.08639411451972336, + "learning_rate": 0.00016806535751107037, + "loss": 0.2674, + "step": 765 + }, + { + "epoch": 1.0078947368421052, + "grad_norm": 0.08367139936296988, + "learning_rate": 0.00016795311658454777, + "loss": 0.279, + "step": 766 + }, + { + "epoch": 1.0092105263157896, + "grad_norm": 0.08766002313298543, + "learning_rate": 0.00016784071638056285, + "loss": 0.283, + "step": 767 + }, + { + "epoch": 1.0105263157894737, + "grad_norm": 0.08427710263543763, + "learning_rate": 0.00016772815716257412, + "loss": 0.2663, + "step": 768 + }, + { + "epoch": 1.0118421052631579, + "grad_norm": 0.07801065155963043, + "learning_rate": 0.0001676154391944126, + "loss": 0.2559, + "step": 769 + }, + { + "epoch": 1.013157894736842, + "grad_norm": 0.07698562715479776, + "learning_rate": 0.00016750256274028152, + "loss": 0.2748, + "step": 770 + }, + { + "epoch": 1.0144736842105264, + "grad_norm": 0.08054717072786537, + "learning_rate": 0.0001673895280647556, + "loss": 0.2719, + "step": 771 + }, + { + "epoch": 1.0157894736842106, + "grad_norm": 0.08399901308359886, + "learning_rate": 0.0001672763354327804, + "loss": 0.2735, + "step": 772 + }, + { + "epoch": 1.0171052631578947, + "grad_norm": 0.07991338180666516, + "learning_rate": 0.0001671629851096717, + "loss": 0.2688, + "step": 773 + }, + { + "epoch": 1.018421052631579, + "grad_norm": 0.0817891282862518, + "learning_rate": 0.00016704947736111492, + "loss": 0.2695, + "step": 774 + }, + { + "epoch": 1.019736842105263, + "grad_norm": 0.084530202074618, + "learning_rate": 0.00016693581245316442, + "loss": 0.2667, + "step": 775 + }, + { + "epoch": 1.0210526315789474, + "grad_norm": 0.08294234750514276, + "learning_rate": 0.00016682199065224307, + "loss": 0.2728, + "step": 776 + }, + { + "epoch": 1.0223684210526316, + "grad_norm": 0.0832171550305921, + "learning_rate": 0.00016670801222514134, + "loss": 0.2778, + "step": 777 + }, + { + "epoch": 1.0236842105263158, + "grad_norm": 0.08255803178852009, + "learning_rate": 0.00016659387743901685, + "loss": 0.2617, + "step": 778 + }, + { + "epoch": 1.025, + "grad_norm": 0.08205321961071378, + "learning_rate": 0.00016647958656139378, + "loss": 0.2717, + "step": 779 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 0.08258890569995453, + "learning_rate": 0.00016636513986016213, + "loss": 0.2658, + "step": 780 + }, + { + "epoch": 1.0276315789473685, + "grad_norm": 0.08071843114802557, + "learning_rate": 0.0001662505376035772, + "loss": 0.2613, + "step": 781 + }, + { + "epoch": 1.0289473684210526, + "grad_norm": 0.07886987321903878, + "learning_rate": 0.00016613578006025872, + "loss": 0.257, + "step": 782 + }, + { + "epoch": 1.0302631578947368, + "grad_norm": 0.07877983241946228, + "learning_rate": 0.00016602086749919063, + "loss": 0.269, + "step": 783 + }, + { + "epoch": 1.0315789473684212, + "grad_norm": 0.08280631625375447, + "learning_rate": 0.0001659058001897201, + "loss": 0.2687, + "step": 784 + }, + { + "epoch": 1.0328947368421053, + "grad_norm": 0.08033744101706376, + "learning_rate": 0.00016579057840155703, + "loss": 0.2676, + "step": 785 + }, + { + "epoch": 1.0342105263157895, + "grad_norm": 0.07853625991786962, + "learning_rate": 0.00016567520240477344, + "loss": 0.2475, + "step": 786 + }, + { + "epoch": 1.0355263157894736, + "grad_norm": 0.08321225528586546, + "learning_rate": 0.00016555967246980276, + "loss": 0.2733, + "step": 787 + }, + { + "epoch": 1.0368421052631578, + "grad_norm": 0.08271091060501498, + "learning_rate": 0.00016544398886743933, + "loss": 0.2669, + "step": 788 + }, + { + "epoch": 1.0381578947368422, + "grad_norm": 0.08480813500178941, + "learning_rate": 0.00016532815186883748, + "loss": 0.2702, + "step": 789 + }, + { + "epoch": 1.0394736842105263, + "grad_norm": 0.08423735368240373, + "learning_rate": 0.0001652121617455113, + "loss": 0.266, + "step": 790 + }, + { + "epoch": 1.0407894736842105, + "grad_norm": 0.08270665791388272, + "learning_rate": 0.00016509601876933374, + "loss": 0.2748, + "step": 791 + }, + { + "epoch": 1.0421052631578946, + "grad_norm": 0.08074688262914587, + "learning_rate": 0.000164979723212536, + "loss": 0.2681, + "step": 792 + }, + { + "epoch": 1.043421052631579, + "grad_norm": 0.08329297167987026, + "learning_rate": 0.0001648632753477068, + "loss": 0.2737, + "step": 793 + }, + { + "epoch": 1.0447368421052632, + "grad_norm": 0.08195503131845851, + "learning_rate": 0.0001647466754477921, + "loss": 0.2605, + "step": 794 + }, + { + "epoch": 1.0460526315789473, + "grad_norm": 0.08219664275017928, + "learning_rate": 0.00016462992378609407, + "loss": 0.2726, + "step": 795 + }, + { + "epoch": 1.0473684210526315, + "grad_norm": 0.081155677466978, + "learning_rate": 0.00016451302063627066, + "loss": 0.2683, + "step": 796 + }, + { + "epoch": 1.0486842105263159, + "grad_norm": 0.07978496536582978, + "learning_rate": 0.0001643959662723348, + "loss": 0.2639, + "step": 797 + }, + { + "epoch": 1.05, + "grad_norm": 0.07736573715270778, + "learning_rate": 0.00016427876096865394, + "loss": 0.2523, + "step": 798 + }, + { + "epoch": 1.0513157894736842, + "grad_norm": 0.08461912280741266, + "learning_rate": 0.0001641614049999493, + "loss": 0.269, + "step": 799 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.08431583636176626, + "learning_rate": 0.00016404389864129533, + "loss": 0.2563, + "step": 800 + }, + { + "epoch": 1.0539473684210527, + "grad_norm": 0.08694565310464801, + "learning_rate": 0.00016392624216811879, + "loss": 0.2778, + "step": 801 + }, + { + "epoch": 1.055263157894737, + "grad_norm": 0.08352264819913631, + "learning_rate": 0.00016380843585619845, + "loss": 0.2742, + "step": 802 + }, + { + "epoch": 1.056578947368421, + "grad_norm": 0.08421963392115288, + "learning_rate": 0.0001636904799816643, + "loss": 0.271, + "step": 803 + }, + { + "epoch": 1.0578947368421052, + "grad_norm": 0.07881294236501658, + "learning_rate": 0.00016357237482099684, + "loss": 0.2599, + "step": 804 + }, + { + "epoch": 1.0592105263157894, + "grad_norm": 0.08602162394576658, + "learning_rate": 0.0001634541206510264, + "loss": 0.2729, + "step": 805 + }, + { + "epoch": 1.0605263157894738, + "grad_norm": 0.08348451901525375, + "learning_rate": 0.00016333571774893285, + "loss": 0.2753, + "step": 806 + }, + { + "epoch": 1.061842105263158, + "grad_norm": 0.0818737526419465, + "learning_rate": 0.00016321716639224434, + "loss": 0.2715, + "step": 807 + }, + { + "epoch": 1.063157894736842, + "grad_norm": 0.08286281227398601, + "learning_rate": 0.00016309846685883726, + "loss": 0.2675, + "step": 808 + }, + { + "epoch": 1.0644736842105262, + "grad_norm": 0.07966839196252047, + "learning_rate": 0.00016297961942693512, + "loss": 0.2558, + "step": 809 + }, + { + "epoch": 1.0657894736842106, + "grad_norm": 0.08414413297914877, + "learning_rate": 0.0001628606243751082, + "loss": 0.2844, + "step": 810 + }, + { + "epoch": 1.0671052631578948, + "grad_norm": 0.08555085876337333, + "learning_rate": 0.00016274148198227282, + "loss": 0.2851, + "step": 811 + }, + { + "epoch": 1.068421052631579, + "grad_norm": 0.08082783425810476, + "learning_rate": 0.00016262219252769064, + "loss": 0.2646, + "step": 812 + }, + { + "epoch": 1.069736842105263, + "grad_norm": 0.08197732764454009, + "learning_rate": 0.00016250275629096786, + "loss": 0.2779, + "step": 813 + }, + { + "epoch": 1.0710526315789473, + "grad_norm": 0.08526782512435523, + "learning_rate": 0.00016238317355205494, + "loss": 0.2799, + "step": 814 + }, + { + "epoch": 1.0723684210526316, + "grad_norm": 0.07823583584470825, + "learning_rate": 0.00016226344459124566, + "loss": 0.2643, + "step": 815 + }, + { + "epoch": 1.0736842105263158, + "grad_norm": 0.08603151669795409, + "learning_rate": 0.00016214356968917648, + "loss": 0.273, + "step": 816 + }, + { + "epoch": 1.075, + "grad_norm": 0.08851720457464476, + "learning_rate": 0.000162023549126826, + "loss": 0.2721, + "step": 817 + }, + { + "epoch": 1.0763157894736841, + "grad_norm": 0.08133793366760693, + "learning_rate": 0.00016190338318551427, + "loss": 0.255, + "step": 818 + }, + { + "epoch": 1.0776315789473685, + "grad_norm": 0.082819100026463, + "learning_rate": 0.00016178307214690193, + "loss": 0.2624, + "step": 819 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 0.08530141351749654, + "learning_rate": 0.00016166261629298995, + "loss": 0.2659, + "step": 820 + }, + { + "epoch": 1.0802631578947368, + "grad_norm": 0.08366548952005705, + "learning_rate": 0.00016154201590611852, + "loss": 0.2627, + "step": 821 + }, + { + "epoch": 1.081578947368421, + "grad_norm": 0.08224916068858729, + "learning_rate": 0.0001614212712689668, + "loss": 0.2702, + "step": 822 + }, + { + "epoch": 1.0828947368421054, + "grad_norm": 0.08321699362162292, + "learning_rate": 0.0001613003826645519, + "loss": 0.2668, + "step": 823 + }, + { + "epoch": 1.0842105263157895, + "grad_norm": 0.07940176654017943, + "learning_rate": 0.0001611793503762285, + "loss": 0.2666, + "step": 824 + }, + { + "epoch": 1.0855263157894737, + "grad_norm": 0.0810039782214103, + "learning_rate": 0.00016105817468768798, + "loss": 0.2748, + "step": 825 + }, + { + "epoch": 1.0868421052631578, + "grad_norm": 0.08080799943317177, + "learning_rate": 0.00016093685588295786, + "loss": 0.2664, + "step": 826 + }, + { + "epoch": 1.0881578947368422, + "grad_norm": 0.07963385945239661, + "learning_rate": 0.00016081539424640118, + "loss": 0.279, + "step": 827 + }, + { + "epoch": 1.0894736842105264, + "grad_norm": 0.08180594063442842, + "learning_rate": 0.00016069379006271566, + "loss": 0.2735, + "step": 828 + }, + { + "epoch": 1.0907894736842105, + "grad_norm": 0.08134763460779691, + "learning_rate": 0.00016057204361693327, + "loss": 0.2666, + "step": 829 + }, + { + "epoch": 1.0921052631578947, + "grad_norm": 0.08433241545175918, + "learning_rate": 0.0001604501551944193, + "loss": 0.2582, + "step": 830 + }, + { + "epoch": 1.0934210526315788, + "grad_norm": 0.08109132578451536, + "learning_rate": 0.0001603281250808719, + "loss": 0.2648, + "step": 831 + }, + { + "epoch": 1.0947368421052632, + "grad_norm": 0.09007590757575104, + "learning_rate": 0.00016020595356232135, + "loss": 0.2681, + "step": 832 + }, + { + "epoch": 1.0960526315789474, + "grad_norm": 0.08677922191238775, + "learning_rate": 0.00016008364092512926, + "loss": 0.2761, + "step": 833 + }, + { + "epoch": 1.0973684210526315, + "grad_norm": 0.08575921288664821, + "learning_rate": 0.00015996118745598817, + "loss": 0.2696, + "step": 834 + }, + { + "epoch": 1.0986842105263157, + "grad_norm": 0.08041317041434254, + "learning_rate": 0.00015983859344192061, + "loss": 0.2689, + "step": 835 + }, + { + "epoch": 1.1, + "grad_norm": 0.08065596311704078, + "learning_rate": 0.00015971585917027862, + "loss": 0.2687, + "step": 836 + }, + { + "epoch": 1.1013157894736842, + "grad_norm": 0.0790113706176156, + "learning_rate": 0.00015959298492874288, + "loss": 0.2555, + "step": 837 + }, + { + "epoch": 1.1026315789473684, + "grad_norm": 0.08012262373177113, + "learning_rate": 0.0001594699710053223, + "loss": 0.2685, + "step": 838 + }, + { + "epoch": 1.1039473684210526, + "grad_norm": 0.08504138731346632, + "learning_rate": 0.00015934681768835297, + "loss": 0.2805, + "step": 839 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.08095254397891562, + "learning_rate": 0.00015922352526649803, + "loss": 0.2709, + "step": 840 + }, + { + "epoch": 1.106578947368421, + "grad_norm": 0.08674994051804193, + "learning_rate": 0.00015910009402874631, + "loss": 0.2806, + "step": 841 + }, + { + "epoch": 1.1078947368421053, + "grad_norm": 0.09194758435288344, + "learning_rate": 0.0001589765242644124, + "loss": 0.2736, + "step": 842 + }, + { + "epoch": 1.1092105263157894, + "grad_norm": 0.08318187378912749, + "learning_rate": 0.00015885281626313517, + "loss": 0.2606, + "step": 843 + }, + { + "epoch": 1.1105263157894736, + "grad_norm": 0.0825846308272865, + "learning_rate": 0.00015872897031487791, + "loss": 0.2628, + "step": 844 + }, + { + "epoch": 1.111842105263158, + "grad_norm": 0.08112434803943486, + "learning_rate": 0.00015860498670992691, + "loss": 0.2793, + "step": 845 + }, + { + "epoch": 1.1131578947368421, + "grad_norm": 0.08537425779356113, + "learning_rate": 0.00015848086573889137, + "loss": 0.2779, + "step": 846 + }, + { + "epoch": 1.1144736842105263, + "grad_norm": 0.08315694519509176, + "learning_rate": 0.00015835660769270232, + "loss": 0.2719, + "step": 847 + }, + { + "epoch": 1.1157894736842104, + "grad_norm": 0.08130780896158304, + "learning_rate": 0.00015823221286261215, + "loss": 0.2653, + "step": 848 + }, + { + "epoch": 1.1171052631578948, + "grad_norm": 0.08219491647444532, + "learning_rate": 0.00015810768154019385, + "loss": 0.2765, + "step": 849 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 0.08240059160474296, + "learning_rate": 0.0001579830140173403, + "loss": 0.2737, + "step": 850 + }, + { + "epoch": 1.1197368421052631, + "grad_norm": 0.083577356578667, + "learning_rate": 0.00015785821058626366, + "loss": 0.2661, + "step": 851 + }, + { + "epoch": 1.1210526315789473, + "grad_norm": 0.07834322980894085, + "learning_rate": 0.00015773327153949465, + "loss": 0.2729, + "step": 852 + }, + { + "epoch": 1.1223684210526317, + "grad_norm": 0.08346738623736531, + "learning_rate": 0.00015760819716988187, + "loss": 0.275, + "step": 853 + }, + { + "epoch": 1.1236842105263158, + "grad_norm": 0.07884897152428384, + "learning_rate": 0.00015748298777059112, + "loss": 0.2805, + "step": 854 + }, + { + "epoch": 1.125, + "grad_norm": 0.08020439719046484, + "learning_rate": 0.0001573576436351046, + "loss": 0.2587, + "step": 855 + }, + { + "epoch": 1.1263157894736842, + "grad_norm": 0.08327214298097156, + "learning_rate": 0.0001572321650572205, + "loss": 0.2626, + "step": 856 + }, + { + "epoch": 1.1276315789473683, + "grad_norm": 0.08372000718554913, + "learning_rate": 0.00015710655233105194, + "loss": 0.2743, + "step": 857 + }, + { + "epoch": 1.1289473684210527, + "grad_norm": 0.08197882426728544, + "learning_rate": 0.00015698080575102661, + "loss": 0.2657, + "step": 858 + }, + { + "epoch": 1.1302631578947369, + "grad_norm": 0.08271785330561393, + "learning_rate": 0.00015685492561188594, + "loss": 0.2696, + "step": 859 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 0.08153102919965115, + "learning_rate": 0.00015672891220868432, + "loss": 0.2627, + "step": 860 + }, + { + "epoch": 1.1328947368421052, + "grad_norm": 0.08273222452553464, + "learning_rate": 0.00015660276583678853, + "loss": 0.2742, + "step": 861 + }, + { + "epoch": 1.1342105263157896, + "grad_norm": 0.08149761866108174, + "learning_rate": 0.0001564764867918771, + "loss": 0.284, + "step": 862 + }, + { + "epoch": 1.1355263157894737, + "grad_norm": 0.07732037718328716, + "learning_rate": 0.0001563500753699395, + "loss": 0.2559, + "step": 863 + }, + { + "epoch": 1.1368421052631579, + "grad_norm": 0.08033074721619064, + "learning_rate": 0.00015622353186727544, + "loss": 0.2651, + "step": 864 + }, + { + "epoch": 1.138157894736842, + "grad_norm": 0.07988808525323399, + "learning_rate": 0.0001560968565804942, + "loss": 0.2676, + "step": 865 + }, + { + "epoch": 1.1394736842105262, + "grad_norm": 0.07997455834407387, + "learning_rate": 0.00015597004980651407, + "loss": 0.2597, + "step": 866 + }, + { + "epoch": 1.1407894736842106, + "grad_norm": 0.07985601487062838, + "learning_rate": 0.0001558431118425614, + "loss": 0.257, + "step": 867 + }, + { + "epoch": 1.1421052631578947, + "grad_norm": 0.08269154375775314, + "learning_rate": 0.0001557160429861702, + "loss": 0.2807, + "step": 868 + }, + { + "epoch": 1.143421052631579, + "grad_norm": 0.08357702138558408, + "learning_rate": 0.00015558884353518107, + "loss": 0.2767, + "step": 869 + }, + { + "epoch": 1.1447368421052633, + "grad_norm": 0.08414408194028052, + "learning_rate": 0.00015546151378774086, + "loss": 0.274, + "step": 870 + }, + { + "epoch": 1.1460526315789474, + "grad_norm": 0.08701268358835322, + "learning_rate": 0.00015533405404230188, + "loss": 0.2811, + "step": 871 + }, + { + "epoch": 1.1473684210526316, + "grad_norm": 0.08262282790058861, + "learning_rate": 0.000155206464597621, + "loss": 0.2731, + "step": 872 + }, + { + "epoch": 1.1486842105263158, + "grad_norm": 0.08408606653507734, + "learning_rate": 0.00015507874575275917, + "loss": 0.251, + "step": 873 + }, + { + "epoch": 1.15, + "grad_norm": 0.08049323373059812, + "learning_rate": 0.0001549508978070806, + "loss": 0.2715, + "step": 874 + }, + { + "epoch": 1.1513157894736843, + "grad_norm": 0.07972519186854046, + "learning_rate": 0.0001548229210602522, + "loss": 0.2671, + "step": 875 + }, + { + "epoch": 1.1526315789473685, + "grad_norm": 0.07882804213132247, + "learning_rate": 0.00015469481581224272, + "loss": 0.2619, + "step": 876 + }, + { + "epoch": 1.1539473684210526, + "grad_norm": 0.0798538301353752, + "learning_rate": 0.00015456658236332203, + "loss": 0.2736, + "step": 877 + }, + { + "epoch": 1.1552631578947368, + "grad_norm": 0.08315715083917044, + "learning_rate": 0.00015443822101406064, + "loss": 0.2664, + "step": 878 + }, + { + "epoch": 1.1565789473684212, + "grad_norm": 0.08372404146892015, + "learning_rate": 0.00015430973206532878, + "loss": 0.2686, + "step": 879 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.08175706218650852, + "learning_rate": 0.00015418111581829574, + "loss": 0.2622, + "step": 880 + }, + { + "epoch": 1.1592105263157895, + "grad_norm": 0.07882686056716785, + "learning_rate": 0.00015405237257442924, + "loss": 0.269, + "step": 881 + }, + { + "epoch": 1.1605263157894736, + "grad_norm": 0.07950928567287992, + "learning_rate": 0.0001539235026354946, + "loss": 0.2628, + "step": 882 + }, + { + "epoch": 1.1618421052631578, + "grad_norm": 0.0850340095145124, + "learning_rate": 0.00015379450630355424, + "loss": 0.2791, + "step": 883 + }, + { + "epoch": 1.1631578947368422, + "grad_norm": 0.0941613058127184, + "learning_rate": 0.0001536653838809667, + "loss": 0.279, + "step": 884 + }, + { + "epoch": 1.1644736842105263, + "grad_norm": 0.08251843104601717, + "learning_rate": 0.00015353613567038607, + "loss": 0.2859, + "step": 885 + }, + { + "epoch": 1.1657894736842105, + "grad_norm": 0.08038962867293699, + "learning_rate": 0.0001534067619747614, + "loss": 0.2747, + "step": 886 + }, + { + "epoch": 1.1671052631578946, + "grad_norm": 0.0794561388070862, + "learning_rate": 0.00015327726309733572, + "loss": 0.2686, + "step": 887 + }, + { + "epoch": 1.168421052631579, + "grad_norm": 0.07884245258276419, + "learning_rate": 0.0001531476393416456, + "loss": 0.2672, + "step": 888 + }, + { + "epoch": 1.1697368421052632, + "grad_norm": 0.08136001457411983, + "learning_rate": 0.00015301789101152026, + "loss": 0.2672, + "step": 889 + }, + { + "epoch": 1.1710526315789473, + "grad_norm": 0.08416373900750189, + "learning_rate": 0.00015288801841108093, + "loss": 0.2823, + "step": 890 + }, + { + "epoch": 1.1723684210526315, + "grad_norm": 0.0820367167436313, + "learning_rate": 0.0001527580218447401, + "loss": 0.2578, + "step": 891 + }, + { + "epoch": 1.1736842105263159, + "grad_norm": 0.08309627633941746, + "learning_rate": 0.0001526279016172008, + "loss": 0.267, + "step": 892 + }, + { + "epoch": 1.175, + "grad_norm": 0.08128917587224013, + "learning_rate": 0.000152497658033456, + "loss": 0.2561, + "step": 893 + }, + { + "epoch": 1.1763157894736842, + "grad_norm": 0.08178638667577875, + "learning_rate": 0.00015236729139878782, + "loss": 0.2768, + "step": 894 + }, + { + "epoch": 1.1776315789473684, + "grad_norm": 0.0801661892105483, + "learning_rate": 0.0001522368020187666, + "loss": 0.2543, + "step": 895 + }, + { + "epoch": 1.1789473684210527, + "grad_norm": 0.08213132733512518, + "learning_rate": 0.00015210619019925066, + "loss": 0.2636, + "step": 896 + }, + { + "epoch": 1.180263157894737, + "grad_norm": 0.08080981772989361, + "learning_rate": 0.00015197545624638504, + "loss": 0.267, + "step": 897 + }, + { + "epoch": 1.181578947368421, + "grad_norm": 0.07879728180506462, + "learning_rate": 0.00015184460046660137, + "loss": 0.2622, + "step": 898 + }, + { + "epoch": 1.1828947368421052, + "grad_norm": 0.07930675022154604, + "learning_rate": 0.00015171362316661652, + "loss": 0.2635, + "step": 899 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.0827303133693508, + "learning_rate": 0.00015158252465343242, + "loss": 0.2731, + "step": 900 + }, + { + "epoch": 1.1855263157894738, + "grad_norm": 0.08133340547715927, + "learning_rate": 0.00015145130523433492, + "loss": 0.2691, + "step": 901 + }, + { + "epoch": 1.186842105263158, + "grad_norm": 0.08320419791917552, + "learning_rate": 0.00015131996521689352, + "loss": 0.2831, + "step": 902 + }, + { + "epoch": 1.188157894736842, + "grad_norm": 0.078583719040117, + "learning_rate": 0.00015118850490896012, + "loss": 0.2605, + "step": 903 + }, + { + "epoch": 1.1894736842105262, + "grad_norm": 0.07987941146339982, + "learning_rate": 0.00015105692461866874, + "loss": 0.271, + "step": 904 + }, + { + "epoch": 1.1907894736842106, + "grad_norm": 0.08065385126525215, + "learning_rate": 0.0001509252246544346, + "loss": 0.2748, + "step": 905 + }, + { + "epoch": 1.1921052631578948, + "grad_norm": 0.08045853100208103, + "learning_rate": 0.00015079340532495343, + "loss": 0.2793, + "step": 906 + }, + { + "epoch": 1.193421052631579, + "grad_norm": 0.07925415373688983, + "learning_rate": 0.00015066146693920072, + "loss": 0.2625, + "step": 907 + }, + { + "epoch": 1.194736842105263, + "grad_norm": 0.08327329284089614, + "learning_rate": 0.000150529409806431, + "loss": 0.2679, + "step": 908 + }, + { + "epoch": 1.1960526315789473, + "grad_norm": 0.08369037313818105, + "learning_rate": 0.0001503972342361772, + "loss": 0.2729, + "step": 909 + }, + { + "epoch": 1.1973684210526316, + "grad_norm": 0.08698590728820178, + "learning_rate": 0.00015026494053824982, + "loss": 0.2792, + "step": 910 + }, + { + "epoch": 1.1986842105263158, + "grad_norm": 0.08234237296181915, + "learning_rate": 0.0001501325290227362, + "loss": 0.2723, + "step": 911 + }, + { + "epoch": 1.2, + "grad_norm": 0.08018132065204943, + "learning_rate": 0.00015000000000000001, + "loss": 0.2582, + "step": 912 + }, + { + "epoch": 1.2013157894736841, + "grad_norm": 0.08029223756152773, + "learning_rate": 0.0001498673537806801, + "loss": 0.2605, + "step": 913 + }, + { + "epoch": 1.2026315789473685, + "grad_norm": 0.07906404673525916, + "learning_rate": 0.00014973459067569022, + "loss": 0.2691, + "step": 914 + }, + { + "epoch": 1.2039473684210527, + "grad_norm": 0.0838278381246816, + "learning_rate": 0.00014960171099621795, + "loss": 0.2697, + "step": 915 + }, + { + "epoch": 1.2052631578947368, + "grad_norm": 0.08289248581844369, + "learning_rate": 0.00014946871505372425, + "loss": 0.2789, + "step": 916 + }, + { + "epoch": 1.206578947368421, + "grad_norm": 0.08065473943197897, + "learning_rate": 0.0001493356031599425, + "loss": 0.2741, + "step": 917 + }, + { + "epoch": 1.2078947368421054, + "grad_norm": 0.08222068877129116, + "learning_rate": 0.00014920237562687785, + "loss": 0.268, + "step": 918 + }, + { + "epoch": 1.2092105263157895, + "grad_norm": 0.0819456615892803, + "learning_rate": 0.00014906903276680654, + "loss": 0.267, + "step": 919 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.08178515229941181, + "learning_rate": 0.00014893557489227517, + "loss": 0.2711, + "step": 920 + }, + { + "epoch": 1.2118421052631578, + "grad_norm": 0.08167974669157664, + "learning_rate": 0.00014880200231609983, + "loss": 0.2623, + "step": 921 + }, + { + "epoch": 1.2131578947368422, + "grad_norm": 0.07644033108650616, + "learning_rate": 0.00014866831535136554, + "loss": 0.2592, + "step": 922 + }, + { + "epoch": 1.2144736842105264, + "grad_norm": 0.08247670698556289, + "learning_rate": 0.00014853451431142537, + "loss": 0.2563, + "step": 923 + }, + { + "epoch": 1.2157894736842105, + "grad_norm": 0.08121309159730847, + "learning_rate": 0.0001484005995098999, + "loss": 0.2651, + "step": 924 + }, + { + "epoch": 1.2171052631578947, + "grad_norm": 0.0813185941928777, + "learning_rate": 0.0001482665712606762, + "loss": 0.2654, + "step": 925 + }, + { + "epoch": 1.2184210526315788, + "grad_norm": 0.07992066628575906, + "learning_rate": 0.00014813242987790734, + "loss": 0.2601, + "step": 926 + }, + { + "epoch": 1.2197368421052632, + "grad_norm": 0.08348022762772563, + "learning_rate": 0.00014799817567601157, + "loss": 0.2698, + "step": 927 + }, + { + "epoch": 1.2210526315789474, + "grad_norm": 0.08164215203446346, + "learning_rate": 0.0001478638089696716, + "loss": 0.261, + "step": 928 + }, + { + "epoch": 1.2223684210526315, + "grad_norm": 0.08053428830419256, + "learning_rate": 0.00014772933007383372, + "loss": 0.271, + "step": 929 + }, + { + "epoch": 1.2236842105263157, + "grad_norm": 0.0855332022802314, + "learning_rate": 0.00014759473930370736, + "loss": 0.2806, + "step": 930 + }, + { + "epoch": 1.225, + "grad_norm": 0.07841552340055463, + "learning_rate": 0.00014746003697476404, + "loss": 0.2562, + "step": 931 + }, + { + "epoch": 1.2263157894736842, + "grad_norm": 0.08012970553706716, + "learning_rate": 0.00014732522340273684, + "loss": 0.2653, + "step": 932 + }, + { + "epoch": 1.2276315789473684, + "grad_norm": 0.08013137552721004, + "learning_rate": 0.00014719029890361955, + "loss": 0.2614, + "step": 933 + }, + { + "epoch": 1.2289473684210526, + "grad_norm": 0.08076902086359555, + "learning_rate": 0.000147055263793666, + "loss": 0.2726, + "step": 934 + }, + { + "epoch": 1.2302631578947367, + "grad_norm": 0.07816751188820396, + "learning_rate": 0.0001469201183893892, + "loss": 0.2697, + "step": 935 + }, + { + "epoch": 1.231578947368421, + "grad_norm": 0.07958991637658457, + "learning_rate": 0.0001467848630075608, + "loss": 0.2718, + "step": 936 + }, + { + "epoch": 1.2328947368421053, + "grad_norm": 0.07907729388333563, + "learning_rate": 0.00014664949796521013, + "loss": 0.2542, + "step": 937 + }, + { + "epoch": 1.2342105263157894, + "grad_norm": 0.08337394014823166, + "learning_rate": 0.00014651402357962367, + "loss": 0.2634, + "step": 938 + }, + { + "epoch": 1.2355263157894738, + "grad_norm": 0.07981852744305896, + "learning_rate": 0.00014637844016834406, + "loss": 0.2669, + "step": 939 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 0.07805563922628504, + "learning_rate": 0.00014624274804916958, + "loss": 0.2604, + "step": 940 + }, + { + "epoch": 1.2381578947368421, + "grad_norm": 0.07875476925853715, + "learning_rate": 0.00014610694754015326, + "loss": 0.2605, + "step": 941 + }, + { + "epoch": 1.2394736842105263, + "grad_norm": 0.08212032555073835, + "learning_rate": 0.00014597103895960226, + "loss": 0.2608, + "step": 942 + }, + { + "epoch": 1.2407894736842104, + "grad_norm": 0.08193583484443283, + "learning_rate": 0.00014583502262607696, + "loss": 0.2671, + "step": 943 + }, + { + "epoch": 1.2421052631578948, + "grad_norm": 0.08018625970236232, + "learning_rate": 0.00014569889885839037, + "loss": 0.2482, + "step": 944 + }, + { + "epoch": 1.243421052631579, + "grad_norm": 0.08080487708321592, + "learning_rate": 0.00014556266797560732, + "loss": 0.2581, + "step": 945 + }, + { + "epoch": 1.2447368421052631, + "grad_norm": 0.08127791305985531, + "learning_rate": 0.00014542633029704366, + "loss": 0.2645, + "step": 946 + }, + { + "epoch": 1.2460526315789473, + "grad_norm": 0.0789744299917556, + "learning_rate": 0.00014528988614226563, + "loss": 0.258, + "step": 947 + }, + { + "epoch": 1.2473684210526317, + "grad_norm": 0.08138536304735684, + "learning_rate": 0.00014515333583108896, + "loss": 0.2602, + "step": 948 + }, + { + "epoch": 1.2486842105263158, + "grad_norm": 0.08347086931054207, + "learning_rate": 0.00014501667968357825, + "loss": 0.2737, + "step": 949 + }, + { + "epoch": 1.25, + "grad_norm": 0.07961359280977227, + "learning_rate": 0.00014487991802004623, + "loss": 0.269, + "step": 950 + }, + { + "epoch": 1.2513157894736842, + "grad_norm": 0.08326297319920578, + "learning_rate": 0.00014474305116105284, + "loss": 0.2627, + "step": 951 + }, + { + "epoch": 1.2526315789473683, + "grad_norm": 0.08085513791177462, + "learning_rate": 0.00014460607942740468, + "loss": 0.2679, + "step": 952 + }, + { + "epoch": 1.2539473684210527, + "grad_norm": 0.07868241470750258, + "learning_rate": 0.00014446900314015411, + "loss": 0.2699, + "step": 953 + }, + { + "epoch": 1.2552631578947369, + "grad_norm": 0.0836320162619728, + "learning_rate": 0.0001443318226205986, + "loss": 0.2792, + "step": 954 + }, + { + "epoch": 1.256578947368421, + "grad_norm": 0.08451883775470095, + "learning_rate": 0.00014419453819027988, + "loss": 0.2739, + "step": 955 + }, + { + "epoch": 1.2578947368421054, + "grad_norm": 0.08034662053541705, + "learning_rate": 0.00014405715017098335, + "loss": 0.263, + "step": 956 + }, + { + "epoch": 1.2592105263157896, + "grad_norm": 0.07928448098795568, + "learning_rate": 0.00014391965888473703, + "loss": 0.2543, + "step": 957 + }, + { + "epoch": 1.2605263157894737, + "grad_norm": 0.08392429826580132, + "learning_rate": 0.0001437820646538112, + "loss": 0.2775, + "step": 958 + }, + { + "epoch": 1.2618421052631579, + "grad_norm": 0.07940539289961981, + "learning_rate": 0.00014364436780071727, + "loss": 0.2685, + "step": 959 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.08036645877207896, + "learning_rate": 0.00014350656864820733, + "loss": 0.2683, + "step": 960 + }, + { + "epoch": 1.2644736842105262, + "grad_norm": 0.07812397442869837, + "learning_rate": 0.0001433686675192731, + "loss": 0.2639, + "step": 961 + }, + { + "epoch": 1.2657894736842106, + "grad_norm": 0.07914775180391949, + "learning_rate": 0.00014323066473714542, + "loss": 0.2612, + "step": 962 + }, + { + "epoch": 1.2671052631578947, + "grad_norm": 0.07880507910786637, + "learning_rate": 0.00014309256062529344, + "loss": 0.269, + "step": 963 + }, + { + "epoch": 1.268421052631579, + "grad_norm": 0.08099065681511952, + "learning_rate": 0.0001429543555074237, + "loss": 0.2649, + "step": 964 + }, + { + "epoch": 1.2697368421052633, + "grad_norm": 0.07889461321025316, + "learning_rate": 0.00014281604970747954, + "loss": 0.2618, + "step": 965 + }, + { + "epoch": 1.2710526315789474, + "grad_norm": 0.07976753992813429, + "learning_rate": 0.00014267764354964038, + "loss": 0.2706, + "step": 966 + }, + { + "epoch": 1.2723684210526316, + "grad_norm": 0.07872266626170443, + "learning_rate": 0.00014253913735832075, + "loss": 0.2642, + "step": 967 + }, + { + "epoch": 1.2736842105263158, + "grad_norm": 0.08023355760105522, + "learning_rate": 0.00014240053145816967, + "loss": 0.2568, + "step": 968 + }, + { + "epoch": 1.275, + "grad_norm": 0.0810413485993457, + "learning_rate": 0.00014226182617406996, + "loss": 0.2529, + "step": 969 + }, + { + "epoch": 1.2763157894736843, + "grad_norm": 0.08212814628024637, + "learning_rate": 0.00014212302183113732, + "loss": 0.2629, + "step": 970 + }, + { + "epoch": 1.2776315789473685, + "grad_norm": 0.08059136593229341, + "learning_rate": 0.00014198411875471955, + "loss": 0.2642, + "step": 971 + }, + { + "epoch": 1.2789473684210526, + "grad_norm": 0.08140236020647386, + "learning_rate": 0.00014184511727039612, + "loss": 0.2562, + "step": 972 + }, + { + "epoch": 1.2802631578947368, + "grad_norm": 0.08437007742270067, + "learning_rate": 0.00014170601770397692, + "loss": 0.2745, + "step": 973 + }, + { + "epoch": 1.2815789473684212, + "grad_norm": 0.08380982717641713, + "learning_rate": 0.00014156682038150183, + "loss": 0.2759, + "step": 974 + }, + { + "epoch": 1.2828947368421053, + "grad_norm": 0.0841772070431119, + "learning_rate": 0.00014142752562923988, + "loss": 0.2753, + "step": 975 + }, + { + "epoch": 1.2842105263157895, + "grad_norm": 0.08106806985375134, + "learning_rate": 0.0001412881337736885, + "loss": 0.2767, + "step": 976 + }, + { + "epoch": 1.2855263157894736, + "grad_norm": 0.0811005519720408, + "learning_rate": 0.00014114864514157258, + "loss": 0.2683, + "step": 977 + }, + { + "epoch": 1.2868421052631578, + "grad_norm": 0.08017428089067932, + "learning_rate": 0.00014100906005984403, + "loss": 0.2723, + "step": 978 + }, + { + "epoch": 1.2881578947368422, + "grad_norm": 0.07478870745109191, + "learning_rate": 0.00014086937885568067, + "loss": 0.2593, + "step": 979 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 0.07742432071526005, + "learning_rate": 0.00014072960185648577, + "loss": 0.2631, + "step": 980 + }, + { + "epoch": 1.2907894736842105, + "grad_norm": 0.07794666505896067, + "learning_rate": 0.000140589729389887, + "loss": 0.2644, + "step": 981 + }, + { + "epoch": 1.2921052631578949, + "grad_norm": 0.07946767921967093, + "learning_rate": 0.0001404497617837359, + "loss": 0.2609, + "step": 982 + }, + { + "epoch": 1.293421052631579, + "grad_norm": 0.08082944776271564, + "learning_rate": 0.00014030969936610697, + "loss": 0.2734, + "step": 983 + }, + { + "epoch": 1.2947368421052632, + "grad_norm": 0.08360749789540052, + "learning_rate": 0.00014016954246529696, + "loss": 0.263, + "step": 984 + }, + { + "epoch": 1.2960526315789473, + "grad_norm": 0.08375341936105894, + "learning_rate": 0.00014002929140982404, + "loss": 0.2639, + "step": 985 + }, + { + "epoch": 1.2973684210526315, + "grad_norm": 0.08336578005654308, + "learning_rate": 0.00013988894652842713, + "loss": 0.2777, + "step": 986 + }, + { + "epoch": 1.2986842105263157, + "grad_norm": 0.07859275037760449, + "learning_rate": 0.00013974850815006503, + "loss": 0.2609, + "step": 987 + }, + { + "epoch": 1.3, + "grad_norm": 0.07972935132468843, + "learning_rate": 0.0001396079766039157, + "loss": 0.2593, + "step": 988 + }, + { + "epoch": 1.3013157894736842, + "grad_norm": 0.07852889738678417, + "learning_rate": 0.0001394673522193755, + "loss": 0.2651, + "step": 989 + }, + { + "epoch": 1.3026315789473684, + "grad_norm": 0.08131207293814138, + "learning_rate": 0.0001393266353260583, + "loss": 0.2636, + "step": 990 + }, + { + "epoch": 1.3039473684210527, + "grad_norm": 0.08085167622175171, + "learning_rate": 0.00013918582625379501, + "loss": 0.2581, + "step": 991 + }, + { + "epoch": 1.305263157894737, + "grad_norm": 0.08310768520116221, + "learning_rate": 0.00013904492533263244, + "loss": 0.2804, + "step": 992 + }, + { + "epoch": 1.306578947368421, + "grad_norm": 0.0835988231304127, + "learning_rate": 0.0001389039328928326, + "loss": 0.2764, + "step": 993 + }, + { + "epoch": 1.3078947368421052, + "grad_norm": 0.08043076516824947, + "learning_rate": 0.0001387628492648723, + "loss": 0.2643, + "step": 994 + }, + { + "epoch": 1.3092105263157894, + "grad_norm": 0.08124742205143656, + "learning_rate": 0.00013862167477944184, + "loss": 0.2779, + "step": 995 + }, + { + "epoch": 1.3105263157894738, + "grad_norm": 0.07855989514944883, + "learning_rate": 0.00013848040976744457, + "loss": 0.2641, + "step": 996 + }, + { + "epoch": 1.311842105263158, + "grad_norm": 0.08038296983596055, + "learning_rate": 0.00013833905455999603, + "loss": 0.2611, + "step": 997 + }, + { + "epoch": 1.313157894736842, + "grad_norm": 0.07754716363033522, + "learning_rate": 0.0001381976094884232, + "loss": 0.2565, + "step": 998 + }, + { + "epoch": 1.3144736842105262, + "grad_norm": 0.0771054502214774, + "learning_rate": 0.00013805607488426362, + "loss": 0.2667, + "step": 999 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.08092253741118172, + "learning_rate": 0.00013791445107926478, + "loss": 0.2715, + "step": 1000 + }, + { + "epoch": 1.3171052631578948, + "grad_norm": 0.08392681822232567, + "learning_rate": 0.00013777273840538312, + "loss": 0.2696, + "step": 1001 + }, + { + "epoch": 1.318421052631579, + "grad_norm": 0.08088919172679297, + "learning_rate": 0.00013763093719478358, + "loss": 0.262, + "step": 1002 + }, + { + "epoch": 1.319736842105263, + "grad_norm": 0.07966639118780608, + "learning_rate": 0.00013748904777983838, + "loss": 0.2632, + "step": 1003 + }, + { + "epoch": 1.3210526315789473, + "grad_norm": 0.07949592188633048, + "learning_rate": 0.00013734707049312673, + "loss": 0.2513, + "step": 1004 + }, + { + "epoch": 1.3223684210526316, + "grad_norm": 0.078682262346058, + "learning_rate": 0.00013720500566743362, + "loss": 0.2567, + "step": 1005 + }, + { + "epoch": 1.3236842105263158, + "grad_norm": 0.07844946772992165, + "learning_rate": 0.00013706285363574932, + "loss": 0.2743, + "step": 1006 + }, + { + "epoch": 1.325, + "grad_norm": 0.08305320278558118, + "learning_rate": 0.00013692061473126845, + "loss": 0.2623, + "step": 1007 + }, + { + "epoch": 1.3263157894736843, + "grad_norm": 0.08139900273296097, + "learning_rate": 0.00013677828928738934, + "loss": 0.269, + "step": 1008 + }, + { + "epoch": 1.3276315789473685, + "grad_norm": 0.0839606346260752, + "learning_rate": 0.00013663587763771304, + "loss": 0.2671, + "step": 1009 + }, + { + "epoch": 1.3289473684210527, + "grad_norm": 0.08150284952029307, + "learning_rate": 0.0001364933801160428, + "loss": 0.2568, + "step": 1010 + }, + { + "epoch": 1.3302631578947368, + "grad_norm": 0.07748260538941765, + "learning_rate": 0.00013635079705638298, + "loss": 0.2423, + "step": 1011 + }, + { + "epoch": 1.331578947368421, + "grad_norm": 0.07721123482181354, + "learning_rate": 0.00013620812879293863, + "loss": 0.2621, + "step": 1012 + }, + { + "epoch": 1.3328947368421051, + "grad_norm": 0.08010424676918745, + "learning_rate": 0.00013606537566011434, + "loss": 0.2643, + "step": 1013 + }, + { + "epoch": 1.3342105263157895, + "grad_norm": 0.07968708879421524, + "learning_rate": 0.00013592253799251376, + "loss": 0.2677, + "step": 1014 + }, + { + "epoch": 1.3355263157894737, + "grad_norm": 0.07865457628121457, + "learning_rate": 0.00013577961612493852, + "loss": 0.2646, + "step": 1015 + }, + { + "epoch": 1.3368421052631578, + "grad_norm": 0.07821958051935139, + "learning_rate": 0.00013563661039238785, + "loss": 0.2605, + "step": 1016 + }, + { + "epoch": 1.3381578947368422, + "grad_norm": 0.07970724490030985, + "learning_rate": 0.00013549352113005728, + "loss": 0.2591, + "step": 1017 + }, + { + "epoch": 1.3394736842105264, + "grad_norm": 0.08021459794844776, + "learning_rate": 0.00013535034867333837, + "loss": 0.2684, + "step": 1018 + }, + { + "epoch": 1.3407894736842105, + "grad_norm": 0.08161684018773531, + "learning_rate": 0.00013520709335781752, + "loss": 0.2579, + "step": 1019 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.0780970363121174, + "learning_rate": 0.00013506375551927547, + "loss": 0.258, + "step": 1020 + }, + { + "epoch": 1.3434210526315788, + "grad_norm": 0.08023973869702199, + "learning_rate": 0.00013492033549368618, + "loss": 0.2758, + "step": 1021 + }, + { + "epoch": 1.3447368421052632, + "grad_norm": 0.081261194750605, + "learning_rate": 0.00013477683361721657, + "loss": 0.2579, + "step": 1022 + }, + { + "epoch": 1.3460526315789474, + "grad_norm": 0.07810232344712505, + "learning_rate": 0.00013463325022622507, + "loss": 0.2597, + "step": 1023 + }, + { + "epoch": 1.3473684210526315, + "grad_norm": 0.07656812032411678, + "learning_rate": 0.00013448958565726144, + "loss": 0.2497, + "step": 1024 + }, + { + "epoch": 1.3486842105263157, + "grad_norm": 0.07962640765722634, + "learning_rate": 0.00013434584024706554, + "loss": 0.2703, + "step": 1025 + }, + { + "epoch": 1.35, + "grad_norm": 0.07859791053686446, + "learning_rate": 0.00013420201433256689, + "loss": 0.2643, + "step": 1026 + }, + { + "epoch": 1.3513157894736842, + "grad_norm": 0.0785017433196338, + "learning_rate": 0.00013405810825088351, + "loss": 0.2554, + "step": 1027 + }, + { + "epoch": 1.3526315789473684, + "grad_norm": 0.07927097278835697, + "learning_rate": 0.00013391412233932149, + "loss": 0.2645, + "step": 1028 + }, + { + "epoch": 1.3539473684210526, + "grad_norm": 0.08028070891011724, + "learning_rate": 0.0001337700569353739, + "loss": 0.259, + "step": 1029 + }, + { + "epoch": 1.3552631578947367, + "grad_norm": 0.07903933780122868, + "learning_rate": 0.0001336259123767203, + "loss": 0.2687, + "step": 1030 + }, + { + "epoch": 1.356578947368421, + "grad_norm": 0.08789904311515143, + "learning_rate": 0.00013348168900122559, + "loss": 0.2653, + "step": 1031 + }, + { + "epoch": 1.3578947368421053, + "grad_norm": 0.0762050013831853, + "learning_rate": 0.00013333738714693956, + "loss": 0.2526, + "step": 1032 + }, + { + "epoch": 1.3592105263157894, + "grad_norm": 0.08083459753449757, + "learning_rate": 0.00013319300715209587, + "loss": 0.2687, + "step": 1033 + }, + { + "epoch": 1.3605263157894738, + "grad_norm": 0.0771191494993079, + "learning_rate": 0.00013304854935511143, + "loss": 0.2659, + "step": 1034 + }, + { + "epoch": 1.361842105263158, + "grad_norm": 0.0810612005537591, + "learning_rate": 0.00013290401409458532, + "loss": 0.2587, + "step": 1035 + }, + { + "epoch": 1.3631578947368421, + "grad_norm": 0.08077515208209074, + "learning_rate": 0.00013275940170929843, + "loss": 0.2763, + "step": 1036 + }, + { + "epoch": 1.3644736842105263, + "grad_norm": 0.0795071805895556, + "learning_rate": 0.00013261471253821227, + "loss": 0.2803, + "step": 1037 + }, + { + "epoch": 1.3657894736842104, + "grad_norm": 0.0783867946653792, + "learning_rate": 0.00013246994692046836, + "loss": 0.2659, + "step": 1038 + }, + { + "epoch": 1.3671052631578946, + "grad_norm": 0.07705812242711711, + "learning_rate": 0.0001323251051953874, + "loss": 0.2626, + "step": 1039 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.07981528649898714, + "learning_rate": 0.00013218018770246858, + "loss": 0.2652, + "step": 1040 + }, + { + "epoch": 1.3697368421052631, + "grad_norm": 0.07580835271521758, + "learning_rate": 0.00013203519478138852, + "loss": 0.2593, + "step": 1041 + }, + { + "epoch": 1.3710526315789473, + "grad_norm": 0.0786805131510188, + "learning_rate": 0.00013189012677200073, + "loss": 0.2583, + "step": 1042 + }, + { + "epoch": 1.3723684210526317, + "grad_norm": 0.07872461104976057, + "learning_rate": 0.00013174498401433474, + "loss": 0.2542, + "step": 1043 + }, + { + "epoch": 1.3736842105263158, + "grad_norm": 0.08393125286330598, + "learning_rate": 0.00013159976684859527, + "loss": 0.2717, + "step": 1044 + }, + { + "epoch": 1.375, + "grad_norm": 0.08038226441935513, + "learning_rate": 0.00013145447561516138, + "loss": 0.2538, + "step": 1045 + }, + { + "epoch": 1.3763157894736842, + "grad_norm": 0.08104956663295539, + "learning_rate": 0.00013130911065458584, + "loss": 0.2666, + "step": 1046 + }, + { + "epoch": 1.3776315789473683, + "grad_norm": 0.08149965352071284, + "learning_rate": 0.00013116367230759415, + "loss": 0.2741, + "step": 1047 + }, + { + "epoch": 1.3789473684210527, + "grad_norm": 0.0800773120109665, + "learning_rate": 0.00013101816091508388, + "loss": 0.2621, + "step": 1048 + }, + { + "epoch": 1.3802631578947369, + "grad_norm": 0.07883317602840334, + "learning_rate": 0.00013087257681812376, + "loss": 0.2627, + "step": 1049 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 0.07808540918916572, + "learning_rate": 0.00013072692035795305, + "loss": 0.2661, + "step": 1050 + }, + { + "epoch": 1.3828947368421054, + "grad_norm": 0.07784582555862558, + "learning_rate": 0.00013058119187598047, + "loss": 0.2605, + "step": 1051 + }, + { + "epoch": 1.3842105263157896, + "grad_norm": 0.0789115552020285, + "learning_rate": 0.0001304353917137836, + "loss": 0.2591, + "step": 1052 + }, + { + "epoch": 1.3855263157894737, + "grad_norm": 0.07707043429238686, + "learning_rate": 0.00013028952021310812, + "loss": 0.2602, + "step": 1053 + }, + { + "epoch": 1.3868421052631579, + "grad_norm": 0.0764917703073864, + "learning_rate": 0.00013014357771586686, + "loss": 0.252, + "step": 1054 + }, + { + "epoch": 1.388157894736842, + "grad_norm": 0.08152267179709696, + "learning_rate": 0.000129997564564139, + "loss": 0.2761, + "step": 1055 + }, + { + "epoch": 1.3894736842105262, + "grad_norm": 0.07488321865899004, + "learning_rate": 0.00012985148110016947, + "loss": 0.2557, + "step": 1056 + }, + { + "epoch": 1.3907894736842106, + "grad_norm": 0.08223478003170398, + "learning_rate": 0.00012970532766636787, + "loss": 0.2758, + "step": 1057 + }, + { + "epoch": 1.3921052631578947, + "grad_norm": 0.08146309512224036, + "learning_rate": 0.00012955910460530788, + "loss": 0.2631, + "step": 1058 + }, + { + "epoch": 1.393421052631579, + "grad_norm": 0.08083129097727351, + "learning_rate": 0.00012941281225972636, + "loss": 0.2848, + "step": 1059 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 0.07757651778516991, + "learning_rate": 0.0001292664509725226, + "loss": 0.2524, + "step": 1060 + }, + { + "epoch": 1.3960526315789474, + "grad_norm": 0.0763538974448064, + "learning_rate": 0.0001291200210867574, + "loss": 0.257, + "step": 1061 + }, + { + "epoch": 1.3973684210526316, + "grad_norm": 0.07882164295488549, + "learning_rate": 0.0001289735229456525, + "loss": 0.2687, + "step": 1062 + }, + { + "epoch": 1.3986842105263158, + "grad_norm": 0.07892589978626854, + "learning_rate": 0.0001288269568925894, + "loss": 0.2704, + "step": 1063 + }, + { + "epoch": 1.4, + "grad_norm": 0.07820774930633734, + "learning_rate": 0.00012868032327110904, + "loss": 0.2619, + "step": 1064 + }, + { + "epoch": 1.4013157894736843, + "grad_norm": 0.07815937274547557, + "learning_rate": 0.00012853362242491053, + "loss": 0.2717, + "step": 1065 + }, + { + "epoch": 1.4026315789473685, + "grad_norm": 0.07522447260364416, + "learning_rate": 0.0001283868546978507, + "loss": 0.2554, + "step": 1066 + }, + { + "epoch": 1.4039473684210526, + "grad_norm": 0.08070192461804411, + "learning_rate": 0.00012824002043394298, + "loss": 0.2754, + "step": 1067 + }, + { + "epoch": 1.4052631578947368, + "grad_norm": 0.0803973317378383, + "learning_rate": 0.00012809311997735696, + "loss": 0.2766, + "step": 1068 + }, + { + "epoch": 1.4065789473684212, + "grad_norm": 0.07866656181841848, + "learning_rate": 0.00012794615367241717, + "loss": 0.2668, + "step": 1069 + }, + { + "epoch": 1.4078947368421053, + "grad_norm": 0.0794268932038752, + "learning_rate": 0.00012779912186360268, + "loss": 0.2675, + "step": 1070 + }, + { + "epoch": 1.4092105263157895, + "grad_norm": 0.07900705819829199, + "learning_rate": 0.0001276520248955459, + "loss": 0.2635, + "step": 1071 + }, + { + "epoch": 1.4105263157894736, + "grad_norm": 0.07918211928299614, + "learning_rate": 0.00012750486311303218, + "loss": 0.262, + "step": 1072 + }, + { + "epoch": 1.4118421052631578, + "grad_norm": 0.0819965398370594, + "learning_rate": 0.00012735763686099862, + "loss": 0.2602, + "step": 1073 + }, + { + "epoch": 1.4131578947368422, + "grad_norm": 0.07831915554583863, + "learning_rate": 0.00012721034648453353, + "loss": 0.2628, + "step": 1074 + }, + { + "epoch": 1.4144736842105263, + "grad_norm": 0.07786089842983143, + "learning_rate": 0.00012706299232887543, + "loss": 0.2561, + "step": 1075 + }, + { + "epoch": 1.4157894736842105, + "grad_norm": 0.07971967408570595, + "learning_rate": 0.00012691557473941243, + "loss": 0.2656, + "step": 1076 + }, + { + "epoch": 1.4171052631578949, + "grad_norm": 0.07801118181055779, + "learning_rate": 0.00012676809406168133, + "loss": 0.2652, + "step": 1077 + }, + { + "epoch": 1.418421052631579, + "grad_norm": 0.08515813484361862, + "learning_rate": 0.00012662055064136668, + "loss": 0.2753, + "step": 1078 + }, + { + "epoch": 1.4197368421052632, + "grad_norm": 0.08068756136226636, + "learning_rate": 0.00012647294482430024, + "loss": 0.2634, + "step": 1079 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.0794228149168458, + "learning_rate": 0.00012632527695645993, + "loss": 0.2613, + "step": 1080 + }, + { + "epoch": 1.4223684210526315, + "grad_norm": 0.07796131697715634, + "learning_rate": 0.00012617754738396915, + "loss": 0.2665, + "step": 1081 + }, + { + "epoch": 1.4236842105263157, + "grad_norm": 0.07971630587719376, + "learning_rate": 0.00012602975645309593, + "loss": 0.2631, + "step": 1082 + }, + { + "epoch": 1.425, + "grad_norm": 0.07712944683367323, + "learning_rate": 0.00012588190451025207, + "loss": 0.2509, + "step": 1083 + }, + { + "epoch": 1.4263157894736842, + "grad_norm": 0.08097034232881713, + "learning_rate": 0.0001257339919019925, + "loss": 0.2649, + "step": 1084 + }, + { + "epoch": 1.4276315789473684, + "grad_norm": 0.07618596280882184, + "learning_rate": 0.0001255860189750142, + "loss": 0.2613, + "step": 1085 + }, + { + "epoch": 1.4289473684210527, + "grad_norm": 0.08305767349447553, + "learning_rate": 0.00012543798607615565, + "loss": 0.2758, + "step": 1086 + }, + { + "epoch": 1.430263157894737, + "grad_norm": 0.08146614433553916, + "learning_rate": 0.0001252898935523958, + "loss": 0.2819, + "step": 1087 + }, + { + "epoch": 1.431578947368421, + "grad_norm": 0.07805110779507855, + "learning_rate": 0.00012514174175085345, + "loss": 0.2649, + "step": 1088 + }, + { + "epoch": 1.4328947368421052, + "grad_norm": 0.07969018389253195, + "learning_rate": 0.0001249935310187863, + "loss": 0.2637, + "step": 1089 + }, + { + "epoch": 1.4342105263157894, + "grad_norm": 0.07956536699437335, + "learning_rate": 0.00012484526170359012, + "loss": 0.2651, + "step": 1090 + }, + { + "epoch": 1.4355263157894738, + "grad_norm": 0.07836456115097726, + "learning_rate": 0.0001246969341527981, + "loss": 0.2645, + "step": 1091 + }, + { + "epoch": 1.436842105263158, + "grad_norm": 0.07976260454428451, + "learning_rate": 0.00012454854871407994, + "loss": 0.2824, + "step": 1092 + }, + { + "epoch": 1.438157894736842, + "grad_norm": 0.07726676671750074, + "learning_rate": 0.00012440010573524086, + "loss": 0.2632, + "step": 1093 + }, + { + "epoch": 1.4394736842105262, + "grad_norm": 0.07821021819439651, + "learning_rate": 0.00012425160556422114, + "loss": 0.2731, + "step": 1094 + }, + { + "epoch": 1.4407894736842106, + "grad_norm": 0.08013778051504493, + "learning_rate": 0.00012410304854909495, + "loss": 0.2559, + "step": 1095 + }, + { + "epoch": 1.4421052631578948, + "grad_norm": 0.08401500576489536, + "learning_rate": 0.0001239544350380699, + "loss": 0.2719, + "step": 1096 + }, + { + "epoch": 1.443421052631579, + "grad_norm": 0.08434562563421157, + "learning_rate": 0.0001238057653794858, + "loss": 0.269, + "step": 1097 + }, + { + "epoch": 1.444736842105263, + "grad_norm": 0.07770950324688412, + "learning_rate": 0.00012365703992181425, + "loss": 0.2623, + "step": 1098 + }, + { + "epoch": 1.4460526315789473, + "grad_norm": 0.08224554301801708, + "learning_rate": 0.0001235082590136575, + "loss": 0.2808, + "step": 1099 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.07921911039298107, + "learning_rate": 0.00012335942300374788, + "loss": 0.2598, + "step": 1100 + }, + { + "epoch": 1.4486842105263158, + "grad_norm": 0.08061258248097088, + "learning_rate": 0.0001232105322409468, + "loss": 0.2674, + "step": 1101 + }, + { + "epoch": 1.45, + "grad_norm": 0.0777388780233582, + "learning_rate": 0.00012306158707424403, + "loss": 0.2498, + "step": 1102 + }, + { + "epoch": 1.4513157894736843, + "grad_norm": 0.07794347014147819, + "learning_rate": 0.00012291258785275687, + "loss": 0.2569, + "step": 1103 + }, + { + "epoch": 1.4526315789473685, + "grad_norm": 0.08178650020126337, + "learning_rate": 0.00012276353492572935, + "loss": 0.2798, + "step": 1104 + }, + { + "epoch": 1.4539473684210527, + "grad_norm": 0.08037719214824916, + "learning_rate": 0.0001226144286425313, + "loss": 0.2684, + "step": 1105 + }, + { + "epoch": 1.4552631578947368, + "grad_norm": 0.0763595502941657, + "learning_rate": 0.00012246526935265768, + "loss": 0.2483, + "step": 1106 + }, + { + "epoch": 1.456578947368421, + "grad_norm": 0.07688959299684457, + "learning_rate": 0.00012231605740572766, + "loss": 0.2524, + "step": 1107 + }, + { + "epoch": 1.4578947368421051, + "grad_norm": 0.07894151742409149, + "learning_rate": 0.00012216679315148386, + "loss": 0.2642, + "step": 1108 + }, + { + "epoch": 1.4592105263157895, + "grad_norm": 0.07777205049498395, + "learning_rate": 0.00012201747693979151, + "loss": 0.2499, + "step": 1109 + }, + { + "epoch": 1.4605263157894737, + "grad_norm": 0.08207208250132972, + "learning_rate": 0.0001218681091206376, + "loss": 0.2751, + "step": 1110 + }, + { + "epoch": 1.4618421052631578, + "grad_norm": 0.07844860347132009, + "learning_rate": 0.00012171869004413006, + "loss": 0.2602, + "step": 1111 + }, + { + "epoch": 1.4631578947368422, + "grad_norm": 0.07856698522588018, + "learning_rate": 0.00012156922006049702, + "loss": 0.2658, + "step": 1112 + }, + { + "epoch": 1.4644736842105264, + "grad_norm": 0.07684231974080626, + "learning_rate": 0.00012141969952008591, + "loss": 0.2519, + "step": 1113 + }, + { + "epoch": 1.4657894736842105, + "grad_norm": 0.07956756231471081, + "learning_rate": 0.00012127012877336266, + "loss": 0.2587, + "step": 1114 + }, + { + "epoch": 1.4671052631578947, + "grad_norm": 0.07797778646499881, + "learning_rate": 0.00012112050817091087, + "loss": 0.2435, + "step": 1115 + }, + { + "epoch": 1.4684210526315788, + "grad_norm": 0.07789957155018702, + "learning_rate": 0.00012097083806343103, + "loss": 0.2675, + "step": 1116 + }, + { + "epoch": 1.4697368421052632, + "grad_norm": 0.08090700227859492, + "learning_rate": 0.00012082111880173965, + "loss": 0.2772, + "step": 1117 + }, + { + "epoch": 1.4710526315789474, + "grad_norm": 0.07661313410176111, + "learning_rate": 0.0001206713507367684, + "loss": 0.2571, + "step": 1118 + }, + { + "epoch": 1.4723684210526315, + "grad_norm": 0.07907971896935535, + "learning_rate": 0.00012052153421956342, + "loss": 0.2649, + "step": 1119 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.07996349059046486, + "learning_rate": 0.00012037166960128443, + "loss": 0.2536, + "step": 1120 + }, + { + "epoch": 1.475, + "grad_norm": 0.08235866500979575, + "learning_rate": 0.00012022175723320381, + "loss": 0.2778, + "step": 1121 + }, + { + "epoch": 1.4763157894736842, + "grad_norm": 0.08262772185376702, + "learning_rate": 0.00012007179746670592, + "loss": 0.2717, + "step": 1122 + }, + { + "epoch": 1.4776315789473684, + "grad_norm": 0.07931030156939642, + "learning_rate": 0.00011992179065328618, + "loss": 0.2556, + "step": 1123 + }, + { + "epoch": 1.4789473684210526, + "grad_norm": 0.0818492117725006, + "learning_rate": 0.00011977173714455034, + "loss": 0.2734, + "step": 1124 + }, + { + "epoch": 1.4802631578947367, + "grad_norm": 0.08007910950748007, + "learning_rate": 0.0001196216372922136, + "loss": 0.2591, + "step": 1125 + }, + { + "epoch": 1.481578947368421, + "grad_norm": 0.07912376020739983, + "learning_rate": 0.00011947149144809969, + "loss": 0.2573, + "step": 1126 + }, + { + "epoch": 1.4828947368421053, + "grad_norm": 0.07919295195700186, + "learning_rate": 0.00011932129996414023, + "loss": 0.2796, + "step": 1127 + }, + { + "epoch": 1.4842105263157894, + "grad_norm": 0.0768053198795628, + "learning_rate": 0.00011917106319237386, + "loss": 0.258, + "step": 1128 + }, + { + "epoch": 1.4855263157894738, + "grad_norm": 0.0751828643806364, + "learning_rate": 0.00011902078148494525, + "loss": 0.2482, + "step": 1129 + }, + { + "epoch": 1.486842105263158, + "grad_norm": 0.07573342946120709, + "learning_rate": 0.00011887045519410442, + "loss": 0.2515, + "step": 1130 + }, + { + "epoch": 1.4881578947368421, + "grad_norm": 0.08033169720613252, + "learning_rate": 0.00011872008467220599, + "loss": 0.2727, + "step": 1131 + }, + { + "epoch": 1.4894736842105263, + "grad_norm": 0.07910729035555242, + "learning_rate": 0.00011856967027170818, + "loss": 0.2753, + "step": 1132 + }, + { + "epoch": 1.4907894736842104, + "grad_norm": 0.07820164568049928, + "learning_rate": 0.00011841921234517206, + "loss": 0.2612, + "step": 1133 + }, + { + "epoch": 1.4921052631578946, + "grad_norm": 0.07934456848043618, + "learning_rate": 0.00011826871124526071, + "loss": 0.2619, + "step": 1134 + }, + { + "epoch": 1.493421052631579, + "grad_norm": 0.08088874374294254, + "learning_rate": 0.00011811816732473841, + "loss": 0.2582, + "step": 1135 + }, + { + "epoch": 1.4947368421052631, + "grad_norm": 0.07877981008540878, + "learning_rate": 0.00011796758093646989, + "loss": 0.2548, + "step": 1136 + }, + { + "epoch": 1.4960526315789473, + "grad_norm": 0.07864927828591978, + "learning_rate": 0.00011781695243341932, + "loss": 0.2587, + "step": 1137 + }, + { + "epoch": 1.4973684210526317, + "grad_norm": 0.08496699143061613, + "learning_rate": 0.0001176662821686496, + "loss": 0.2769, + "step": 1138 + }, + { + "epoch": 1.4986842105263158, + "grad_norm": 0.07790722036110928, + "learning_rate": 0.00011751557049532153, + "loss": 0.2625, + "step": 1139 + }, + { + "epoch": 1.5, + "grad_norm": 0.07732825022547576, + "learning_rate": 0.00011736481776669306, + "loss": 0.2595, + "step": 1140 + }, + { + "epoch": 1.5013157894736842, + "grad_norm": 0.07839345817113182, + "learning_rate": 0.00011721402433611818, + "loss": 0.264, + "step": 1141 + }, + { + "epoch": 1.5026315789473683, + "grad_norm": 0.0797985656438268, + "learning_rate": 0.00011706319055704642, + "loss": 0.2762, + "step": 1142 + }, + { + "epoch": 1.5039473684210525, + "grad_norm": 0.07859356519091114, + "learning_rate": 0.00011691231678302187, + "loss": 0.2589, + "step": 1143 + }, + { + "epoch": 1.5052631578947369, + "grad_norm": 0.08132159915933083, + "learning_rate": 0.00011676140336768236, + "loss": 0.2676, + "step": 1144 + }, + { + "epoch": 1.506578947368421, + "grad_norm": 0.07821055809299678, + "learning_rate": 0.0001166104506647586, + "loss": 0.2647, + "step": 1145 + }, + { + "epoch": 1.5078947368421054, + "grad_norm": 0.07762701396147464, + "learning_rate": 0.00011645945902807341, + "loss": 0.2646, + "step": 1146 + }, + { + "epoch": 1.5092105263157896, + "grad_norm": 0.07496175871914876, + "learning_rate": 0.00011630842881154085, + "loss": 0.2576, + "step": 1147 + }, + { + "epoch": 1.5105263157894737, + "grad_norm": 0.0784666282944514, + "learning_rate": 0.00011615736036916549, + "loss": 0.2679, + "step": 1148 + }, + { + "epoch": 1.5118421052631579, + "grad_norm": 0.07722067192546046, + "learning_rate": 0.0001160062540550414, + "loss": 0.2639, + "step": 1149 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 0.08059485240498777, + "learning_rate": 0.00011585511022335142, + "loss": 0.2764, + "step": 1150 + }, + { + "epoch": 1.5144736842105262, + "grad_norm": 0.08129025875363483, + "learning_rate": 0.00011570392922836644, + "loss": 0.2714, + "step": 1151 + }, + { + "epoch": 1.5157894736842106, + "grad_norm": 0.07486275958049285, + "learning_rate": 0.00011555271142444433, + "loss": 0.2517, + "step": 1152 + }, + { + "epoch": 1.5171052631578947, + "grad_norm": 0.08143946348918624, + "learning_rate": 0.0001154014571660293, + "loss": 0.2747, + "step": 1153 + }, + { + "epoch": 1.518421052631579, + "grad_norm": 0.07923671682954142, + "learning_rate": 0.00011525016680765102, + "loss": 0.2485, + "step": 1154 + }, + { + "epoch": 1.5197368421052633, + "grad_norm": 0.08011623724583099, + "learning_rate": 0.00011509884070392369, + "loss": 0.2656, + "step": 1155 + }, + { + "epoch": 1.5210526315789474, + "grad_norm": 0.07946415372441243, + "learning_rate": 0.00011494747920954545, + "loss": 0.2572, + "step": 1156 + }, + { + "epoch": 1.5223684210526316, + "grad_norm": 0.0812375151138501, + "learning_rate": 0.00011479608267929722, + "loss": 0.2697, + "step": 1157 + }, + { + "epoch": 1.5236842105263158, + "grad_norm": 0.07578132261810436, + "learning_rate": 0.00011464465146804217, + "loss": 0.2411, + "step": 1158 + }, + { + "epoch": 1.525, + "grad_norm": 0.07784132005694626, + "learning_rate": 0.00011449318593072466, + "loss": 0.2678, + "step": 1159 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.07909385203173394, + "learning_rate": 0.00011434168642236964, + "loss": 0.2754, + "step": 1160 + }, + { + "epoch": 1.5276315789473685, + "grad_norm": 0.0776880493894959, + "learning_rate": 0.00011419015329808157, + "loss": 0.2655, + "step": 1161 + }, + { + "epoch": 1.5289473684210526, + "grad_norm": 0.07765848299877728, + "learning_rate": 0.00011403858691304373, + "loss": 0.2607, + "step": 1162 + }, + { + "epoch": 1.530263157894737, + "grad_norm": 0.07646804543626935, + "learning_rate": 0.00011388698762251732, + "loss": 0.2596, + "step": 1163 + }, + { + "epoch": 1.5315789473684212, + "grad_norm": 0.07740933938489439, + "learning_rate": 0.00011373535578184082, + "loss": 0.2707, + "step": 1164 + }, + { + "epoch": 1.5328947368421053, + "grad_norm": 0.07996711741187337, + "learning_rate": 0.00011358369174642887, + "loss": 0.2664, + "step": 1165 + }, + { + "epoch": 1.5342105263157895, + "grad_norm": 0.07987614614098584, + "learning_rate": 0.00011343199587177155, + "loss": 0.2714, + "step": 1166 + }, + { + "epoch": 1.5355263157894736, + "grad_norm": 0.07887783815136196, + "learning_rate": 0.00011328026851343367, + "loss": 0.2548, + "step": 1167 + }, + { + "epoch": 1.5368421052631578, + "grad_norm": 0.07775100523637381, + "learning_rate": 0.00011312851002705383, + "loss": 0.2587, + "step": 1168 + }, + { + "epoch": 1.538157894736842, + "grad_norm": 0.08179602367804785, + "learning_rate": 0.00011297672076834348, + "loss": 0.2743, + "step": 1169 + }, + { + "epoch": 1.5394736842105263, + "grad_norm": 0.08166670313368278, + "learning_rate": 0.00011282490109308633, + "loss": 0.2743, + "step": 1170 + }, + { + "epoch": 1.5407894736842105, + "grad_norm": 0.07926586858824067, + "learning_rate": 0.00011267305135713726, + "loss": 0.2532, + "step": 1171 + }, + { + "epoch": 1.5421052631578949, + "grad_norm": 0.07935859462462387, + "learning_rate": 0.00011252117191642175, + "loss": 0.2586, + "step": 1172 + }, + { + "epoch": 1.543421052631579, + "grad_norm": 0.07771700777076371, + "learning_rate": 0.00011236926312693479, + "loss": 0.2556, + "step": 1173 + }, + { + "epoch": 1.5447368421052632, + "grad_norm": 0.07944739071127381, + "learning_rate": 0.00011221732534474019, + "loss": 0.2669, + "step": 1174 + }, + { + "epoch": 1.5460526315789473, + "grad_norm": 0.07926172086790113, + "learning_rate": 0.00011206535892596975, + "loss": 0.2578, + "step": 1175 + }, + { + "epoch": 1.5473684210526315, + "grad_norm": 0.07836482244209485, + "learning_rate": 0.00011191336422682237, + "loss": 0.266, + "step": 1176 + }, + { + "epoch": 1.5486842105263157, + "grad_norm": 0.08060052790786207, + "learning_rate": 0.00011176134160356327, + "loss": 0.27, + "step": 1177 + }, + { + "epoch": 1.55, + "grad_norm": 0.07857242666783919, + "learning_rate": 0.00011160929141252303, + "loss": 0.2561, + "step": 1178 + }, + { + "epoch": 1.5513157894736842, + "grad_norm": 0.0762732065222906, + "learning_rate": 0.00011145721401009694, + "loss": 0.265, + "step": 1179 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.07588799278326497, + "learning_rate": 0.00011130510975274409, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 1.5539473684210527, + "grad_norm": 0.0782353560349947, + "learning_rate": 0.00011115297899698638, + "loss": 0.2622, + "step": 1181 + }, + { + "epoch": 1.555263157894737, + "grad_norm": 0.0779137347020748, + "learning_rate": 0.00011100082209940795, + "loss": 0.2581, + "step": 1182 + }, + { + "epoch": 1.556578947368421, + "grad_norm": 0.07626200398110206, + "learning_rate": 0.00011084863941665415, + "loss": 0.2525, + "step": 1183 + }, + { + "epoch": 1.5578947368421052, + "grad_norm": 0.08075601397948919, + "learning_rate": 0.00011069643130543084, + "loss": 0.2589, + "step": 1184 + }, + { + "epoch": 1.5592105263157894, + "grad_norm": 0.07890719633469666, + "learning_rate": 0.00011054419812250338, + "loss": 0.2609, + "step": 1185 + }, + { + "epoch": 1.5605263157894735, + "grad_norm": 0.0789865706647385, + "learning_rate": 0.00011039194022469597, + "loss": 0.2515, + "step": 1186 + }, + { + "epoch": 1.561842105263158, + "grad_norm": 0.0796846545034331, + "learning_rate": 0.0001102396579688907, + "loss": 0.2687, + "step": 1187 + }, + { + "epoch": 1.563157894736842, + "grad_norm": 0.07938699909977037, + "learning_rate": 0.00011008735171202684, + "loss": 0.2636, + "step": 1188 + }, + { + "epoch": 1.5644736842105265, + "grad_norm": 0.0810811457705607, + "learning_rate": 0.00010993502181109978, + "loss": 0.2544, + "step": 1189 + }, + { + "epoch": 1.5657894736842106, + "grad_norm": 0.0811343931888347, + "learning_rate": 0.0001097826686231604, + "loss": 0.274, + "step": 1190 + }, + { + "epoch": 1.5671052631578948, + "grad_norm": 0.0786799724878539, + "learning_rate": 0.00010963029250531418, + "loss": 0.2619, + "step": 1191 + }, + { + "epoch": 1.568421052631579, + "grad_norm": 0.07801305259558834, + "learning_rate": 0.00010947789381472035, + "loss": 0.2575, + "step": 1192 + }, + { + "epoch": 1.569736842105263, + "grad_norm": 0.08019971785855351, + "learning_rate": 0.00010932547290859103, + "loss": 0.2592, + "step": 1193 + }, + { + "epoch": 1.5710526315789473, + "grad_norm": 0.07693908653004418, + "learning_rate": 0.00010917303014419036, + "loss": 0.2665, + "step": 1194 + }, + { + "epoch": 1.5723684210526314, + "grad_norm": 0.07646993423396128, + "learning_rate": 0.00010902056587883378, + "loss": 0.245, + "step": 1195 + }, + { + "epoch": 1.5736842105263158, + "grad_norm": 0.080428348690664, + "learning_rate": 0.00010886808046988717, + "loss": 0.2588, + "step": 1196 + }, + { + "epoch": 1.575, + "grad_norm": 0.07877667825439123, + "learning_rate": 0.00010871557427476583, + "loss": 0.2688, + "step": 1197 + }, + { + "epoch": 1.5763157894736843, + "grad_norm": 0.07769524509043618, + "learning_rate": 0.0001085630476509339, + "loss": 0.2588, + "step": 1198 + }, + { + "epoch": 1.5776315789473685, + "grad_norm": 0.08089328257237792, + "learning_rate": 0.00010841050095590335, + "loss": 0.2643, + "step": 1199 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.07938294687717415, + "learning_rate": 0.00010825793454723325, + "loss": 0.2678, + "step": 1200 + }, + { + "epoch": 1.5802631578947368, + "grad_norm": 0.07853030205890081, + "learning_rate": 0.0001081053487825288, + "loss": 0.2606, + "step": 1201 + }, + { + "epoch": 1.581578947368421, + "grad_norm": 0.07774175526579997, + "learning_rate": 0.00010795274401944058, + "loss": 0.2582, + "step": 1202 + }, + { + "epoch": 1.5828947368421051, + "grad_norm": 0.07578431858186546, + "learning_rate": 0.00010780012061566378, + "loss": 0.2568, + "step": 1203 + }, + { + "epoch": 1.5842105263157895, + "grad_norm": 0.07606000180404633, + "learning_rate": 0.00010764747892893723, + "loss": 0.2646, + "step": 1204 + }, + { + "epoch": 1.5855263157894737, + "grad_norm": 0.07591132081458318, + "learning_rate": 0.0001074948193170426, + "loss": 0.2517, + "step": 1205 + }, + { + "epoch": 1.586842105263158, + "grad_norm": 0.0781464844924248, + "learning_rate": 0.00010734214213780354, + "loss": 0.2491, + "step": 1206 + }, + { + "epoch": 1.5881578947368422, + "grad_norm": 0.07902701850224841, + "learning_rate": 0.000107189447749085, + "loss": 0.2653, + "step": 1207 + }, + { + "epoch": 1.5894736842105264, + "grad_norm": 0.08084483037179403, + "learning_rate": 0.00010703673650879218, + "loss": 0.2625, + "step": 1208 + }, + { + "epoch": 1.5907894736842105, + "grad_norm": 0.0773678893977578, + "learning_rate": 0.00010688400877486978, + "loss": 0.2563, + "step": 1209 + }, + { + "epoch": 1.5921052631578947, + "grad_norm": 0.07876897624596318, + "learning_rate": 0.00010673126490530112, + "loss": 0.2649, + "step": 1210 + }, + { + "epoch": 1.5934210526315788, + "grad_norm": 0.0804198373742344, + "learning_rate": 0.00010657850525810748, + "loss": 0.268, + "step": 1211 + }, + { + "epoch": 1.594736842105263, + "grad_norm": 0.08109356740988712, + "learning_rate": 0.00010642573019134703, + "loss": 0.2634, + "step": 1212 + }, + { + "epoch": 1.5960526315789474, + "grad_norm": 0.08129604963653887, + "learning_rate": 0.00010627294006311404, + "loss": 0.2852, + "step": 1213 + }, + { + "epoch": 1.5973684210526315, + "grad_norm": 0.07643522766740891, + "learning_rate": 0.00010612013523153812, + "loss": 0.261, + "step": 1214 + }, + { + "epoch": 1.598684210526316, + "grad_norm": 0.07695348511929585, + "learning_rate": 0.0001059673160547834, + "loss": 0.2651, + "step": 1215 + }, + { + "epoch": 1.6, + "grad_norm": 0.07630900188908298, + "learning_rate": 0.00010581448289104758, + "loss": 0.2633, + "step": 1216 + }, + { + "epoch": 1.6013157894736842, + "grad_norm": 0.07437930134164347, + "learning_rate": 0.00010566163609856117, + "loss": 0.2575, + "step": 1217 + }, + { + "epoch": 1.6026315789473684, + "grad_norm": 0.0780918664132209, + "learning_rate": 0.00010550877603558655, + "loss": 0.2647, + "step": 1218 + }, + { + "epoch": 1.6039473684210526, + "grad_norm": 0.07895904000053701, + "learning_rate": 0.00010535590306041732, + "loss": 0.2657, + "step": 1219 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.0782823182484623, + "learning_rate": 0.00010520301753137724, + "loss": 0.2539, + "step": 1220 + }, + { + "epoch": 1.606578947368421, + "grad_norm": 0.07836129489018223, + "learning_rate": 0.00010505011980681962, + "loss": 0.2623, + "step": 1221 + }, + { + "epoch": 1.6078947368421053, + "grad_norm": 0.07921294679317484, + "learning_rate": 0.00010489721024512618, + "loss": 0.2574, + "step": 1222 + }, + { + "epoch": 1.6092105263157894, + "grad_norm": 0.08253466147497233, + "learning_rate": 0.00010474428920470654, + "loss": 0.2626, + "step": 1223 + }, + { + "epoch": 1.6105263157894738, + "grad_norm": 0.0783379793278089, + "learning_rate": 0.00010459135704399718, + "loss": 0.2565, + "step": 1224 + }, + { + "epoch": 1.611842105263158, + "grad_norm": 0.0789556340245482, + "learning_rate": 0.00010443841412146065, + "loss": 0.2732, + "step": 1225 + }, + { + "epoch": 1.6131578947368421, + "grad_norm": 0.08284705629363337, + "learning_rate": 0.00010428546079558463, + "loss": 0.2784, + "step": 1226 + }, + { + "epoch": 1.6144736842105263, + "grad_norm": 0.07926378933409738, + "learning_rate": 0.00010413249742488131, + "loss": 0.2626, + "step": 1227 + }, + { + "epoch": 1.6157894736842104, + "grad_norm": 0.07646962136820679, + "learning_rate": 0.00010397952436788642, + "loss": 0.2576, + "step": 1228 + }, + { + "epoch": 1.6171052631578946, + "grad_norm": 0.07459202591852203, + "learning_rate": 0.00010382654198315834, + "loss": 0.2544, + "step": 1229 + }, + { + "epoch": 1.618421052631579, + "grad_norm": 0.07728933092230707, + "learning_rate": 0.00010367355062927726, + "loss": 0.2597, + "step": 1230 + }, + { + "epoch": 1.6197368421052631, + "grad_norm": 0.0740833624188333, + "learning_rate": 0.00010352055066484449, + "loss": 0.2415, + "step": 1231 + }, + { + "epoch": 1.6210526315789475, + "grad_norm": 0.07917560120385768, + "learning_rate": 0.00010336754244848157, + "loss": 0.2615, + "step": 1232 + }, + { + "epoch": 1.6223684210526317, + "grad_norm": 0.08063325789327866, + "learning_rate": 0.00010321452633882922, + "loss": 0.2741, + "step": 1233 + }, + { + "epoch": 1.6236842105263158, + "grad_norm": 0.07913725813766033, + "learning_rate": 0.00010306150269454675, + "loss": 0.2658, + "step": 1234 + }, + { + "epoch": 1.625, + "grad_norm": 0.07936038810014337, + "learning_rate": 0.00010290847187431113, + "loss": 0.2618, + "step": 1235 + }, + { + "epoch": 1.6263157894736842, + "grad_norm": 0.07924213689691284, + "learning_rate": 0.00010275543423681621, + "loss": 0.2614, + "step": 1236 + }, + { + "epoch": 1.6276315789473683, + "grad_norm": 0.0771038728604427, + "learning_rate": 0.0001026023901407717, + "loss": 0.2477, + "step": 1237 + }, + { + "epoch": 1.6289473684210525, + "grad_norm": 0.0770735907526327, + "learning_rate": 0.00010244933994490249, + "loss": 0.2548, + "step": 1238 + }, + { + "epoch": 1.6302631578947369, + "grad_norm": 0.07674015928306312, + "learning_rate": 0.0001022962840079478, + "loss": 0.2547, + "step": 1239 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.07857136632734509, + "learning_rate": 0.00010214322268866032, + "loss": 0.2547, + "step": 1240 + }, + { + "epoch": 1.6328947368421054, + "grad_norm": 0.08094746714498062, + "learning_rate": 0.00010199015634580528, + "loss": 0.2612, + "step": 1241 + }, + { + "epoch": 1.6342105263157896, + "grad_norm": 0.07859907086130548, + "learning_rate": 0.00010183708533815974, + "loss": 0.2629, + "step": 1242 + }, + { + "epoch": 1.6355263157894737, + "grad_norm": 0.07847032246484141, + "learning_rate": 0.0001016840100245117, + "loss": 0.2587, + "step": 1243 + }, + { + "epoch": 1.6368421052631579, + "grad_norm": 0.0785820431028293, + "learning_rate": 0.00010153093076365923, + "loss": 0.2562, + "step": 1244 + }, + { + "epoch": 1.638157894736842, + "grad_norm": 0.07896059641278598, + "learning_rate": 0.00010137784791440965, + "loss": 0.2572, + "step": 1245 + }, + { + "epoch": 1.6394736842105262, + "grad_norm": 0.07921831824560291, + "learning_rate": 0.00010122476183557869, + "loss": 0.2727, + "step": 1246 + }, + { + "epoch": 1.6407894736842106, + "grad_norm": 0.07588261624391224, + "learning_rate": 0.00010107167288598967, + "loss": 0.2505, + "step": 1247 + }, + { + "epoch": 1.6421052631578947, + "grad_norm": 0.07634558208956514, + "learning_rate": 0.00010091858142447265, + "loss": 0.2536, + "step": 1248 + }, + { + "epoch": 1.643421052631579, + "grad_norm": 0.07689177759409037, + "learning_rate": 0.00010076548780986352, + "loss": 0.2603, + "step": 1249 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 0.07734671917849004, + "learning_rate": 0.00010061239240100327, + "loss": 0.2473, + "step": 1250 + }, + { + "epoch": 1.6460526315789474, + "grad_norm": 0.07734454423729424, + "learning_rate": 0.00010045929555673705, + "loss": 0.2687, + "step": 1251 + }, + { + "epoch": 1.6473684210526316, + "grad_norm": 0.0778519121851718, + "learning_rate": 0.00010030619763591347, + "loss": 0.2659, + "step": 1252 + }, + { + "epoch": 1.6486842105263158, + "grad_norm": 0.07985594655530279, + "learning_rate": 0.00010015309899738355, + "loss": 0.269, + "step": 1253 + }, + { + "epoch": 1.65, + "grad_norm": 0.07695791648645144, + "learning_rate": 0.0001, + "loss": 0.2552, + "step": 1254 + }, + { + "epoch": 1.651315789473684, + "grad_norm": 0.07817327760226638, + "learning_rate": 9.984690100261648e-05, + "loss": 0.2493, + "step": 1255 + }, + { + "epoch": 1.6526315789473685, + "grad_norm": 0.07787762754684492, + "learning_rate": 9.969380236408656e-05, + "loss": 0.2553, + "step": 1256 + }, + { + "epoch": 1.6539473684210526, + "grad_norm": 0.08135305782817635, + "learning_rate": 9.954070444326293e-05, + "loss": 0.2505, + "step": 1257 + }, + { + "epoch": 1.655263157894737, + "grad_norm": 0.07789141129556089, + "learning_rate": 9.938760759899674e-05, + "loss": 0.2527, + "step": 1258 + }, + { + "epoch": 1.6565789473684212, + "grad_norm": 0.07824485146455672, + "learning_rate": 9.923451219013651e-05, + "loss": 0.2603, + "step": 1259 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 0.07858931038856287, + "learning_rate": 9.908141857552737e-05, + "loss": 0.2631, + "step": 1260 + }, + { + "epoch": 1.6592105263157895, + "grad_norm": 0.07644569879074736, + "learning_rate": 9.892832711401036e-05, + "loss": 0.2436, + "step": 1261 + }, + { + "epoch": 1.6605263157894736, + "grad_norm": 0.08015020052581098, + "learning_rate": 9.877523816442133e-05, + "loss": 0.2733, + "step": 1262 + }, + { + "epoch": 1.6618421052631578, + "grad_norm": 0.07973057338959542, + "learning_rate": 9.862215208559037e-05, + "loss": 0.2595, + "step": 1263 + }, + { + "epoch": 1.663157894736842, + "grad_norm": 0.07569295606836161, + "learning_rate": 9.846906923634079e-05, + "loss": 0.2425, + "step": 1264 + }, + { + "epoch": 1.6644736842105263, + "grad_norm": 0.07591010855329165, + "learning_rate": 9.831598997548831e-05, + "loss": 0.247, + "step": 1265 + }, + { + "epoch": 1.6657894736842105, + "grad_norm": 0.07750100103712705, + "learning_rate": 9.816291466184026e-05, + "loss": 0.2475, + "step": 1266 + }, + { + "epoch": 1.6671052631578949, + "grad_norm": 0.07706201585723965, + "learning_rate": 9.800984365419475e-05, + "loss": 0.2611, + "step": 1267 + }, + { + "epoch": 1.668421052631579, + "grad_norm": 0.07793486034207057, + "learning_rate": 9.78567773113397e-05, + "loss": 0.263, + "step": 1268 + }, + { + "epoch": 1.6697368421052632, + "grad_norm": 0.079099268943506, + "learning_rate": 9.770371599205222e-05, + "loss": 0.2578, + "step": 1269 + }, + { + "epoch": 1.6710526315789473, + "grad_norm": 0.07871797278447953, + "learning_rate": 9.755066005509753e-05, + "loss": 0.2578, + "step": 1270 + }, + { + "epoch": 1.6723684210526315, + "grad_norm": 0.0811458532081561, + "learning_rate": 9.739760985922832e-05, + "loss": 0.2579, + "step": 1271 + }, + { + "epoch": 1.6736842105263157, + "grad_norm": 0.07824873316859218, + "learning_rate": 9.724456576318381e-05, + "loss": 0.2754, + "step": 1272 + }, + { + "epoch": 1.675, + "grad_norm": 0.07647687339497086, + "learning_rate": 9.709152812568886e-05, + "loss": 0.2575, + "step": 1273 + }, + { + "epoch": 1.6763157894736842, + "grad_norm": 0.07842920421047767, + "learning_rate": 9.693849730545326e-05, + "loss": 0.248, + "step": 1274 + }, + { + "epoch": 1.6776315789473686, + "grad_norm": 0.0779966215397614, + "learning_rate": 9.678547366117083e-05, + "loss": 0.2669, + "step": 1275 + }, + { + "epoch": 1.6789473684210527, + "grad_norm": 0.07565197476267557, + "learning_rate": 9.663245755151846e-05, + "loss": 0.2553, + "step": 1276 + }, + { + "epoch": 1.680263157894737, + "grad_norm": 0.07727410664675037, + "learning_rate": 9.647944933515552e-05, + "loss": 0.2528, + "step": 1277 + }, + { + "epoch": 1.681578947368421, + "grad_norm": 0.076100890561838, + "learning_rate": 9.632644937072277e-05, + "loss": 0.2547, + "step": 1278 + }, + { + "epoch": 1.6828947368421052, + "grad_norm": 0.07998682208123038, + "learning_rate": 9.617345801684169e-05, + "loss": 0.2606, + "step": 1279 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.0766101279713053, + "learning_rate": 9.602047563211359e-05, + "loss": 0.2706, + "step": 1280 + }, + { + "epoch": 1.6855263157894735, + "grad_norm": 0.07714573829749818, + "learning_rate": 9.586750257511867e-05, + "loss": 0.2533, + "step": 1281 + }, + { + "epoch": 1.686842105263158, + "grad_norm": 0.0774718178209979, + "learning_rate": 9.571453920441538e-05, + "loss": 0.2541, + "step": 1282 + }, + { + "epoch": 1.688157894736842, + "grad_norm": 0.07724082139457383, + "learning_rate": 9.556158587853941e-05, + "loss": 0.2548, + "step": 1283 + }, + { + "epoch": 1.6894736842105265, + "grad_norm": 0.08138472870698339, + "learning_rate": 9.540864295600283e-05, + "loss": 0.2767, + "step": 1284 + }, + { + "epoch": 1.6907894736842106, + "grad_norm": 0.07668384972603022, + "learning_rate": 9.525571079529347e-05, + "loss": 0.2614, + "step": 1285 + }, + { + "epoch": 1.6921052631578948, + "grad_norm": 0.07631635135059317, + "learning_rate": 9.510278975487384e-05, + "loss": 0.2568, + "step": 1286 + }, + { + "epoch": 1.693421052631579, + "grad_norm": 0.07762293294175955, + "learning_rate": 9.49498801931804e-05, + "loss": 0.2608, + "step": 1287 + }, + { + "epoch": 1.694736842105263, + "grad_norm": 0.08091630947772432, + "learning_rate": 9.479698246862276e-05, + "loss": 0.2624, + "step": 1288 + }, + { + "epoch": 1.6960526315789473, + "grad_norm": 0.07844980001770621, + "learning_rate": 9.464409693958269e-05, + "loss": 0.2589, + "step": 1289 + }, + { + "epoch": 1.6973684210526314, + "grad_norm": 0.07835999657865879, + "learning_rate": 9.449122396441345e-05, + "loss": 0.2673, + "step": 1290 + }, + { + "epoch": 1.6986842105263158, + "grad_norm": 0.07660019194402308, + "learning_rate": 9.433836390143887e-05, + "loss": 0.2533, + "step": 1291 + }, + { + "epoch": 1.7, + "grad_norm": 0.0775129862715817, + "learning_rate": 9.418551710895243e-05, + "loss": 0.2526, + "step": 1292 + }, + { + "epoch": 1.7013157894736843, + "grad_norm": 0.07852001442550643, + "learning_rate": 9.403268394521662e-05, + "loss": 0.2536, + "step": 1293 + }, + { + "epoch": 1.7026315789473685, + "grad_norm": 0.07775416745660803, + "learning_rate": 9.38798647684619e-05, + "loss": 0.2568, + "step": 1294 + }, + { + "epoch": 1.7039473684210527, + "grad_norm": 0.0773123834839056, + "learning_rate": 9.372705993688599e-05, + "loss": 0.2487, + "step": 1295 + }, + { + "epoch": 1.7052631578947368, + "grad_norm": 0.07933332672618959, + "learning_rate": 9.357426980865301e-05, + "loss": 0.2546, + "step": 1296 + }, + { + "epoch": 1.706578947368421, + "grad_norm": 0.0758849825907164, + "learning_rate": 9.342149474189251e-05, + "loss": 0.2527, + "step": 1297 + }, + { + "epoch": 1.7078947368421051, + "grad_norm": 0.07768939168993196, + "learning_rate": 9.326873509469887e-05, + "loss": 0.2635, + "step": 1298 + }, + { + "epoch": 1.7092105263157895, + "grad_norm": 0.07733832118219917, + "learning_rate": 9.311599122513029e-05, + "loss": 0.2639, + "step": 1299 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.07871573341153681, + "learning_rate": 9.296326349120785e-05, + "loss": 0.2618, + "step": 1300 + }, + { + "epoch": 1.711842105263158, + "grad_norm": 0.07866153909260622, + "learning_rate": 9.281055225091503e-05, + "loss": 0.2594, + "step": 1301 + }, + { + "epoch": 1.7131578947368422, + "grad_norm": 0.07974140917973452, + "learning_rate": 9.265785786219647e-05, + "loss": 0.2668, + "step": 1302 + }, + { + "epoch": 1.7144736842105264, + "grad_norm": 0.08096294253599645, + "learning_rate": 9.250518068295744e-05, + "loss": 0.2652, + "step": 1303 + }, + { + "epoch": 1.7157894736842105, + "grad_norm": 0.0790015551619781, + "learning_rate": 9.235252107106279e-05, + "loss": 0.2657, + "step": 1304 + }, + { + "epoch": 1.7171052631578947, + "grad_norm": 0.07763615541336244, + "learning_rate": 9.219987938433621e-05, + "loss": 0.2629, + "step": 1305 + }, + { + "epoch": 1.7184210526315788, + "grad_norm": 0.07778669559237536, + "learning_rate": 9.204725598055942e-05, + "loss": 0.2636, + "step": 1306 + }, + { + "epoch": 1.719736842105263, + "grad_norm": 0.07939255811714764, + "learning_rate": 9.189465121747125e-05, + "loss": 0.2705, + "step": 1307 + }, + { + "epoch": 1.7210526315789474, + "grad_norm": 0.07960488700289318, + "learning_rate": 9.174206545276677e-05, + "loss": 0.2706, + "step": 1308 + }, + { + "epoch": 1.7223684210526315, + "grad_norm": 0.07723537042473928, + "learning_rate": 9.158949904409668e-05, + "loss": 0.2621, + "step": 1309 + }, + { + "epoch": 1.723684210526316, + "grad_norm": 0.07861687285953221, + "learning_rate": 9.143695234906611e-05, + "loss": 0.2549, + "step": 1310 + }, + { + "epoch": 1.725, + "grad_norm": 0.07928259749395238, + "learning_rate": 9.128442572523417e-05, + "loss": 0.256, + "step": 1311 + }, + { + "epoch": 1.7263157894736842, + "grad_norm": 0.07977907458496993, + "learning_rate": 9.113191953011287e-05, + "loss": 0.2621, + "step": 1312 + }, + { + "epoch": 1.7276315789473684, + "grad_norm": 0.07908599126862541, + "learning_rate": 9.09794341211662e-05, + "loss": 0.2577, + "step": 1313 + }, + { + "epoch": 1.7289473684210526, + "grad_norm": 0.07804526868832733, + "learning_rate": 9.082696985580964e-05, + "loss": 0.2574, + "step": 1314 + }, + { + "epoch": 1.7302631578947367, + "grad_norm": 0.07671319689683427, + "learning_rate": 9.0674527091409e-05, + "loss": 0.2551, + "step": 1315 + }, + { + "epoch": 1.731578947368421, + "grad_norm": 0.07918675981206362, + "learning_rate": 9.052210618527966e-05, + "loss": 0.2478, + "step": 1316 + }, + { + "epoch": 1.7328947368421053, + "grad_norm": 0.07700717313469471, + "learning_rate": 9.036970749468584e-05, + "loss": 0.2484, + "step": 1317 + }, + { + "epoch": 1.7342105263157894, + "grad_norm": 0.07832287153888227, + "learning_rate": 9.021733137683962e-05, + "loss": 0.2604, + "step": 1318 + }, + { + "epoch": 1.7355263157894738, + "grad_norm": 0.076498834528713, + "learning_rate": 9.006497818890024e-05, + "loss": 0.2536, + "step": 1319 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.077411644809103, + "learning_rate": 8.991264828797319e-05, + "loss": 0.2578, + "step": 1320 + }, + { + "epoch": 1.7381578947368421, + "grad_norm": 0.07747513215833557, + "learning_rate": 8.97603420311093e-05, + "loss": 0.2574, + "step": 1321 + }, + { + "epoch": 1.7394736842105263, + "grad_norm": 0.07842803983473304, + "learning_rate": 8.960805977530404e-05, + "loss": 0.2606, + "step": 1322 + }, + { + "epoch": 1.7407894736842104, + "grad_norm": 0.07436646379437782, + "learning_rate": 8.945580187749666e-05, + "loss": 0.2487, + "step": 1323 + }, + { + "epoch": 1.7421052631578946, + "grad_norm": 0.07487978241239661, + "learning_rate": 8.930356869456919e-05, + "loss": 0.2532, + "step": 1324 + }, + { + "epoch": 1.743421052631579, + "grad_norm": 0.07966307306554148, + "learning_rate": 8.915136058334588e-05, + "loss": 0.2665, + "step": 1325 + }, + { + "epoch": 1.7447368421052631, + "grad_norm": 0.08106464265363358, + "learning_rate": 8.899917790059208e-05, + "loss": 0.2643, + "step": 1326 + }, + { + "epoch": 1.7460526315789475, + "grad_norm": 0.07593234358817623, + "learning_rate": 8.884702100301364e-05, + "loss": 0.2586, + "step": 1327 + }, + { + "epoch": 1.7473684210526317, + "grad_norm": 0.07609912132517359, + "learning_rate": 8.869489024725595e-05, + "loss": 0.2459, + "step": 1328 + }, + { + "epoch": 1.7486842105263158, + "grad_norm": 0.07772499392839312, + "learning_rate": 8.854278598990305e-05, + "loss": 0.2632, + "step": 1329 + }, + { + "epoch": 1.75, + "grad_norm": 0.07646456091007017, + "learning_rate": 8.839070858747697e-05, + "loss": 0.2526, + "step": 1330 + }, + { + "epoch": 1.7513157894736842, + "grad_norm": 0.07793051011581553, + "learning_rate": 8.823865839643677e-05, + "loss": 0.2714, + "step": 1331 + }, + { + "epoch": 1.7526315789473683, + "grad_norm": 0.07790374700766114, + "learning_rate": 8.808663577317764e-05, + "loss": 0.2605, + "step": 1332 + }, + { + "epoch": 1.7539473684210525, + "grad_norm": 0.0800107435876805, + "learning_rate": 8.793464107403028e-05, + "loss": 0.2595, + "step": 1333 + }, + { + "epoch": 1.7552631578947369, + "grad_norm": 0.08006102878272488, + "learning_rate": 8.778267465525985e-05, + "loss": 0.2667, + "step": 1334 + }, + { + "epoch": 1.756578947368421, + "grad_norm": 0.07794460119809767, + "learning_rate": 8.763073687306524e-05, + "loss": 0.2506, + "step": 1335 + }, + { + "epoch": 1.7578947368421054, + "grad_norm": 0.07759219048166072, + "learning_rate": 8.747882808357828e-05, + "loss": 0.2705, + "step": 1336 + }, + { + "epoch": 1.7592105263157896, + "grad_norm": 0.07712939424044654, + "learning_rate": 8.732694864286273e-05, + "loss": 0.2635, + "step": 1337 + }, + { + "epoch": 1.7605263157894737, + "grad_norm": 0.07743375242887887, + "learning_rate": 8.717509890691368e-05, + "loss": 0.2587, + "step": 1338 + }, + { + "epoch": 1.7618421052631579, + "grad_norm": 0.07860695060081709, + "learning_rate": 8.702327923165654e-05, + "loss": 0.2661, + "step": 1339 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 0.0792831934996456, + "learning_rate": 8.687148997294621e-05, + "loss": 0.2678, + "step": 1340 + }, + { + "epoch": 1.7644736842105262, + "grad_norm": 0.07976560061008468, + "learning_rate": 8.671973148656634e-05, + "loss": 0.2619, + "step": 1341 + }, + { + "epoch": 1.7657894736842106, + "grad_norm": 0.08106987042463963, + "learning_rate": 8.656800412822847e-05, + "loss": 0.2735, + "step": 1342 + }, + { + "epoch": 1.7671052631578947, + "grad_norm": 0.07818392747232804, + "learning_rate": 8.641630825357115e-05, + "loss": 0.2387, + "step": 1343 + }, + { + "epoch": 1.768421052631579, + "grad_norm": 0.07902258148724874, + "learning_rate": 8.626464421815919e-05, + "loss": 0.2524, + "step": 1344 + }, + { + "epoch": 1.7697368421052633, + "grad_norm": 0.07756812342250517, + "learning_rate": 8.611301237748267e-05, + "loss": 0.2621, + "step": 1345 + }, + { + "epoch": 1.7710526315789474, + "grad_norm": 0.07885482619098905, + "learning_rate": 8.596141308695628e-05, + "loss": 0.2544, + "step": 1346 + }, + { + "epoch": 1.7723684210526316, + "grad_norm": 0.08005110335934358, + "learning_rate": 8.580984670191848e-05, + "loss": 0.2539, + "step": 1347 + }, + { + "epoch": 1.7736842105263158, + "grad_norm": 0.07943085562482573, + "learning_rate": 8.565831357763039e-05, + "loss": 0.2622, + "step": 1348 + }, + { + "epoch": 1.775, + "grad_norm": 0.08121598066546899, + "learning_rate": 8.550681406927535e-05, + "loss": 0.2658, + "step": 1349 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 0.0773683018126476, + "learning_rate": 8.535534853195786e-05, + "loss": 0.2629, + "step": 1350 + }, + { + "epoch": 1.7776315789473685, + "grad_norm": 0.07743762444071942, + "learning_rate": 8.520391732070279e-05, + "loss": 0.2477, + "step": 1351 + }, + { + "epoch": 1.7789473684210526, + "grad_norm": 0.07738788021837957, + "learning_rate": 8.505252079045458e-05, + "loss": 0.2617, + "step": 1352 + }, + { + "epoch": 1.780263157894737, + "grad_norm": 0.07732362511743367, + "learning_rate": 8.490115929607631e-05, + "loss": 0.2562, + "step": 1353 + }, + { + "epoch": 1.7815789473684212, + "grad_norm": 0.07882812579817991, + "learning_rate": 8.474983319234899e-05, + "loss": 0.26, + "step": 1354 + }, + { + "epoch": 1.7828947368421053, + "grad_norm": 0.0764856203909363, + "learning_rate": 8.459854283397073e-05, + "loss": 0.2561, + "step": 1355 + }, + { + "epoch": 1.7842105263157895, + "grad_norm": 0.07723245518951646, + "learning_rate": 8.444728857555572e-05, + "loss": 0.2504, + "step": 1356 + }, + { + "epoch": 1.7855263157894736, + "grad_norm": 0.07746876461441092, + "learning_rate": 8.42960707716336e-05, + "loss": 0.247, + "step": 1357 + }, + { + "epoch": 1.7868421052631578, + "grad_norm": 0.07569577111860483, + "learning_rate": 8.414488977664859e-05, + "loss": 0.2409, + "step": 1358 + }, + { + "epoch": 1.788157894736842, + "grad_norm": 0.07919043138051154, + "learning_rate": 8.399374594495861e-05, + "loss": 0.2483, + "step": 1359 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.08072885452301694, + "learning_rate": 8.384263963083453e-05, + "loss": 0.2527, + "step": 1360 + }, + { + "epoch": 1.7907894736842105, + "grad_norm": 0.07843210339508665, + "learning_rate": 8.369157118845914e-05, + "loss": 0.2538, + "step": 1361 + }, + { + "epoch": 1.7921052631578949, + "grad_norm": 0.08063967474828551, + "learning_rate": 8.35405409719266e-05, + "loss": 0.2619, + "step": 1362 + }, + { + "epoch": 1.793421052631579, + "grad_norm": 0.07796033453134701, + "learning_rate": 8.338954933524144e-05, + "loss": 0.253, + "step": 1363 + }, + { + "epoch": 1.7947368421052632, + "grad_norm": 0.08080704163282365, + "learning_rate": 8.323859663231768e-05, + "loss": 0.2691, + "step": 1364 + }, + { + "epoch": 1.7960526315789473, + "grad_norm": 0.0786006883745561, + "learning_rate": 8.308768321697815e-05, + "loss": 0.263, + "step": 1365 + }, + { + "epoch": 1.7973684210526315, + "grad_norm": 0.07705721459554137, + "learning_rate": 8.293680944295359e-05, + "loss": 0.2486, + "step": 1366 + }, + { + "epoch": 1.7986842105263157, + "grad_norm": 0.07751983519025603, + "learning_rate": 8.278597566388184e-05, + "loss": 0.2578, + "step": 1367 + }, + { + "epoch": 1.8, + "grad_norm": 0.07892468926699212, + "learning_rate": 8.263518223330697e-05, + "loss": 0.2503, + "step": 1368 + }, + { + "epoch": 1.8013157894736842, + "grad_norm": 0.07633017824416555, + "learning_rate": 8.248442950467845e-05, + "loss": 0.2623, + "step": 1369 + }, + { + "epoch": 1.8026315789473686, + "grad_norm": 0.07718202591821682, + "learning_rate": 8.23337178313504e-05, + "loss": 0.2565, + "step": 1370 + }, + { + "epoch": 1.8039473684210527, + "grad_norm": 0.0807857739995025, + "learning_rate": 8.218304756658072e-05, + "loss": 0.2599, + "step": 1371 + }, + { + "epoch": 1.805263157894737, + "grad_norm": 0.0790429723121207, + "learning_rate": 8.203241906353014e-05, + "loss": 0.2577, + "step": 1372 + }, + { + "epoch": 1.806578947368421, + "grad_norm": 0.07931013296038247, + "learning_rate": 8.188183267526161e-05, + "loss": 0.2639, + "step": 1373 + }, + { + "epoch": 1.8078947368421052, + "grad_norm": 0.07877666664803827, + "learning_rate": 8.173128875473932e-05, + "loss": 0.2563, + "step": 1374 + }, + { + "epoch": 1.8092105263157894, + "grad_norm": 0.07782636186273496, + "learning_rate": 8.158078765482796e-05, + "loss": 0.2635, + "step": 1375 + }, + { + "epoch": 1.8105263157894735, + "grad_norm": 0.07632603628610292, + "learning_rate": 8.143032972829183e-05, + "loss": 0.2568, + "step": 1376 + }, + { + "epoch": 1.811842105263158, + "grad_norm": 0.07623514696597232, + "learning_rate": 8.127991532779401e-05, + "loss": 0.2498, + "step": 1377 + }, + { + "epoch": 1.813157894736842, + "grad_norm": 0.0777236902872697, + "learning_rate": 8.112954480589558e-05, + "loss": 0.2534, + "step": 1378 + }, + { + "epoch": 1.8144736842105265, + "grad_norm": 0.07656061477259994, + "learning_rate": 8.09792185150548e-05, + "loss": 0.26, + "step": 1379 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.0789133570152323, + "learning_rate": 8.082893680762619e-05, + "loss": 0.2604, + "step": 1380 + }, + { + "epoch": 1.8171052631578948, + "grad_norm": 0.07851474431282743, + "learning_rate": 8.067870003585978e-05, + "loss": 0.2663, + "step": 1381 + }, + { + "epoch": 1.818421052631579, + "grad_norm": 0.0789616006642964, + "learning_rate": 8.052850855190034e-05, + "loss": 0.2638, + "step": 1382 + }, + { + "epoch": 1.819736842105263, + "grad_norm": 0.07562051436427474, + "learning_rate": 8.037836270778642e-05, + "loss": 0.2512, + "step": 1383 + }, + { + "epoch": 1.8210526315789473, + "grad_norm": 0.08100206522865458, + "learning_rate": 8.022826285544968e-05, + "loss": 0.2474, + "step": 1384 + }, + { + "epoch": 1.8223684210526314, + "grad_norm": 0.0794818851301571, + "learning_rate": 8.007820934671383e-05, + "loss": 0.2713, + "step": 1385 + }, + { + "epoch": 1.8236842105263158, + "grad_norm": 0.07947535099199077, + "learning_rate": 7.992820253329409e-05, + "loss": 0.2589, + "step": 1386 + }, + { + "epoch": 1.825, + "grad_norm": 0.08012876631030338, + "learning_rate": 7.977824276679623e-05, + "loss": 0.2656, + "step": 1387 + }, + { + "epoch": 1.8263157894736843, + "grad_norm": 0.0778147716148084, + "learning_rate": 7.96283303987156e-05, + "loss": 0.2574, + "step": 1388 + }, + { + "epoch": 1.8276315789473685, + "grad_norm": 0.08108286483882464, + "learning_rate": 7.947846578043659e-05, + "loss": 0.2592, + "step": 1389 + }, + { + "epoch": 1.8289473684210527, + "grad_norm": 0.07898753250610759, + "learning_rate": 7.932864926323161e-05, + "loss": 0.2612, + "step": 1390 + }, + { + "epoch": 1.8302631578947368, + "grad_norm": 0.07750883606242706, + "learning_rate": 7.917888119826036e-05, + "loss": 0.2533, + "step": 1391 + }, + { + "epoch": 1.831578947368421, + "grad_norm": 0.08012157144092143, + "learning_rate": 7.902916193656898e-05, + "loss": 0.2664, + "step": 1392 + }, + { + "epoch": 1.8328947368421051, + "grad_norm": 0.07738904505017828, + "learning_rate": 7.887949182908912e-05, + "loss": 0.2599, + "step": 1393 + }, + { + "epoch": 1.8342105263157895, + "grad_norm": 0.07769928944177891, + "learning_rate": 7.872987122663733e-05, + "loss": 0.2624, + "step": 1394 + }, + { + "epoch": 1.8355263157894737, + "grad_norm": 0.08069668200882675, + "learning_rate": 7.858030047991411e-05, + "loss": 0.2589, + "step": 1395 + }, + { + "epoch": 1.836842105263158, + "grad_norm": 0.07833127833956029, + "learning_rate": 7.843077993950302e-05, + "loss": 0.2572, + "step": 1396 + }, + { + "epoch": 1.8381578947368422, + "grad_norm": 0.07971876020235151, + "learning_rate": 7.828130995586998e-05, + "loss": 0.2582, + "step": 1397 + }, + { + "epoch": 1.8394736842105264, + "grad_norm": 0.07885772994939849, + "learning_rate": 7.813189087936243e-05, + "loss": 0.2738, + "step": 1398 + }, + { + "epoch": 1.8407894736842105, + "grad_norm": 0.07736167226784646, + "learning_rate": 7.798252306020851e-05, + "loss": 0.2608, + "step": 1399 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.07630819815719928, + "learning_rate": 7.783320684851614e-05, + "loss": 0.2613, + "step": 1400 + }, + { + "epoch": 1.8434210526315788, + "grad_norm": 0.07834945368484907, + "learning_rate": 7.768394259427234e-05, + "loss": 0.2614, + "step": 1401 + }, + { + "epoch": 1.844736842105263, + "grad_norm": 0.07718022449050989, + "learning_rate": 7.753473064734232e-05, + "loss": 0.2704, + "step": 1402 + }, + { + "epoch": 1.8460526315789474, + "grad_norm": 0.07707441842552827, + "learning_rate": 7.738557135746873e-05, + "loss": 0.2576, + "step": 1403 + }, + { + "epoch": 1.8473684210526315, + "grad_norm": 0.07812383749123156, + "learning_rate": 7.72364650742707e-05, + "loss": 0.2635, + "step": 1404 + }, + { + "epoch": 1.848684210526316, + "grad_norm": 0.07972960299568377, + "learning_rate": 7.708741214724315e-05, + "loss": 0.2711, + "step": 1405 + }, + { + "epoch": 1.85, + "grad_norm": 0.07567077698895516, + "learning_rate": 7.693841292575598e-05, + "loss": 0.2583, + "step": 1406 + }, + { + "epoch": 1.8513157894736842, + "grad_norm": 0.07920973523092241, + "learning_rate": 7.678946775905324e-05, + "loss": 0.2565, + "step": 1407 + }, + { + "epoch": 1.8526315789473684, + "grad_norm": 0.07722915516418472, + "learning_rate": 7.664057699625214e-05, + "loss": 0.2526, + "step": 1408 + }, + { + "epoch": 1.8539473684210526, + "grad_norm": 0.08254943723021463, + "learning_rate": 7.649174098634251e-05, + "loss": 0.27, + "step": 1409 + }, + { + "epoch": 1.8552631578947367, + "grad_norm": 0.0778109023166561, + "learning_rate": 7.634296007818576e-05, + "loss": 0.2516, + "step": 1410 + }, + { + "epoch": 1.856578947368421, + "grad_norm": 0.07700945488788534, + "learning_rate": 7.619423462051423e-05, + "loss": 0.2646, + "step": 1411 + }, + { + "epoch": 1.8578947368421053, + "grad_norm": 0.07630139704348335, + "learning_rate": 7.604556496193015e-05, + "loss": 0.2538, + "step": 1412 + }, + { + "epoch": 1.8592105263157894, + "grad_norm": 0.07939635050751274, + "learning_rate": 7.589695145090506e-05, + "loss": 0.2667, + "step": 1413 + }, + { + "epoch": 1.8605263157894738, + "grad_norm": 0.07616939180006002, + "learning_rate": 7.57483944357789e-05, + "loss": 0.2529, + "step": 1414 + }, + { + "epoch": 1.861842105263158, + "grad_norm": 0.08117405461306933, + "learning_rate": 7.559989426475917e-05, + "loss": 0.2555, + "step": 1415 + }, + { + "epoch": 1.8631578947368421, + "grad_norm": 0.08059557719232159, + "learning_rate": 7.54514512859201e-05, + "loss": 0.2628, + "step": 1416 + }, + { + "epoch": 1.8644736842105263, + "grad_norm": 0.078588193817039, + "learning_rate": 7.530306584720188e-05, + "loss": 0.2615, + "step": 1417 + }, + { + "epoch": 1.8657894736842104, + "grad_norm": 0.07865927809600427, + "learning_rate": 7.515473829640987e-05, + "loss": 0.2588, + "step": 1418 + }, + { + "epoch": 1.8671052631578946, + "grad_norm": 0.07875855234210856, + "learning_rate": 7.500646898121373e-05, + "loss": 0.254, + "step": 1419 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.07994170191297952, + "learning_rate": 7.485825824914659e-05, + "loss": 0.2625, + "step": 1420 + }, + { + "epoch": 1.8697368421052631, + "grad_norm": 0.07669419084742685, + "learning_rate": 7.471010644760421e-05, + "loss": 0.2639, + "step": 1421 + }, + { + "epoch": 1.8710526315789475, + "grad_norm": 0.07671532012322799, + "learning_rate": 7.456201392384436e-05, + "loss": 0.2618, + "step": 1422 + }, + { + "epoch": 1.8723684210526317, + "grad_norm": 0.0760382420175465, + "learning_rate": 7.441398102498582e-05, + "loss": 0.2586, + "step": 1423 + }, + { + "epoch": 1.8736842105263158, + "grad_norm": 0.07479447004096319, + "learning_rate": 7.426600809800752e-05, + "loss": 0.2487, + "step": 1424 + }, + { + "epoch": 1.875, + "grad_norm": 0.07952054209905557, + "learning_rate": 7.411809548974792e-05, + "loss": 0.2694, + "step": 1425 + }, + { + "epoch": 1.8763157894736842, + "grad_norm": 0.0795536063574035, + "learning_rate": 7.397024354690408e-05, + "loss": 0.2575, + "step": 1426 + }, + { + "epoch": 1.8776315789473683, + "grad_norm": 0.07893539617767596, + "learning_rate": 7.382245261603088e-05, + "loss": 0.271, + "step": 1427 + }, + { + "epoch": 1.8789473684210525, + "grad_norm": 0.08003786520908833, + "learning_rate": 7.36747230435401e-05, + "loss": 0.2499, + "step": 1428 + }, + { + "epoch": 1.8802631578947369, + "grad_norm": 0.078791451136056, + "learning_rate": 7.352705517569977e-05, + "loss": 0.2674, + "step": 1429 + }, + { + "epoch": 1.881578947368421, + "grad_norm": 0.0761073189754986, + "learning_rate": 7.337944935863333e-05, + "loss": 0.2561, + "step": 1430 + }, + { + "epoch": 1.8828947368421054, + "grad_norm": 0.07783417880514816, + "learning_rate": 7.32319059383187e-05, + "loss": 0.2566, + "step": 1431 + }, + { + "epoch": 1.8842105263157896, + "grad_norm": 0.07782790823841897, + "learning_rate": 7.308442526058756e-05, + "loss": 0.2681, + "step": 1432 + }, + { + "epoch": 1.8855263157894737, + "grad_norm": 0.07722518403441637, + "learning_rate": 7.293700767112458e-05, + "loss": 0.2581, + "step": 1433 + }, + { + "epoch": 1.8868421052631579, + "grad_norm": 0.07897906723081582, + "learning_rate": 7.278965351546648e-05, + "loss": 0.2618, + "step": 1434 + }, + { + "epoch": 1.888157894736842, + "grad_norm": 0.07503201774332066, + "learning_rate": 7.264236313900141e-05, + "loss": 0.2449, + "step": 1435 + }, + { + "epoch": 1.8894736842105262, + "grad_norm": 0.07528777047208206, + "learning_rate": 7.249513688696786e-05, + "loss": 0.2509, + "step": 1436 + }, + { + "epoch": 1.8907894736842106, + "grad_norm": 0.07701172257337681, + "learning_rate": 7.234797510445411e-05, + "loss": 0.2502, + "step": 1437 + }, + { + "epoch": 1.8921052631578947, + "grad_norm": 0.07610174570738121, + "learning_rate": 7.220087813639736e-05, + "loss": 0.2551, + "step": 1438 + }, + { + "epoch": 1.893421052631579, + "grad_norm": 0.07662368675063383, + "learning_rate": 7.205384632758285e-05, + "loss": 0.2611, + "step": 1439 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.07837947549366885, + "learning_rate": 7.190688002264308e-05, + "loss": 0.2587, + "step": 1440 + }, + { + "epoch": 1.8960526315789474, + "grad_norm": 0.07828653663313988, + "learning_rate": 7.175997956605701e-05, + "loss": 0.2542, + "step": 1441 + }, + { + "epoch": 1.8973684210526316, + "grad_norm": 0.07749211808620897, + "learning_rate": 7.161314530214931e-05, + "loss": 0.2554, + "step": 1442 + }, + { + "epoch": 1.8986842105263158, + "grad_norm": 0.07955661477515108, + "learning_rate": 7.146637757508949e-05, + "loss": 0.2465, + "step": 1443 + }, + { + "epoch": 1.9, + "grad_norm": 0.0780534289115777, + "learning_rate": 7.131967672889101e-05, + "loss": 0.2587, + "step": 1444 + }, + { + "epoch": 1.901315789473684, + "grad_norm": 0.08230240962537452, + "learning_rate": 7.117304310741062e-05, + "loss": 0.2676, + "step": 1445 + }, + { + "epoch": 1.9026315789473685, + "grad_norm": 0.07514699270244664, + "learning_rate": 7.102647705434756e-05, + "loss": 0.2515, + "step": 1446 + }, + { + "epoch": 1.9039473684210526, + "grad_norm": 0.07838555881596923, + "learning_rate": 7.087997891324262e-05, + "loss": 0.2676, + "step": 1447 + }, + { + "epoch": 1.905263157894737, + "grad_norm": 0.07794040393822359, + "learning_rate": 7.073354902747741e-05, + "loss": 0.2622, + "step": 1448 + }, + { + "epoch": 1.9065789473684212, + "grad_norm": 0.07793028180395223, + "learning_rate": 7.058718774027364e-05, + "loss": 0.2581, + "step": 1449 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 0.07468275561587644, + "learning_rate": 7.044089539469212e-05, + "loss": 0.2536, + "step": 1450 + }, + { + "epoch": 1.9092105263157895, + "grad_norm": 0.07736426080010585, + "learning_rate": 7.029467233363216e-05, + "loss": 0.2607, + "step": 1451 + }, + { + "epoch": 1.9105263157894736, + "grad_norm": 0.07788040509177954, + "learning_rate": 7.014851889983057e-05, + "loss": 0.2699, + "step": 1452 + }, + { + "epoch": 1.9118421052631578, + "grad_norm": 0.07482713534215896, + "learning_rate": 7.000243543586102e-05, + "loss": 0.2562, + "step": 1453 + }, + { + "epoch": 1.913157894736842, + "grad_norm": 0.07738656143600638, + "learning_rate": 6.985642228413316e-05, + "loss": 0.2589, + "step": 1454 + }, + { + "epoch": 1.9144736842105263, + "grad_norm": 0.0771729364408806, + "learning_rate": 6.971047978689189e-05, + "loss": 0.2651, + "step": 1455 + }, + { + "epoch": 1.9157894736842105, + "grad_norm": 0.07648368441271167, + "learning_rate": 6.95646082862164e-05, + "loss": 0.2568, + "step": 1456 + }, + { + "epoch": 1.9171052631578949, + "grad_norm": 0.07769870758338829, + "learning_rate": 6.941880812401956e-05, + "loss": 0.2537, + "step": 1457 + }, + { + "epoch": 1.918421052631579, + "grad_norm": 0.07903999465640997, + "learning_rate": 6.927307964204694e-05, + "loss": 0.2666, + "step": 1458 + }, + { + "epoch": 1.9197368421052632, + "grad_norm": 0.08045066676431731, + "learning_rate": 6.912742318187624e-05, + "loss": 0.2605, + "step": 1459 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.07627530893025287, + "learning_rate": 6.898183908491617e-05, + "loss": 0.255, + "step": 1460 + }, + { + "epoch": 1.9223684210526315, + "grad_norm": 0.07918035796815091, + "learning_rate": 6.883632769240589e-05, + "loss": 0.2565, + "step": 1461 + }, + { + "epoch": 1.9236842105263157, + "grad_norm": 0.079331461000813, + "learning_rate": 6.869088934541419e-05, + "loss": 0.2595, + "step": 1462 + }, + { + "epoch": 1.925, + "grad_norm": 0.07819193010414092, + "learning_rate": 6.854552438483865e-05, + "loss": 0.2602, + "step": 1463 + }, + { + "epoch": 1.9263157894736842, + "grad_norm": 0.08127166962507164, + "learning_rate": 6.840023315140475e-05, + "loss": 0.2563, + "step": 1464 + }, + { + "epoch": 1.9276315789473686, + "grad_norm": 0.08070961078490956, + "learning_rate": 6.825501598566525e-05, + "loss": 0.259, + "step": 1465 + }, + { + "epoch": 1.9289473684210527, + "grad_norm": 0.0766943472529489, + "learning_rate": 6.810987322799926e-05, + "loss": 0.2586, + "step": 1466 + }, + { + "epoch": 1.930263157894737, + "grad_norm": 0.08113329672328959, + "learning_rate": 6.79648052186115e-05, + "loss": 0.2425, + "step": 1467 + }, + { + "epoch": 1.931578947368421, + "grad_norm": 0.07603940923660328, + "learning_rate": 6.781981229753145e-05, + "loss": 0.2534, + "step": 1468 + }, + { + "epoch": 1.9328947368421052, + "grad_norm": 0.07646855224356787, + "learning_rate": 6.76748948046126e-05, + "loss": 0.2506, + "step": 1469 + }, + { + "epoch": 1.9342105263157894, + "grad_norm": 0.07477033930912226, + "learning_rate": 6.753005307953167e-05, + "loss": 0.2509, + "step": 1470 + }, + { + "epoch": 1.9355263157894735, + "grad_norm": 0.07717545781453299, + "learning_rate": 6.738528746178776e-05, + "loss": 0.2576, + "step": 1471 + }, + { + "epoch": 1.936842105263158, + "grad_norm": 0.07742807164222197, + "learning_rate": 6.724059829070158e-05, + "loss": 0.2647, + "step": 1472 + }, + { + "epoch": 1.938157894736842, + "grad_norm": 0.07798070076914343, + "learning_rate": 6.709598590541469e-05, + "loss": 0.2593, + "step": 1473 + }, + { + "epoch": 1.9394736842105265, + "grad_norm": 0.07717711873805973, + "learning_rate": 6.695145064488861e-05, + "loss": 0.2589, + "step": 1474 + }, + { + "epoch": 1.9407894736842106, + "grad_norm": 0.07645570062299177, + "learning_rate": 6.680699284790415e-05, + "loss": 0.2612, + "step": 1475 + }, + { + "epoch": 1.9421052631578948, + "grad_norm": 0.07709702657938383, + "learning_rate": 6.666261285306047e-05, + "loss": 0.2643, + "step": 1476 + }, + { + "epoch": 1.943421052631579, + "grad_norm": 0.07634109328627783, + "learning_rate": 6.651831099877444e-05, + "loss": 0.2468, + "step": 1477 + }, + { + "epoch": 1.944736842105263, + "grad_norm": 0.07783850878336081, + "learning_rate": 6.637408762327972e-05, + "loss": 0.2607, + "step": 1478 + }, + { + "epoch": 1.9460526315789473, + "grad_norm": 0.07820503016401602, + "learning_rate": 6.622994306462611e-05, + "loss": 0.2574, + "step": 1479 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 0.0796202316851204, + "learning_rate": 6.608587766067852e-05, + "loss": 0.2575, + "step": 1480 + }, + { + "epoch": 1.9486842105263158, + "grad_norm": 0.07605261401674518, + "learning_rate": 6.59418917491165e-05, + "loss": 0.2591, + "step": 1481 + }, + { + "epoch": 1.95, + "grad_norm": 0.07560659095541633, + "learning_rate": 6.579798566743314e-05, + "loss": 0.2526, + "step": 1482 + }, + { + "epoch": 1.9513157894736843, + "grad_norm": 0.07747930757786237, + "learning_rate": 6.565415975293448e-05, + "loss": 0.2631, + "step": 1483 + }, + { + "epoch": 1.9526315789473685, + "grad_norm": 0.07643184520906729, + "learning_rate": 6.551041434273861e-05, + "loss": 0.2508, + "step": 1484 + }, + { + "epoch": 1.9539473684210527, + "grad_norm": 0.0796433446523202, + "learning_rate": 6.536674977377496e-05, + "loss": 0.2543, + "step": 1485 + }, + { + "epoch": 1.9552631578947368, + "grad_norm": 0.07657952724716138, + "learning_rate": 6.522316638278347e-05, + "loss": 0.2552, + "step": 1486 + }, + { + "epoch": 1.956578947368421, + "grad_norm": 0.07677380083558996, + "learning_rate": 6.507966450631382e-05, + "loss": 0.2541, + "step": 1487 + }, + { + "epoch": 1.9578947368421051, + "grad_norm": 0.07595336702259202, + "learning_rate": 6.493624448072457e-05, + "loss": 0.2487, + "step": 1488 + }, + { + "epoch": 1.9592105263157895, + "grad_norm": 0.07721242087118127, + "learning_rate": 6.479290664218247e-05, + "loss": 0.2507, + "step": 1489 + }, + { + "epoch": 1.9605263157894737, + "grad_norm": 0.07755079832687756, + "learning_rate": 6.464965132666163e-05, + "loss": 0.2574, + "step": 1490 + }, + { + "epoch": 1.961842105263158, + "grad_norm": 0.07980082932294216, + "learning_rate": 6.450647886994272e-05, + "loss": 0.2704, + "step": 1491 + }, + { + "epoch": 1.9631578947368422, + "grad_norm": 0.07840280028957075, + "learning_rate": 6.43633896076122e-05, + "loss": 0.2565, + "step": 1492 + }, + { + "epoch": 1.9644736842105264, + "grad_norm": 0.07780809182938828, + "learning_rate": 6.422038387506149e-05, + "loss": 0.2626, + "step": 1493 + }, + { + "epoch": 1.9657894736842105, + "grad_norm": 0.07710732052984494, + "learning_rate": 6.407746200748628e-05, + "loss": 0.2565, + "step": 1494 + }, + { + "epoch": 1.9671052631578947, + "grad_norm": 0.07590458797290023, + "learning_rate": 6.393462433988569e-05, + "loss": 0.2547, + "step": 1495 + }, + { + "epoch": 1.9684210526315788, + "grad_norm": 0.07792407917489579, + "learning_rate": 6.379187120706138e-05, + "loss": 0.2466, + "step": 1496 + }, + { + "epoch": 1.969736842105263, + "grad_norm": 0.07781793554961416, + "learning_rate": 6.3649202943617e-05, + "loss": 0.266, + "step": 1497 + }, + { + "epoch": 1.9710526315789474, + "grad_norm": 0.07455889127205702, + "learning_rate": 6.350661988395723e-05, + "loss": 0.2458, + "step": 1498 + }, + { + "epoch": 1.9723684210526315, + "grad_norm": 0.07493719255336764, + "learning_rate": 6.336412236228697e-05, + "loss": 0.2565, + "step": 1499 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.07690525334306028, + "learning_rate": 6.322171071261071e-05, + "loss": 0.2613, + "step": 1500 + }, + { + "epoch": 1.975, + "grad_norm": 0.07579197410852237, + "learning_rate": 6.307938526873157e-05, + "loss": 0.2523, + "step": 1501 + }, + { + "epoch": 1.9763157894736842, + "grad_norm": 0.08008252829550877, + "learning_rate": 6.293714636425071e-05, + "loss": 0.2671, + "step": 1502 + }, + { + "epoch": 1.9776315789473684, + "grad_norm": 0.07781399370339455, + "learning_rate": 6.279499433256642e-05, + "loss": 0.2603, + "step": 1503 + }, + { + "epoch": 1.9789473684210526, + "grad_norm": 0.07813006074806125, + "learning_rate": 6.26529295068733e-05, + "loss": 0.2493, + "step": 1504 + }, + { + "epoch": 1.9802631578947367, + "grad_norm": 0.07894579521693036, + "learning_rate": 6.251095222016162e-05, + "loss": 0.2624, + "step": 1505 + }, + { + "epoch": 1.981578947368421, + "grad_norm": 0.07494640550743084, + "learning_rate": 6.236906280521646e-05, + "loss": 0.2409, + "step": 1506 + }, + { + "epoch": 1.9828947368421053, + "grad_norm": 0.07893332158347492, + "learning_rate": 6.22272615946169e-05, + "loss": 0.2629, + "step": 1507 + }, + { + "epoch": 1.9842105263157894, + "grad_norm": 0.07842421722733443, + "learning_rate": 6.208554892073528e-05, + "loss": 0.2504, + "step": 1508 + }, + { + "epoch": 1.9855263157894738, + "grad_norm": 0.08117542344228214, + "learning_rate": 6.19439251157364e-05, + "loss": 0.2653, + "step": 1509 + }, + { + "epoch": 1.986842105263158, + "grad_norm": 0.07755918045728276, + "learning_rate": 6.180239051157681e-05, + "loss": 0.2558, + "step": 1510 + }, + { + "epoch": 1.9881578947368421, + "grad_norm": 0.07645794314850728, + "learning_rate": 6.166094544000398e-05, + "loss": 0.2559, + "step": 1511 + }, + { + "epoch": 1.9894736842105263, + "grad_norm": 0.07640733998010549, + "learning_rate": 6.151959023255545e-05, + "loss": 0.2537, + "step": 1512 + }, + { + "epoch": 1.9907894736842104, + "grad_norm": 0.0768529160978783, + "learning_rate": 6.137832522055817e-05, + "loss": 0.2601, + "step": 1513 + }, + { + "epoch": 1.9921052631578946, + "grad_norm": 0.07847739643759478, + "learning_rate": 6.123715073512772e-05, + "loss": 0.255, + "step": 1514 + }, + { + "epoch": 1.993421052631579, + "grad_norm": 0.07449855656212337, + "learning_rate": 6.109606710716741e-05, + "loss": 0.239, + "step": 1515 + }, + { + "epoch": 1.9947368421052631, + "grad_norm": 0.07846660941255253, + "learning_rate": 6.095507466736763e-05, + "loss": 0.2551, + "step": 1516 + }, + { + "epoch": 1.9960526315789475, + "grad_norm": 0.07789468698938513, + "learning_rate": 6.0814173746205e-05, + "loss": 0.2694, + "step": 1517 + }, + { + "epoch": 1.9973684210526317, + "grad_norm": 0.077200397264026, + "learning_rate": 6.067336467394169e-05, + "loss": 0.2589, + "step": 1518 + }, + { + "epoch": 1.9986842105263158, + "grad_norm": 0.08246944198719414, + "learning_rate": 6.0532647780624554e-05, + "loss": 0.2656, + "step": 1519 + }, + { + "epoch": 2.0, + "grad_norm": 0.07831394886097501, + "learning_rate": 6.039202339608432e-05, + "loss": 0.2656, + "step": 1520 + }, + { + "epoch": 2.0, + "eval_loss": 0.2646636962890625, + "eval_runtime": 136.4654, + "eval_samples_per_second": 37.504, + "eval_steps_per_second": 1.172, + "step": 1520 + }, + { + "epoch": 2.001315789473684, + "grad_norm": 0.07478380723960856, + "learning_rate": 6.025149184993498e-05, + "loss": 0.2524, + "step": 1521 + }, + { + "epoch": 2.0026315789473683, + "grad_norm": 0.07372772894034235, + "learning_rate": 6.011105347157289e-05, + "loss": 0.2399, + "step": 1522 + }, + { + "epoch": 2.0039473684210525, + "grad_norm": 0.07430175019575523, + "learning_rate": 5.9970708590175986e-05, + "loss": 0.2369, + "step": 1523 + }, + { + "epoch": 2.0052631578947366, + "grad_norm": 0.07626311971922609, + "learning_rate": 5.983045753470308e-05, + "loss": 0.2371, + "step": 1524 + }, + { + "epoch": 2.0065789473684212, + "grad_norm": 0.07792913324419348, + "learning_rate": 5.969030063389305e-05, + "loss": 0.2372, + "step": 1525 + }, + { + "epoch": 2.0078947368421054, + "grad_norm": 0.07819835291288194, + "learning_rate": 5.955023821626411e-05, + "loss": 0.2299, + "step": 1526 + }, + { + "epoch": 2.0092105263157896, + "grad_norm": 0.0782044620700486, + "learning_rate": 5.941027061011303e-05, + "loss": 0.243, + "step": 1527 + }, + { + "epoch": 2.0105263157894737, + "grad_norm": 0.08211154278042845, + "learning_rate": 5.927039814351426e-05, + "loss": 0.2432, + "step": 1528 + }, + { + "epoch": 2.011842105263158, + "grad_norm": 0.08053029149065172, + "learning_rate": 5.9130621144319334e-05, + "loss": 0.2349, + "step": 1529 + }, + { + "epoch": 2.013157894736842, + "grad_norm": 0.08170774492319388, + "learning_rate": 5.8990939940156e-05, + "loss": 0.2455, + "step": 1530 + }, + { + "epoch": 2.014473684210526, + "grad_norm": 0.07949305041445266, + "learning_rate": 5.885135485842743e-05, + "loss": 0.2351, + "step": 1531 + }, + { + "epoch": 2.0157894736842104, + "grad_norm": 0.08110005829531224, + "learning_rate": 5.8711866226311553e-05, + "loss": 0.2579, + "step": 1532 + }, + { + "epoch": 2.017105263157895, + "grad_norm": 0.08333787912458002, + "learning_rate": 5.857247437076012e-05, + "loss": 0.2387, + "step": 1533 + }, + { + "epoch": 2.018421052631579, + "grad_norm": 0.08010741675929599, + "learning_rate": 5.843317961849818e-05, + "loss": 0.2299, + "step": 1534 + }, + { + "epoch": 2.0197368421052633, + "grad_norm": 0.08205136086453177, + "learning_rate": 5.829398229602312e-05, + "loss": 0.2422, + "step": 1535 + }, + { + "epoch": 2.0210526315789474, + "grad_norm": 0.08134779329404473, + "learning_rate": 5.8154882729603876e-05, + "loss": 0.2488, + "step": 1536 + }, + { + "epoch": 2.0223684210526316, + "grad_norm": 0.0793049391780135, + "learning_rate": 5.8015881245280436e-05, + "loss": 0.2404, + "step": 1537 + }, + { + "epoch": 2.0236842105263158, + "grad_norm": 0.07994738843779876, + "learning_rate": 5.787697816886273e-05, + "loss": 0.2309, + "step": 1538 + }, + { + "epoch": 2.025, + "grad_norm": 0.07995832117150292, + "learning_rate": 5.773817382593008e-05, + "loss": 0.2243, + "step": 1539 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.0805658044178454, + "learning_rate": 5.7599468541830356e-05, + "loss": 0.236, + "step": 1540 + }, + { + "epoch": 2.0276315789473682, + "grad_norm": 0.082233376699682, + "learning_rate": 5.7460862641679316e-05, + "loss": 0.2395, + "step": 1541 + }, + { + "epoch": 2.028947368421053, + "grad_norm": 0.08455600799602911, + "learning_rate": 5.732235645035964e-05, + "loss": 0.2527, + "step": 1542 + }, + { + "epoch": 2.030263157894737, + "grad_norm": 0.08166867758119097, + "learning_rate": 5.7183950292520473e-05, + "loss": 0.2247, + "step": 1543 + }, + { + "epoch": 2.031578947368421, + "grad_norm": 0.08362795390398718, + "learning_rate": 5.7045644492576346e-05, + "loss": 0.2334, + "step": 1544 + }, + { + "epoch": 2.0328947368421053, + "grad_norm": 0.08759943221985574, + "learning_rate": 5.690743937470657e-05, + "loss": 0.25, + "step": 1545 + }, + { + "epoch": 2.0342105263157895, + "grad_norm": 0.0834958876411245, + "learning_rate": 5.676933526285457e-05, + "loss": 0.2416, + "step": 1546 + }, + { + "epoch": 2.0355263157894736, + "grad_norm": 0.08052792625700166, + "learning_rate": 5.663133248072692e-05, + "loss": 0.2363, + "step": 1547 + }, + { + "epoch": 2.036842105263158, + "grad_norm": 0.08077196722556565, + "learning_rate": 5.64934313517927e-05, + "loss": 0.2395, + "step": 1548 + }, + { + "epoch": 2.038157894736842, + "grad_norm": 0.08231061675764222, + "learning_rate": 5.635563219928275e-05, + "loss": 0.2467, + "step": 1549 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 0.08140686496008916, + "learning_rate": 5.62179353461888e-05, + "loss": 0.2399, + "step": 1550 + }, + { + "epoch": 2.0407894736842107, + "grad_norm": 0.08075751241578549, + "learning_rate": 5.608034111526298e-05, + "loss": 0.2262, + "step": 1551 + }, + { + "epoch": 2.042105263157895, + "grad_norm": 0.0814531147960288, + "learning_rate": 5.5942849829016695e-05, + "loss": 0.2292, + "step": 1552 + }, + { + "epoch": 2.043421052631579, + "grad_norm": 0.0807479054094009, + "learning_rate": 5.580546180972011e-05, + "loss": 0.243, + "step": 1553 + }, + { + "epoch": 2.044736842105263, + "grad_norm": 0.08012998797966009, + "learning_rate": 5.566817737940142e-05, + "loss": 0.232, + "step": 1554 + }, + { + "epoch": 2.0460526315789473, + "grad_norm": 0.08437656676259887, + "learning_rate": 5.553099685984591e-05, + "loss": 0.2499, + "step": 1555 + }, + { + "epoch": 2.0473684210526315, + "grad_norm": 0.08389268402272403, + "learning_rate": 5.5393920572595356e-05, + "loss": 0.2318, + "step": 1556 + }, + { + "epoch": 2.0486842105263157, + "grad_norm": 0.08099986813273759, + "learning_rate": 5.52569488389472e-05, + "loss": 0.2408, + "step": 1557 + }, + { + "epoch": 2.05, + "grad_norm": 0.08017638799598005, + "learning_rate": 5.5120081979953785e-05, + "loss": 0.2276, + "step": 1558 + }, + { + "epoch": 2.0513157894736844, + "grad_norm": 0.08313509302134009, + "learning_rate": 5.498332031642177e-05, + "loss": 0.2464, + "step": 1559 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.08458917823416191, + "learning_rate": 5.484666416891109e-05, + "loss": 0.2402, + "step": 1560 + }, + { + "epoch": 2.0539473684210527, + "grad_norm": 0.08235673729736151, + "learning_rate": 5.4710113857734394e-05, + "loss": 0.2316, + "step": 1561 + }, + { + "epoch": 2.055263157894737, + "grad_norm": 0.08137519978808737, + "learning_rate": 5.457366970295634e-05, + "loss": 0.2354, + "step": 1562 + }, + { + "epoch": 2.056578947368421, + "grad_norm": 0.08503575588624475, + "learning_rate": 5.4437332024392694e-05, + "loss": 0.2356, + "step": 1563 + }, + { + "epoch": 2.057894736842105, + "grad_norm": 0.08267486713807648, + "learning_rate": 5.430110114160964e-05, + "loss": 0.2492, + "step": 1564 + }, + { + "epoch": 2.0592105263157894, + "grad_norm": 0.08202736940009933, + "learning_rate": 5.416497737392308e-05, + "loss": 0.2418, + "step": 1565 + }, + { + "epoch": 2.0605263157894735, + "grad_norm": 0.0848040424518247, + "learning_rate": 5.402896104039776e-05, + "loss": 0.2385, + "step": 1566 + }, + { + "epoch": 2.0618421052631577, + "grad_norm": 0.08202439572712068, + "learning_rate": 5.389305245984675e-05, + "loss": 0.2411, + "step": 1567 + }, + { + "epoch": 2.0631578947368423, + "grad_norm": 0.08444110111591935, + "learning_rate": 5.375725195083046e-05, + "loss": 0.2365, + "step": 1568 + }, + { + "epoch": 2.0644736842105265, + "grad_norm": 0.08419157425499689, + "learning_rate": 5.362155983165594e-05, + "loss": 0.2472, + "step": 1569 + }, + { + "epoch": 2.0657894736842106, + "grad_norm": 0.08359202979261496, + "learning_rate": 5.3485976420376336e-05, + "loss": 0.2495, + "step": 1570 + }, + { + "epoch": 2.067105263157895, + "grad_norm": 0.08127010161214182, + "learning_rate": 5.335050203478988e-05, + "loss": 0.2319, + "step": 1571 + }, + { + "epoch": 2.068421052631579, + "grad_norm": 0.08168717674226678, + "learning_rate": 5.321513699243924e-05, + "loss": 0.2339, + "step": 1572 + }, + { + "epoch": 2.069736842105263, + "grad_norm": 0.08449279952945175, + "learning_rate": 5.307988161061085e-05, + "loss": 0.238, + "step": 1573 + }, + { + "epoch": 2.0710526315789473, + "grad_norm": 0.08225377668981279, + "learning_rate": 5.2944736206334034e-05, + "loss": 0.2409, + "step": 1574 + }, + { + "epoch": 2.0723684210526314, + "grad_norm": 0.0843365596375047, + "learning_rate": 5.280970109638047e-05, + "loss": 0.246, + "step": 1575 + }, + { + "epoch": 2.0736842105263156, + "grad_norm": 0.08534819131960646, + "learning_rate": 5.2674776597263186e-05, + "loss": 0.2487, + "step": 1576 + }, + { + "epoch": 2.075, + "grad_norm": 0.08083479176795237, + "learning_rate": 5.253996302523596e-05, + "loss": 0.2432, + "step": 1577 + }, + { + "epoch": 2.0763157894736843, + "grad_norm": 0.0813017134645149, + "learning_rate": 5.240526069629265e-05, + "loss": 0.2426, + "step": 1578 + }, + { + "epoch": 2.0776315789473685, + "grad_norm": 0.08318443053477804, + "learning_rate": 5.227066992616629e-05, + "loss": 0.2451, + "step": 1579 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.0833772227653302, + "learning_rate": 5.2136191030328455e-05, + "loss": 0.2351, + "step": 1580 + }, + { + "epoch": 2.080263157894737, + "grad_norm": 0.08342495654640115, + "learning_rate": 5.2001824323988455e-05, + "loss": 0.232, + "step": 1581 + }, + { + "epoch": 2.081578947368421, + "grad_norm": 0.08272035294268948, + "learning_rate": 5.1867570122092666e-05, + "loss": 0.2383, + "step": 1582 + }, + { + "epoch": 2.082894736842105, + "grad_norm": 0.08074034588106882, + "learning_rate": 5.173342873932383e-05, + "loss": 0.232, + "step": 1583 + }, + { + "epoch": 2.0842105263157893, + "grad_norm": 0.08255542214386506, + "learning_rate": 5.159940049010015e-05, + "loss": 0.2346, + "step": 1584 + }, + { + "epoch": 2.085526315789474, + "grad_norm": 0.08454238319499686, + "learning_rate": 5.146548568857462e-05, + "loss": 0.2421, + "step": 1585 + }, + { + "epoch": 2.086842105263158, + "grad_norm": 0.084726345772104, + "learning_rate": 5.133168464863449e-05, + "loss": 0.2387, + "step": 1586 + }, + { + "epoch": 2.088157894736842, + "grad_norm": 0.08212224275163124, + "learning_rate": 5.1197997683900214e-05, + "loss": 0.2352, + "step": 1587 + }, + { + "epoch": 2.0894736842105264, + "grad_norm": 0.08285955543488045, + "learning_rate": 5.106442510772489e-05, + "loss": 0.2246, + "step": 1588 + }, + { + "epoch": 2.0907894736842105, + "grad_norm": 0.08221363824415492, + "learning_rate": 5.0930967233193504e-05, + "loss": 0.2313, + "step": 1589 + }, + { + "epoch": 2.0921052631578947, + "grad_norm": 0.08240239248129916, + "learning_rate": 5.079762437312219e-05, + "loss": 0.2374, + "step": 1590 + }, + { + "epoch": 2.093421052631579, + "grad_norm": 0.0833112285945494, + "learning_rate": 5.066439684005755e-05, + "loss": 0.2354, + "step": 1591 + }, + { + "epoch": 2.094736842105263, + "grad_norm": 0.08005461008140058, + "learning_rate": 5.0531284946275784e-05, + "loss": 0.215, + "step": 1592 + }, + { + "epoch": 2.096052631578947, + "grad_norm": 0.08221155288324693, + "learning_rate": 5.039828900378204e-05, + "loss": 0.2298, + "step": 1593 + }, + { + "epoch": 2.0973684210526318, + "grad_norm": 0.08430361712412235, + "learning_rate": 5.02654093243098e-05, + "loss": 0.2391, + "step": 1594 + }, + { + "epoch": 2.098684210526316, + "grad_norm": 0.08646135396687035, + "learning_rate": 5.013264621931991e-05, + "loss": 0.2382, + "step": 1595 + }, + { + "epoch": 2.1, + "grad_norm": 0.08349700203559438, + "learning_rate": 5.000000000000002e-05, + "loss": 0.2353, + "step": 1596 + }, + { + "epoch": 2.1013157894736842, + "grad_norm": 0.08425984836395248, + "learning_rate": 4.986747097726381e-05, + "loss": 0.2298, + "step": 1597 + }, + { + "epoch": 2.1026315789473684, + "grad_norm": 0.08389516910120101, + "learning_rate": 4.97350594617502e-05, + "loss": 0.237, + "step": 1598 + }, + { + "epoch": 2.1039473684210526, + "grad_norm": 0.08297711753798366, + "learning_rate": 4.960276576382283e-05, + "loss": 0.2388, + "step": 1599 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.08353450960448826, + "learning_rate": 4.9470590193569044e-05, + "loss": 0.2448, + "step": 1600 + }, + { + "epoch": 2.106578947368421, + "grad_norm": 0.08489883369548581, + "learning_rate": 4.9338533060799306e-05, + "loss": 0.2468, + "step": 1601 + }, + { + "epoch": 2.1078947368421055, + "grad_norm": 0.08015122761732316, + "learning_rate": 4.920659467504659e-05, + "loss": 0.2311, + "step": 1602 + }, + { + "epoch": 2.1092105263157896, + "grad_norm": 0.0825490688742884, + "learning_rate": 4.907477534556542e-05, + "loss": 0.237, + "step": 1603 + }, + { + "epoch": 2.110526315789474, + "grad_norm": 0.08391033769643125, + "learning_rate": 4.894307538133129e-05, + "loss": 0.2384, + "step": 1604 + }, + { + "epoch": 2.111842105263158, + "grad_norm": 0.08420377491425757, + "learning_rate": 4.8811495091039926e-05, + "loss": 0.2365, + "step": 1605 + }, + { + "epoch": 2.113157894736842, + "grad_norm": 0.08279227752642704, + "learning_rate": 4.868003478310651e-05, + "loss": 0.2335, + "step": 1606 + }, + { + "epoch": 2.1144736842105263, + "grad_norm": 0.08322872043307299, + "learning_rate": 4.854869476566508e-05, + "loss": 0.2309, + "step": 1607 + }, + { + "epoch": 2.1157894736842104, + "grad_norm": 0.08569452755597119, + "learning_rate": 4.841747534656763e-05, + "loss": 0.2365, + "step": 1608 + }, + { + "epoch": 2.1171052631578946, + "grad_norm": 0.08195650643162714, + "learning_rate": 4.828637683338347e-05, + "loss": 0.2304, + "step": 1609 + }, + { + "epoch": 2.1184210526315788, + "grad_norm": 0.08713567384955483, + "learning_rate": 4.815539953339865e-05, + "loss": 0.2411, + "step": 1610 + }, + { + "epoch": 2.1197368421052634, + "grad_norm": 0.08545867868134706, + "learning_rate": 4.802454375361495e-05, + "loss": 0.2418, + "step": 1611 + }, + { + "epoch": 2.1210526315789475, + "grad_norm": 0.08523598311058483, + "learning_rate": 4.7893809800749403e-05, + "loss": 0.2328, + "step": 1612 + }, + { + "epoch": 2.1223684210526317, + "grad_norm": 0.08672971700799055, + "learning_rate": 4.776319798123344e-05, + "loss": 0.2357, + "step": 1613 + }, + { + "epoch": 2.123684210526316, + "grad_norm": 0.08589391450643363, + "learning_rate": 4.763270860121222e-05, + "loss": 0.2376, + "step": 1614 + }, + { + "epoch": 2.125, + "grad_norm": 0.08657771324163321, + "learning_rate": 4.7502341966544e-05, + "loss": 0.2409, + "step": 1615 + }, + { + "epoch": 2.126315789473684, + "grad_norm": 0.08553999858199614, + "learning_rate": 4.737209838279922e-05, + "loss": 0.2498, + "step": 1616 + }, + { + "epoch": 2.1276315789473683, + "grad_norm": 0.08346189155987148, + "learning_rate": 4.7241978155259925e-05, + "loss": 0.2382, + "step": 1617 + }, + { + "epoch": 2.1289473684210525, + "grad_norm": 0.0832168506244348, + "learning_rate": 4.7111981588919084e-05, + "loss": 0.2362, + "step": 1618 + }, + { + "epoch": 2.1302631578947366, + "grad_norm": 0.08400154849121748, + "learning_rate": 4.698210898847976e-05, + "loss": 0.2397, + "step": 1619 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.08285572712672996, + "learning_rate": 4.685236065835443e-05, + "loss": 0.2385, + "step": 1620 + }, + { + "epoch": 2.1328947368421054, + "grad_norm": 0.08317759957463747, + "learning_rate": 4.6722736902664334e-05, + "loss": 0.2406, + "step": 1621 + }, + { + "epoch": 2.1342105263157896, + "grad_norm": 0.08245858802078364, + "learning_rate": 4.659323802523864e-05, + "loss": 0.2391, + "step": 1622 + }, + { + "epoch": 2.1355263157894737, + "grad_norm": 0.08197930733916409, + "learning_rate": 4.646386432961396e-05, + "loss": 0.2319, + "step": 1623 + }, + { + "epoch": 2.136842105263158, + "grad_norm": 0.08493589285679196, + "learning_rate": 4.6334616119033356e-05, + "loss": 0.243, + "step": 1624 + }, + { + "epoch": 2.138157894736842, + "grad_norm": 0.08490735296967258, + "learning_rate": 4.6205493696445754e-05, + "loss": 0.236, + "step": 1625 + }, + { + "epoch": 2.139473684210526, + "grad_norm": 0.08540155245746917, + "learning_rate": 4.6076497364505386e-05, + "loss": 0.2517, + "step": 1626 + }, + { + "epoch": 2.1407894736842104, + "grad_norm": 0.08620766196769589, + "learning_rate": 4.594762742557078e-05, + "loss": 0.2317, + "step": 1627 + }, + { + "epoch": 2.1421052631578945, + "grad_norm": 0.08354790535329532, + "learning_rate": 4.5818884181704294e-05, + "loss": 0.2265, + "step": 1628 + }, + { + "epoch": 2.143421052631579, + "grad_norm": 0.08468876926659806, + "learning_rate": 4.569026793467126e-05, + "loss": 0.2323, + "step": 1629 + }, + { + "epoch": 2.1447368421052633, + "grad_norm": 0.08422922304528441, + "learning_rate": 4.5561778985939366e-05, + "loss": 0.231, + "step": 1630 + }, + { + "epoch": 2.1460526315789474, + "grad_norm": 0.08597404500241558, + "learning_rate": 4.543341763667799e-05, + "loss": 0.2455, + "step": 1631 + }, + { + "epoch": 2.1473684210526316, + "grad_norm": 0.08569530326388924, + "learning_rate": 4.530518418775733e-05, + "loss": 0.2374, + "step": 1632 + }, + { + "epoch": 2.1486842105263158, + "grad_norm": 0.08651611173999371, + "learning_rate": 4.5177078939747796e-05, + "loss": 0.2491, + "step": 1633 + }, + { + "epoch": 2.15, + "grad_norm": 0.08435793058285544, + "learning_rate": 4.50491021929194e-05, + "loss": 0.2414, + "step": 1634 + }, + { + "epoch": 2.151315789473684, + "grad_norm": 0.08452273206770455, + "learning_rate": 4.492125424724086e-05, + "loss": 0.2389, + "step": 1635 + }, + { + "epoch": 2.1526315789473682, + "grad_norm": 0.08400832223995384, + "learning_rate": 4.479353540237903e-05, + "loss": 0.2509, + "step": 1636 + }, + { + "epoch": 2.153947368421053, + "grad_norm": 0.0864191829115749, + "learning_rate": 4.466594595769814e-05, + "loss": 0.2546, + "step": 1637 + }, + { + "epoch": 2.155263157894737, + "grad_norm": 0.08469120634147317, + "learning_rate": 4.453848621225912e-05, + "loss": 0.2357, + "step": 1638 + }, + { + "epoch": 2.156578947368421, + "grad_norm": 0.08564901017872756, + "learning_rate": 4.441115646481896e-05, + "loss": 0.2367, + "step": 1639 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.08581568319788428, + "learning_rate": 4.4283957013829846e-05, + "loss": 0.238, + "step": 1640 + }, + { + "epoch": 2.1592105263157895, + "grad_norm": 0.08280728226775676, + "learning_rate": 4.415688815743858e-05, + "loss": 0.2296, + "step": 1641 + }, + { + "epoch": 2.1605263157894736, + "grad_norm": 0.0853460546951434, + "learning_rate": 4.402995019348595e-05, + "loss": 0.2387, + "step": 1642 + }, + { + "epoch": 2.161842105263158, + "grad_norm": 0.0866721985482825, + "learning_rate": 4.390314341950581e-05, + "loss": 0.2425, + "step": 1643 + }, + { + "epoch": 2.163157894736842, + "grad_norm": 0.08725371319117803, + "learning_rate": 4.3776468132724604e-05, + "loss": 0.242, + "step": 1644 + }, + { + "epoch": 2.1644736842105265, + "grad_norm": 0.08513410768610372, + "learning_rate": 4.3649924630060534e-05, + "loss": 0.2458, + "step": 1645 + }, + { + "epoch": 2.1657894736842107, + "grad_norm": 0.08350902892663929, + "learning_rate": 4.35235132081229e-05, + "loss": 0.2373, + "step": 1646 + }, + { + "epoch": 2.167105263157895, + "grad_norm": 0.08213110851604064, + "learning_rate": 4.3397234163211483e-05, + "loss": 0.2244, + "step": 1647 + }, + { + "epoch": 2.168421052631579, + "grad_norm": 0.08674873213326942, + "learning_rate": 4.3271087791315734e-05, + "loss": 0.2425, + "step": 1648 + }, + { + "epoch": 2.169736842105263, + "grad_norm": 0.08588867781512083, + "learning_rate": 4.314507438811407e-05, + "loss": 0.2368, + "step": 1649 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 0.08387668032160955, + "learning_rate": 4.301919424897338e-05, + "loss": 0.2397, + "step": 1650 + }, + { + "epoch": 2.1723684210526315, + "grad_norm": 0.08260437607051827, + "learning_rate": 4.289344766894807e-05, + "loss": 0.2323, + "step": 1651 + }, + { + "epoch": 2.1736842105263157, + "grad_norm": 0.0833246040235028, + "learning_rate": 4.276783494277954e-05, + "loss": 0.2313, + "step": 1652 + }, + { + "epoch": 2.175, + "grad_norm": 0.08291199475859631, + "learning_rate": 4.264235636489542e-05, + "loss": 0.2342, + "step": 1653 + }, + { + "epoch": 2.1763157894736844, + "grad_norm": 0.08695364030640872, + "learning_rate": 4.2517012229408905e-05, + "loss": 0.2534, + "step": 1654 + }, + { + "epoch": 2.1776315789473686, + "grad_norm": 0.08428816275057385, + "learning_rate": 4.2391802830118135e-05, + "loss": 0.2346, + "step": 1655 + }, + { + "epoch": 2.1789473684210527, + "grad_norm": 0.084207037568309, + "learning_rate": 4.2266728460505375e-05, + "loss": 0.2381, + "step": 1656 + }, + { + "epoch": 2.180263157894737, + "grad_norm": 0.0886796758669413, + "learning_rate": 4.2141789413736354e-05, + "loss": 0.2495, + "step": 1657 + }, + { + "epoch": 2.181578947368421, + "grad_norm": 0.08307977015154046, + "learning_rate": 4.201698598265973e-05, + "loss": 0.2327, + "step": 1658 + }, + { + "epoch": 2.182894736842105, + "grad_norm": 0.08244618117247395, + "learning_rate": 4.189231845980618e-05, + "loss": 0.2344, + "step": 1659 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.08471751852248358, + "learning_rate": 4.176778713738787e-05, + "loss": 0.2341, + "step": 1660 + }, + { + "epoch": 2.1855263157894735, + "grad_norm": 0.08538574209457075, + "learning_rate": 4.164339230729771e-05, + "loss": 0.2375, + "step": 1661 + }, + { + "epoch": 2.1868421052631577, + "grad_norm": 0.08739644816596194, + "learning_rate": 4.151913426110864e-05, + "loss": 0.2357, + "step": 1662 + }, + { + "epoch": 2.1881578947368423, + "grad_norm": 0.08456784004962821, + "learning_rate": 4.13950132900731e-05, + "loss": 0.2389, + "step": 1663 + }, + { + "epoch": 2.1894736842105265, + "grad_norm": 0.08259644860196363, + "learning_rate": 4.127102968512214e-05, + "loss": 0.2335, + "step": 1664 + }, + { + "epoch": 2.1907894736842106, + "grad_norm": 0.08038218499706021, + "learning_rate": 4.114718373686481e-05, + "loss": 0.233, + "step": 1665 + }, + { + "epoch": 2.192105263157895, + "grad_norm": 0.08297061732865985, + "learning_rate": 4.102347573558763e-05, + "loss": 0.226, + "step": 1666 + }, + { + "epoch": 2.193421052631579, + "grad_norm": 0.08863091885047723, + "learning_rate": 4.089990597125368e-05, + "loss": 0.2438, + "step": 1667 + }, + { + "epoch": 2.194736842105263, + "grad_norm": 0.08628873633777266, + "learning_rate": 4.077647473350201e-05, + "loss": 0.2361, + "step": 1668 + }, + { + "epoch": 2.1960526315789473, + "grad_norm": 0.08360640948058073, + "learning_rate": 4.065318231164704e-05, + "loss": 0.2417, + "step": 1669 + }, + { + "epoch": 2.1973684210526314, + "grad_norm": 0.08524943894134786, + "learning_rate": 4.053002899467774e-05, + "loss": 0.2387, + "step": 1670 + }, + { + "epoch": 2.1986842105263156, + "grad_norm": 0.08537007936359343, + "learning_rate": 4.040701507125712e-05, + "loss": 0.2408, + "step": 1671 + }, + { + "epoch": 2.2, + "grad_norm": 0.08300365174925911, + "learning_rate": 4.028414082972141e-05, + "loss": 0.226, + "step": 1672 + }, + { + "epoch": 2.2013157894736843, + "grad_norm": 0.08790060045013368, + "learning_rate": 4.016140655807936e-05, + "loss": 0.2507, + "step": 1673 + }, + { + "epoch": 2.2026315789473685, + "grad_norm": 0.0831197113798778, + "learning_rate": 4.003881254401183e-05, + "loss": 0.2343, + "step": 1674 + }, + { + "epoch": 2.2039473684210527, + "grad_norm": 0.08118714142897289, + "learning_rate": 3.991635907487076e-05, + "loss": 0.2265, + "step": 1675 + }, + { + "epoch": 2.205263157894737, + "grad_norm": 0.08836758200078478, + "learning_rate": 3.97940464376787e-05, + "loss": 0.2539, + "step": 1676 + }, + { + "epoch": 2.206578947368421, + "grad_norm": 0.0829126388696676, + "learning_rate": 3.967187491912813e-05, + "loss": 0.2343, + "step": 1677 + }, + { + "epoch": 2.207894736842105, + "grad_norm": 0.08050172289656292, + "learning_rate": 3.9549844805580706e-05, + "loss": 0.2339, + "step": 1678 + }, + { + "epoch": 2.2092105263157893, + "grad_norm": 0.08394767599321681, + "learning_rate": 3.942795638306674e-05, + "loss": 0.2369, + "step": 1679 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.08711738632474018, + "learning_rate": 3.9306209937284346e-05, + "loss": 0.2419, + "step": 1680 + }, + { + "epoch": 2.211842105263158, + "grad_norm": 0.0814409575586451, + "learning_rate": 3.918460575359882e-05, + "loss": 0.2306, + "step": 1681 + }, + { + "epoch": 2.213157894736842, + "grad_norm": 0.08526205771283932, + "learning_rate": 3.906314411704215e-05, + "loss": 0.2374, + "step": 1682 + }, + { + "epoch": 2.2144736842105264, + "grad_norm": 0.08590081960012944, + "learning_rate": 3.8941825312312054e-05, + "loss": 0.2324, + "step": 1683 + }, + { + "epoch": 2.2157894736842105, + "grad_norm": 0.08577840811573154, + "learning_rate": 3.882064962377154e-05, + "loss": 0.2402, + "step": 1684 + }, + { + "epoch": 2.2171052631578947, + "grad_norm": 0.08559702425445263, + "learning_rate": 3.869961733544814e-05, + "loss": 0.2402, + "step": 1685 + }, + { + "epoch": 2.218421052631579, + "grad_norm": 0.08436964025823426, + "learning_rate": 3.857872873103322e-05, + "loss": 0.2415, + "step": 1686 + }, + { + "epoch": 2.219736842105263, + "grad_norm": 0.08396240106431019, + "learning_rate": 3.845798409388149e-05, + "loss": 0.2302, + "step": 1687 + }, + { + "epoch": 2.221052631578947, + "grad_norm": 0.08629968011623863, + "learning_rate": 3.83373837070101e-05, + "loss": 0.2358, + "step": 1688 + }, + { + "epoch": 2.2223684210526318, + "grad_norm": 0.0838099558892786, + "learning_rate": 3.821692785309807e-05, + "loss": 0.2334, + "step": 1689 + }, + { + "epoch": 2.223684210526316, + "grad_norm": 0.08557035987280985, + "learning_rate": 3.809661681448576e-05, + "loss": 0.2311, + "step": 1690 + }, + { + "epoch": 2.225, + "grad_norm": 0.08591483983457045, + "learning_rate": 3.7976450873174005e-05, + "loss": 0.2421, + "step": 1691 + }, + { + "epoch": 2.2263157894736842, + "grad_norm": 0.08701077906025538, + "learning_rate": 3.7856430310823545e-05, + "loss": 0.2451, + "step": 1692 + }, + { + "epoch": 2.2276315789473684, + "grad_norm": 0.08152777587376067, + "learning_rate": 3.773655540875438e-05, + "loss": 0.2335, + "step": 1693 + }, + { + "epoch": 2.2289473684210526, + "grad_norm": 0.0879771188843296, + "learning_rate": 3.7616826447945066e-05, + "loss": 0.2404, + "step": 1694 + }, + { + "epoch": 2.2302631578947367, + "grad_norm": 0.08415415636312946, + "learning_rate": 3.749724370903216e-05, + "loss": 0.2355, + "step": 1695 + }, + { + "epoch": 2.231578947368421, + "grad_norm": 0.08756860364801405, + "learning_rate": 3.737780747230941e-05, + "loss": 0.2378, + "step": 1696 + }, + { + "epoch": 2.2328947368421055, + "grad_norm": 0.08454219628982126, + "learning_rate": 3.725851801772715e-05, + "loss": 0.2445, + "step": 1697 + }, + { + "epoch": 2.2342105263157896, + "grad_norm": 0.0869901636562866, + "learning_rate": 3.713937562489179e-05, + "loss": 0.2296, + "step": 1698 + }, + { + "epoch": 2.235526315789474, + "grad_norm": 0.08630652964552962, + "learning_rate": 3.702038057306492e-05, + "loss": 0.2302, + "step": 1699 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.08659038929154979, + "learning_rate": 3.69015331411628e-05, + "loss": 0.2351, + "step": 1700 + }, + { + "epoch": 2.238157894736842, + "grad_norm": 0.08649330336962248, + "learning_rate": 3.678283360775571e-05, + "loss": 0.2362, + "step": 1701 + }, + { + "epoch": 2.2394736842105263, + "grad_norm": 0.08424707415501553, + "learning_rate": 3.6664282251067184e-05, + "loss": 0.2378, + "step": 1702 + }, + { + "epoch": 2.2407894736842104, + "grad_norm": 0.08338435013878512, + "learning_rate": 3.65458793489736e-05, + "loss": 0.2336, + "step": 1703 + }, + { + "epoch": 2.2421052631578946, + "grad_norm": 0.08719248023927356, + "learning_rate": 3.642762517900322e-05, + "loss": 0.2426, + "step": 1704 + }, + { + "epoch": 2.2434210526315788, + "grad_norm": 0.08436003020889046, + "learning_rate": 3.6309520018335705e-05, + "loss": 0.2301, + "step": 1705 + }, + { + "epoch": 2.2447368421052634, + "grad_norm": 0.08728623567451708, + "learning_rate": 3.619156414380156e-05, + "loss": 0.2462, + "step": 1706 + }, + { + "epoch": 2.2460526315789475, + "grad_norm": 0.08273816077006978, + "learning_rate": 3.607375783188125e-05, + "loss": 0.2433, + "step": 1707 + }, + { + "epoch": 2.2473684210526317, + "grad_norm": 0.08380037356624727, + "learning_rate": 3.595610135870472e-05, + "loss": 0.2419, + "step": 1708 + }, + { + "epoch": 2.248684210526316, + "grad_norm": 0.08599999255981298, + "learning_rate": 3.583859500005071e-05, + "loss": 0.2459, + "step": 1709 + }, + { + "epoch": 2.25, + "grad_norm": 0.0857928049153299, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.2332, + "step": 1710 + }, + { + "epoch": 2.251315789473684, + "grad_norm": 0.08563816332335225, + "learning_rate": 3.560403372766522e-05, + "loss": 0.239, + "step": 1711 + }, + { + "epoch": 2.2526315789473683, + "grad_norm": 0.08648184240333989, + "learning_rate": 3.548697936372937e-05, + "loss": 0.2369, + "step": 1712 + }, + { + "epoch": 2.2539473684210525, + "grad_norm": 0.08405093979550926, + "learning_rate": 3.53700762139059e-05, + "loss": 0.2484, + "step": 1713 + }, + { + "epoch": 2.2552631578947366, + "grad_norm": 0.08629123371639234, + "learning_rate": 3.525332455220789e-05, + "loss": 0.2526, + "step": 1714 + }, + { + "epoch": 2.2565789473684212, + "grad_norm": 0.08603955512267374, + "learning_rate": 3.5136724652293206e-05, + "loss": 0.2363, + "step": 1715 + }, + { + "epoch": 2.2578947368421054, + "grad_norm": 0.08357218310355902, + "learning_rate": 3.5020276787464056e-05, + "loss": 0.2379, + "step": 1716 + }, + { + "epoch": 2.2592105263157896, + "grad_norm": 0.08178687870308674, + "learning_rate": 3.490398123066628e-05, + "loss": 0.2342, + "step": 1717 + }, + { + "epoch": 2.2605263157894737, + "grad_norm": 0.08618629426674392, + "learning_rate": 3.4787838254488694e-05, + "loss": 0.2436, + "step": 1718 + }, + { + "epoch": 2.261842105263158, + "grad_norm": 0.08654678035853532, + "learning_rate": 3.4671848131162544e-05, + "loss": 0.2413, + "step": 1719 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 0.08154103451407675, + "learning_rate": 3.455601113256073e-05, + "loss": 0.2445, + "step": 1720 + }, + { + "epoch": 2.264473684210526, + "grad_norm": 0.08283767549879928, + "learning_rate": 3.444032753019723e-05, + "loss": 0.235, + "step": 1721 + }, + { + "epoch": 2.2657894736842104, + "grad_norm": 0.08402970526889068, + "learning_rate": 3.4324797595226565e-05, + "loss": 0.2337, + "step": 1722 + }, + { + "epoch": 2.2671052631578945, + "grad_norm": 0.08237791548613392, + "learning_rate": 3.420942159844298e-05, + "loss": 0.2352, + "step": 1723 + }, + { + "epoch": 2.268421052631579, + "grad_norm": 0.08344712694132657, + "learning_rate": 3.4094199810279924e-05, + "loss": 0.236, + "step": 1724 + }, + { + "epoch": 2.2697368421052633, + "grad_norm": 0.08799849742749684, + "learning_rate": 3.3979132500809405e-05, + "loss": 0.2481, + "step": 1725 + }, + { + "epoch": 2.2710526315789474, + "grad_norm": 0.08588090804811839, + "learning_rate": 3.386421993974129e-05, + "loss": 0.2364, + "step": 1726 + }, + { + "epoch": 2.2723684210526316, + "grad_norm": 0.0838270871173491, + "learning_rate": 3.3749462396422846e-05, + "loss": 0.228, + "step": 1727 + }, + { + "epoch": 2.2736842105263158, + "grad_norm": 0.08845792107580112, + "learning_rate": 3.363486013983788e-05, + "loss": 0.2437, + "step": 1728 + }, + { + "epoch": 2.275, + "grad_norm": 0.08775832799442573, + "learning_rate": 3.352041343860621e-05, + "loss": 0.2433, + "step": 1729 + }, + { + "epoch": 2.276315789473684, + "grad_norm": 0.08852510695897718, + "learning_rate": 3.340612256098316e-05, + "loss": 0.2528, + "step": 1730 + }, + { + "epoch": 2.2776315789473682, + "grad_norm": 0.09072365256400772, + "learning_rate": 3.329198777485869e-05, + "loss": 0.235, + "step": 1731 + }, + { + "epoch": 2.2789473684210524, + "grad_norm": 0.08478552332865018, + "learning_rate": 3.317800934775696e-05, + "loss": 0.2419, + "step": 1732 + }, + { + "epoch": 2.280263157894737, + "grad_norm": 0.08510050487380844, + "learning_rate": 3.30641875468356e-05, + "loss": 0.2382, + "step": 1733 + }, + { + "epoch": 2.281578947368421, + "grad_norm": 0.08450032284933855, + "learning_rate": 3.2950522638885106e-05, + "loss": 0.2411, + "step": 1734 + }, + { + "epoch": 2.2828947368421053, + "grad_norm": 0.0847645250276902, + "learning_rate": 3.283701489032832e-05, + "loss": 0.2289, + "step": 1735 + }, + { + "epoch": 2.2842105263157895, + "grad_norm": 0.08500704575747163, + "learning_rate": 3.2723664567219626e-05, + "loss": 0.2362, + "step": 1736 + }, + { + "epoch": 2.2855263157894736, + "grad_norm": 0.08540028460798461, + "learning_rate": 3.261047193524439e-05, + "loss": 0.2441, + "step": 1737 + }, + { + "epoch": 2.286842105263158, + "grad_norm": 0.08482142626182317, + "learning_rate": 3.249743725971849e-05, + "loss": 0.2339, + "step": 1738 + }, + { + "epoch": 2.288157894736842, + "grad_norm": 0.08455145109428938, + "learning_rate": 3.238456080558743e-05, + "loss": 0.2364, + "step": 1739 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.08608434293062465, + "learning_rate": 3.227184283742591e-05, + "loss": 0.2427, + "step": 1740 + }, + { + "epoch": 2.2907894736842107, + "grad_norm": 0.08525626191346178, + "learning_rate": 3.2159283619437155e-05, + "loss": 0.2447, + "step": 1741 + }, + { + "epoch": 2.292105263157895, + "grad_norm": 0.0867443524413285, + "learning_rate": 3.2046883415452246e-05, + "loss": 0.2297, + "step": 1742 + }, + { + "epoch": 2.293421052631579, + "grad_norm": 0.08427294151144223, + "learning_rate": 3.193464248892964e-05, + "loss": 0.2466, + "step": 1743 + }, + { + "epoch": 2.294736842105263, + "grad_norm": 0.0849042097519616, + "learning_rate": 3.182256110295437e-05, + "loss": 0.2402, + "step": 1744 + }, + { + "epoch": 2.2960526315789473, + "grad_norm": 0.08773216559618434, + "learning_rate": 3.171063952023753e-05, + "loss": 0.238, + "step": 1745 + }, + { + "epoch": 2.2973684210526315, + "grad_norm": 0.08514375299514274, + "learning_rate": 3.159887800311569e-05, + "loss": 0.2495, + "step": 1746 + }, + { + "epoch": 2.2986842105263157, + "grad_norm": 0.08221174413038916, + "learning_rate": 3.148727681355022e-05, + "loss": 0.2327, + "step": 1747 + }, + { + "epoch": 2.3, + "grad_norm": 0.08292212985659095, + "learning_rate": 3.137583621312665e-05, + "loss": 0.2265, + "step": 1748 + }, + { + "epoch": 2.3013157894736844, + "grad_norm": 0.08536339352006025, + "learning_rate": 3.126455646305416e-05, + "loss": 0.2316, + "step": 1749 + }, + { + "epoch": 2.3026315789473686, + "grad_norm": 0.083425418514592, + "learning_rate": 3.115343782416483e-05, + "loss": 0.2405, + "step": 1750 + }, + { + "epoch": 2.3039473684210527, + "grad_norm": 0.08635976351926793, + "learning_rate": 3.1042480556913224e-05, + "loss": 0.234, + "step": 1751 + }, + { + "epoch": 2.305263157894737, + "grad_norm": 0.08648353037146853, + "learning_rate": 3.093168492137557e-05, + "loss": 0.222, + "step": 1752 + }, + { + "epoch": 2.306578947368421, + "grad_norm": 0.08421631985285294, + "learning_rate": 3.082105117724923e-05, + "loss": 0.2301, + "step": 1753 + }, + { + "epoch": 2.307894736842105, + "grad_norm": 0.08776858485770223, + "learning_rate": 3.071057958385221e-05, + "loss": 0.236, + "step": 1754 + }, + { + "epoch": 2.3092105263157894, + "grad_norm": 0.08692453013595573, + "learning_rate": 3.0600270400122335e-05, + "loss": 0.2379, + "step": 1755 + }, + { + "epoch": 2.3105263157894735, + "grad_norm": 0.08408758438701626, + "learning_rate": 3.0490123884616796e-05, + "loss": 0.2397, + "step": 1756 + }, + { + "epoch": 2.3118421052631577, + "grad_norm": 0.08561383977421695, + "learning_rate": 3.0380140295511516e-05, + "loss": 0.2362, + "step": 1757 + }, + { + "epoch": 2.3131578947368423, + "grad_norm": 0.08523323066228368, + "learning_rate": 3.0270319890600462e-05, + "loss": 0.2373, + "step": 1758 + }, + { + "epoch": 2.3144736842105265, + "grad_norm": 0.08384268173924093, + "learning_rate": 3.0160662927295225e-05, + "loss": 0.2221, + "step": 1759 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.08377079729896948, + "learning_rate": 3.0051169662624225e-05, + "loss": 0.2305, + "step": 1760 + }, + { + "epoch": 2.317105263157895, + "grad_norm": 0.08434931689091195, + "learning_rate": 2.994184035323213e-05, + "loss": 0.2264, + "step": 1761 + }, + { + "epoch": 2.318421052631579, + "grad_norm": 0.08485228565126582, + "learning_rate": 2.983267525537945e-05, + "loss": 0.2407, + "step": 1762 + }, + { + "epoch": 2.319736842105263, + "grad_norm": 0.08458549101284824, + "learning_rate": 2.9723674624941688e-05, + "loss": 0.2397, + "step": 1763 + }, + { + "epoch": 2.3210526315789473, + "grad_norm": 0.08376501835808486, + "learning_rate": 2.9614838717408867e-05, + "loss": 0.2353, + "step": 1764 + }, + { + "epoch": 2.3223684210526314, + "grad_norm": 0.08574531246445859, + "learning_rate": 2.950616778788492e-05, + "loss": 0.2465, + "step": 1765 + }, + { + "epoch": 2.3236842105263156, + "grad_norm": 0.08775545996621473, + "learning_rate": 2.9397662091087054e-05, + "loss": 0.2464, + "step": 1766 + }, + { + "epoch": 2.325, + "grad_norm": 0.0818496814028781, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.2204, + "step": 1767 + }, + { + "epoch": 2.3263157894736843, + "grad_norm": 0.08595948586857366, + "learning_rate": 2.9181147412601562e-05, + "loss": 0.2329, + "step": 1768 + }, + { + "epoch": 2.3276315789473685, + "grad_norm": 0.08376848170186316, + "learning_rate": 2.9073138938409495e-05, + "loss": 0.2416, + "step": 1769 + }, + { + "epoch": 2.3289473684210527, + "grad_norm": 0.08594662597858775, + "learning_rate": 2.89652967119336e-05, + "loss": 0.2439, + "step": 1770 + }, + { + "epoch": 2.330263157894737, + "grad_norm": 0.08548685030438735, + "learning_rate": 2.8857620985948652e-05, + "loss": 0.2466, + "step": 1771 + }, + { + "epoch": 2.331578947368421, + "grad_norm": 0.08440124314126071, + "learning_rate": 2.8750112012839214e-05, + "loss": 0.2318, + "step": 1772 + }, + { + "epoch": 2.332894736842105, + "grad_norm": 0.08563767620958296, + "learning_rate": 2.8642770044598966e-05, + "loss": 0.2386, + "step": 1773 + }, + { + "epoch": 2.3342105263157893, + "grad_norm": 0.08413261927528606, + "learning_rate": 2.8535595332830102e-05, + "loss": 0.2412, + "step": 1774 + }, + { + "epoch": 2.3355263157894735, + "grad_norm": 0.08430887282829173, + "learning_rate": 2.8428588128742894e-05, + "loss": 0.2433, + "step": 1775 + }, + { + "epoch": 2.336842105263158, + "grad_norm": 0.08421978093619426, + "learning_rate": 2.8321748683154893e-05, + "loss": 0.2266, + "step": 1776 + }, + { + "epoch": 2.338157894736842, + "grad_norm": 0.08367626319071061, + "learning_rate": 2.8215077246490417e-05, + "loss": 0.232, + "step": 1777 + }, + { + "epoch": 2.3394736842105264, + "grad_norm": 0.0852480773032482, + "learning_rate": 2.810857406878009e-05, + "loss": 0.2431, + "step": 1778 + }, + { + "epoch": 2.3407894736842105, + "grad_norm": 0.08752279412974256, + "learning_rate": 2.800223939966007e-05, + "loss": 0.2392, + "step": 1779 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.08525332318728705, + "learning_rate": 2.789607348837153e-05, + "loss": 0.2261, + "step": 1780 + }, + { + "epoch": 2.343421052631579, + "grad_norm": 0.08501669465697845, + "learning_rate": 2.7790076583760126e-05, + "loss": 0.236, + "step": 1781 + }, + { + "epoch": 2.344736842105263, + "grad_norm": 0.08586501169000936, + "learning_rate": 2.7684248934275325e-05, + "loss": 0.2369, + "step": 1782 + }, + { + "epoch": 2.3460526315789476, + "grad_norm": 0.08330526797227158, + "learning_rate": 2.757859078796997e-05, + "loss": 0.2426, + "step": 1783 + }, + { + "epoch": 2.3473684210526318, + "grad_norm": 0.08682682618319912, + "learning_rate": 2.7473102392499518e-05, + "loss": 0.2405, + "step": 1784 + }, + { + "epoch": 2.348684210526316, + "grad_norm": 0.08681140965019736, + "learning_rate": 2.73677839951215e-05, + "loss": 0.2438, + "step": 1785 + }, + { + "epoch": 2.35, + "grad_norm": 0.08557189396659094, + "learning_rate": 2.7262635842695127e-05, + "loss": 0.232, + "step": 1786 + }, + { + "epoch": 2.3513157894736842, + "grad_norm": 0.08357509250185297, + "learning_rate": 2.7157658181680457e-05, + "loss": 0.2391, + "step": 1787 + }, + { + "epoch": 2.3526315789473684, + "grad_norm": 0.0863037240334069, + "learning_rate": 2.7052851258137935e-05, + "loss": 0.2385, + "step": 1788 + }, + { + "epoch": 2.3539473684210526, + "grad_norm": 0.08781831219541067, + "learning_rate": 2.6948215317727844e-05, + "loss": 0.2332, + "step": 1789 + }, + { + "epoch": 2.3552631578947367, + "grad_norm": 0.08657991059589279, + "learning_rate": 2.684375060570965e-05, + "loss": 0.2386, + "step": 1790 + }, + { + "epoch": 2.356578947368421, + "grad_norm": 0.08748510488863394, + "learning_rate": 2.6739457366941543e-05, + "loss": 0.2449, + "step": 1791 + }, + { + "epoch": 2.3578947368421055, + "grad_norm": 0.08693207168372159, + "learning_rate": 2.6635335845879737e-05, + "loss": 0.2294, + "step": 1792 + }, + { + "epoch": 2.3592105263157896, + "grad_norm": 0.0867096331580623, + "learning_rate": 2.653138628657793e-05, + "loss": 0.2375, + "step": 1793 + }, + { + "epoch": 2.360526315789474, + "grad_norm": 0.09001793260204524, + "learning_rate": 2.6427608932686843e-05, + "loss": 0.2479, + "step": 1794 + }, + { + "epoch": 2.361842105263158, + "grad_norm": 0.08716676015459905, + "learning_rate": 2.6324004027453464e-05, + "loss": 0.2371, + "step": 1795 + }, + { + "epoch": 2.363157894736842, + "grad_norm": 0.08978495624040321, + "learning_rate": 2.622057181372063e-05, + "loss": 0.2468, + "step": 1796 + }, + { + "epoch": 2.3644736842105263, + "grad_norm": 0.08299772256887776, + "learning_rate": 2.6117312533926362e-05, + "loss": 0.2336, + "step": 1797 + }, + { + "epoch": 2.3657894736842104, + "grad_norm": 0.08607109549303381, + "learning_rate": 2.601422643010335e-05, + "loss": 0.2424, + "step": 1798 + }, + { + "epoch": 2.3671052631578946, + "grad_norm": 0.08507498429800535, + "learning_rate": 2.5911313743878418e-05, + "loss": 0.2385, + "step": 1799 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.08326170794330705, + "learning_rate": 2.5808574716471856e-05, + "loss": 0.2419, + "step": 1800 + }, + { + "epoch": 2.3697368421052634, + "grad_norm": 0.08536319126880634, + "learning_rate": 2.570600958869689e-05, + "loss": 0.2337, + "step": 1801 + }, + { + "epoch": 2.3710526315789475, + "grad_norm": 0.0832285855296963, + "learning_rate": 2.5603618600959223e-05, + "loss": 0.2262, + "step": 1802 + }, + { + "epoch": 2.3723684210526317, + "grad_norm": 0.08748256088729627, + "learning_rate": 2.55014019932563e-05, + "loss": 0.238, + "step": 1803 + }, + { + "epoch": 2.373684210526316, + "grad_norm": 0.0845403743408863, + "learning_rate": 2.5399360005176886e-05, + "loss": 0.2342, + "step": 1804 + }, + { + "epoch": 2.375, + "grad_norm": 0.08306478491889778, + "learning_rate": 2.529749287590042e-05, + "loss": 0.236, + "step": 1805 + }, + { + "epoch": 2.376315789473684, + "grad_norm": 0.08506256922386096, + "learning_rate": 2.519580084419646e-05, + "loss": 0.2364, + "step": 1806 + }, + { + "epoch": 2.3776315789473683, + "grad_norm": 0.0821793026467122, + "learning_rate": 2.509428414842424e-05, + "loss": 0.2241, + "step": 1807 + }, + { + "epoch": 2.3789473684210525, + "grad_norm": 0.08556195534896847, + "learning_rate": 2.4992943026531935e-05, + "loss": 0.2365, + "step": 1808 + }, + { + "epoch": 2.3802631578947366, + "grad_norm": 0.08705793209497273, + "learning_rate": 2.4891777716056176e-05, + "loss": 0.2526, + "step": 1809 + }, + { + "epoch": 2.3815789473684212, + "grad_norm": 0.08705293536857067, + "learning_rate": 2.4790788454121584e-05, + "loss": 0.2361, + "step": 1810 + }, + { + "epoch": 2.3828947368421054, + "grad_norm": 0.08488258217639373, + "learning_rate": 2.4689975477440086e-05, + "loss": 0.2273, + "step": 1811 + }, + { + "epoch": 2.3842105263157896, + "grad_norm": 0.09044220228981603, + "learning_rate": 2.4589339022310386e-05, + "loss": 0.2532, + "step": 1812 + }, + { + "epoch": 2.3855263157894737, + "grad_norm": 0.0858040156345174, + "learning_rate": 2.4488879324617474e-05, + "loss": 0.2395, + "step": 1813 + }, + { + "epoch": 2.386842105263158, + "grad_norm": 0.08540339576774533, + "learning_rate": 2.4388596619831993e-05, + "loss": 0.2393, + "step": 1814 + }, + { + "epoch": 2.388157894736842, + "grad_norm": 0.08684868336556462, + "learning_rate": 2.4288491143009795e-05, + "loss": 0.2362, + "step": 1815 + }, + { + "epoch": 2.389473684210526, + "grad_norm": 0.086881482942677, + "learning_rate": 2.4188563128791254e-05, + "loss": 0.2352, + "step": 1816 + }, + { + "epoch": 2.3907894736842104, + "grad_norm": 0.08698301322688715, + "learning_rate": 2.4088812811400773e-05, + "loss": 0.2366, + "step": 1817 + }, + { + "epoch": 2.3921052631578945, + "grad_norm": 0.08860609435080301, + "learning_rate": 2.3989240424646355e-05, + "loss": 0.2579, + "step": 1818 + }, + { + "epoch": 2.393421052631579, + "grad_norm": 0.08501020836673648, + "learning_rate": 2.388984620191883e-05, + "loss": 0.2305, + "step": 1819 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.08510368650070274, + "learning_rate": 2.379063037619146e-05, + "loss": 0.247, + "step": 1820 + }, + { + "epoch": 2.3960526315789474, + "grad_norm": 0.0829406506692919, + "learning_rate": 2.3691593180019366e-05, + "loss": 0.2278, + "step": 1821 + }, + { + "epoch": 2.3973684210526316, + "grad_norm": 0.0867391319954807, + "learning_rate": 2.3592734845538956e-05, + "loss": 0.2336, + "step": 1822 + }, + { + "epoch": 2.3986842105263158, + "grad_norm": 0.08541140823531897, + "learning_rate": 2.3494055604467447e-05, + "loss": 0.2492, + "step": 1823 + }, + { + "epoch": 2.4, + "grad_norm": 0.08302602754112574, + "learning_rate": 2.339555568810221e-05, + "loss": 0.2373, + "step": 1824 + }, + { + "epoch": 2.401315789473684, + "grad_norm": 0.08577260892930051, + "learning_rate": 2.32972353273203e-05, + "loss": 0.2419, + "step": 1825 + }, + { + "epoch": 2.4026315789473682, + "grad_norm": 0.08774292891845863, + "learning_rate": 2.319909475257799e-05, + "loss": 0.2405, + "step": 1826 + }, + { + "epoch": 2.4039473684210524, + "grad_norm": 0.08550920787272853, + "learning_rate": 2.3101134193910024e-05, + "loss": 0.2434, + "step": 1827 + }, + { + "epoch": 2.405263157894737, + "grad_norm": 0.08426612023984284, + "learning_rate": 2.300335388092929e-05, + "loss": 0.2234, + "step": 1828 + }, + { + "epoch": 2.406578947368421, + "grad_norm": 0.0848436411883103, + "learning_rate": 2.2905754042826143e-05, + "loss": 0.2377, + "step": 1829 + }, + { + "epoch": 2.4078947368421053, + "grad_norm": 0.08552189669750118, + "learning_rate": 2.2808334908367914e-05, + "loss": 0.243, + "step": 1830 + }, + { + "epoch": 2.4092105263157895, + "grad_norm": 0.08572236225629354, + "learning_rate": 2.271109670589844e-05, + "loss": 0.2398, + "step": 1831 + }, + { + "epoch": 2.4105263157894736, + "grad_norm": 0.08511556223223665, + "learning_rate": 2.2614039663337417e-05, + "loss": 0.225, + "step": 1832 + }, + { + "epoch": 2.411842105263158, + "grad_norm": 0.08510989348259673, + "learning_rate": 2.2517164008179882e-05, + "loss": 0.2393, + "step": 1833 + }, + { + "epoch": 2.413157894736842, + "grad_norm": 0.08621647138074637, + "learning_rate": 2.2420469967495793e-05, + "loss": 0.2492, + "step": 1834 + }, + { + "epoch": 2.4144736842105265, + "grad_norm": 0.08627144976057537, + "learning_rate": 2.232395776792938e-05, + "loss": 0.2351, + "step": 1835 + }, + { + "epoch": 2.4157894736842107, + "grad_norm": 0.08449582717174542, + "learning_rate": 2.222762763569862e-05, + "loss": 0.2331, + "step": 1836 + }, + { + "epoch": 2.417105263157895, + "grad_norm": 0.08533743090482276, + "learning_rate": 2.2131479796594767e-05, + "loss": 0.2356, + "step": 1837 + }, + { + "epoch": 2.418421052631579, + "grad_norm": 0.08874843333228837, + "learning_rate": 2.2035514475981756e-05, + "loss": 0.242, + "step": 1838 + }, + { + "epoch": 2.419736842105263, + "grad_norm": 0.08529644318540644, + "learning_rate": 2.1939731898795802e-05, + "loss": 0.2239, + "step": 1839 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.08594783838232867, + "learning_rate": 2.184413228954468e-05, + "loss": 0.2411, + "step": 1840 + }, + { + "epoch": 2.4223684210526315, + "grad_norm": 0.09034497841922952, + "learning_rate": 2.1748715872307345e-05, + "loss": 0.2437, + "step": 1841 + }, + { + "epoch": 2.4236842105263157, + "grad_norm": 0.0864543032403873, + "learning_rate": 2.165348287073339e-05, + "loss": 0.2497, + "step": 1842 + }, + { + "epoch": 2.425, + "grad_norm": 0.08423651720196439, + "learning_rate": 2.155843350804243e-05, + "loss": 0.233, + "step": 1843 + }, + { + "epoch": 2.4263157894736844, + "grad_norm": 0.08476088806860947, + "learning_rate": 2.1463568007023704e-05, + "loss": 0.2338, + "step": 1844 + }, + { + "epoch": 2.4276315789473686, + "grad_norm": 0.08560695927731492, + "learning_rate": 2.1368886590035443e-05, + "loss": 0.2474, + "step": 1845 + }, + { + "epoch": 2.4289473684210527, + "grad_norm": 0.08571656417086615, + "learning_rate": 2.1274389479004397e-05, + "loss": 0.2343, + "step": 1846 + }, + { + "epoch": 2.430263157894737, + "grad_norm": 0.08484405706280464, + "learning_rate": 2.1180076895425395e-05, + "loss": 0.2358, + "step": 1847 + }, + { + "epoch": 2.431578947368421, + "grad_norm": 0.09045256351090854, + "learning_rate": 2.1085949060360654e-05, + "loss": 0.241, + "step": 1848 + }, + { + "epoch": 2.432894736842105, + "grad_norm": 0.08339645141686791, + "learning_rate": 2.0992006194439372e-05, + "loss": 0.2417, + "step": 1849 + }, + { + "epoch": 2.4342105263157894, + "grad_norm": 0.08857069700987559, + "learning_rate": 2.0898248517857256e-05, + "loss": 0.2281, + "step": 1850 + }, + { + "epoch": 2.4355263157894735, + "grad_norm": 0.08727223363474927, + "learning_rate": 2.0804676250375867e-05, + "loss": 0.2408, + "step": 1851 + }, + { + "epoch": 2.4368421052631577, + "grad_norm": 0.08636083640113454, + "learning_rate": 2.0711289611322204e-05, + "loss": 0.2357, + "step": 1852 + }, + { + "epoch": 2.4381578947368423, + "grad_norm": 0.08601921483257845, + "learning_rate": 2.0618088819588167e-05, + "loss": 0.2362, + "step": 1853 + }, + { + "epoch": 2.4394736842105265, + "grad_norm": 0.08573635555006953, + "learning_rate": 2.0525074093630036e-05, + "loss": 0.2382, + "step": 1854 + }, + { + "epoch": 2.4407894736842106, + "grad_norm": 0.08431162925824574, + "learning_rate": 2.0432245651467995e-05, + "loss": 0.2336, + "step": 1855 + }, + { + "epoch": 2.442105263157895, + "grad_norm": 0.08468414851359736, + "learning_rate": 2.033960371068557e-05, + "loss": 0.2217, + "step": 1856 + }, + { + "epoch": 2.443421052631579, + "grad_norm": 0.08472832758486493, + "learning_rate": 2.02471484884291e-05, + "loss": 0.2378, + "step": 1857 + }, + { + "epoch": 2.444736842105263, + "grad_norm": 0.08936877382897962, + "learning_rate": 2.0154880201407367e-05, + "loss": 0.2449, + "step": 1858 + }, + { + "epoch": 2.4460526315789473, + "grad_norm": 0.0841329121430106, + "learning_rate": 2.0062799065890904e-05, + "loss": 0.2318, + "step": 1859 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.08665657400105448, + "learning_rate": 1.9970905297711606e-05, + "loss": 0.2404, + "step": 1860 + }, + { + "epoch": 2.4486842105263156, + "grad_norm": 0.08891275836559771, + "learning_rate": 1.987919911226217e-05, + "loss": 0.237, + "step": 1861 + }, + { + "epoch": 2.45, + "grad_norm": 0.08586405744449396, + "learning_rate": 1.9787680724495617e-05, + "loss": 0.2316, + "step": 1862 + }, + { + "epoch": 2.4513157894736843, + "grad_norm": 0.08602563906517424, + "learning_rate": 1.969635034892485e-05, + "loss": 0.2357, + "step": 1863 + }, + { + "epoch": 2.4526315789473685, + "grad_norm": 0.08605411678673527, + "learning_rate": 1.9605208199621995e-05, + "loss": 0.2368, + "step": 1864 + }, + { + "epoch": 2.4539473684210527, + "grad_norm": 0.0847811039119867, + "learning_rate": 1.9514254490218e-05, + "loss": 0.2278, + "step": 1865 + }, + { + "epoch": 2.455263157894737, + "grad_norm": 0.089108249737477, + "learning_rate": 1.9423489433902186e-05, + "loss": 0.2421, + "step": 1866 + }, + { + "epoch": 2.456578947368421, + "grad_norm": 0.08853862535589128, + "learning_rate": 1.9332913243421634e-05, + "loss": 0.2346, + "step": 1867 + }, + { + "epoch": 2.457894736842105, + "grad_norm": 0.08624747222459084, + "learning_rate": 1.924252613108073e-05, + "loss": 0.2385, + "step": 1868 + }, + { + "epoch": 2.4592105263157893, + "grad_norm": 0.08633100421050345, + "learning_rate": 1.9152328308740707e-05, + "loss": 0.2281, + "step": 1869 + }, + { + "epoch": 2.4605263157894735, + "grad_norm": 0.0823894382169209, + "learning_rate": 1.9062319987819067e-05, + "loss": 0.2313, + "step": 1870 + }, + { + "epoch": 2.461842105263158, + "grad_norm": 0.08514975111544096, + "learning_rate": 1.897250137928921e-05, + "loss": 0.2301, + "step": 1871 + }, + { + "epoch": 2.463157894736842, + "grad_norm": 0.08573391873752553, + "learning_rate": 1.888287269367979e-05, + "loss": 0.2253, + "step": 1872 + }, + { + "epoch": 2.4644736842105264, + "grad_norm": 0.08763954109623806, + "learning_rate": 1.8793434141074295e-05, + "loss": 0.2446, + "step": 1873 + }, + { + "epoch": 2.4657894736842105, + "grad_norm": 0.0848193021797019, + "learning_rate": 1.870418593111064e-05, + "loss": 0.2421, + "step": 1874 + }, + { + "epoch": 2.4671052631578947, + "grad_norm": 0.08299936916001775, + "learning_rate": 1.861512827298051e-05, + "loss": 0.2377, + "step": 1875 + }, + { + "epoch": 2.468421052631579, + "grad_norm": 0.08656696401202549, + "learning_rate": 1.8526261375428955e-05, + "loss": 0.2453, + "step": 1876 + }, + { + "epoch": 2.469736842105263, + "grad_norm": 0.0845516003224581, + "learning_rate": 1.8437585446753925e-05, + "loss": 0.2264, + "step": 1877 + }, + { + "epoch": 2.4710526315789476, + "grad_norm": 0.08820431143873694, + "learning_rate": 1.834910069480571e-05, + "loss": 0.2553, + "step": 1878 + }, + { + "epoch": 2.4723684210526318, + "grad_norm": 0.086680366075887, + "learning_rate": 1.826080732698656e-05, + "loss": 0.2332, + "step": 1879 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 0.08757999419788688, + "learning_rate": 1.8172705550250092e-05, + "loss": 0.2365, + "step": 1880 + }, + { + "epoch": 2.475, + "grad_norm": 0.0819273273150161, + "learning_rate": 1.808479557110081e-05, + "loss": 0.2222, + "step": 1881 + }, + { + "epoch": 2.4763157894736842, + "grad_norm": 0.08333326519630053, + "learning_rate": 1.799707759559376e-05, + "loss": 0.2393, + "step": 1882 + }, + { + "epoch": 2.4776315789473684, + "grad_norm": 0.08611320760984689, + "learning_rate": 1.790955182933385e-05, + "loss": 0.2268, + "step": 1883 + }, + { + "epoch": 2.4789473684210526, + "grad_norm": 0.08633530140680602, + "learning_rate": 1.7822218477475494e-05, + "loss": 0.236, + "step": 1884 + }, + { + "epoch": 2.4802631578947367, + "grad_norm": 0.08652135837244138, + "learning_rate": 1.7735077744722107e-05, + "loss": 0.2351, + "step": 1885 + }, + { + "epoch": 2.481578947368421, + "grad_norm": 0.08815435855670466, + "learning_rate": 1.7648129835325587e-05, + "loss": 0.2347, + "step": 1886 + }, + { + "epoch": 2.4828947368421055, + "grad_norm": 0.0879883933592944, + "learning_rate": 1.756137495308594e-05, + "loss": 0.2328, + "step": 1887 + }, + { + "epoch": 2.4842105263157896, + "grad_norm": 0.08643787173876677, + "learning_rate": 1.7474813301350666e-05, + "loss": 0.2387, + "step": 1888 + }, + { + "epoch": 2.485526315789474, + "grad_norm": 0.08893052591767589, + "learning_rate": 1.7388445083014325e-05, + "loss": 0.2428, + "step": 1889 + }, + { + "epoch": 2.486842105263158, + "grad_norm": 0.08536802381106222, + "learning_rate": 1.7302270500518182e-05, + "loss": 0.2375, + "step": 1890 + }, + { + "epoch": 2.488157894736842, + "grad_norm": 0.08703152842078288, + "learning_rate": 1.7216289755849525e-05, + "loss": 0.2308, + "step": 1891 + }, + { + "epoch": 2.4894736842105263, + "grad_norm": 0.0836327066485613, + "learning_rate": 1.7130503050541368e-05, + "loss": 0.2363, + "step": 1892 + }, + { + "epoch": 2.4907894736842104, + "grad_norm": 0.08521480449342837, + "learning_rate": 1.704491058567187e-05, + "loss": 0.2371, + "step": 1893 + }, + { + "epoch": 2.4921052631578946, + "grad_norm": 0.08544333339590796, + "learning_rate": 1.6959512561863912e-05, + "loss": 0.2376, + "step": 1894 + }, + { + "epoch": 2.4934210526315788, + "grad_norm": 0.08419989528831981, + "learning_rate": 1.6874309179284664e-05, + "loss": 0.233, + "step": 1895 + }, + { + "epoch": 2.4947368421052634, + "grad_norm": 0.08743178517536454, + "learning_rate": 1.6789300637645e-05, + "loss": 0.24, + "step": 1896 + }, + { + "epoch": 2.4960526315789475, + "grad_norm": 0.08627446891742664, + "learning_rate": 1.670448713619913e-05, + "loss": 0.2418, + "step": 1897 + }, + { + "epoch": 2.4973684210526317, + "grad_norm": 0.086520690921376, + "learning_rate": 1.6619868873744147e-05, + "loss": 0.2333, + "step": 1898 + }, + { + "epoch": 2.498684210526316, + "grad_norm": 0.0890053931742651, + "learning_rate": 1.653544604861945e-05, + "loss": 0.245, + "step": 1899 + }, + { + "epoch": 2.5, + "grad_norm": 0.08795310211457955, + "learning_rate": 1.6451218858706374e-05, + "loss": 0.2424, + "step": 1900 + }, + { + "epoch": 2.501315789473684, + "grad_norm": 0.08768860420319245, + "learning_rate": 1.6367187501427685e-05, + "loss": 0.2409, + "step": 1901 + }, + { + "epoch": 2.5026315789473683, + "grad_norm": 0.08433281148019468, + "learning_rate": 1.6283352173747145e-05, + "loss": 0.2332, + "step": 1902 + }, + { + "epoch": 2.5039473684210525, + "grad_norm": 0.08869485067176774, + "learning_rate": 1.6199713072169053e-05, + "loss": 0.247, + "step": 1903 + }, + { + "epoch": 2.5052631578947366, + "grad_norm": 0.08535593567657823, + "learning_rate": 1.6116270392737754e-05, + "loss": 0.2331, + "step": 1904 + }, + { + "epoch": 2.5065789473684212, + "grad_norm": 0.08375689041638204, + "learning_rate": 1.6033024331037138e-05, + "loss": 0.2275, + "step": 1905 + }, + { + "epoch": 2.5078947368421054, + "grad_norm": 0.08478044933594917, + "learning_rate": 1.5949975082190337e-05, + "loss": 0.2339, + "step": 1906 + }, + { + "epoch": 2.5092105263157896, + "grad_norm": 0.08731008917029157, + "learning_rate": 1.5867122840859117e-05, + "loss": 0.2345, + "step": 1907 + }, + { + "epoch": 2.5105263157894737, + "grad_norm": 0.0867834465835224, + "learning_rate": 1.578446780124344e-05, + "loss": 0.2306, + "step": 1908 + }, + { + "epoch": 2.511842105263158, + "grad_norm": 0.08484337640412591, + "learning_rate": 1.570201015708108e-05, + "loss": 0.2421, + "step": 1909 + }, + { + "epoch": 2.513157894736842, + "grad_norm": 0.08676705310204419, + "learning_rate": 1.5619750101647114e-05, + "loss": 0.2431, + "step": 1910 + }, + { + "epoch": 2.514473684210526, + "grad_norm": 0.08560817073143388, + "learning_rate": 1.553768782775351e-05, + "loss": 0.2333, + "step": 1911 + }, + { + "epoch": 2.515789473684211, + "grad_norm": 0.08577547460347472, + "learning_rate": 1.5455823527748626e-05, + "loss": 0.2372, + "step": 1912 + }, + { + "epoch": 2.5171052631578945, + "grad_norm": 0.08601962847067497, + "learning_rate": 1.5374157393516764e-05, + "loss": 0.2334, + "step": 1913 + }, + { + "epoch": 2.518421052631579, + "grad_norm": 0.08618206990680746, + "learning_rate": 1.5292689616477806e-05, + "loss": 0.2355, + "step": 1914 + }, + { + "epoch": 2.5197368421052633, + "grad_norm": 0.08561559936263621, + "learning_rate": 1.5211420387586638e-05, + "loss": 0.2397, + "step": 1915 + }, + { + "epoch": 2.5210526315789474, + "grad_norm": 0.08569149401048583, + "learning_rate": 1.5130349897332763e-05, + "loss": 0.2385, + "step": 1916 + }, + { + "epoch": 2.5223684210526316, + "grad_norm": 0.08528987545012465, + "learning_rate": 1.5049478335739886e-05, + "loss": 0.2422, + "step": 1917 + }, + { + "epoch": 2.5236842105263158, + "grad_norm": 0.08573039796052453, + "learning_rate": 1.49688058923654e-05, + "loss": 0.2346, + "step": 1918 + }, + { + "epoch": 2.525, + "grad_norm": 0.08483858064921189, + "learning_rate": 1.4888332756300027e-05, + "loss": 0.2335, + "step": 1919 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.08618837736943744, + "learning_rate": 1.4808059116167305e-05, + "loss": 0.2321, + "step": 1920 + }, + { + "epoch": 2.5276315789473687, + "grad_norm": 0.08306460927365213, + "learning_rate": 1.4727985160123114e-05, + "loss": 0.2322, + "step": 1921 + }, + { + "epoch": 2.5289473684210524, + "grad_norm": 0.08847498270917728, + "learning_rate": 1.4648111075855398e-05, + "loss": 0.238, + "step": 1922 + }, + { + "epoch": 2.530263157894737, + "grad_norm": 0.08664647691974588, + "learning_rate": 1.4568437050583517e-05, + "loss": 0.2329, + "step": 1923 + }, + { + "epoch": 2.531578947368421, + "grad_norm": 0.08925646785401711, + "learning_rate": 1.4488963271057943e-05, + "loss": 0.2447, + "step": 1924 + }, + { + "epoch": 2.5328947368421053, + "grad_norm": 0.0858422178404279, + "learning_rate": 1.44096899235598e-05, + "loss": 0.2339, + "step": 1925 + }, + { + "epoch": 2.5342105263157895, + "grad_norm": 0.085698190947571, + "learning_rate": 1.4330617193900364e-05, + "loss": 0.2329, + "step": 1926 + }, + { + "epoch": 2.5355263157894736, + "grad_norm": 0.08849510463317915, + "learning_rate": 1.4251745267420757e-05, + "loss": 0.236, + "step": 1927 + }, + { + "epoch": 2.536842105263158, + "grad_norm": 0.08642342384169846, + "learning_rate": 1.4173074328991377e-05, + "loss": 0.2292, + "step": 1928 + }, + { + "epoch": 2.538157894736842, + "grad_norm": 0.08593496440244164, + "learning_rate": 1.4094604563011472e-05, + "loss": 0.2308, + "step": 1929 + }, + { + "epoch": 2.5394736842105265, + "grad_norm": 0.08537576961097464, + "learning_rate": 1.4016336153408893e-05, + "loss": 0.2318, + "step": 1930 + }, + { + "epoch": 2.5407894736842103, + "grad_norm": 0.08435630547007106, + "learning_rate": 1.3938269283639394e-05, + "loss": 0.2244, + "step": 1931 + }, + { + "epoch": 2.542105263157895, + "grad_norm": 0.0894275649268176, + "learning_rate": 1.3860404136686411e-05, + "loss": 0.2386, + "step": 1932 + }, + { + "epoch": 2.543421052631579, + "grad_norm": 0.08731027043438779, + "learning_rate": 1.3782740895060497e-05, + "loss": 0.2438, + "step": 1933 + }, + { + "epoch": 2.544736842105263, + "grad_norm": 0.08677749818948599, + "learning_rate": 1.3705279740798993e-05, + "loss": 0.2383, + "step": 1934 + }, + { + "epoch": 2.5460526315789473, + "grad_norm": 0.08591491161614231, + "learning_rate": 1.3628020855465572e-05, + "loss": 0.2387, + "step": 1935 + }, + { + "epoch": 2.5473684210526315, + "grad_norm": 0.08575789115259161, + "learning_rate": 1.355096442014977e-05, + "loss": 0.2325, + "step": 1936 + }, + { + "epoch": 2.5486842105263157, + "grad_norm": 0.08651010282289677, + "learning_rate": 1.3474110615466583e-05, + "loss": 0.2279, + "step": 1937 + }, + { + "epoch": 2.55, + "grad_norm": 0.08869408109954365, + "learning_rate": 1.339745962155613e-05, + "loss": 0.2351, + "step": 1938 + }, + { + "epoch": 2.5513157894736844, + "grad_norm": 0.08792931672329181, + "learning_rate": 1.3321011618083079e-05, + "loss": 0.2439, + "step": 1939 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.08816455279822999, + "learning_rate": 1.3244766784236307e-05, + "loss": 0.2429, + "step": 1940 + }, + { + "epoch": 2.5539473684210527, + "grad_norm": 0.08652728797542152, + "learning_rate": 1.3168725298728524e-05, + "loss": 0.2322, + "step": 1941 + }, + { + "epoch": 2.555263157894737, + "grad_norm": 0.08646822510404321, + "learning_rate": 1.3092887339795734e-05, + "loss": 0.2448, + "step": 1942 + }, + { + "epoch": 2.556578947368421, + "grad_norm": 0.08827748125160723, + "learning_rate": 1.3017253085196979e-05, + "loss": 0.2432, + "step": 1943 + }, + { + "epoch": 2.557894736842105, + "grad_norm": 0.087400695177381, + "learning_rate": 1.294182271221377e-05, + "loss": 0.242, + "step": 1944 + }, + { + "epoch": 2.5592105263157894, + "grad_norm": 0.08476067656041161, + "learning_rate": 1.2866596397649721e-05, + "loss": 0.2332, + "step": 1945 + }, + { + "epoch": 2.5605263157894735, + "grad_norm": 0.08400036515218953, + "learning_rate": 1.2791574317830213e-05, + "loss": 0.2335, + "step": 1946 + }, + { + "epoch": 2.5618421052631577, + "grad_norm": 0.08714320110806623, + "learning_rate": 1.2716756648601857e-05, + "loss": 0.2384, + "step": 1947 + }, + { + "epoch": 2.5631578947368423, + "grad_norm": 0.08718032193532914, + "learning_rate": 1.2642143565332154e-05, + "loss": 0.2363, + "step": 1948 + }, + { + "epoch": 2.5644736842105265, + "grad_norm": 0.08459275476172294, + "learning_rate": 1.2567735242909074e-05, + "loss": 0.2226, + "step": 1949 + }, + { + "epoch": 2.5657894736842106, + "grad_norm": 0.08586849969458207, + "learning_rate": 1.2493531855740625e-05, + "loss": 0.2247, + "step": 1950 + }, + { + "epoch": 2.567105263157895, + "grad_norm": 0.08846213083149775, + "learning_rate": 1.2419533577754528e-05, + "loss": 0.2333, + "step": 1951 + }, + { + "epoch": 2.568421052631579, + "grad_norm": 0.08643152306184454, + "learning_rate": 1.2345740582397648e-05, + "loss": 0.2371, + "step": 1952 + }, + { + "epoch": 2.569736842105263, + "grad_norm": 0.08661338412832961, + "learning_rate": 1.2272153042635704e-05, + "loss": 0.2434, + "step": 1953 + }, + { + "epoch": 2.5710526315789473, + "grad_norm": 0.08897240710261847, + "learning_rate": 1.2198771130952913e-05, + "loss": 0.2376, + "step": 1954 + }, + { + "epoch": 2.5723684210526314, + "grad_norm": 0.08622184838413748, + "learning_rate": 1.2125595019351443e-05, + "loss": 0.2307, + "step": 1955 + }, + { + "epoch": 2.5736842105263156, + "grad_norm": 0.0907063460852498, + "learning_rate": 1.2052624879351104e-05, + "loss": 0.2313, + "step": 1956 + }, + { + "epoch": 2.575, + "grad_norm": 0.08625315493284626, + "learning_rate": 1.1979860881988902e-05, + "loss": 0.2383, + "step": 1957 + }, + { + "epoch": 2.5763157894736843, + "grad_norm": 0.08964244854263959, + "learning_rate": 1.1907303197818665e-05, + "loss": 0.2394, + "step": 1958 + }, + { + "epoch": 2.5776315789473685, + "grad_norm": 0.08330532747614758, + "learning_rate": 1.183495199691068e-05, + "loss": 0.2215, + "step": 1959 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 0.0884851429344919, + "learning_rate": 1.176280744885121e-05, + "loss": 0.2423, + "step": 1960 + }, + { + "epoch": 2.580263157894737, + "grad_norm": 0.08980407385361433, + "learning_rate": 1.1690869722742126e-05, + "loss": 0.2393, + "step": 1961 + }, + { + "epoch": 2.581578947368421, + "grad_norm": 0.08634666525024143, + "learning_rate": 1.1619138987200562e-05, + "loss": 0.2287, + "step": 1962 + }, + { + "epoch": 2.582894736842105, + "grad_norm": 0.08589844097584014, + "learning_rate": 1.154761541035847e-05, + "loss": 0.2353, + "step": 1963 + }, + { + "epoch": 2.5842105263157897, + "grad_norm": 0.08695619199252902, + "learning_rate": 1.1476299159862203e-05, + "loss": 0.2424, + "step": 1964 + }, + { + "epoch": 2.5855263157894735, + "grad_norm": 0.08435657255170038, + "learning_rate": 1.1405190402872202e-05, + "loss": 0.2308, + "step": 1965 + }, + { + "epoch": 2.586842105263158, + "grad_norm": 0.08681956736459753, + "learning_rate": 1.1334289306062495e-05, + "loss": 0.2387, + "step": 1966 + }, + { + "epoch": 2.588157894736842, + "grad_norm": 0.08650224803441879, + "learning_rate": 1.126359603562045e-05, + "loss": 0.2369, + "step": 1967 + }, + { + "epoch": 2.5894736842105264, + "grad_norm": 0.08441575091576375, + "learning_rate": 1.119311075724625e-05, + "loss": 0.2195, + "step": 1968 + }, + { + "epoch": 2.5907894736842105, + "grad_norm": 0.08371697252642783, + "learning_rate": 1.1122833636152563e-05, + "loss": 0.2174, + "step": 1969 + }, + { + "epoch": 2.5921052631578947, + "grad_norm": 0.0868737302053953, + "learning_rate": 1.1052764837064178e-05, + "loss": 0.2283, + "step": 1970 + }, + { + "epoch": 2.593421052631579, + "grad_norm": 0.08687526871015291, + "learning_rate": 1.0982904524217551e-05, + "loss": 0.2291, + "step": 1971 + }, + { + "epoch": 2.594736842105263, + "grad_norm": 0.08813319422928485, + "learning_rate": 1.09132528613605e-05, + "loss": 0.2449, + "step": 1972 + }, + { + "epoch": 2.5960526315789476, + "grad_norm": 0.08856476761244737, + "learning_rate": 1.0843810011751766e-05, + "loss": 0.2422, + "step": 1973 + }, + { + "epoch": 2.5973684210526313, + "grad_norm": 0.08769322323788943, + "learning_rate": 1.0774576138160597e-05, + "loss": 0.245, + "step": 1974 + }, + { + "epoch": 2.598684210526316, + "grad_norm": 0.0854386832496123, + "learning_rate": 1.070555140286652e-05, + "loss": 0.2269, + "step": 1975 + }, + { + "epoch": 2.6, + "grad_norm": 0.08380866243911338, + "learning_rate": 1.0636735967658784e-05, + "loss": 0.239, + "step": 1976 + }, + { + "epoch": 2.6013157894736842, + "grad_norm": 0.08479138359498807, + "learning_rate": 1.056812999383604e-05, + "loss": 0.2321, + "step": 1977 + }, + { + "epoch": 2.6026315789473684, + "grad_norm": 0.08709294911680168, + "learning_rate": 1.0499733642206033e-05, + "loss": 0.239, + "step": 1978 + }, + { + "epoch": 2.6039473684210526, + "grad_norm": 0.08505697002044242, + "learning_rate": 1.0431547073085135e-05, + "loss": 0.2314, + "step": 1979 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.08770454123258202, + "learning_rate": 1.0363570446297999e-05, + "loss": 0.2455, + "step": 1980 + }, + { + "epoch": 2.606578947368421, + "grad_norm": 0.08477772700319303, + "learning_rate": 1.0295803921177182e-05, + "loss": 0.2277, + "step": 1981 + }, + { + "epoch": 2.6078947368421055, + "grad_norm": 0.08575690009081015, + "learning_rate": 1.0228247656562795e-05, + "loss": 0.2352, + "step": 1982 + }, + { + "epoch": 2.609210526315789, + "grad_norm": 0.0852766947310931, + "learning_rate": 1.0160901810802115e-05, + "loss": 0.2379, + "step": 1983 + }, + { + "epoch": 2.610526315789474, + "grad_norm": 0.08360204366625848, + "learning_rate": 1.0093766541749205e-05, + "loss": 0.2266, + "step": 1984 + }, + { + "epoch": 2.611842105263158, + "grad_norm": 0.08424512848837429, + "learning_rate": 1.0026842006764526e-05, + "loss": 0.2307, + "step": 1985 + }, + { + "epoch": 2.613157894736842, + "grad_norm": 0.08406138879126103, + "learning_rate": 9.960128362714637e-06, + "loss": 0.2293, + "step": 1986 + }, + { + "epoch": 2.6144736842105263, + "grad_norm": 0.08644831073662886, + "learning_rate": 9.89362576597177e-06, + "loss": 0.2334, + "step": 1987 + }, + { + "epoch": 2.6157894736842104, + "grad_norm": 0.08623901530925424, + "learning_rate": 9.827334372413444e-06, + "loss": 0.2358, + "step": 1988 + }, + { + "epoch": 2.6171052631578946, + "grad_norm": 0.08269859922491032, + "learning_rate": 9.761254337422176e-06, + "loss": 0.2167, + "step": 1989 + }, + { + "epoch": 2.6184210526315788, + "grad_norm": 0.08644342262890786, + "learning_rate": 9.695385815885016e-06, + "loss": 0.2301, + "step": 1990 + }, + { + "epoch": 2.6197368421052634, + "grad_norm": 0.08629959299387181, + "learning_rate": 9.629728962193318e-06, + "loss": 0.2289, + "step": 1991 + }, + { + "epoch": 2.6210526315789475, + "grad_norm": 0.08512850244099619, + "learning_rate": 9.564283930242257e-06, + "loss": 0.2251, + "step": 1992 + }, + { + "epoch": 2.6223684210526317, + "grad_norm": 0.0855143103873754, + "learning_rate": 9.499050873430482e-06, + "loss": 0.2253, + "step": 1993 + }, + { + "epoch": 2.623684210526316, + "grad_norm": 0.08812972174025184, + "learning_rate": 9.434029944659872e-06, + "loss": 0.2227, + "step": 1994 + }, + { + "epoch": 2.625, + "grad_norm": 0.08923681105940687, + "learning_rate": 9.369221296335006e-06, + "loss": 0.2519, + "step": 1995 + }, + { + "epoch": 2.626315789473684, + "grad_norm": 0.08888942368903012, + "learning_rate": 9.30462508036294e-06, + "loss": 0.2451, + "step": 1996 + }, + { + "epoch": 2.6276315789473683, + "grad_norm": 0.08691289008570331, + "learning_rate": 9.240241448152787e-06, + "loss": 0.235, + "step": 1997 + }, + { + "epoch": 2.6289473684210525, + "grad_norm": 0.08587149284301165, + "learning_rate": 9.176070550615378e-06, + "loss": 0.2273, + "step": 1998 + }, + { + "epoch": 2.6302631578947366, + "grad_norm": 0.08822548058662702, + "learning_rate": 9.112112538162898e-06, + "loss": 0.2356, + "step": 1999 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.08348460888767743, + "learning_rate": 9.048367560708604e-06, + "loss": 0.2297, + "step": 2000 + }, + { + "epoch": 2.6328947368421054, + "grad_norm": 0.08551510433915722, + "learning_rate": 8.98483576766631e-06, + "loss": 0.2324, + "step": 2001 + }, + { + "epoch": 2.6342105263157896, + "grad_norm": 0.08388165924617051, + "learning_rate": 8.921517307950255e-06, + "loss": 0.229, + "step": 2002 + }, + { + "epoch": 2.6355263157894737, + "grad_norm": 0.08896318131104192, + "learning_rate": 8.858412329974552e-06, + "loss": 0.2385, + "step": 2003 + }, + { + "epoch": 2.636842105263158, + "grad_norm": 0.09015074541095214, + "learning_rate": 8.795520981652961e-06, + "loss": 0.2392, + "step": 2004 + }, + { + "epoch": 2.638157894736842, + "grad_norm": 0.08696268311755063, + "learning_rate": 8.732843410398506e-06, + "loss": 0.2294, + "step": 2005 + }, + { + "epoch": 2.639473684210526, + "grad_norm": 0.08769627213456496, + "learning_rate": 8.670379763123126e-06, + "loss": 0.2384, + "step": 2006 + }, + { + "epoch": 2.640789473684211, + "grad_norm": 0.08643115429762364, + "learning_rate": 8.608130186237329e-06, + "loss": 0.25, + "step": 2007 + }, + { + "epoch": 2.6421052631578945, + "grad_norm": 0.08746848122982008, + "learning_rate": 8.546094825649908e-06, + "loss": 0.2398, + "step": 2008 + }, + { + "epoch": 2.643421052631579, + "grad_norm": 0.08700832769044184, + "learning_rate": 8.484273826767474e-06, + "loss": 0.2315, + "step": 2009 + }, + { + "epoch": 2.6447368421052633, + "grad_norm": 0.08453786644008071, + "learning_rate": 8.422667334494249e-06, + "loss": 0.2285, + "step": 2010 + }, + { + "epoch": 2.6460526315789474, + "grad_norm": 0.08586615693833513, + "learning_rate": 8.361275493231646e-06, + "loss": 0.2364, + "step": 2011 + }, + { + "epoch": 2.6473684210526316, + "grad_norm": 0.0870712891159542, + "learning_rate": 8.300098446877923e-06, + "loss": 0.2265, + "step": 2012 + }, + { + "epoch": 2.6486842105263158, + "grad_norm": 0.08440192942964624, + "learning_rate": 8.239136338827903e-06, + "loss": 0.2291, + "step": 2013 + }, + { + "epoch": 2.65, + "grad_norm": 0.08705307636244689, + "learning_rate": 8.178389311972612e-06, + "loss": 0.2392, + "step": 2014 + }, + { + "epoch": 2.651315789473684, + "grad_norm": 0.08894269433728565, + "learning_rate": 8.1178575086989e-06, + "loss": 0.2315, + "step": 2015 + }, + { + "epoch": 2.6526315789473687, + "grad_norm": 0.08919405616773922, + "learning_rate": 8.05754107088923e-06, + "loss": 0.2429, + "step": 2016 + }, + { + "epoch": 2.6539473684210524, + "grad_norm": 0.08652861044002905, + "learning_rate": 7.997440139921152e-06, + "loss": 0.2386, + "step": 2017 + }, + { + "epoch": 2.655263157894737, + "grad_norm": 0.08615918194801953, + "learning_rate": 7.937554856667196e-06, + "loss": 0.2275, + "step": 2018 + }, + { + "epoch": 2.656578947368421, + "grad_norm": 0.0867406788792665, + "learning_rate": 7.877885361494353e-06, + "loss": 0.2294, + "step": 2019 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.08726072526321839, + "learning_rate": 7.818431794263836e-06, + "loss": 0.236, + "step": 2020 + }, + { + "epoch": 2.6592105263157895, + "grad_norm": 0.08547170774404624, + "learning_rate": 7.759194294330751e-06, + "loss": 0.2416, + "step": 2021 + }, + { + "epoch": 2.6605263157894736, + "grad_norm": 0.08743733901669834, + "learning_rate": 7.700173000543742e-06, + "loss": 0.2409, + "step": 2022 + }, + { + "epoch": 2.661842105263158, + "grad_norm": 0.08978412000343398, + "learning_rate": 7.641368051244679e-06, + "loss": 0.2303, + "step": 2023 + }, + { + "epoch": 2.663157894736842, + "grad_norm": 0.08595515849525466, + "learning_rate": 7.582779584268373e-06, + "loss": 0.2392, + "step": 2024 + }, + { + "epoch": 2.6644736842105265, + "grad_norm": 0.08931169378561088, + "learning_rate": 7.524407736942174e-06, + "loss": 0.236, + "step": 2025 + }, + { + "epoch": 2.6657894736842103, + "grad_norm": 0.08550626450400388, + "learning_rate": 7.466252646085703e-06, + "loss": 0.2351, + "step": 2026 + }, + { + "epoch": 2.667105263157895, + "grad_norm": 0.08787743584655944, + "learning_rate": 7.4083144480105335e-06, + "loss": 0.2464, + "step": 2027 + }, + { + "epoch": 2.668421052631579, + "grad_norm": 0.0875076538218403, + "learning_rate": 7.350593278519824e-06, + "loss": 0.238, + "step": 2028 + }, + { + "epoch": 2.669736842105263, + "grad_norm": 0.09154668653402206, + "learning_rate": 7.2930892729080716e-06, + "loss": 0.2381, + "step": 2029 + }, + { + "epoch": 2.6710526315789473, + "grad_norm": 0.08720851842242237, + "learning_rate": 7.235802565960714e-06, + "loss": 0.2394, + "step": 2030 + }, + { + "epoch": 2.6723684210526315, + "grad_norm": 0.08749018793861221, + "learning_rate": 7.178733291953865e-06, + "loss": 0.2331, + "step": 2031 + }, + { + "epoch": 2.6736842105263157, + "grad_norm": 0.08792096320231124, + "learning_rate": 7.121881584654056e-06, + "loss": 0.2415, + "step": 2032 + }, + { + "epoch": 2.675, + "grad_norm": 0.08722269899883195, + "learning_rate": 7.0652475773177464e-06, + "loss": 0.236, + "step": 2033 + }, + { + "epoch": 2.6763157894736844, + "grad_norm": 0.08769168784492533, + "learning_rate": 7.00883140269123e-06, + "loss": 0.2409, + "step": 2034 + }, + { + "epoch": 2.6776315789473686, + "grad_norm": 0.09065018938662237, + "learning_rate": 6.95263319301015e-06, + "loss": 0.244, + "step": 2035 + }, + { + "epoch": 2.6789473684210527, + "grad_norm": 0.08628478345157022, + "learning_rate": 6.896653079999249e-06, + "loss": 0.2418, + "step": 2036 + }, + { + "epoch": 2.680263157894737, + "grad_norm": 0.0847424030869481, + "learning_rate": 6.840891194872112e-06, + "loss": 0.2367, + "step": 2037 + }, + { + "epoch": 2.681578947368421, + "grad_norm": 0.09082411672610778, + "learning_rate": 6.785347668330777e-06, + "loss": 0.2496, + "step": 2038 + }, + { + "epoch": 2.682894736842105, + "grad_norm": 0.08829134391849804, + "learning_rate": 6.730022630565458e-06, + "loss": 0.242, + "step": 2039 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.08183719511764015, + "learning_rate": 6.674916211254289e-06, + "loss": 0.2239, + "step": 2040 + }, + { + "epoch": 2.6855263157894735, + "grad_norm": 0.08448377337525598, + "learning_rate": 6.620028539562939e-06, + "loss": 0.2306, + "step": 2041 + }, + { + "epoch": 2.6868421052631577, + "grad_norm": 0.0850861974045535, + "learning_rate": 6.565359744144373e-06, + "loss": 0.2434, + "step": 2042 + }, + { + "epoch": 2.6881578947368423, + "grad_norm": 0.08652193833655933, + "learning_rate": 6.510909953138511e-06, + "loss": 0.2399, + "step": 2043 + }, + { + "epoch": 2.6894736842105265, + "grad_norm": 0.08571622313764427, + "learning_rate": 6.45667929417193e-06, + "loss": 0.2168, + "step": 2044 + }, + { + "epoch": 2.6907894736842106, + "grad_norm": 0.0891461015547161, + "learning_rate": 6.402667894357595e-06, + "loss": 0.2358, + "step": 2045 + }, + { + "epoch": 2.692105263157895, + "grad_norm": 0.08690415491550392, + "learning_rate": 6.3488758802945354e-06, + "loss": 0.2306, + "step": 2046 + }, + { + "epoch": 2.693421052631579, + "grad_norm": 0.08712160901508374, + "learning_rate": 6.2953033780675406e-06, + "loss": 0.2435, + "step": 2047 + }, + { + "epoch": 2.694736842105263, + "grad_norm": 0.08600309013612613, + "learning_rate": 6.2419505132469305e-06, + "loss": 0.2348, + "step": 2048 + }, + { + "epoch": 2.6960526315789473, + "grad_norm": 0.08761511265084414, + "learning_rate": 6.188817410888148e-06, + "loss": 0.2362, + "step": 2049 + }, + { + "epoch": 2.6973684210526314, + "grad_norm": 0.08786468959375783, + "learning_rate": 6.1359041955315725e-06, + "loss": 0.228, + "step": 2050 + }, + { + "epoch": 2.6986842105263156, + "grad_norm": 0.08424404719132456, + "learning_rate": 6.083210991202148e-06, + "loss": 0.2309, + "step": 2051 + }, + { + "epoch": 2.7, + "grad_norm": 0.08855984411695608, + "learning_rate": 6.030737921409169e-06, + "loss": 0.2481, + "step": 2052 + }, + { + "epoch": 2.7013157894736843, + "grad_norm": 0.08622212781661236, + "learning_rate": 5.978485109145904e-06, + "loss": 0.2316, + "step": 2053 + }, + { + "epoch": 2.7026315789473685, + "grad_norm": 0.0877254168652046, + "learning_rate": 5.926452676889383e-06, + "loss": 0.2442, + "step": 2054 + }, + { + "epoch": 2.7039473684210527, + "grad_norm": 0.08959699248737611, + "learning_rate": 5.8746407466000464e-06, + "loss": 0.2334, + "step": 2055 + }, + { + "epoch": 2.705263157894737, + "grad_norm": 0.08448005747326008, + "learning_rate": 5.823049439721561e-06, + "loss": 0.2246, + "step": 2056 + }, + { + "epoch": 2.706578947368421, + "grad_norm": 0.087622602007786, + "learning_rate": 5.771678877180408e-06, + "loss": 0.2301, + "step": 2057 + }, + { + "epoch": 2.707894736842105, + "grad_norm": 0.08772663242966719, + "learning_rate": 5.720529179385659e-06, + "loss": 0.2335, + "step": 2058 + }, + { + "epoch": 2.7092105263157897, + "grad_norm": 0.08569105342372876, + "learning_rate": 5.669600466228742e-06, + "loss": 0.2334, + "step": 2059 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 0.08468267671093112, + "learning_rate": 5.618892857083069e-06, + "loss": 0.2261, + "step": 2060 + }, + { + "epoch": 2.711842105263158, + "grad_norm": 0.08830354382676614, + "learning_rate": 5.568406470803799e-06, + "loss": 0.2395, + "step": 2061 + }, + { + "epoch": 2.713157894736842, + "grad_norm": 0.08578516507839022, + "learning_rate": 5.518141425727586e-06, + "loss": 0.228, + "step": 2062 + }, + { + "epoch": 2.7144736842105264, + "grad_norm": 0.08452732187319882, + "learning_rate": 5.468097839672237e-06, + "loss": 0.2139, + "step": 2063 + }, + { + "epoch": 2.7157894736842105, + "grad_norm": 0.08925179167431409, + "learning_rate": 5.418275829936537e-06, + "loss": 0.2473, + "step": 2064 + }, + { + "epoch": 2.7171052631578947, + "grad_norm": 0.08802009007215943, + "learning_rate": 5.3686755132998475e-06, + "loss": 0.2354, + "step": 2065 + }, + { + "epoch": 2.718421052631579, + "grad_norm": 0.08638926510806848, + "learning_rate": 5.319297006021917e-06, + "loss": 0.2276, + "step": 2066 + }, + { + "epoch": 2.719736842105263, + "grad_norm": 0.08649040265863565, + "learning_rate": 5.270140423842607e-06, + "loss": 0.2319, + "step": 2067 + }, + { + "epoch": 2.7210526315789476, + "grad_norm": 0.09027369123729026, + "learning_rate": 5.221205881981595e-06, + "loss": 0.2362, + "step": 2068 + }, + { + "epoch": 2.7223684210526313, + "grad_norm": 0.09174442002171228, + "learning_rate": 5.1724934951380755e-06, + "loss": 0.241, + "step": 2069 + }, + { + "epoch": 2.723684210526316, + "grad_norm": 0.08685169046569569, + "learning_rate": 5.124003377490582e-06, + "loss": 0.2389, + "step": 2070 + }, + { + "epoch": 2.725, + "grad_norm": 0.08724300958525942, + "learning_rate": 5.075735642696611e-06, + "loss": 0.2284, + "step": 2071 + }, + { + "epoch": 2.7263157894736842, + "grad_norm": 0.08498489763962802, + "learning_rate": 5.02769040389246e-06, + "loss": 0.2242, + "step": 2072 + }, + { + "epoch": 2.7276315789473684, + "grad_norm": 0.08637681029702944, + "learning_rate": 4.979867773692881e-06, + "loss": 0.2304, + "step": 2073 + }, + { + "epoch": 2.7289473684210526, + "grad_norm": 0.08534225285490787, + "learning_rate": 4.932267864190832e-06, + "loss": 0.2241, + "step": 2074 + }, + { + "epoch": 2.7302631578947367, + "grad_norm": 0.08306302707156428, + "learning_rate": 4.884890786957264e-06, + "loss": 0.2215, + "step": 2075 + }, + { + "epoch": 2.731578947368421, + "grad_norm": 0.08910155948565507, + "learning_rate": 4.8377366530408254e-06, + "loss": 0.2356, + "step": 2076 + }, + { + "epoch": 2.7328947368421055, + "grad_norm": 0.0864142099489445, + "learning_rate": 4.790805572967549e-06, + "loss": 0.2376, + "step": 2077 + }, + { + "epoch": 2.734210526315789, + "grad_norm": 0.08777769993987874, + "learning_rate": 4.744097656740709e-06, + "loss": 0.2296, + "step": 2078 + }, + { + "epoch": 2.735526315789474, + "grad_norm": 0.08927992176441508, + "learning_rate": 4.697613013840441e-06, + "loss": 0.245, + "step": 2079 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.08952627138575911, + "learning_rate": 4.65135175322361e-06, + "loss": 0.2361, + "step": 2080 + }, + { + "epoch": 2.738157894736842, + "grad_norm": 0.0887680485842762, + "learning_rate": 4.605313983323423e-06, + "loss": 0.2384, + "step": 2081 + }, + { + "epoch": 2.7394736842105263, + "grad_norm": 0.08612832481286653, + "learning_rate": 4.559499812049251e-06, + "loss": 0.2299, + "step": 2082 + }, + { + "epoch": 2.7407894736842104, + "grad_norm": 0.08792908730829178, + "learning_rate": 4.513909346786427e-06, + "loss": 0.24, + "step": 2083 + }, + { + "epoch": 2.7421052631578946, + "grad_norm": 0.08726287355056869, + "learning_rate": 4.468542694395861e-06, + "loss": 0.2355, + "step": 2084 + }, + { + "epoch": 2.7434210526315788, + "grad_norm": 0.08377835577746928, + "learning_rate": 4.423399961213892e-06, + "loss": 0.2242, + "step": 2085 + }, + { + "epoch": 2.7447368421052634, + "grad_norm": 0.08736709748947386, + "learning_rate": 4.378481253051991e-06, + "loss": 0.2313, + "step": 2086 + }, + { + "epoch": 2.7460526315789475, + "grad_norm": 0.08840579315321198, + "learning_rate": 4.333786675196539e-06, + "loss": 0.2494, + "step": 2087 + }, + { + "epoch": 2.7473684210526317, + "grad_norm": 0.08786933648344666, + "learning_rate": 4.2893163324085885e-06, + "loss": 0.2391, + "step": 2088 + }, + { + "epoch": 2.748684210526316, + "grad_norm": 0.0864833290624945, + "learning_rate": 4.245070328923584e-06, + "loss": 0.2368, + "step": 2089 + }, + { + "epoch": 2.75, + "grad_norm": 0.08633401270649953, + "learning_rate": 4.20104876845111e-06, + "loss": 0.2371, + "step": 2090 + }, + { + "epoch": 2.751315789473684, + "grad_norm": 0.0872957150222657, + "learning_rate": 4.1572517541747294e-06, + "loss": 0.2411, + "step": 2091 + }, + { + "epoch": 2.7526315789473683, + "grad_norm": 0.08503214350266516, + "learning_rate": 4.1136793887516345e-06, + "loss": 0.2208, + "step": 2092 + }, + { + "epoch": 2.7539473684210525, + "grad_norm": 0.08709188346280049, + "learning_rate": 4.070331774312486e-06, + "loss": 0.2518, + "step": 2093 + }, + { + "epoch": 2.7552631578947366, + "grad_norm": 0.08684645991108973, + "learning_rate": 4.027209012461108e-06, + "loss": 0.2442, + "step": 2094 + }, + { + "epoch": 2.7565789473684212, + "grad_norm": 0.0857591344523911, + "learning_rate": 3.9843112042743045e-06, + "loss": 0.2321, + "step": 2095 + }, + { + "epoch": 2.7578947368421054, + "grad_norm": 0.08806771319356137, + "learning_rate": 3.941638450301644e-06, + "loss": 0.2497, + "step": 2096 + }, + { + "epoch": 2.7592105263157896, + "grad_norm": 0.08562267017890308, + "learning_rate": 3.899190850565115e-06, + "loss": 0.2305, + "step": 2097 + }, + { + "epoch": 2.7605263157894737, + "grad_norm": 0.0875801485743485, + "learning_rate": 3.856968504558989e-06, + "loss": 0.2461, + "step": 2098 + }, + { + "epoch": 2.761842105263158, + "grad_norm": 0.086786356254571, + "learning_rate": 3.814971511249576e-06, + "loss": 0.2399, + "step": 2099 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.08869720806043141, + "learning_rate": 3.7731999690749585e-06, + "loss": 0.2418, + "step": 2100 + }, + { + "epoch": 2.764473684210526, + "grad_norm": 0.08740038989320052, + "learning_rate": 3.731653975944782e-06, + "loss": 0.2313, + "step": 2101 + }, + { + "epoch": 2.765789473684211, + "grad_norm": 0.08421195141482454, + "learning_rate": 3.690333629239995e-06, + "loss": 0.2452, + "step": 2102 + }, + { + "epoch": 2.7671052631578945, + "grad_norm": 0.08786793240306752, + "learning_rate": 3.6492390258126673e-06, + "loss": 0.2447, + "step": 2103 + }, + { + "epoch": 2.768421052631579, + "grad_norm": 0.08677843897373576, + "learning_rate": 3.6083702619857605e-06, + "loss": 0.2302, + "step": 2104 + }, + { + "epoch": 2.7697368421052633, + "grad_norm": 0.08762759404913377, + "learning_rate": 3.567727433552859e-06, + "loss": 0.2398, + "step": 2105 + }, + { + "epoch": 2.7710526315789474, + "grad_norm": 0.08696670352877477, + "learning_rate": 3.5273106357779585e-06, + "loss": 0.2303, + "step": 2106 + }, + { + "epoch": 2.7723684210526316, + "grad_norm": 0.08376635095287631, + "learning_rate": 3.4871199633953024e-06, + "loss": 0.2312, + "step": 2107 + }, + { + "epoch": 2.7736842105263158, + "grad_norm": 0.08618277832613587, + "learning_rate": 3.447155510609057e-06, + "loss": 0.2297, + "step": 2108 + }, + { + "epoch": 2.775, + "grad_norm": 0.0846951369483359, + "learning_rate": 3.40741737109318e-06, + "loss": 0.2277, + "step": 2109 + }, + { + "epoch": 2.776315789473684, + "grad_norm": 0.08571322444230144, + "learning_rate": 3.367905637991142e-06, + "loss": 0.235, + "step": 2110 + }, + { + "epoch": 2.7776315789473687, + "grad_norm": 0.08587956669273608, + "learning_rate": 3.328620403915761e-06, + "loss": 0.2309, + "step": 2111 + }, + { + "epoch": 2.7789473684210524, + "grad_norm": 0.08783109845186413, + "learning_rate": 3.2895617609489336e-06, + "loss": 0.2297, + "step": 2112 + }, + { + "epoch": 2.780263157894737, + "grad_norm": 0.08554797023694241, + "learning_rate": 3.2507298006414497e-06, + "loss": 0.235, + "step": 2113 + }, + { + "epoch": 2.781578947368421, + "grad_norm": 0.08680930677926671, + "learning_rate": 3.212124614012768e-06, + "loss": 0.2211, + "step": 2114 + }, + { + "epoch": 2.7828947368421053, + "grad_norm": 0.08766866674749912, + "learning_rate": 3.1737462915508277e-06, + "loss": 0.2337, + "step": 2115 + }, + { + "epoch": 2.7842105263157895, + "grad_norm": 0.08826029132090077, + "learning_rate": 3.135594923211771e-06, + "loss": 0.2422, + "step": 2116 + }, + { + "epoch": 2.7855263157894736, + "grad_norm": 0.08636371899570172, + "learning_rate": 3.0976705984198106e-06, + "loss": 0.2338, + "step": 2117 + }, + { + "epoch": 2.786842105263158, + "grad_norm": 0.09050217986657871, + "learning_rate": 3.059973406066963e-06, + "loss": 0.228, + "step": 2118 + }, + { + "epoch": 2.788157894736842, + "grad_norm": 0.08503847802029245, + "learning_rate": 3.02250343451288e-06, + "loss": 0.2271, + "step": 2119 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 0.08703828648807459, + "learning_rate": 2.9852607715846193e-06, + "loss": 0.2366, + "step": 2120 + }, + { + "epoch": 2.7907894736842103, + "grad_norm": 0.08829271415423992, + "learning_rate": 2.948245504576419e-06, + "loss": 0.2426, + "step": 2121 + }, + { + "epoch": 2.792105263157895, + "grad_norm": 0.08927594751571939, + "learning_rate": 2.9114577202495553e-06, + "loss": 0.2404, + "step": 2122 + }, + { + "epoch": 2.793421052631579, + "grad_norm": 0.08694095520332024, + "learning_rate": 2.874897504832075e-06, + "loss": 0.2378, + "step": 2123 + }, + { + "epoch": 2.794736842105263, + "grad_norm": 0.08685404680967446, + "learning_rate": 2.838564944018618e-06, + "loss": 0.2321, + "step": 2124 + }, + { + "epoch": 2.7960526315789473, + "grad_norm": 0.08768289772849402, + "learning_rate": 2.802460122970241e-06, + "loss": 0.241, + "step": 2125 + }, + { + "epoch": 2.7973684210526315, + "grad_norm": 0.08607902569363003, + "learning_rate": 2.7665831263141593e-06, + "loss": 0.233, + "step": 2126 + }, + { + "epoch": 2.7986842105263157, + "grad_norm": 0.08510692543767813, + "learning_rate": 2.730934038143607e-06, + "loss": 0.2356, + "step": 2127 + }, + { + "epoch": 2.8, + "grad_norm": 0.08529239376866304, + "learning_rate": 2.6955129420176196e-06, + "loss": 0.2368, + "step": 2128 + }, + { + "epoch": 2.8013157894736844, + "grad_norm": 0.08535080650525781, + "learning_rate": 2.6603199209608187e-06, + "loss": 0.225, + "step": 2129 + }, + { + "epoch": 2.8026315789473686, + "grad_norm": 0.08544159810069368, + "learning_rate": 2.6253550574632303e-06, + "loss": 0.2278, + "step": 2130 + }, + { + "epoch": 2.8039473684210527, + "grad_norm": 0.0864151284971542, + "learning_rate": 2.5906184334801297e-06, + "loss": 0.24, + "step": 2131 + }, + { + "epoch": 2.805263157894737, + "grad_norm": 0.08574975605717337, + "learning_rate": 2.556110130431788e-06, + "loss": 0.2266, + "step": 2132 + }, + { + "epoch": 2.806578947368421, + "grad_norm": 0.0857571071786596, + "learning_rate": 2.5218302292032816e-06, + "loss": 0.234, + "step": 2133 + }, + { + "epoch": 2.807894736842105, + "grad_norm": 0.08886112337240165, + "learning_rate": 2.487778810144381e-06, + "loss": 0.2369, + "step": 2134 + }, + { + "epoch": 2.8092105263157894, + "grad_norm": 0.08877261222424743, + "learning_rate": 2.4539559530692758e-06, + "loss": 0.2304, + "step": 2135 + }, + { + "epoch": 2.8105263157894735, + "grad_norm": 0.08399044671282317, + "learning_rate": 2.420361737256438e-06, + "loss": 0.2282, + "step": 2136 + }, + { + "epoch": 2.8118421052631577, + "grad_norm": 0.08240687984849308, + "learning_rate": 2.3869962414484137e-06, + "loss": 0.2231, + "step": 2137 + }, + { + "epoch": 2.8131578947368423, + "grad_norm": 0.08767361332290152, + "learning_rate": 2.353859543851644e-06, + "loss": 0.2296, + "step": 2138 + }, + { + "epoch": 2.8144736842105265, + "grad_norm": 0.08790627521525603, + "learning_rate": 2.3209517221362777e-06, + "loss": 0.2293, + "step": 2139 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.08622647934206558, + "learning_rate": 2.288272853436013e-06, + "loss": 0.2325, + "step": 2140 + }, + { + "epoch": 2.817105263157895, + "grad_norm": 0.08713384140492961, + "learning_rate": 2.2558230143478797e-06, + "loss": 0.2276, + "step": 2141 + }, + { + "epoch": 2.818421052631579, + "grad_norm": 0.08415303494815114, + "learning_rate": 2.22360228093208e-06, + "loss": 0.221, + "step": 2142 + }, + { + "epoch": 2.819736842105263, + "grad_norm": 0.08558597543600141, + "learning_rate": 2.1916107287118015e-06, + "loss": 0.2293, + "step": 2143 + }, + { + "epoch": 2.8210526315789473, + "grad_norm": 0.08465218463761441, + "learning_rate": 2.1598484326730837e-06, + "loss": 0.2314, + "step": 2144 + }, + { + "epoch": 2.8223684210526314, + "grad_norm": 0.08774203550326841, + "learning_rate": 2.128315467264552e-06, + "loss": 0.2359, + "step": 2145 + }, + { + "epoch": 2.8236842105263156, + "grad_norm": 0.08602355712846993, + "learning_rate": 2.097011906397339e-06, + "loss": 0.2414, + "step": 2146 + }, + { + "epoch": 2.825, + "grad_norm": 0.09054024554229902, + "learning_rate": 2.0659378234448525e-06, + "loss": 0.2335, + "step": 2147 + }, + { + "epoch": 2.8263157894736843, + "grad_norm": 0.08533131567031418, + "learning_rate": 2.035093291242607e-06, + "loss": 0.2343, + "step": 2148 + }, + { + "epoch": 2.8276315789473685, + "grad_norm": 0.0884644294438792, + "learning_rate": 2.004478382088093e-06, + "loss": 0.2345, + "step": 2149 + }, + { + "epoch": 2.8289473684210527, + "grad_norm": 0.08947476508746073, + "learning_rate": 1.974093167740565e-06, + "loss": 0.2335, + "step": 2150 + }, + { + "epoch": 2.830263157894737, + "grad_norm": 0.086504450962513, + "learning_rate": 1.943937719420863e-06, + "loss": 0.2414, + "step": 2151 + }, + { + "epoch": 2.831578947368421, + "grad_norm": 0.08547620700378816, + "learning_rate": 1.914012107811336e-06, + "loss": 0.2304, + "step": 2152 + }, + { + "epoch": 2.832894736842105, + "grad_norm": 0.08406046395791139, + "learning_rate": 1.8843164030555527e-06, + "loss": 0.2224, + "step": 2153 + }, + { + "epoch": 2.8342105263157897, + "grad_norm": 0.0858958830300312, + "learning_rate": 1.8548506747582129e-06, + "loss": 0.2299, + "step": 2154 + }, + { + "epoch": 2.8355263157894735, + "grad_norm": 0.08713981466515258, + "learning_rate": 1.8256149919849807e-06, + "loss": 0.24, + "step": 2155 + }, + { + "epoch": 2.836842105263158, + "grad_norm": 0.08722563464111789, + "learning_rate": 1.7966094232622855e-06, + "loss": 0.2369, + "step": 2156 + }, + { + "epoch": 2.838157894736842, + "grad_norm": 0.08782546599587424, + "learning_rate": 1.7678340365772206e-06, + "loss": 0.2339, + "step": 2157 + }, + { + "epoch": 2.8394736842105264, + "grad_norm": 0.08820898856927462, + "learning_rate": 1.7392888993773005e-06, + "loss": 0.2294, + "step": 2158 + }, + { + "epoch": 2.8407894736842105, + "grad_norm": 0.08559547137262101, + "learning_rate": 1.7109740785703933e-06, + "loss": 0.233, + "step": 2159 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.08651933564372974, + "learning_rate": 1.6828896405244988e-06, + "loss": 0.2326, + "step": 2160 + }, + { + "epoch": 2.843421052631579, + "grad_norm": 0.08423235088748676, + "learning_rate": 1.6550356510676268e-06, + "loss": 0.2314, + "step": 2161 + }, + { + "epoch": 2.844736842105263, + "grad_norm": 0.08646992121393281, + "learning_rate": 1.6274121754876082e-06, + "loss": 0.2358, + "step": 2162 + }, + { + "epoch": 2.8460526315789476, + "grad_norm": 0.08897157982270418, + "learning_rate": 1.6000192785320057e-06, + "loss": 0.2421, + "step": 2163 + }, + { + "epoch": 2.8473684210526313, + "grad_norm": 0.08753330536214494, + "learning_rate": 1.572857024407881e-06, + "loss": 0.2367, + "step": 2164 + }, + { + "epoch": 2.848684210526316, + "grad_norm": 0.08917423233859062, + "learning_rate": 1.5459254767817066e-06, + "loss": 0.2337, + "step": 2165 + }, + { + "epoch": 2.85, + "grad_norm": 0.08718844159110516, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.2316, + "step": 2166 + }, + { + "epoch": 2.8513157894736842, + "grad_norm": 0.08666099472117271, + "learning_rate": 1.4927547529851371e-06, + "loss": 0.2145, + "step": 2167 + }, + { + "epoch": 2.8526315789473684, + "grad_norm": 0.08442460380559827, + "learning_rate": 1.466515701443294e-06, + "loss": 0.2332, + "step": 2168 + }, + { + "epoch": 2.8539473684210526, + "grad_norm": 0.08449859016985016, + "learning_rate": 1.4405076056561828e-06, + "loss": 0.2266, + "step": 2169 + }, + { + "epoch": 2.8552631578947367, + "grad_norm": 0.08545748600057546, + "learning_rate": 1.4147305265850175e-06, + "loss": 0.2298, + "step": 2170 + }, + { + "epoch": 2.856578947368421, + "grad_norm": 0.08929470679144376, + "learning_rate": 1.3891845246495228e-06, + "loss": 0.2343, + "step": 2171 + }, + { + "epoch": 2.8578947368421055, + "grad_norm": 0.08668972660138131, + "learning_rate": 1.3638696597277679e-06, + "loss": 0.2423, + "step": 2172 + }, + { + "epoch": 2.859210526315789, + "grad_norm": 0.08262056937095345, + "learning_rate": 1.3387859911560664e-06, + "loss": 0.2285, + "step": 2173 + }, + { + "epoch": 2.860526315789474, + "grad_norm": 0.08830815458424673, + "learning_rate": 1.3139335777288208e-06, + "loss": 0.2334, + "step": 2174 + }, + { + "epoch": 2.861842105263158, + "grad_norm": 0.08675825238016463, + "learning_rate": 1.28931247769839e-06, + "loss": 0.2382, + "step": 2175 + }, + { + "epoch": 2.863157894736842, + "grad_norm": 0.08893171231303189, + "learning_rate": 1.2649227487749548e-06, + "loss": 0.2336, + "step": 2176 + }, + { + "epoch": 2.8644736842105263, + "grad_norm": 0.08924178196183445, + "learning_rate": 1.2407644481263858e-06, + "loss": 0.2408, + "step": 2177 + }, + { + "epoch": 2.8657894736842104, + "grad_norm": 0.08715372943476031, + "learning_rate": 1.216837632378065e-06, + "loss": 0.2315, + "step": 2178 + }, + { + "epoch": 2.8671052631578946, + "grad_norm": 0.08660959792084139, + "learning_rate": 1.1931423576128197e-06, + "loss": 0.2416, + "step": 2179 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 0.08853921306085291, + "learning_rate": 1.1696786793707781e-06, + "loss": 0.2335, + "step": 2180 + }, + { + "epoch": 2.8697368421052634, + "grad_norm": 0.08640447512392578, + "learning_rate": 1.146446652649169e-06, + "loss": 0.2358, + "step": 2181 + }, + { + "epoch": 2.8710526315789475, + "grad_norm": 0.08705591508512092, + "learning_rate": 1.1234463319022893e-06, + "loss": 0.2315, + "step": 2182 + }, + { + "epoch": 2.8723684210526317, + "grad_norm": 0.08602413068650967, + "learning_rate": 1.100677771041314e-06, + "loss": 0.2235, + "step": 2183 + }, + { + "epoch": 2.873684210526316, + "grad_norm": 0.08684507106024413, + "learning_rate": 1.0781410234342094e-06, + "loss": 0.24, + "step": 2184 + }, + { + "epoch": 2.875, + "grad_norm": 0.09031170729723671, + "learning_rate": 1.055836141905553e-06, + "loss": 0.2338, + "step": 2185 + }, + { + "epoch": 2.876315789473684, + "grad_norm": 0.08646029200839189, + "learning_rate": 1.0337631787364687e-06, + "loss": 0.2222, + "step": 2186 + }, + { + "epoch": 2.8776315789473683, + "grad_norm": 0.08740568430137426, + "learning_rate": 1.0119221856644712e-06, + "loss": 0.2352, + "step": 2187 + }, + { + "epoch": 2.8789473684210525, + "grad_norm": 0.08586656965987803, + "learning_rate": 9.90313213883376e-07, + "loss": 0.2263, + "step": 2188 + }, + { + "epoch": 2.8802631578947366, + "grad_norm": 0.08991958053706495, + "learning_rate": 9.689363140431118e-07, + "loss": 0.2266, + "step": 2189 + }, + { + "epoch": 2.8815789473684212, + "grad_norm": 0.08767297790926798, + "learning_rate": 9.477915362496758e-07, + "loss": 0.2441, + "step": 2190 + }, + { + "epoch": 2.8828947368421054, + "grad_norm": 0.0898797640991881, + "learning_rate": 9.268789300649894e-07, + "loss": 0.2401, + "step": 2191 + }, + { + "epoch": 2.8842105263157896, + "grad_norm": 0.08612849691104628, + "learning_rate": 9.061985445067756e-07, + "loss": 0.2281, + "step": 2192 + }, + { + "epoch": 2.8855263157894737, + "grad_norm": 0.08949331264552439, + "learning_rate": 8.857504280484375e-07, + "loss": 0.2547, + "step": 2193 + }, + { + "epoch": 2.886842105263158, + "grad_norm": 0.0851227718307587, + "learning_rate": 8.65534628618958e-07, + "loss": 0.2297, + "step": 2194 + }, + { + "epoch": 2.888157894736842, + "grad_norm": 0.08827554977188867, + "learning_rate": 8.455511936028004e-07, + "loss": 0.2411, + "step": 2195 + }, + { + "epoch": 2.889473684210526, + "grad_norm": 0.08846981462555736, + "learning_rate": 8.258001698397744e-07, + "loss": 0.2389, + "step": 2196 + }, + { + "epoch": 2.890789473684211, + "grad_norm": 0.08664150903305488, + "learning_rate": 8.062816036249143e-07, + "loss": 0.2345, + "step": 2197 + }, + { + "epoch": 2.8921052631578945, + "grad_norm": 0.0854396346836948, + "learning_rate": 7.86995540708424e-07, + "loss": 0.2318, + "step": 2198 + }, + { + "epoch": 2.893421052631579, + "grad_norm": 0.08695987453269646, + "learning_rate": 7.679420262954984e-07, + "loss": 0.2424, + "step": 2199 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.08752689227211352, + "learning_rate": 7.491211050462798e-07, + "loss": 0.2304, + "step": 2200 + }, + { + "epoch": 2.8960526315789474, + "grad_norm": 0.08666285275563744, + "learning_rate": 7.305328210757356e-07, + "loss": 0.2299, + "step": 2201 + }, + { + "epoch": 2.8973684210526316, + "grad_norm": 0.08663314353560102, + "learning_rate": 7.121772179535135e-07, + "loss": 0.2424, + "step": 2202 + }, + { + "epoch": 2.8986842105263158, + "grad_norm": 0.08458000982804612, + "learning_rate": 6.94054338703909e-07, + "loss": 0.2316, + "step": 2203 + }, + { + "epoch": 2.9, + "grad_norm": 0.08821934440133324, + "learning_rate": 6.761642258056978e-07, + "loss": 0.2276, + "step": 2204 + }, + { + "epoch": 2.901315789473684, + "grad_norm": 0.08815511451469815, + "learning_rate": 6.585069211921035e-07, + "loss": 0.2379, + "step": 2205 + }, + { + "epoch": 2.9026315789473687, + "grad_norm": 0.08529196165876289, + "learning_rate": 6.410824662506198e-07, + "loss": 0.2296, + "step": 2206 + }, + { + "epoch": 2.9039473684210524, + "grad_norm": 0.08451900774747255, + "learning_rate": 6.238909018229766e-07, + "loss": 0.2269, + "step": 2207 + }, + { + "epoch": 2.905263157894737, + "grad_norm": 0.08696348620201358, + "learning_rate": 6.069322682050516e-07, + "loss": 0.2324, + "step": 2208 + }, + { + "epoch": 2.906578947368421, + "grad_norm": 0.08828910381160303, + "learning_rate": 5.902066051467037e-07, + "loss": 0.2357, + "step": 2209 + }, + { + "epoch": 2.9078947368421053, + "grad_norm": 0.08483736507293088, + "learning_rate": 5.737139518517509e-07, + "loss": 0.2296, + "step": 2210 + }, + { + "epoch": 2.9092105263157895, + "grad_norm": 0.08743099417381292, + "learning_rate": 5.57454346977837e-07, + "loss": 0.24, + "step": 2211 + }, + { + "epoch": 2.9105263157894736, + "grad_norm": 0.08916802794242618, + "learning_rate": 5.414278286363761e-07, + "loss": 0.2304, + "step": 2212 + }, + { + "epoch": 2.911842105263158, + "grad_norm": 0.08828220707425087, + "learning_rate": 5.256344343924302e-07, + "loss": 0.2419, + "step": 2213 + }, + { + "epoch": 2.913157894736842, + "grad_norm": 0.08659025242576524, + "learning_rate": 5.10074201264632e-07, + "loss": 0.2166, + "step": 2214 + }, + { + "epoch": 2.9144736842105265, + "grad_norm": 0.08525185958778574, + "learning_rate": 4.947471657251068e-07, + "loss": 0.2368, + "step": 2215 + }, + { + "epoch": 2.9157894736842103, + "grad_norm": 0.08692863494788813, + "learning_rate": 4.796533636993727e-07, + "loss": 0.23, + "step": 2216 + }, + { + "epoch": 2.917105263157895, + "grad_norm": 0.08819398487450925, + "learning_rate": 4.647928305662852e-07, + "loss": 0.2321, + "step": 2217 + }, + { + "epoch": 2.918421052631579, + "grad_norm": 0.08936510229812145, + "learning_rate": 4.501656011579036e-07, + "loss": 0.2468, + "step": 2218 + }, + { + "epoch": 2.919736842105263, + "grad_norm": 0.08708818005334107, + "learning_rate": 4.3577170975945826e-07, + "loss": 0.2322, + "step": 2219 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.08388217210536535, + "learning_rate": 4.216111901092501e-07, + "loss": 0.2267, + "step": 2220 + }, + { + "epoch": 2.9223684210526315, + "grad_norm": 0.08876373477705209, + "learning_rate": 4.0768407539857333e-07, + "loss": 0.2337, + "step": 2221 + }, + { + "epoch": 2.9236842105263157, + "grad_norm": 0.08918972976359953, + "learning_rate": 3.9399039827162643e-07, + "loss": 0.2458, + "step": 2222 + }, + { + "epoch": 2.925, + "grad_norm": 0.08612029906467318, + "learning_rate": 3.805301908254455e-07, + "loss": 0.2243, + "step": 2223 + }, + { + "epoch": 2.9263157894736844, + "grad_norm": 0.08294679916711059, + "learning_rate": 3.6730348460985996e-07, + "loss": 0.2184, + "step": 2224 + }, + { + "epoch": 2.9276315789473686, + "grad_norm": 0.08752162048000231, + "learning_rate": 3.543103106273371e-07, + "loss": 0.2457, + "step": 2225 + }, + { + "epoch": 2.9289473684210527, + "grad_norm": 0.0859025435788662, + "learning_rate": 3.415506993330153e-07, + "loss": 0.2346, + "step": 2226 + }, + { + "epoch": 2.930263157894737, + "grad_norm": 0.08793876747475723, + "learning_rate": 3.2902468063453763e-07, + "loss": 0.2391, + "step": 2227 + }, + { + "epoch": 2.931578947368421, + "grad_norm": 0.08667819600458836, + "learning_rate": 3.1673228389204055e-07, + "loss": 0.2288, + "step": 2228 + }, + { + "epoch": 2.932894736842105, + "grad_norm": 0.08786519232774889, + "learning_rate": 3.046735379180543e-07, + "loss": 0.2341, + "step": 2229 + }, + { + "epoch": 2.9342105263157894, + "grad_norm": 0.0901967616524326, + "learning_rate": 2.9284847097746923e-07, + "loss": 0.2291, + "step": 2230 + }, + { + "epoch": 2.9355263157894735, + "grad_norm": 0.08704968172092865, + "learning_rate": 2.81257110787414e-07, + "loss": 0.234, + "step": 2231 + }, + { + "epoch": 2.9368421052631577, + "grad_norm": 0.08690111626726336, + "learning_rate": 2.6989948451726643e-07, + "loss": 0.2284, + "step": 2232 + }, + { + "epoch": 2.9381578947368423, + "grad_norm": 0.08848680506181238, + "learning_rate": 2.587756187885204e-07, + "loss": 0.2313, + "step": 2233 + }, + { + "epoch": 2.9394736842105265, + "grad_norm": 0.08596311834903651, + "learning_rate": 2.4788553967474147e-07, + "loss": 0.2361, + "step": 2234 + }, + { + "epoch": 2.9407894736842106, + "grad_norm": 0.08490908394075776, + "learning_rate": 2.372292727015557e-07, + "loss": 0.2353, + "step": 2235 + }, + { + "epoch": 2.942105263157895, + "grad_norm": 0.08292706658982998, + "learning_rate": 2.2680684284650533e-07, + "loss": 0.2309, + "step": 2236 + }, + { + "epoch": 2.943421052631579, + "grad_norm": 0.08480952928040106, + "learning_rate": 2.1661827453905992e-07, + "loss": 0.2213, + "step": 2237 + }, + { + "epoch": 2.944736842105263, + "grad_norm": 0.08635227023585973, + "learning_rate": 2.066635916605386e-07, + "loss": 0.2353, + "step": 2238 + }, + { + "epoch": 2.9460526315789473, + "grad_norm": 0.08781565402914938, + "learning_rate": 1.9694281754401024e-07, + "loss": 0.2361, + "step": 2239 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.08776480608964969, + "learning_rate": 1.8745597497433765e-07, + "loss": 0.2415, + "step": 2240 + }, + { + "epoch": 2.9486842105263156, + "grad_norm": 0.08905978886837364, + "learning_rate": 1.782030861880113e-07, + "loss": 0.2308, + "step": 2241 + }, + { + "epoch": 2.95, + "grad_norm": 0.08890792718531305, + "learning_rate": 1.6918417287318245e-07, + "loss": 0.2415, + "step": 2242 + }, + { + "epoch": 2.9513157894736843, + "grad_norm": 0.08905016422570772, + "learning_rate": 1.603992561695522e-07, + "loss": 0.2378, + "step": 2243 + }, + { + "epoch": 2.9526315789473685, + "grad_norm": 0.08893795963848528, + "learning_rate": 1.518483566683826e-07, + "loss": 0.2435, + "step": 2244 + }, + { + "epoch": 2.9539473684210527, + "grad_norm": 0.08778434845315054, + "learning_rate": 1.4353149441237445e-07, + "loss": 0.2378, + "step": 2245 + }, + { + "epoch": 2.955263157894737, + "grad_norm": 0.08482607245940768, + "learning_rate": 1.3544868889571182e-07, + "loss": 0.2206, + "step": 2246 + }, + { + "epoch": 2.956578947368421, + "grad_norm": 0.08524277713713192, + "learning_rate": 1.2759995906392874e-07, + "loss": 0.2235, + "step": 2247 + }, + { + "epoch": 2.957894736842105, + "grad_norm": 0.08760984571309041, + "learning_rate": 1.199853233138981e-07, + "loss": 0.2301, + "step": 2248 + }, + { + "epoch": 2.9592105263157897, + "grad_norm": 0.08864338113540858, + "learning_rate": 1.1260479949382064e-07, + "loss": 0.241, + "step": 2249 + }, + { + "epoch": 2.9605263157894735, + "grad_norm": 0.0860770469372616, + "learning_rate": 1.0545840490313596e-07, + "loss": 0.238, + "step": 2250 + }, + { + "epoch": 2.961842105263158, + "grad_norm": 0.08309356073768617, + "learning_rate": 9.854615629250053e-08, + "loss": 0.2216, + "step": 2251 + }, + { + "epoch": 2.963157894736842, + "grad_norm": 0.08693365254726673, + "learning_rate": 9.186806986376529e-08, + "loss": 0.229, + "step": 2252 + }, + { + "epoch": 2.9644736842105264, + "grad_norm": 0.08720773613369974, + "learning_rate": 8.542416126989805e-08, + "loss": 0.2323, + "step": 2253 + }, + { + "epoch": 2.9657894736842105, + "grad_norm": 0.089536393572043, + "learning_rate": 7.921444561498348e-08, + "loss": 0.2369, + "step": 2254 + }, + { + "epoch": 2.9671052631578947, + "grad_norm": 0.08652678030587639, + "learning_rate": 7.323893745416755e-08, + "loss": 0.2398, + "step": 2255 + }, + { + "epoch": 2.968421052631579, + "grad_norm": 0.08812917248570945, + "learning_rate": 6.749765079363534e-08, + "loss": 0.2262, + "step": 2256 + }, + { + "epoch": 2.969736842105263, + "grad_norm": 0.08805384258384784, + "learning_rate": 6.19905990905667e-08, + "loss": 0.2264, + "step": 2257 + }, + { + "epoch": 2.9710526315789476, + "grad_norm": 0.0845281113628984, + "learning_rate": 5.6717795253113935e-08, + "loss": 0.2296, + "step": 2258 + }, + { + "epoch": 2.9723684210526313, + "grad_norm": 0.08972586281120497, + "learning_rate": 5.167925164037968e-08, + "loss": 0.2444, + "step": 2259 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.086226803286329, + "learning_rate": 4.687498006236135e-08, + "loss": 0.2377, + "step": 2260 + }, + { + "epoch": 2.975, + "grad_norm": 0.08827582008385458, + "learning_rate": 4.230499177994007e-08, + "loss": 0.2275, + "step": 2261 + }, + { + "epoch": 2.9763157894736842, + "grad_norm": 0.08805311064331064, + "learning_rate": 3.796929750485845e-08, + "loss": 0.2453, + "step": 2262 + }, + { + "epoch": 2.9776315789473684, + "grad_norm": 0.08707716445589411, + "learning_rate": 3.386790739968726e-08, + "loss": 0.2484, + "step": 2263 + }, + { + "epoch": 2.9789473684210526, + "grad_norm": 0.08937493080093048, + "learning_rate": 3.000083107780327e-08, + "loss": 0.2341, + "step": 2264 + }, + { + "epoch": 2.9802631578947367, + "grad_norm": 0.08847213274992714, + "learning_rate": 2.6368077603367015e-08, + "loss": 0.2345, + "step": 2265 + }, + { + "epoch": 2.981578947368421, + "grad_norm": 0.08752540842663672, + "learning_rate": 2.2969655491311693e-08, + "loss": 0.231, + "step": 2266 + }, + { + "epoch": 2.9828947368421055, + "grad_norm": 0.08820844196032962, + "learning_rate": 1.980557270729877e-08, + "loss": 0.2403, + "step": 2267 + }, + { + "epoch": 2.984210526315789, + "grad_norm": 0.08848035935218941, + "learning_rate": 1.687583666772907e-08, + "loss": 0.2475, + "step": 2268 + }, + { + "epoch": 2.985526315789474, + "grad_norm": 0.08684909478055165, + "learning_rate": 1.418045423968728e-08, + "loss": 0.2276, + "step": 2269 + }, + { + "epoch": 2.986842105263158, + "grad_norm": 0.08857801328002969, + "learning_rate": 1.1719431740997433e-08, + "loss": 0.2351, + "step": 2270 + }, + { + "epoch": 2.988157894736842, + "grad_norm": 0.08626632234298558, + "learning_rate": 9.49277494008971e-09, + "loss": 0.2371, + "step": 2271 + }, + { + "epoch": 2.9894736842105263, + "grad_norm": 0.08631453252565319, + "learning_rate": 7.500489056133652e-09, + "loss": 0.241, + "step": 2272 + }, + { + "epoch": 2.9907894736842104, + "grad_norm": 0.08837237273152578, + "learning_rate": 5.742578758882733e-09, + "loss": 0.2304, + "step": 2273 + }, + { + "epoch": 2.9921052631578946, + "grad_norm": 0.0862791762664004, + "learning_rate": 4.219048168763174e-09, + "loss": 0.2425, + "step": 2274 + }, + { + "epoch": 2.9934210526315788, + "grad_norm": 0.08930817551049701, + "learning_rate": 2.9299008568406396e-09, + "loss": 0.2402, + "step": 2275 + }, + { + "epoch": 2.9947368421052634, + "grad_norm": 0.08856125619943322, + "learning_rate": 1.8751398447758306e-09, + "loss": 0.2358, + "step": 2276 + }, + { + "epoch": 2.9960526315789475, + "grad_norm": 0.08640583110457638, + "learning_rate": 1.0547676048688892e-09, + "loss": 0.2259, + "step": 2277 + }, + { + "epoch": 2.9973684210526317, + "grad_norm": 0.08504843454486558, + "learning_rate": 4.687860599927873e-10, + "loss": 0.2402, + "step": 2278 + }, + { + "epoch": 2.998684210526316, + "grad_norm": 0.08642252232900641, + "learning_rate": 1.1719658367104202e-10, + "loss": 0.2366, + "step": 2279 + }, + { + "epoch": 3.0, + "grad_norm": 0.08478813908452047, + "learning_rate": 0.0, + "loss": 0.2337, + "step": 2280 + }, + { + "epoch": 3.0, + "eval_loss": 0.2625243365764618, + "eval_runtime": 136.232, + "eval_samples_per_second": 37.568, + "eval_steps_per_second": 1.174, + "step": 2280 + }, + { + "epoch": 3.0, + "step": 2280, + "total_flos": 6.730550114202419e+17, + "train_loss": 0.28347440996583095, + "train_runtime": 21673.3063, + "train_samples_per_second": 13.46, + "train_steps_per_second": 0.105 + } + ], + "logging_steps": 1, + "max_steps": 2280, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.730550114202419e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}