diff --git "a/checkpoint-1464/trainer_state.json" "b/checkpoint-1464/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1464/trainer_state.json" @@ -0,0 +1,10313 @@ +{ + "best_metric": 0.36407509446144104, + "best_model_checkpoint": "cbb-3b/checkpoint-1464", + "epoch": 3.997269624573379, + "eval_steps": 500, + "global_step": 1464, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0027303754266211604, + "grad_norm": 0.7549694776535034, + "learning_rate": 1.360544217687075e-06, + "loss": 1.2225, + "step": 1 + }, + { + "epoch": 0.005460750853242321, + "grad_norm": 0.7538214325904846, + "learning_rate": 2.72108843537415e-06, + "loss": 1.2103, + "step": 2 + }, + { + "epoch": 0.008191126279863481, + "grad_norm": 0.7328954935073853, + "learning_rate": 4.081632653061224e-06, + "loss": 1.1858, + "step": 3 + }, + { + "epoch": 0.010921501706484642, + "grad_norm": 0.7359272837638855, + "learning_rate": 5.4421768707483e-06, + "loss": 1.1885, + "step": 4 + }, + { + "epoch": 0.013651877133105802, + "grad_norm": 0.740386426448822, + "learning_rate": 6.802721088435375e-06, + "loss": 1.1781, + "step": 5 + }, + { + "epoch": 0.016382252559726963, + "grad_norm": 0.6984951496124268, + "learning_rate": 8.163265306122448e-06, + "loss": 1.1395, + "step": 6 + }, + { + "epoch": 0.01911262798634812, + "grad_norm": 0.6689624786376953, + "learning_rate": 9.523809523809523e-06, + "loss": 1.137, + "step": 7 + }, + { + "epoch": 0.021843003412969283, + "grad_norm": 0.6134174466133118, + "learning_rate": 1.08843537414966e-05, + "loss": 1.1531, + "step": 8 + }, + { + "epoch": 0.024573378839590442, + "grad_norm": 0.5647606253623962, + "learning_rate": 1.2244897959183674e-05, + "loss": 1.1201, + "step": 9 + }, + { + "epoch": 0.027303754266211604, + "grad_norm": 0.541833221912384, + "learning_rate": 1.360544217687075e-05, + "loss": 1.0989, + "step": 10 + }, + { + "epoch": 0.030034129692832763, + "grad_norm": 0.4785626232624054, + "learning_rate": 1.4965986394557824e-05, + "loss": 1.0664, + "step": 11 + }, + { + "epoch": 0.032764505119453925, + "grad_norm": 0.42421552538871765, + "learning_rate": 1.6326530612244897e-05, + "loss": 1.057, + "step": 12 + }, + { + "epoch": 0.03549488054607509, + "grad_norm": 0.384870707988739, + "learning_rate": 1.7687074829931973e-05, + "loss": 0.9794, + "step": 13 + }, + { + "epoch": 0.03822525597269624, + "grad_norm": 0.31449463963508606, + "learning_rate": 1.9047619047619046e-05, + "loss": 0.9485, + "step": 14 + }, + { + "epoch": 0.040955631399317405, + "grad_norm": 0.29094135761260986, + "learning_rate": 2.0408163265306123e-05, + "loss": 0.9581, + "step": 15 + }, + { + "epoch": 0.04368600682593857, + "grad_norm": 0.2500893771648407, + "learning_rate": 2.17687074829932e-05, + "loss": 0.9363, + "step": 16 + }, + { + "epoch": 0.04641638225255973, + "grad_norm": 0.2445881962776184, + "learning_rate": 2.3129251700680275e-05, + "loss": 0.9186, + "step": 17 + }, + { + "epoch": 0.049146757679180884, + "grad_norm": 0.2477860301733017, + "learning_rate": 2.448979591836735e-05, + "loss": 0.9099, + "step": 18 + }, + { + "epoch": 0.05187713310580205, + "grad_norm": 0.24853268265724182, + "learning_rate": 2.5850340136054425e-05, + "loss": 0.912, + "step": 19 + }, + { + "epoch": 0.05460750853242321, + "grad_norm": 0.22501873970031738, + "learning_rate": 2.72108843537415e-05, + "loss": 0.8836, + "step": 20 + }, + { + "epoch": 0.05733788395904437, + "grad_norm": 0.21223071217536926, + "learning_rate": 2.857142857142857e-05, + "loss": 0.8651, + "step": 21 + }, + { + "epoch": 0.060068259385665526, + "grad_norm": 0.20172430574893951, + "learning_rate": 2.9931972789115647e-05, + "loss": 0.8393, + "step": 22 + }, + { + "epoch": 0.06279863481228669, + "grad_norm": 0.17902718484401703, + "learning_rate": 3.1292517006802724e-05, + "loss": 0.8033, + "step": 23 + }, + { + "epoch": 0.06552901023890785, + "grad_norm": 0.1813097447156906, + "learning_rate": 3.265306122448979e-05, + "loss": 0.8152, + "step": 24 + }, + { + "epoch": 0.06825938566552901, + "grad_norm": 0.19280143082141876, + "learning_rate": 3.401360544217687e-05, + "loss": 0.8051, + "step": 25 + }, + { + "epoch": 0.07098976109215017, + "grad_norm": 0.17157189548015594, + "learning_rate": 3.5374149659863946e-05, + "loss": 0.794, + "step": 26 + }, + { + "epoch": 0.07372013651877134, + "grad_norm": 0.1467738002538681, + "learning_rate": 3.673469387755102e-05, + "loss": 0.7874, + "step": 27 + }, + { + "epoch": 0.07645051194539249, + "grad_norm": 0.13913457095623016, + "learning_rate": 3.809523809523809e-05, + "loss": 0.7519, + "step": 28 + }, + { + "epoch": 0.07918088737201365, + "grad_norm": 0.13179022073745728, + "learning_rate": 3.945578231292517e-05, + "loss": 0.76, + "step": 29 + }, + { + "epoch": 0.08191126279863481, + "grad_norm": 0.1376553773880005, + "learning_rate": 4.0816326530612245e-05, + "loss": 0.7369, + "step": 30 + }, + { + "epoch": 0.08464163822525597, + "grad_norm": 0.14040575921535492, + "learning_rate": 4.217687074829932e-05, + "loss": 0.7463, + "step": 31 + }, + { + "epoch": 0.08737201365187713, + "grad_norm": 0.13217338919639587, + "learning_rate": 4.35374149659864e-05, + "loss": 0.7298, + "step": 32 + }, + { + "epoch": 0.0901023890784983, + "grad_norm": 0.11285194754600525, + "learning_rate": 4.4897959183673474e-05, + "loss": 0.7134, + "step": 33 + }, + { + "epoch": 0.09283276450511946, + "grad_norm": 0.10098642110824585, + "learning_rate": 4.625850340136055e-05, + "loss": 0.7238, + "step": 34 + }, + { + "epoch": 0.09556313993174062, + "grad_norm": 0.10341370850801468, + "learning_rate": 4.761904761904762e-05, + "loss": 0.6908, + "step": 35 + }, + { + "epoch": 0.09829351535836177, + "grad_norm": 0.09662918746471405, + "learning_rate": 4.89795918367347e-05, + "loss": 0.7, + "step": 36 + }, + { + "epoch": 0.10102389078498293, + "grad_norm": 0.09548471122980118, + "learning_rate": 5.034013605442177e-05, + "loss": 0.7207, + "step": 37 + }, + { + "epoch": 0.1037542662116041, + "grad_norm": 0.09512269496917725, + "learning_rate": 5.170068027210885e-05, + "loss": 0.7016, + "step": 38 + }, + { + "epoch": 0.10648464163822526, + "grad_norm": 0.0912129282951355, + "learning_rate": 5.3061224489795926e-05, + "loss": 0.6891, + "step": 39 + }, + { + "epoch": 0.10921501706484642, + "grad_norm": 0.08661182224750519, + "learning_rate": 5.4421768707483e-05, + "loss": 0.6982, + "step": 40 + }, + { + "epoch": 0.11194539249146758, + "grad_norm": 0.09124922007322311, + "learning_rate": 5.5782312925170065e-05, + "loss": 0.7051, + "step": 41 + }, + { + "epoch": 0.11467576791808874, + "grad_norm": 0.09174500405788422, + "learning_rate": 5.714285714285714e-05, + "loss": 0.6978, + "step": 42 + }, + { + "epoch": 0.1174061433447099, + "grad_norm": 0.0679943636059761, + "learning_rate": 5.850340136054422e-05, + "loss": 0.6889, + "step": 43 + }, + { + "epoch": 0.12013651877133105, + "grad_norm": 0.07204238325357437, + "learning_rate": 5.9863945578231295e-05, + "loss": 0.704, + "step": 44 + }, + { + "epoch": 0.12286689419795221, + "grad_norm": 0.08089234679937363, + "learning_rate": 6.122448979591838e-05, + "loss": 0.6838, + "step": 45 + }, + { + "epoch": 0.12559726962457338, + "grad_norm": 0.09053023904561996, + "learning_rate": 6.258503401360545e-05, + "loss": 0.6754, + "step": 46 + }, + { + "epoch": 0.12832764505119454, + "grad_norm": 0.07513958215713501, + "learning_rate": 6.394557823129253e-05, + "loss": 0.6894, + "step": 47 + }, + { + "epoch": 0.1310580204778157, + "grad_norm": 0.07480401545763016, + "learning_rate": 6.530612244897959e-05, + "loss": 0.6809, + "step": 48 + }, + { + "epoch": 0.13378839590443686, + "grad_norm": 0.07617643475532532, + "learning_rate": 6.666666666666667e-05, + "loss": 0.697, + "step": 49 + }, + { + "epoch": 0.13651877133105803, + "grad_norm": 0.06744271516799927, + "learning_rate": 6.802721088435374e-05, + "loss": 0.6921, + "step": 50 + }, + { + "epoch": 0.1392491467576792, + "grad_norm": 0.07185206562280655, + "learning_rate": 6.938775510204082e-05, + "loss": 0.6536, + "step": 51 + }, + { + "epoch": 0.14197952218430035, + "grad_norm": 0.07255382090806961, + "learning_rate": 7.074829931972789e-05, + "loss": 0.653, + "step": 52 + }, + { + "epoch": 0.1447098976109215, + "grad_norm": 0.07474930584430695, + "learning_rate": 7.210884353741498e-05, + "loss": 0.6888, + "step": 53 + }, + { + "epoch": 0.14744027303754267, + "grad_norm": 0.0754467323422432, + "learning_rate": 7.346938775510205e-05, + "loss": 0.6818, + "step": 54 + }, + { + "epoch": 0.15017064846416384, + "grad_norm": 0.07726683467626572, + "learning_rate": 7.482993197278913e-05, + "loss": 0.6835, + "step": 55 + }, + { + "epoch": 0.15290102389078497, + "grad_norm": 0.07462974637746811, + "learning_rate": 7.619047619047618e-05, + "loss": 0.667, + "step": 56 + }, + { + "epoch": 0.15563139931740613, + "grad_norm": 0.06939647346735, + "learning_rate": 7.755102040816327e-05, + "loss": 0.6668, + "step": 57 + }, + { + "epoch": 0.1583617747440273, + "grad_norm": 0.08218149840831757, + "learning_rate": 7.891156462585034e-05, + "loss": 0.6762, + "step": 58 + }, + { + "epoch": 0.16109215017064846, + "grad_norm": 0.0838819146156311, + "learning_rate": 8.027210884353742e-05, + "loss": 0.6685, + "step": 59 + }, + { + "epoch": 0.16382252559726962, + "grad_norm": 0.07441603392362595, + "learning_rate": 8.163265306122449e-05, + "loss": 0.6573, + "step": 60 + }, + { + "epoch": 0.16655290102389078, + "grad_norm": 0.0746053010225296, + "learning_rate": 8.299319727891157e-05, + "loss": 0.6582, + "step": 61 + }, + { + "epoch": 0.16928327645051194, + "grad_norm": 0.08602144569158554, + "learning_rate": 8.435374149659864e-05, + "loss": 0.6547, + "step": 62 + }, + { + "epoch": 0.1720136518771331, + "grad_norm": 0.08236663043498993, + "learning_rate": 8.571428571428571e-05, + "loss": 0.6081, + "step": 63 + }, + { + "epoch": 0.17474402730375427, + "grad_norm": 0.08744888752698898, + "learning_rate": 8.70748299319728e-05, + "loss": 0.6576, + "step": 64 + }, + { + "epoch": 0.17747440273037543, + "grad_norm": 0.08321461081504822, + "learning_rate": 8.843537414965987e-05, + "loss": 0.6137, + "step": 65 + }, + { + "epoch": 0.1802047781569966, + "grad_norm": 0.08639347553253174, + "learning_rate": 8.979591836734695e-05, + "loss": 0.6579, + "step": 66 + }, + { + "epoch": 0.18293515358361775, + "grad_norm": 0.09154847264289856, + "learning_rate": 9.115646258503402e-05, + "loss": 0.6391, + "step": 67 + }, + { + "epoch": 0.18566552901023892, + "grad_norm": 0.1094379723072052, + "learning_rate": 9.25170068027211e-05, + "loss": 0.61, + "step": 68 + }, + { + "epoch": 0.18839590443686008, + "grad_norm": 0.11089900881052017, + "learning_rate": 9.387755102040817e-05, + "loss": 0.6452, + "step": 69 + }, + { + "epoch": 0.19112627986348124, + "grad_norm": 0.11615785956382751, + "learning_rate": 9.523809523809524e-05, + "loss": 0.6463, + "step": 70 + }, + { + "epoch": 0.19385665529010238, + "grad_norm": 0.08359086513519287, + "learning_rate": 9.659863945578231e-05, + "loss": 0.6364, + "step": 71 + }, + { + "epoch": 0.19658703071672354, + "grad_norm": 0.0885363295674324, + "learning_rate": 9.79591836734694e-05, + "loss": 0.6092, + "step": 72 + }, + { + "epoch": 0.1993174061433447, + "grad_norm": 0.09258115291595459, + "learning_rate": 9.931972789115646e-05, + "loss": 0.6229, + "step": 73 + }, + { + "epoch": 0.20204778156996586, + "grad_norm": 0.08969170600175858, + "learning_rate": 0.00010068027210884355, + "loss": 0.6173, + "step": 74 + }, + { + "epoch": 0.20477815699658702, + "grad_norm": 0.10124260932207108, + "learning_rate": 0.00010204081632653062, + "loss": 0.6414, + "step": 75 + }, + { + "epoch": 0.2075085324232082, + "grad_norm": 0.08671349287033081, + "learning_rate": 0.0001034013605442177, + "loss": 0.6145, + "step": 76 + }, + { + "epoch": 0.21023890784982935, + "grad_norm": 0.09684890508651733, + "learning_rate": 0.00010476190476190477, + "loss": 0.6262, + "step": 77 + }, + { + "epoch": 0.2129692832764505, + "grad_norm": 0.08690830320119858, + "learning_rate": 0.00010612244897959185, + "loss": 0.6316, + "step": 78 + }, + { + "epoch": 0.21569965870307167, + "grad_norm": 0.10457205027341843, + "learning_rate": 0.00010748299319727892, + "loss": 0.639, + "step": 79 + }, + { + "epoch": 0.21843003412969283, + "grad_norm": 0.10080841183662415, + "learning_rate": 0.000108843537414966, + "loss": 0.592, + "step": 80 + }, + { + "epoch": 0.221160409556314, + "grad_norm": 0.08858262002468109, + "learning_rate": 0.00011020408163265306, + "loss": 0.6471, + "step": 81 + }, + { + "epoch": 0.22389078498293516, + "grad_norm": 0.08708172291517258, + "learning_rate": 0.00011156462585034013, + "loss": 0.6222, + "step": 82 + }, + { + "epoch": 0.22662116040955632, + "grad_norm": 0.1075206995010376, + "learning_rate": 0.00011292517006802721, + "loss": 0.5961, + "step": 83 + }, + { + "epoch": 0.22935153583617748, + "grad_norm": 0.11788732558488846, + "learning_rate": 0.00011428571428571428, + "loss": 0.609, + "step": 84 + }, + { + "epoch": 0.23208191126279865, + "grad_norm": 0.0956830084323883, + "learning_rate": 0.00011564625850340137, + "loss": 0.6042, + "step": 85 + }, + { + "epoch": 0.2348122866894198, + "grad_norm": 0.09799174964427948, + "learning_rate": 0.00011700680272108844, + "loss": 0.6045, + "step": 86 + }, + { + "epoch": 0.23754266211604094, + "grad_norm": 0.09177012741565704, + "learning_rate": 0.00011836734693877552, + "loss": 0.6068, + "step": 87 + }, + { + "epoch": 0.2402730375426621, + "grad_norm": 0.10407502949237823, + "learning_rate": 0.00011972789115646259, + "loss": 0.5993, + "step": 88 + }, + { + "epoch": 0.24300341296928327, + "grad_norm": 0.1047271341085434, + "learning_rate": 0.00012108843537414967, + "loss": 0.6144, + "step": 89 + }, + { + "epoch": 0.24573378839590443, + "grad_norm": 0.0866198018193245, + "learning_rate": 0.00012244897959183676, + "loss": 0.6203, + "step": 90 + }, + { + "epoch": 0.2484641638225256, + "grad_norm": 0.09400323033332825, + "learning_rate": 0.0001238095238095238, + "loss": 0.6056, + "step": 91 + }, + { + "epoch": 0.25119453924914675, + "grad_norm": 0.0817628726363182, + "learning_rate": 0.0001251700680272109, + "loss": 0.5853, + "step": 92 + }, + { + "epoch": 0.25392491467576794, + "grad_norm": 0.09105788916349411, + "learning_rate": 0.00012653061224489798, + "loss": 0.5952, + "step": 93 + }, + { + "epoch": 0.2566552901023891, + "grad_norm": 0.09889201074838638, + "learning_rate": 0.00012789115646258506, + "loss": 0.5994, + "step": 94 + }, + { + "epoch": 0.2593856655290102, + "grad_norm": 0.09481444954872131, + "learning_rate": 0.00012925170068027212, + "loss": 0.5918, + "step": 95 + }, + { + "epoch": 0.2621160409556314, + "grad_norm": 0.11730329692363739, + "learning_rate": 0.00013061224489795917, + "loss": 0.592, + "step": 96 + }, + { + "epoch": 0.26484641638225254, + "grad_norm": 0.15733356773853302, + "learning_rate": 0.00013197278911564626, + "loss": 0.5636, + "step": 97 + }, + { + "epoch": 0.2675767918088737, + "grad_norm": 0.20819880068302155, + "learning_rate": 0.00013333333333333334, + "loss": 0.6101, + "step": 98 + }, + { + "epoch": 0.27030716723549486, + "grad_norm": 0.18305541574954987, + "learning_rate": 0.0001346938775510204, + "loss": 0.5814, + "step": 99 + }, + { + "epoch": 0.27303754266211605, + "grad_norm": 0.10316050797700882, + "learning_rate": 0.00013605442176870748, + "loss": 0.5871, + "step": 100 + }, + { + "epoch": 0.2757679180887372, + "grad_norm": 0.13305549323558807, + "learning_rate": 0.00013741496598639456, + "loss": 0.5846, + "step": 101 + }, + { + "epoch": 0.2784982935153584, + "grad_norm": 0.0950811356306076, + "learning_rate": 0.00013877551020408165, + "loss": 0.5711, + "step": 102 + }, + { + "epoch": 0.2812286689419795, + "grad_norm": 0.1198628693819046, + "learning_rate": 0.0001401360544217687, + "loss": 0.5914, + "step": 103 + }, + { + "epoch": 0.2839590443686007, + "grad_norm": 0.08809541165828705, + "learning_rate": 0.00014149659863945578, + "loss": 0.5872, + "step": 104 + }, + { + "epoch": 0.28668941979522183, + "grad_norm": 0.09801067411899567, + "learning_rate": 0.00014285714285714287, + "loss": 0.566, + "step": 105 + }, + { + "epoch": 0.289419795221843, + "grad_norm": 0.08766568452119827, + "learning_rate": 0.00014421768707482995, + "loss": 0.5808, + "step": 106 + }, + { + "epoch": 0.29215017064846416, + "grad_norm": 0.09133429825305939, + "learning_rate": 0.000145578231292517, + "loss": 0.6037, + "step": 107 + }, + { + "epoch": 0.29488054607508535, + "grad_norm": 0.09074072539806366, + "learning_rate": 0.0001469387755102041, + "loss": 0.5897, + "step": 108 + }, + { + "epoch": 0.2976109215017065, + "grad_norm": 0.08934789896011353, + "learning_rate": 0.00014829931972789117, + "loss": 0.5998, + "step": 109 + }, + { + "epoch": 0.3003412969283277, + "grad_norm": 0.08707176148891449, + "learning_rate": 0.00014965986394557826, + "loss": 0.5762, + "step": 110 + }, + { + "epoch": 0.3030716723549488, + "grad_norm": 0.0948200449347496, + "learning_rate": 0.0001510204081632653, + "loss": 0.5734, + "step": 111 + }, + { + "epoch": 0.30580204778156994, + "grad_norm": 0.08889783173799515, + "learning_rate": 0.00015238095238095237, + "loss": 0.5867, + "step": 112 + }, + { + "epoch": 0.30853242320819113, + "grad_norm": 0.08152323961257935, + "learning_rate": 0.00015374149659863945, + "loss": 0.5527, + "step": 113 + }, + { + "epoch": 0.31126279863481227, + "grad_norm": 0.09019389748573303, + "learning_rate": 0.00015510204081632654, + "loss": 0.6007, + "step": 114 + }, + { + "epoch": 0.31399317406143346, + "grad_norm": 0.08257456868886948, + "learning_rate": 0.00015646258503401362, + "loss": 0.5569, + "step": 115 + }, + { + "epoch": 0.3167235494880546, + "grad_norm": 0.08834348618984222, + "learning_rate": 0.00015782312925170067, + "loss": 0.6026, + "step": 116 + }, + { + "epoch": 0.3194539249146758, + "grad_norm": 0.08634665608406067, + "learning_rate": 0.00015918367346938776, + "loss": 0.5926, + "step": 117 + }, + { + "epoch": 0.3221843003412969, + "grad_norm": 0.07867719978094101, + "learning_rate": 0.00016054421768707484, + "loss": 0.5707, + "step": 118 + }, + { + "epoch": 0.3249146757679181, + "grad_norm": 0.09690061956644058, + "learning_rate": 0.00016190476190476192, + "loss": 0.5793, + "step": 119 + }, + { + "epoch": 0.32764505119453924, + "grad_norm": 0.08276376128196716, + "learning_rate": 0.00016326530612244898, + "loss": 0.5459, + "step": 120 + }, + { + "epoch": 0.33037542662116043, + "grad_norm": 0.09276240319013596, + "learning_rate": 0.00016462585034013606, + "loss": 0.5732, + "step": 121 + }, + { + "epoch": 0.33310580204778156, + "grad_norm": 0.0819844901561737, + "learning_rate": 0.00016598639455782315, + "loss": 0.5349, + "step": 122 + }, + { + "epoch": 0.33583617747440275, + "grad_norm": 0.08146791905164719, + "learning_rate": 0.00016734693877551023, + "loss": 0.5656, + "step": 123 + }, + { + "epoch": 0.3385665529010239, + "grad_norm": 0.0879024788737297, + "learning_rate": 0.00016870748299319729, + "loss": 0.5758, + "step": 124 + }, + { + "epoch": 0.3412969283276451, + "grad_norm": 0.07890356332063675, + "learning_rate": 0.00017006802721088434, + "loss": 0.5332, + "step": 125 + }, + { + "epoch": 0.3440273037542662, + "grad_norm": 0.10049955546855927, + "learning_rate": 0.00017142857142857143, + "loss": 0.5671, + "step": 126 + }, + { + "epoch": 0.34675767918088735, + "grad_norm": 0.09643971920013428, + "learning_rate": 0.0001727891156462585, + "loss": 0.5812, + "step": 127 + }, + { + "epoch": 0.34948805460750854, + "grad_norm": 0.08666185289621353, + "learning_rate": 0.0001741496598639456, + "loss": 0.5487, + "step": 128 + }, + { + "epoch": 0.35221843003412967, + "grad_norm": 0.1031438484787941, + "learning_rate": 0.00017551020408163265, + "loss": 0.5558, + "step": 129 + }, + { + "epoch": 0.35494880546075086, + "grad_norm": 0.09404855966567993, + "learning_rate": 0.00017687074829931973, + "loss": 0.5615, + "step": 130 + }, + { + "epoch": 0.357679180887372, + "grad_norm": 0.09127198159694672, + "learning_rate": 0.00017823129251700681, + "loss": 0.5656, + "step": 131 + }, + { + "epoch": 0.3604095563139932, + "grad_norm": 0.08694130182266235, + "learning_rate": 0.0001795918367346939, + "loss": 0.5379, + "step": 132 + }, + { + "epoch": 0.3631399317406143, + "grad_norm": 0.09511597454547882, + "learning_rate": 0.00018095238095238095, + "loss": 0.5535, + "step": 133 + }, + { + "epoch": 0.3658703071672355, + "grad_norm": 0.09129739552736282, + "learning_rate": 0.00018231292517006804, + "loss": 0.5678, + "step": 134 + }, + { + "epoch": 0.36860068259385664, + "grad_norm": 0.09248334169387817, + "learning_rate": 0.00018367346938775512, + "loss": 0.5574, + "step": 135 + }, + { + "epoch": 0.37133105802047783, + "grad_norm": 0.09906318038702011, + "learning_rate": 0.0001850340136054422, + "loss": 0.5499, + "step": 136 + }, + { + "epoch": 0.37406143344709897, + "grad_norm": 0.09928654134273529, + "learning_rate": 0.00018639455782312926, + "loss": 0.5413, + "step": 137 + }, + { + "epoch": 0.37679180887372016, + "grad_norm": 0.07559472322463989, + "learning_rate": 0.00018775510204081634, + "loss": 0.5475, + "step": 138 + }, + { + "epoch": 0.3795221843003413, + "grad_norm": 0.08408834040164948, + "learning_rate": 0.00018911564625850343, + "loss": 0.5432, + "step": 139 + }, + { + "epoch": 0.3822525597269625, + "grad_norm": 0.08800789713859558, + "learning_rate": 0.00019047619047619048, + "loss": 0.5587, + "step": 140 + }, + { + "epoch": 0.3849829351535836, + "grad_norm": 0.09994784742593765, + "learning_rate": 0.00019183673469387756, + "loss": 0.555, + "step": 141 + }, + { + "epoch": 0.38771331058020475, + "grad_norm": 0.07616768032312393, + "learning_rate": 0.00019319727891156462, + "loss": 0.5621, + "step": 142 + }, + { + "epoch": 0.39044368600682594, + "grad_norm": 0.10337202996015549, + "learning_rate": 0.0001945578231292517, + "loss": 0.5282, + "step": 143 + }, + { + "epoch": 0.3931740614334471, + "grad_norm": 0.08526328206062317, + "learning_rate": 0.0001959183673469388, + "loss": 0.5439, + "step": 144 + }, + { + "epoch": 0.39590443686006827, + "grad_norm": 0.10538353770971298, + "learning_rate": 0.00019727891156462587, + "loss": 0.5481, + "step": 145 + }, + { + "epoch": 0.3986348122866894, + "grad_norm": 0.07550521194934845, + "learning_rate": 0.00019863945578231293, + "loss": 0.5414, + "step": 146 + }, + { + "epoch": 0.4013651877133106, + "grad_norm": 0.10045620799064636, + "learning_rate": 0.0002, + "loss": 0.5382, + "step": 147 + }, + { + "epoch": 0.4040955631399317, + "grad_norm": 0.08987366408109665, + "learning_rate": 0.00019999971548969982, + "loss": 0.5417, + "step": 148 + }, + { + "epoch": 0.4068259385665529, + "grad_norm": 0.0801815390586853, + "learning_rate": 0.0001999988619604182, + "loss": 0.5275, + "step": 149 + }, + { + "epoch": 0.40955631399317405, + "grad_norm": 0.08214934170246124, + "learning_rate": 0.00019999743941701188, + "loss": 0.543, + "step": 150 + }, + { + "epoch": 0.41228668941979524, + "grad_norm": 0.08146006613969803, + "learning_rate": 0.00019999544786757545, + "loss": 0.5409, + "step": 151 + }, + { + "epoch": 0.4150170648464164, + "grad_norm": 0.08081945031881332, + "learning_rate": 0.00019999288732344122, + "loss": 0.5509, + "step": 152 + }, + { + "epoch": 0.41774744027303756, + "grad_norm": 0.09135357290506363, + "learning_rate": 0.0001999897577991792, + "loss": 0.518, + "step": 153 + }, + { + "epoch": 0.4204778156996587, + "grad_norm": 0.09191333502531052, + "learning_rate": 0.0001999860593125971, + "loss": 0.5276, + "step": 154 + }, + { + "epoch": 0.4232081911262799, + "grad_norm": 0.08375995606184006, + "learning_rate": 0.00019998179188473997, + "loss": 0.5319, + "step": 155 + }, + { + "epoch": 0.425938566552901, + "grad_norm": 0.08481922000646591, + "learning_rate": 0.00019997695553989042, + "loss": 0.5437, + "step": 156 + }, + { + "epoch": 0.4286689419795222, + "grad_norm": 0.08768640458583832, + "learning_rate": 0.00019997155030556822, + "loss": 0.5445, + "step": 157 + }, + { + "epoch": 0.43139931740614335, + "grad_norm": 0.08787625283002853, + "learning_rate": 0.00019996557621253027, + "loss": 0.5479, + "step": 158 + }, + { + "epoch": 0.4341296928327645, + "grad_norm": 0.09505843371152878, + "learning_rate": 0.0001999590332947704, + "loss": 0.5263, + "step": 159 + }, + { + "epoch": 0.43686006825938567, + "grad_norm": 0.10003377497196198, + "learning_rate": 0.00019995192158951919, + "loss": 0.5228, + "step": 160 + }, + { + "epoch": 0.4395904436860068, + "grad_norm": 0.0675501748919487, + "learning_rate": 0.00019994424113724363, + "loss": 0.4977, + "step": 161 + }, + { + "epoch": 0.442320819112628, + "grad_norm": 0.09747067093849182, + "learning_rate": 0.00019993599198164715, + "loss": 0.5161, + "step": 162 + }, + { + "epoch": 0.44505119453924913, + "grad_norm": 0.0837995857000351, + "learning_rate": 0.0001999271741696691, + "loss": 0.5243, + "step": 163 + }, + { + "epoch": 0.4477815699658703, + "grad_norm": 0.0793512687087059, + "learning_rate": 0.00019991778775148465, + "loss": 0.5141, + "step": 164 + }, + { + "epoch": 0.45051194539249145, + "grad_norm": 0.07802822440862656, + "learning_rate": 0.00019990783278050448, + "loss": 0.515, + "step": 165 + }, + { + "epoch": 0.45324232081911264, + "grad_norm": 0.08355724066495895, + "learning_rate": 0.0001998973093133744, + "loss": 0.5176, + "step": 166 + }, + { + "epoch": 0.4559726962457338, + "grad_norm": 0.08045308291912079, + "learning_rate": 0.00019988621740997512, + "loss": 0.5151, + "step": 167 + }, + { + "epoch": 0.45870307167235497, + "grad_norm": 0.07589907944202423, + "learning_rate": 0.00019987455713342187, + "loss": 0.5249, + "step": 168 + }, + { + "epoch": 0.4614334470989761, + "grad_norm": 0.08553771674633026, + "learning_rate": 0.000199862328550064, + "loss": 0.5485, + "step": 169 + }, + { + "epoch": 0.4641638225255973, + "grad_norm": 0.08599649369716644, + "learning_rate": 0.00019984953172948465, + "loss": 0.53, + "step": 170 + }, + { + "epoch": 0.4668941979522184, + "grad_norm": 0.06906479597091675, + "learning_rate": 0.0001998361667445004, + "loss": 0.5336, + "step": 171 + }, + { + "epoch": 0.4696245733788396, + "grad_norm": 0.07526392489671707, + "learning_rate": 0.00019982223367116076, + "loss": 0.5013, + "step": 172 + }, + { + "epoch": 0.47235494880546075, + "grad_norm": 0.0722610279917717, + "learning_rate": 0.00019980773258874778, + "loss": 0.5217, + "step": 173 + }, + { + "epoch": 0.4750853242320819, + "grad_norm": 0.0773632749915123, + "learning_rate": 0.00019979266357977564, + "loss": 0.5184, + "step": 174 + }, + { + "epoch": 0.4778156996587031, + "grad_norm": 0.07160216569900513, + "learning_rate": 0.00019977702672999007, + "loss": 0.5009, + "step": 175 + }, + { + "epoch": 0.4805460750853242, + "grad_norm": 0.0764177069067955, + "learning_rate": 0.00019976082212836793, + "loss": 0.5126, + "step": 176 + }, + { + "epoch": 0.4832764505119454, + "grad_norm": 0.07116773724555969, + "learning_rate": 0.0001997440498671168, + "loss": 0.514, + "step": 177 + }, + { + "epoch": 0.48600682593856653, + "grad_norm": 0.08402683585882187, + "learning_rate": 0.00019972671004167433, + "loss": 0.5133, + "step": 178 + }, + { + "epoch": 0.4887372013651877, + "grad_norm": 0.07286666333675385, + "learning_rate": 0.00019970880275070762, + "loss": 0.5221, + "step": 179 + }, + { + "epoch": 0.49146757679180886, + "grad_norm": 0.08641263097524643, + "learning_rate": 0.00019969032809611287, + "loss": 0.4959, + "step": 180 + }, + { + "epoch": 0.49419795221843005, + "grad_norm": 0.08849737048149109, + "learning_rate": 0.0001996712861830147, + "loss": 0.4952, + "step": 181 + }, + { + "epoch": 0.4969283276450512, + "grad_norm": 0.08661802858114243, + "learning_rate": 0.00019965167711976552, + "loss": 0.5023, + "step": 182 + }, + { + "epoch": 0.49965870307167237, + "grad_norm": 0.08355259150266647, + "learning_rate": 0.0001996315010179449, + "loss": 0.5235, + "step": 183 + }, + { + "epoch": 0.5023890784982935, + "grad_norm": 0.07524804770946503, + "learning_rate": 0.00019961075799235903, + "loss": 0.5143, + "step": 184 + }, + { + "epoch": 0.5051194539249146, + "grad_norm": 0.08126044273376465, + "learning_rate": 0.00019958944816104, + "loss": 0.496, + "step": 185 + }, + { + "epoch": 0.5078498293515359, + "grad_norm": 0.08320248872041702, + "learning_rate": 0.00019956757164524516, + "loss": 0.5106, + "step": 186 + }, + { + "epoch": 0.510580204778157, + "grad_norm": 0.07375509291887283, + "learning_rate": 0.00019954512856945632, + "loss": 0.4811, + "step": 187 + }, + { + "epoch": 0.5133105802047782, + "grad_norm": 0.07187776267528534, + "learning_rate": 0.00019952211906137932, + "loss": 0.5104, + "step": 188 + }, + { + "epoch": 0.5160409556313993, + "grad_norm": 0.07441398501396179, + "learning_rate": 0.00019949854325194294, + "loss": 0.5304, + "step": 189 + }, + { + "epoch": 0.5187713310580204, + "grad_norm": 0.07976701855659485, + "learning_rate": 0.00019947440127529836, + "loss": 0.4945, + "step": 190 + }, + { + "epoch": 0.5215017064846417, + "grad_norm": 0.07280328124761581, + "learning_rate": 0.00019944969326881845, + "loss": 0.4848, + "step": 191 + }, + { + "epoch": 0.5242320819112628, + "grad_norm": 0.07618428766727448, + "learning_rate": 0.00019942441937309684, + "loss": 0.4858, + "step": 192 + }, + { + "epoch": 0.5269624573378839, + "grad_norm": 0.0665225088596344, + "learning_rate": 0.00019939857973194717, + "loss": 0.4955, + "step": 193 + }, + { + "epoch": 0.5296928327645051, + "grad_norm": 0.08379194140434265, + "learning_rate": 0.0001993721744924024, + "loss": 0.5067, + "step": 194 + }, + { + "epoch": 0.5324232081911263, + "grad_norm": 0.07564423978328705, + "learning_rate": 0.00019934520380471372, + "loss": 0.5159, + "step": 195 + }, + { + "epoch": 0.5351535836177475, + "grad_norm": 0.07225633412599564, + "learning_rate": 0.0001993176678223499, + "loss": 0.5144, + "step": 196 + }, + { + "epoch": 0.5378839590443686, + "grad_norm": 0.07224252074956894, + "learning_rate": 0.0001992895667019964, + "loss": 0.4859, + "step": 197 + }, + { + "epoch": 0.5406143344709897, + "grad_norm": 0.079926997423172, + "learning_rate": 0.0001992609006035543, + "loss": 0.4872, + "step": 198 + }, + { + "epoch": 0.543344709897611, + "grad_norm": 0.08545151352882385, + "learning_rate": 0.0001992316696901397, + "loss": 0.5105, + "step": 199 + }, + { + "epoch": 0.5460750853242321, + "grad_norm": 0.08008193224668503, + "learning_rate": 0.00019920187412808248, + "loss": 0.4903, + "step": 200 + }, + { + "epoch": 0.5488054607508532, + "grad_norm": 0.06717066466808319, + "learning_rate": 0.0001991715140869255, + "loss": 0.5037, + "step": 201 + }, + { + "epoch": 0.5515358361774744, + "grad_norm": 0.08613338321447372, + "learning_rate": 0.00019914058973942368, + "loss": 0.4999, + "step": 202 + }, + { + "epoch": 0.5542662116040956, + "grad_norm": 0.07288234680891037, + "learning_rate": 0.00019910910126154293, + "loss": 0.5019, + "step": 203 + }, + { + "epoch": 0.5569965870307167, + "grad_norm": 0.07831370085477829, + "learning_rate": 0.00019907704883245916, + "loss": 0.4595, + "step": 204 + }, + { + "epoch": 0.5597269624573379, + "grad_norm": 0.0916525200009346, + "learning_rate": 0.00019904443263455728, + "loss": 0.4994, + "step": 205 + }, + { + "epoch": 0.562457337883959, + "grad_norm": 0.07431495934724808, + "learning_rate": 0.00019901125285343022, + "loss": 0.5059, + "step": 206 + }, + { + "epoch": 0.5651877133105802, + "grad_norm": 0.07864730060100555, + "learning_rate": 0.0001989775096778777, + "loss": 0.4824, + "step": 207 + }, + { + "epoch": 0.5679180887372014, + "grad_norm": 0.06928006559610367, + "learning_rate": 0.0001989432032999054, + "loss": 0.4887, + "step": 208 + }, + { + "epoch": 0.5706484641638225, + "grad_norm": 0.07330948859453201, + "learning_rate": 0.0001989083339147237, + "loss": 0.4804, + "step": 209 + }, + { + "epoch": 0.5733788395904437, + "grad_norm": 0.07905860990285873, + "learning_rate": 0.0001988729017207465, + "loss": 0.5126, + "step": 210 + }, + { + "epoch": 0.5761092150170648, + "grad_norm": 0.07062509655952454, + "learning_rate": 0.00019883690691959035, + "loss": 0.5063, + "step": 211 + }, + { + "epoch": 0.578839590443686, + "grad_norm": 0.071404367685318, + "learning_rate": 0.00019880034971607308, + "loss": 0.495, + "step": 212 + }, + { + "epoch": 0.5815699658703072, + "grad_norm": 0.0727284774184227, + "learning_rate": 0.00019876323031821266, + "loss": 0.4994, + "step": 213 + }, + { + "epoch": 0.5843003412969283, + "grad_norm": 0.07198608666658401, + "learning_rate": 0.00019872554893722618, + "loss": 0.4903, + "step": 214 + }, + { + "epoch": 0.5870307167235495, + "grad_norm": 0.07637451589107513, + "learning_rate": 0.0001986873057875284, + "loss": 0.5057, + "step": 215 + }, + { + "epoch": 0.5897610921501707, + "grad_norm": 0.06596951186656952, + "learning_rate": 0.00019864850108673073, + "loss": 0.4932, + "step": 216 + }, + { + "epoch": 0.5924914675767918, + "grad_norm": 0.06999579071998596, + "learning_rate": 0.0001986091350556399, + "loss": 0.4887, + "step": 217 + }, + { + "epoch": 0.595221843003413, + "grad_norm": 0.06687980890274048, + "learning_rate": 0.00019856920791825683, + "loss": 0.472, + "step": 218 + }, + { + "epoch": 0.5979522184300341, + "grad_norm": 0.07001427561044693, + "learning_rate": 0.00019852871990177503, + "loss": 0.4692, + "step": 219 + }, + { + "epoch": 0.6006825938566553, + "grad_norm": 0.06714101880788803, + "learning_rate": 0.00019848767123657976, + "loss": 0.4813, + "step": 220 + }, + { + "epoch": 0.6034129692832765, + "grad_norm": 0.07292049378156662, + "learning_rate": 0.0001984460621562463, + "loss": 0.4885, + "step": 221 + }, + { + "epoch": 0.6061433447098976, + "grad_norm": 0.06814104318618774, + "learning_rate": 0.00019840389289753896, + "loss": 0.4938, + "step": 222 + }, + { + "epoch": 0.6088737201365187, + "grad_norm": 0.06866355985403061, + "learning_rate": 0.00019836116370040944, + "loss": 0.4776, + "step": 223 + }, + { + "epoch": 0.6116040955631399, + "grad_norm": 0.07145702093839645, + "learning_rate": 0.00019831787480799568, + "loss": 0.4883, + "step": 224 + }, + { + "epoch": 0.6143344709897611, + "grad_norm": 0.06319977343082428, + "learning_rate": 0.00019827402646662047, + "loss": 0.4882, + "step": 225 + }, + { + "epoch": 0.6170648464163823, + "grad_norm": 0.08186688274145126, + "learning_rate": 0.0001982296189257898, + "loss": 0.4917, + "step": 226 + }, + { + "epoch": 0.6197952218430034, + "grad_norm": 0.06892900168895721, + "learning_rate": 0.00019818465243819184, + "loss": 0.4808, + "step": 227 + }, + { + "epoch": 0.6225255972696245, + "grad_norm": 0.0752168744802475, + "learning_rate": 0.00019813912725969509, + "loss": 0.4858, + "step": 228 + }, + { + "epoch": 0.6252559726962458, + "grad_norm": 0.08079662919044495, + "learning_rate": 0.0001980930436493472, + "loss": 0.5101, + "step": 229 + }, + { + "epoch": 0.6279863481228669, + "grad_norm": 0.0717153325676918, + "learning_rate": 0.00019804640186937343, + "loss": 0.4799, + "step": 230 + }, + { + "epoch": 0.630716723549488, + "grad_norm": 0.08962002396583557, + "learning_rate": 0.0001979992021851751, + "loss": 0.5067, + "step": 231 + }, + { + "epoch": 0.6334470989761092, + "grad_norm": 0.08904211223125458, + "learning_rate": 0.00019795144486532814, + "loss": 0.4725, + "step": 232 + }, + { + "epoch": 0.6361774744027304, + "grad_norm": 0.06842932850122452, + "learning_rate": 0.00019790313018158156, + "loss": 0.4996, + "step": 233 + }, + { + "epoch": 0.6389078498293516, + "grad_norm": 0.08361311256885529, + "learning_rate": 0.0001978542584088558, + "loss": 0.4945, + "step": 234 + }, + { + "epoch": 0.6416382252559727, + "grad_norm": 0.07219431549310684, + "learning_rate": 0.00019780482982524142, + "loss": 0.4488, + "step": 235 + }, + { + "epoch": 0.6443686006825938, + "grad_norm": 0.07717226445674896, + "learning_rate": 0.00019775484471199715, + "loss": 0.4814, + "step": 236 + }, + { + "epoch": 0.647098976109215, + "grad_norm": 0.07770105451345444, + "learning_rate": 0.0001977043033535486, + "loss": 0.4731, + "step": 237 + }, + { + "epoch": 0.6498293515358362, + "grad_norm": 0.06878919899463654, + "learning_rate": 0.00019765320603748655, + "loss": 0.4833, + "step": 238 + }, + { + "epoch": 0.6525597269624573, + "grad_norm": 0.07085343450307846, + "learning_rate": 0.0001976015530545652, + "loss": 0.4907, + "step": 239 + }, + { + "epoch": 0.6552901023890785, + "grad_norm": 0.07935165613889694, + "learning_rate": 0.0001975493446987007, + "loss": 0.4794, + "step": 240 + }, + { + "epoch": 0.6580204778156996, + "grad_norm": 0.06543820351362228, + "learning_rate": 0.00019749658126696934, + "loss": 0.4906, + "step": 241 + }, + { + "epoch": 0.6607508532423209, + "grad_norm": 0.07727054506540298, + "learning_rate": 0.00019744326305960595, + "loss": 0.4868, + "step": 242 + }, + { + "epoch": 0.663481228668942, + "grad_norm": 0.06668544560670853, + "learning_rate": 0.00019738939038000205, + "loss": 0.475, + "step": 243 + }, + { + "epoch": 0.6662116040955631, + "grad_norm": 0.07048569619655609, + "learning_rate": 0.00019733496353470433, + "loss": 0.4878, + "step": 244 + }, + { + "epoch": 0.6689419795221843, + "grad_norm": 0.07110477238893509, + "learning_rate": 0.00019727998283341274, + "loss": 0.4663, + "step": 245 + }, + { + "epoch": 0.6716723549488055, + "grad_norm": 0.07245586067438126, + "learning_rate": 0.00019722444858897878, + "loss": 0.4899, + "step": 246 + }, + { + "epoch": 0.6744027303754266, + "grad_norm": 0.07484875619411469, + "learning_rate": 0.00019716836111740378, + "loss": 0.4831, + "step": 247 + }, + { + "epoch": 0.6771331058020478, + "grad_norm": 0.07812648266553879, + "learning_rate": 0.00019711172073783696, + "loss": 0.4654, + "step": 248 + }, + { + "epoch": 0.6798634812286689, + "grad_norm": 0.060632165521383286, + "learning_rate": 0.00019705452777257377, + "loss": 0.4706, + "step": 249 + }, + { + "epoch": 0.6825938566552902, + "grad_norm": 0.07092992216348648, + "learning_rate": 0.000196996782547054, + "loss": 0.4792, + "step": 250 + }, + { + "epoch": 0.6853242320819113, + "grad_norm": 0.06629595905542374, + "learning_rate": 0.00019693848538985983, + "loss": 0.4791, + "step": 251 + }, + { + "epoch": 0.6880546075085324, + "grad_norm": 0.06915664672851562, + "learning_rate": 0.00019687963663271409, + "loss": 0.4623, + "step": 252 + }, + { + "epoch": 0.6907849829351536, + "grad_norm": 0.0694665014743805, + "learning_rate": 0.00019682023661047836, + "loss": 0.48, + "step": 253 + }, + { + "epoch": 0.6935153583617747, + "grad_norm": 0.06899196654558182, + "learning_rate": 0.00019676028566115102, + "loss": 0.4855, + "step": 254 + }, + { + "epoch": 0.6962457337883959, + "grad_norm": 0.0740811675786972, + "learning_rate": 0.00019669978412586528, + "loss": 0.4833, + "step": 255 + }, + { + "epoch": 0.6989761092150171, + "grad_norm": 0.06517481803894043, + "learning_rate": 0.00019663873234888733, + "loss": 0.4523, + "step": 256 + }, + { + "epoch": 0.7017064846416382, + "grad_norm": 0.06481153517961502, + "learning_rate": 0.0001965771306776144, + "loss": 0.4689, + "step": 257 + }, + { + "epoch": 0.7044368600682593, + "grad_norm": 0.06042364612221718, + "learning_rate": 0.00019651497946257266, + "loss": 0.4757, + "step": 258 + }, + { + "epoch": 0.7071672354948806, + "grad_norm": 0.0717868059873581, + "learning_rate": 0.00019645227905741534, + "loss": 0.4773, + "step": 259 + }, + { + "epoch": 0.7098976109215017, + "grad_norm": 0.06427443772554398, + "learning_rate": 0.00019638902981892068, + "loss": 0.4875, + "step": 260 + }, + { + "epoch": 0.7126279863481229, + "grad_norm": 0.07786547392606735, + "learning_rate": 0.00019632523210698987, + "loss": 0.4758, + "step": 261 + }, + { + "epoch": 0.715358361774744, + "grad_norm": 0.07115910202264786, + "learning_rate": 0.00019626088628464498, + "loss": 0.4651, + "step": 262 + }, + { + "epoch": 0.7180887372013652, + "grad_norm": 0.06626811623573303, + "learning_rate": 0.00019619599271802706, + "loss": 0.4873, + "step": 263 + }, + { + "epoch": 0.7208191126279864, + "grad_norm": 0.07854583859443665, + "learning_rate": 0.00019613055177639384, + "loss": 0.4945, + "step": 264 + }, + { + "epoch": 0.7235494880546075, + "grad_norm": 0.0847892239689827, + "learning_rate": 0.00019606456383211777, + "loss": 0.4671, + "step": 265 + }, + { + "epoch": 0.7262798634812286, + "grad_norm": 0.06735772639513016, + "learning_rate": 0.00019599802926068384, + "loss": 0.4767, + "step": 266 + }, + { + "epoch": 0.7290102389078499, + "grad_norm": 0.07502768933773041, + "learning_rate": 0.00019593094844068748, + "loss": 0.462, + "step": 267 + }, + { + "epoch": 0.731740614334471, + "grad_norm": 0.07276903837919235, + "learning_rate": 0.00019586332175383238, + "loss": 0.4754, + "step": 268 + }, + { + "epoch": 0.7344709897610922, + "grad_norm": 0.07755447924137115, + "learning_rate": 0.00019579514958492826, + "loss": 0.492, + "step": 269 + }, + { + "epoch": 0.7372013651877133, + "grad_norm": 0.07876396179199219, + "learning_rate": 0.0001957264323218889, + "loss": 0.4737, + "step": 270 + }, + { + "epoch": 0.7399317406143344, + "grad_norm": 0.07997962832450867, + "learning_rate": 0.0001956571703557296, + "loss": 0.4592, + "step": 271 + }, + { + "epoch": 0.7426621160409557, + "grad_norm": 0.08079583197832108, + "learning_rate": 0.00019558736408056525, + "loss": 0.473, + "step": 272 + }, + { + "epoch": 0.7453924914675768, + "grad_norm": 0.0736604854464531, + "learning_rate": 0.00019551701389360795, + "loss": 0.4741, + "step": 273 + }, + { + "epoch": 0.7481228668941979, + "grad_norm": 0.0741550549864769, + "learning_rate": 0.00019544612019516472, + "loss": 0.4611, + "step": 274 + }, + { + "epoch": 0.7508532423208191, + "grad_norm": 0.06802786141633987, + "learning_rate": 0.00019537468338863537, + "loss": 0.4621, + "step": 275 + }, + { + "epoch": 0.7535836177474403, + "grad_norm": 0.06499720364809036, + "learning_rate": 0.00019530270388050998, + "loss": 0.4676, + "step": 276 + }, + { + "epoch": 0.7563139931740614, + "grad_norm": 0.06809037923812866, + "learning_rate": 0.00019523018208036677, + "loss": 0.475, + "step": 277 + }, + { + "epoch": 0.7590443686006826, + "grad_norm": 0.06455886363983154, + "learning_rate": 0.0001951571184008698, + "loss": 0.4807, + "step": 278 + }, + { + "epoch": 0.7617747440273037, + "grad_norm": 0.06833679229021072, + "learning_rate": 0.00019508351325776642, + "loss": 0.4751, + "step": 279 + }, + { + "epoch": 0.764505119453925, + "grad_norm": 0.07593976706266403, + "learning_rate": 0.00019500936706988502, + "loss": 0.4714, + "step": 280 + }, + { + "epoch": 0.7672354948805461, + "grad_norm": 0.0687364712357521, + "learning_rate": 0.00019493468025913276, + "loss": 0.4575, + "step": 281 + }, + { + "epoch": 0.7699658703071672, + "grad_norm": 0.07183225452899933, + "learning_rate": 0.00019485945325049288, + "loss": 0.4815, + "step": 282 + }, + { + "epoch": 0.7726962457337884, + "grad_norm": 0.06775309145450592, + "learning_rate": 0.00019478368647202264, + "loss": 0.4543, + "step": 283 + }, + { + "epoch": 0.7754266211604095, + "grad_norm": 0.06261654198169708, + "learning_rate": 0.00019470738035485058, + "loss": 0.4724, + "step": 284 + }, + { + "epoch": 0.7781569965870307, + "grad_norm": 0.06674676388502121, + "learning_rate": 0.00019463053533317425, + "loss": 0.4667, + "step": 285 + }, + { + "epoch": 0.7808873720136519, + "grad_norm": 0.06266098469495773, + "learning_rate": 0.0001945531518442576, + "loss": 0.4614, + "step": 286 + }, + { + "epoch": 0.783617747440273, + "grad_norm": 0.06769178062677383, + "learning_rate": 0.0001944752303284287, + "loss": 0.4609, + "step": 287 + }, + { + "epoch": 0.7863481228668942, + "grad_norm": 0.07618339359760284, + "learning_rate": 0.00019439677122907697, + "loss": 0.4822, + "step": 288 + }, + { + "epoch": 0.7890784982935154, + "grad_norm": 0.06216439977288246, + "learning_rate": 0.00019431777499265087, + "loss": 0.4573, + "step": 289 + }, + { + "epoch": 0.7918088737201365, + "grad_norm": 0.06998062878847122, + "learning_rate": 0.00019423824206865527, + "loss": 0.4683, + "step": 290 + }, + { + "epoch": 0.7945392491467577, + "grad_norm": 0.06178448721766472, + "learning_rate": 0.00019415817290964883, + "loss": 0.4643, + "step": 291 + }, + { + "epoch": 0.7972696245733788, + "grad_norm": 0.06611185520887375, + "learning_rate": 0.00019407756797124164, + "loss": 0.4712, + "step": 292 + }, + { + "epoch": 0.8, + "grad_norm": 0.06682468205690384, + "learning_rate": 0.00019399642771209238, + "loss": 0.474, + "step": 293 + }, + { + "epoch": 0.8027303754266212, + "grad_norm": 0.0632803738117218, + "learning_rate": 0.00019391475259390584, + "loss": 0.4776, + "step": 294 + }, + { + "epoch": 0.8054607508532423, + "grad_norm": 0.06498962640762329, + "learning_rate": 0.0001938325430814302, + "loss": 0.4735, + "step": 295 + }, + { + "epoch": 0.8081911262798634, + "grad_norm": 0.06621643900871277, + "learning_rate": 0.00019374979964245463, + "loss": 0.4785, + "step": 296 + }, + { + "epoch": 0.8109215017064847, + "grad_norm": 0.05847141519188881, + "learning_rate": 0.00019366652274780628, + "loss": 0.4702, + "step": 297 + }, + { + "epoch": 0.8136518771331058, + "grad_norm": 0.06962229311466217, + "learning_rate": 0.00019358271287134784, + "loss": 0.4612, + "step": 298 + }, + { + "epoch": 0.816382252559727, + "grad_norm": 0.06132384389638901, + "learning_rate": 0.00019349837048997478, + "loss": 0.4453, + "step": 299 + }, + { + "epoch": 0.8191126279863481, + "grad_norm": 0.06574399024248123, + "learning_rate": 0.00019341349608361267, + "loss": 0.4545, + "step": 300 + }, + { + "epoch": 0.8218430034129692, + "grad_norm": 0.06561442464590073, + "learning_rate": 0.00019332809013521428, + "loss": 0.4619, + "step": 301 + }, + { + "epoch": 0.8245733788395905, + "grad_norm": 0.06309875100851059, + "learning_rate": 0.00019324215313075706, + "loss": 0.465, + "step": 302 + }, + { + "epoch": 0.8273037542662116, + "grad_norm": 0.06544878333806992, + "learning_rate": 0.00019315568555924035, + "loss": 0.4571, + "step": 303 + }, + { + "epoch": 0.8300341296928327, + "grad_norm": 0.07011238485574722, + "learning_rate": 0.0001930686879126824, + "loss": 0.4579, + "step": 304 + }, + { + "epoch": 0.8327645051194539, + "grad_norm": 0.06445574760437012, + "learning_rate": 0.0001929811606861177, + "loss": 0.4695, + "step": 305 + }, + { + "epoch": 0.8354948805460751, + "grad_norm": 0.061930734664201736, + "learning_rate": 0.00019289310437759427, + "loss": 0.4449, + "step": 306 + }, + { + "epoch": 0.8382252559726963, + "grad_norm": 0.0658838227391243, + "learning_rate": 0.00019280451948817059, + "loss": 0.4726, + "step": 307 + }, + { + "epoch": 0.8409556313993174, + "grad_norm": 0.06302706897258759, + "learning_rate": 0.00019271540652191296, + "loss": 0.447, + "step": 308 + }, + { + "epoch": 0.8436860068259385, + "grad_norm": 0.08308806270360947, + "learning_rate": 0.0001926257659858925, + "loss": 0.4605, + "step": 309 + }, + { + "epoch": 0.8464163822525598, + "grad_norm": 0.06508838385343552, + "learning_rate": 0.00019253559839018235, + "loss": 0.4778, + "step": 310 + }, + { + "epoch": 0.8491467576791809, + "grad_norm": 0.07429094612598419, + "learning_rate": 0.00019244490424785468, + "loss": 0.4659, + "step": 311 + }, + { + "epoch": 0.851877133105802, + "grad_norm": 0.07138285785913467, + "learning_rate": 0.00019235368407497788, + "loss": 0.4564, + "step": 312 + }, + { + "epoch": 0.8546075085324232, + "grad_norm": 0.07202211022377014, + "learning_rate": 0.00019226193839061347, + "loss": 0.4377, + "step": 313 + }, + { + "epoch": 0.8573378839590444, + "grad_norm": 0.0779070258140564, + "learning_rate": 0.0001921696677168133, + "loss": 0.4532, + "step": 314 + }, + { + "epoch": 0.8600682593856656, + "grad_norm": 0.07717596739530563, + "learning_rate": 0.00019207687257861655, + "loss": 0.4654, + "step": 315 + }, + { + "epoch": 0.8627986348122867, + "grad_norm": 0.0708346962928772, + "learning_rate": 0.00019198355350404667, + "loss": 0.4584, + "step": 316 + }, + { + "epoch": 0.8655290102389078, + "grad_norm": 0.0656716600060463, + "learning_rate": 0.00019188971102410837, + "loss": 0.4504, + "step": 317 + }, + { + "epoch": 0.868259385665529, + "grad_norm": 0.06869971752166748, + "learning_rate": 0.00019179534567278475, + "loss": 0.4592, + "step": 318 + }, + { + "epoch": 0.8709897610921502, + "grad_norm": 0.06358928978443146, + "learning_rate": 0.00019170045798703406, + "loss": 0.4376, + "step": 319 + }, + { + "epoch": 0.8737201365187713, + "grad_norm": 0.06602993607521057, + "learning_rate": 0.0001916050485067868, + "loss": 0.4692, + "step": 320 + }, + { + "epoch": 0.8764505119453925, + "grad_norm": 0.06115058436989784, + "learning_rate": 0.00019150911777494258, + "loss": 0.462, + "step": 321 + }, + { + "epoch": 0.8791808873720136, + "grad_norm": 0.06374403834342957, + "learning_rate": 0.00019141266633736697, + "loss": 0.4325, + "step": 322 + }, + { + "epoch": 0.8819112627986349, + "grad_norm": 0.06459895521402359, + "learning_rate": 0.0001913156947428886, + "loss": 0.4605, + "step": 323 + }, + { + "epoch": 0.884641638225256, + "grad_norm": 0.06160016357898712, + "learning_rate": 0.00019121820354329577, + "loss": 0.4604, + "step": 324 + }, + { + "epoch": 0.8873720136518771, + "grad_norm": 0.06345291435718536, + "learning_rate": 0.00019112019329333346, + "loss": 0.4565, + "step": 325 + }, + { + "epoch": 0.8901023890784983, + "grad_norm": 0.06534894555807114, + "learning_rate": 0.00019102166455070024, + "loss": 0.4619, + "step": 326 + }, + { + "epoch": 0.8928327645051195, + "grad_norm": 0.06186550110578537, + "learning_rate": 0.00019092261787604492, + "loss": 0.4477, + "step": 327 + }, + { + "epoch": 0.8955631399317406, + "grad_norm": 0.058699868619441986, + "learning_rate": 0.00019082305383296352, + "loss": 0.4484, + "step": 328 + }, + { + "epoch": 0.8982935153583618, + "grad_norm": 0.05798410624265671, + "learning_rate": 0.00019072297298799589, + "loss": 0.4605, + "step": 329 + }, + { + "epoch": 0.9010238907849829, + "grad_norm": 0.06147664040327072, + "learning_rate": 0.00019062237591062272, + "loss": 0.4489, + "step": 330 + }, + { + "epoch": 0.903754266211604, + "grad_norm": 0.06032559648156166, + "learning_rate": 0.00019052126317326207, + "loss": 0.4412, + "step": 331 + }, + { + "epoch": 0.9064846416382253, + "grad_norm": 0.06326504051685333, + "learning_rate": 0.00019041963535126625, + "loss": 0.4547, + "step": 332 + }, + { + "epoch": 0.9092150170648464, + "grad_norm": 0.06808637827634811, + "learning_rate": 0.0001903174930229185, + "loss": 0.4513, + "step": 333 + }, + { + "epoch": 0.9119453924914676, + "grad_norm": 0.06384904682636261, + "learning_rate": 0.00019021483676942973, + "loss": 0.4542, + "step": 334 + }, + { + "epoch": 0.9146757679180887, + "grad_norm": 0.07148803770542145, + "learning_rate": 0.00019011166717493517, + "loss": 0.4569, + "step": 335 + }, + { + "epoch": 0.9174061433447099, + "grad_norm": 0.06942867487668991, + "learning_rate": 0.000190007984826491, + "loss": 0.4496, + "step": 336 + }, + { + "epoch": 0.9201365187713311, + "grad_norm": 0.06153569370508194, + "learning_rate": 0.00018990379031407124, + "loss": 0.464, + "step": 337 + }, + { + "epoch": 0.9228668941979522, + "grad_norm": 0.07417679578065872, + "learning_rate": 0.00018979908423056408, + "loss": 0.4396, + "step": 338 + }, + { + "epoch": 0.9255972696245733, + "grad_norm": 0.06745341420173645, + "learning_rate": 0.0001896938671717687, + "loss": 0.4584, + "step": 339 + }, + { + "epoch": 0.9283276450511946, + "grad_norm": 0.060262780636548996, + "learning_rate": 0.00018958813973639184, + "loss": 0.4363, + "step": 340 + }, + { + "epoch": 0.9310580204778157, + "grad_norm": 0.06427337974309921, + "learning_rate": 0.0001894819025260444, + "loss": 0.4352, + "step": 341 + }, + { + "epoch": 0.9337883959044369, + "grad_norm": 0.06150776520371437, + "learning_rate": 0.00018937515614523797, + "loss": 0.4644, + "step": 342 + }, + { + "epoch": 0.936518771331058, + "grad_norm": 0.06864424049854279, + "learning_rate": 0.0001892679012013815, + "loss": 0.4608, + "step": 343 + }, + { + "epoch": 0.9392491467576792, + "grad_norm": 0.06174071133136749, + "learning_rate": 0.00018916013830477766, + "loss": 0.4402, + "step": 344 + }, + { + "epoch": 0.9419795221843004, + "grad_norm": 0.0684589147567749, + "learning_rate": 0.00018905186806861957, + "loss": 0.4569, + "step": 345 + }, + { + "epoch": 0.9447098976109215, + "grad_norm": 0.05750627443194389, + "learning_rate": 0.00018894309110898712, + "loss": 0.4522, + "step": 346 + }, + { + "epoch": 0.9474402730375426, + "grad_norm": 0.0697883740067482, + "learning_rate": 0.00018883380804484367, + "loss": 0.4594, + "step": 347 + }, + { + "epoch": 0.9501706484641638, + "grad_norm": 0.06613462418317795, + "learning_rate": 0.00018872401949803237, + "loss": 0.4459, + "step": 348 + }, + { + "epoch": 0.952901023890785, + "grad_norm": 0.06346327811479568, + "learning_rate": 0.00018861372609327263, + "loss": 0.4316, + "step": 349 + }, + { + "epoch": 0.9556313993174061, + "grad_norm": 0.06382953375577927, + "learning_rate": 0.00018850292845815672, + "loss": 0.4358, + "step": 350 + }, + { + "epoch": 0.9583617747440273, + "grad_norm": 0.07121171057224274, + "learning_rate": 0.0001883916272231459, + "loss": 0.465, + "step": 351 + }, + { + "epoch": 0.9610921501706484, + "grad_norm": 0.06311832368373871, + "learning_rate": 0.0001882798230215672, + "loss": 0.4478, + "step": 352 + }, + { + "epoch": 0.9638225255972697, + "grad_norm": 0.06858519464731216, + "learning_rate": 0.00018816751648960956, + "loss": 0.4402, + "step": 353 + }, + { + "epoch": 0.9665529010238908, + "grad_norm": 0.06063356623053551, + "learning_rate": 0.00018805470826632024, + "loss": 0.4373, + "step": 354 + }, + { + "epoch": 0.9692832764505119, + "grad_norm": 0.06550437211990356, + "learning_rate": 0.0001879413989936013, + "loss": 0.4448, + "step": 355 + }, + { + "epoch": 0.9720136518771331, + "grad_norm": 0.06248946860432625, + "learning_rate": 0.00018782758931620584, + "loss": 0.4576, + "step": 356 + }, + { + "epoch": 0.9747440273037543, + "grad_norm": 0.07067371159791946, + "learning_rate": 0.00018771327988173435, + "loss": 0.4644, + "step": 357 + }, + { + "epoch": 0.9774744027303754, + "grad_norm": 0.06225898116827011, + "learning_rate": 0.00018759847134063108, + "loss": 0.4617, + "step": 358 + }, + { + "epoch": 0.9802047781569966, + "grad_norm": 0.061437107622623444, + "learning_rate": 0.0001874831643461803, + "loss": 0.4339, + "step": 359 + }, + { + "epoch": 0.9829351535836177, + "grad_norm": 0.059149857610464096, + "learning_rate": 0.00018736735955450251, + "loss": 0.4238, + "step": 360 + }, + { + "epoch": 0.985665529010239, + "grad_norm": 0.06511219590902328, + "learning_rate": 0.0001872510576245509, + "loss": 0.4394, + "step": 361 + }, + { + "epoch": 0.9883959044368601, + "grad_norm": 0.06580841541290283, + "learning_rate": 0.00018713425921810733, + "loss": 0.4218, + "step": 362 + }, + { + "epoch": 0.9911262798634812, + "grad_norm": 0.07789267599582672, + "learning_rate": 0.00018701696499977884, + "loss": 0.4524, + "step": 363 + }, + { + "epoch": 0.9938566552901024, + "grad_norm": 0.06430528312921524, + "learning_rate": 0.0001868991756369937, + "loss": 0.4503, + "step": 364 + }, + { + "epoch": 0.9965870307167235, + "grad_norm": 0.06355779618024826, + "learning_rate": 0.00018678089179999762, + "loss": 0.4556, + "step": 365 + }, + { + "epoch": 0.9993174061433447, + "grad_norm": 0.06800378113985062, + "learning_rate": 0.00018666211416184999, + "loss": 0.44, + "step": 366 + }, + { + "epoch": 0.9993174061433447, + "eval_loss": 0.4462641775608063, + "eval_runtime": 311.1378, + "eval_samples_per_second": 8.369, + "eval_steps_per_second": 1.048, + "step": 366 + }, + { + "epoch": 1.0020477815699658, + "grad_norm": 0.14618873596191406, + "learning_rate": 0.00018654284339842013, + "loss": 0.7832, + "step": 367 + }, + { + "epoch": 1.004778156996587, + "grad_norm": 0.10670002549886703, + "learning_rate": 0.00018642308018838316, + "loss": 0.4482, + "step": 368 + }, + { + "epoch": 1.0075085324232083, + "grad_norm": 0.07775750756263733, + "learning_rate": 0.00018630282521321645, + "loss": 0.4345, + "step": 369 + }, + { + "epoch": 1.0102389078498293, + "grad_norm": 0.07130205631256104, + "learning_rate": 0.0001861820791571956, + "loss": 0.4294, + "step": 370 + }, + { + "epoch": 1.0129692832764505, + "grad_norm": 0.07318615168333054, + "learning_rate": 0.00018606084270739049, + "loss": 0.449, + "step": 371 + }, + { + "epoch": 1.0156996587030718, + "grad_norm": 0.06613319367170334, + "learning_rate": 0.0001859391165536615, + "loss": 0.4435, + "step": 372 + }, + { + "epoch": 1.0184300341296928, + "grad_norm": 0.06562095880508423, + "learning_rate": 0.0001858169013886556, + "loss": 0.4288, + "step": 373 + }, + { + "epoch": 1.021160409556314, + "grad_norm": 0.060670241713523865, + "learning_rate": 0.00018569419790780218, + "loss": 0.4029, + "step": 374 + }, + { + "epoch": 1.023890784982935, + "grad_norm": 0.06414277106523514, + "learning_rate": 0.00018557100680930937, + "loss": 0.4357, + "step": 375 + }, + { + "epoch": 1.0266211604095563, + "grad_norm": 0.06078667938709259, + "learning_rate": 0.00018544732879415986, + "loss": 0.4188, + "step": 376 + }, + { + "epoch": 1.0293515358361776, + "grad_norm": 0.06345190107822418, + "learning_rate": 0.00018532316456610704, + "loss": 0.4501, + "step": 377 + }, + { + "epoch": 1.0320819112627986, + "grad_norm": 0.06139195337891579, + "learning_rate": 0.00018519851483167097, + "loss": 0.438, + "step": 378 + }, + { + "epoch": 1.0348122866894198, + "grad_norm": 0.059995777904987335, + "learning_rate": 0.00018507338030013427, + "loss": 0.4505, + "step": 379 + }, + { + "epoch": 1.0375426621160408, + "grad_norm": 0.06199508160352707, + "learning_rate": 0.00018494776168353827, + "loss": 0.4564, + "step": 380 + }, + { + "epoch": 1.040273037542662, + "grad_norm": 0.062205228954553604, + "learning_rate": 0.00018482165969667874, + "loss": 0.4519, + "step": 381 + }, + { + "epoch": 1.0430034129692833, + "grad_norm": 0.06433286517858505, + "learning_rate": 0.00018469507505710194, + "loss": 0.4394, + "step": 382 + }, + { + "epoch": 1.0457337883959044, + "grad_norm": 0.06373082101345062, + "learning_rate": 0.00018456800848510056, + "loss": 0.4456, + "step": 383 + }, + { + "epoch": 1.0484641638225256, + "grad_norm": 0.0655735656619072, + "learning_rate": 0.00018444046070370963, + "loss": 0.4527, + "step": 384 + }, + { + "epoch": 1.0511945392491469, + "grad_norm": 0.059250976890325546, + "learning_rate": 0.00018431243243870223, + "loss": 0.4338, + "step": 385 + }, + { + "epoch": 1.0539249146757679, + "grad_norm": 0.05919628590345383, + "learning_rate": 0.00018418392441858555, + "loss": 0.4252, + "step": 386 + }, + { + "epoch": 1.0566552901023891, + "grad_norm": 0.07075149565935135, + "learning_rate": 0.0001840549373745968, + "loss": 0.4478, + "step": 387 + }, + { + "epoch": 1.0593856655290101, + "grad_norm": 0.06196924299001694, + "learning_rate": 0.0001839254720406987, + "loss": 0.4446, + "step": 388 + }, + { + "epoch": 1.0621160409556314, + "grad_norm": 0.07002051174640656, + "learning_rate": 0.00018379552915357575, + "loss": 0.4668, + "step": 389 + }, + { + "epoch": 1.0648464163822526, + "grad_norm": 0.05986930802464485, + "learning_rate": 0.00018366510945262972, + "loss": 0.4361, + "step": 390 + }, + { + "epoch": 1.0675767918088737, + "grad_norm": 0.06568475067615509, + "learning_rate": 0.00018353421367997563, + "loss": 0.4432, + "step": 391 + }, + { + "epoch": 1.070307167235495, + "grad_norm": 0.063268281519413, + "learning_rate": 0.00018340284258043732, + "loss": 0.4479, + "step": 392 + }, + { + "epoch": 1.073037542662116, + "grad_norm": 0.06184746325016022, + "learning_rate": 0.00018327099690154344, + "loss": 0.4392, + "step": 393 + }, + { + "epoch": 1.0757679180887372, + "grad_norm": 0.06682950258255005, + "learning_rate": 0.00018313867739352304, + "loss": 0.4469, + "step": 394 + }, + { + "epoch": 1.0784982935153584, + "grad_norm": 0.06049386039376259, + "learning_rate": 0.00018300588480930143, + "loss": 0.4448, + "step": 395 + }, + { + "epoch": 1.0812286689419794, + "grad_norm": 0.058452919125556946, + "learning_rate": 0.0001828726199044957, + "loss": 0.4387, + "step": 396 + }, + { + "epoch": 1.0839590443686007, + "grad_norm": 0.06608898937702179, + "learning_rate": 0.0001827388834374107, + "loss": 0.4316, + "step": 397 + }, + { + "epoch": 1.086689419795222, + "grad_norm": 0.06221776083111763, + "learning_rate": 0.0001826046761690344, + "loss": 0.4362, + "step": 398 + }, + { + "epoch": 1.089419795221843, + "grad_norm": 0.0670786052942276, + "learning_rate": 0.00018246999886303383, + "loss": 0.4394, + "step": 399 + }, + { + "epoch": 1.0921501706484642, + "grad_norm": 0.061892326921224594, + "learning_rate": 0.00018233485228575063, + "loss": 0.4565, + "step": 400 + }, + { + "epoch": 1.0948805460750852, + "grad_norm": 0.06282811611890793, + "learning_rate": 0.00018219923720619663, + "loss": 0.4421, + "step": 401 + }, + { + "epoch": 1.0976109215017065, + "grad_norm": 0.061520010232925415, + "learning_rate": 0.0001820631543960496, + "loss": 0.4346, + "step": 402 + }, + { + "epoch": 1.1003412969283277, + "grad_norm": 0.05969773232936859, + "learning_rate": 0.0001819266046296487, + "loss": 0.4472, + "step": 403 + }, + { + "epoch": 1.1030716723549487, + "grad_norm": 0.060664501041173935, + "learning_rate": 0.00018178958868399033, + "loss": 0.453, + "step": 404 + }, + { + "epoch": 1.10580204778157, + "grad_norm": 0.0612984299659729, + "learning_rate": 0.00018165210733872336, + "loss": 0.4406, + "step": 405 + }, + { + "epoch": 1.108532423208191, + "grad_norm": 0.059849295765161514, + "learning_rate": 0.000181514161376145, + "loss": 0.4423, + "step": 406 + }, + { + "epoch": 1.1112627986348123, + "grad_norm": 0.059180960059165955, + "learning_rate": 0.0001813757515811962, + "loss": 0.4401, + "step": 407 + }, + { + "epoch": 1.1139931740614335, + "grad_norm": 0.05857124924659729, + "learning_rate": 0.00018123687874145721, + "loss": 0.4159, + "step": 408 + }, + { + "epoch": 1.1167235494880545, + "grad_norm": 0.06205347552895546, + "learning_rate": 0.00018109754364714305, + "loss": 0.4318, + "step": 409 + }, + { + "epoch": 1.1194539249146758, + "grad_norm": 0.06382250785827637, + "learning_rate": 0.0001809577470910992, + "loss": 0.4416, + "step": 410 + }, + { + "epoch": 1.122184300341297, + "grad_norm": 0.05814497917890549, + "learning_rate": 0.00018081748986879679, + "loss": 0.4392, + "step": 411 + }, + { + "epoch": 1.124914675767918, + "grad_norm": 0.058424465358257294, + "learning_rate": 0.00018067677277832834, + "loss": 0.4266, + "step": 412 + }, + { + "epoch": 1.1276450511945393, + "grad_norm": 0.05630108341574669, + "learning_rate": 0.00018053559662040302, + "loss": 0.4401, + "step": 413 + }, + { + "epoch": 1.1303754266211605, + "grad_norm": 0.06453561037778854, + "learning_rate": 0.00018039396219834237, + "loss": 0.4267, + "step": 414 + }, + { + "epoch": 1.1331058020477816, + "grad_norm": 0.06126587092876434, + "learning_rate": 0.00018025187031807532, + "loss": 0.4346, + "step": 415 + }, + { + "epoch": 1.1358361774744028, + "grad_norm": 0.057017982006073, + "learning_rate": 0.00018010932178813397, + "loss": 0.4367, + "step": 416 + }, + { + "epoch": 1.1385665529010238, + "grad_norm": 0.06581621617078781, + "learning_rate": 0.00017996631741964888, + "loss": 0.4157, + "step": 417 + }, + { + "epoch": 1.141296928327645, + "grad_norm": 0.055874526500701904, + "learning_rate": 0.00017982285802634426, + "loss": 0.4341, + "step": 418 + }, + { + "epoch": 1.144027303754266, + "grad_norm": 0.059336546808481216, + "learning_rate": 0.0001796789444245337, + "loss": 0.4029, + "step": 419 + }, + { + "epoch": 1.1467576791808873, + "grad_norm": 0.06833340972661972, + "learning_rate": 0.00017953457743311523, + "loss": 0.4564, + "step": 420 + }, + { + "epoch": 1.1494880546075086, + "grad_norm": 0.061153508722782135, + "learning_rate": 0.00017938975787356673, + "loss": 0.4496, + "step": 421 + }, + { + "epoch": 1.1522184300341296, + "grad_norm": 0.0649651363492012, + "learning_rate": 0.00017924448656994133, + "loss": 0.4323, + "step": 422 + }, + { + "epoch": 1.1549488054607508, + "grad_norm": 0.0639922022819519, + "learning_rate": 0.00017909876434886273, + "loss": 0.4421, + "step": 423 + }, + { + "epoch": 1.157679180887372, + "grad_norm": 0.06662526726722717, + "learning_rate": 0.00017895259203952032, + "loss": 0.4532, + "step": 424 + }, + { + "epoch": 1.1604095563139931, + "grad_norm": 0.05699828639626503, + "learning_rate": 0.0001788059704736647, + "loss": 0.4382, + "step": 425 + }, + { + "epoch": 1.1631399317406144, + "grad_norm": 0.06322555243968964, + "learning_rate": 0.00017865890048560277, + "loss": 0.4423, + "step": 426 + }, + { + "epoch": 1.1658703071672356, + "grad_norm": 0.05652053654193878, + "learning_rate": 0.00017851138291219301, + "loss": 0.4338, + "step": 427 + }, + { + "epoch": 1.1686006825938566, + "grad_norm": 0.06619950383901596, + "learning_rate": 0.00017836341859284093, + "loss": 0.4272, + "step": 428 + }, + { + "epoch": 1.1713310580204779, + "grad_norm": 0.060171984136104584, + "learning_rate": 0.00017821500836949386, + "loss": 0.4371, + "step": 429 + }, + { + "epoch": 1.174061433447099, + "grad_norm": 0.06065813824534416, + "learning_rate": 0.0001780661530866366, + "loss": 0.4064, + "step": 430 + }, + { + "epoch": 1.1767918088737201, + "grad_norm": 0.06799128651618958, + "learning_rate": 0.00017791685359128633, + "loss": 0.43, + "step": 431 + }, + { + "epoch": 1.1795221843003414, + "grad_norm": 0.059587378054857254, + "learning_rate": 0.000177767110732988, + "loss": 0.4366, + "step": 432 + }, + { + "epoch": 1.1822525597269624, + "grad_norm": 0.06191541254520416, + "learning_rate": 0.00017761692536380928, + "loss": 0.415, + "step": 433 + }, + { + "epoch": 1.1849829351535837, + "grad_norm": 0.0611693374812603, + "learning_rate": 0.00017746629833833585, + "loss": 0.4396, + "step": 434 + }, + { + "epoch": 1.1877133105802047, + "grad_norm": 0.06228373572230339, + "learning_rate": 0.00017731523051366658, + "loss": 0.431, + "step": 435 + }, + { + "epoch": 1.190443686006826, + "grad_norm": 0.06130995601415634, + "learning_rate": 0.00017716372274940843, + "loss": 0.4538, + "step": 436 + }, + { + "epoch": 1.1931740614334472, + "grad_norm": 0.06163164600729942, + "learning_rate": 0.00017701177590767183, + "loss": 0.4251, + "step": 437 + }, + { + "epoch": 1.1959044368600682, + "grad_norm": 0.061723340302705765, + "learning_rate": 0.00017685939085306562, + "loss": 0.4274, + "step": 438 + }, + { + "epoch": 1.1986348122866894, + "grad_norm": 0.06078750640153885, + "learning_rate": 0.00017670656845269214, + "loss": 0.4432, + "step": 439 + }, + { + "epoch": 1.2013651877133107, + "grad_norm": 0.05991605296730995, + "learning_rate": 0.00017655330957614234, + "loss": 0.4167, + "step": 440 + }, + { + "epoch": 1.2040955631399317, + "grad_norm": 0.05879712477326393, + "learning_rate": 0.00017639961509549078, + "loss": 0.4232, + "step": 441 + }, + { + "epoch": 1.206825938566553, + "grad_norm": 0.060264360159635544, + "learning_rate": 0.00017624548588529072, + "loss": 0.4361, + "step": 442 + }, + { + "epoch": 1.209556313993174, + "grad_norm": 0.06511180847883224, + "learning_rate": 0.00017609092282256912, + "loss": 0.4327, + "step": 443 + }, + { + "epoch": 1.2122866894197952, + "grad_norm": 0.06026393920183182, + "learning_rate": 0.00017593592678682166, + "loss": 0.4195, + "step": 444 + }, + { + "epoch": 1.2150170648464165, + "grad_norm": 0.06378287822008133, + "learning_rate": 0.0001757804986600077, + "loss": 0.4404, + "step": 445 + }, + { + "epoch": 1.2177474402730375, + "grad_norm": 0.0656813457608223, + "learning_rate": 0.0001756246393265453, + "loss": 0.4354, + "step": 446 + }, + { + "epoch": 1.2204778156996587, + "grad_norm": 0.05804288387298584, + "learning_rate": 0.00017546834967330617, + "loss": 0.4352, + "step": 447 + }, + { + "epoch": 1.2232081911262798, + "grad_norm": 0.06775437295436859, + "learning_rate": 0.00017531163058961066, + "loss": 0.4393, + "step": 448 + }, + { + "epoch": 1.225938566552901, + "grad_norm": 0.06272158026695251, + "learning_rate": 0.00017515448296722262, + "loss": 0.4178, + "step": 449 + }, + { + "epoch": 1.2286689419795223, + "grad_norm": 0.06508231163024902, + "learning_rate": 0.00017499690770034443, + "loss": 0.4322, + "step": 450 + }, + { + "epoch": 1.2313993174061433, + "grad_norm": 0.05709952861070633, + "learning_rate": 0.00017483890568561173, + "loss": 0.4337, + "step": 451 + }, + { + "epoch": 1.2341296928327645, + "grad_norm": 0.061706554144620895, + "learning_rate": 0.00017468047782208865, + "loss": 0.4126, + "step": 452 + }, + { + "epoch": 1.2368600682593858, + "grad_norm": 0.056757740676403046, + "learning_rate": 0.00017452162501126227, + "loss": 0.4287, + "step": 453 + }, + { + "epoch": 1.2395904436860068, + "grad_norm": 0.05650217831134796, + "learning_rate": 0.00017436234815703788, + "loss": 0.4224, + "step": 454 + }, + { + "epoch": 1.242320819112628, + "grad_norm": 0.05224541947245598, + "learning_rate": 0.0001742026481657335, + "loss": 0.4166, + "step": 455 + }, + { + "epoch": 1.245051194539249, + "grad_norm": 0.06731689721345901, + "learning_rate": 0.0001740425259460751, + "loss": 0.4538, + "step": 456 + }, + { + "epoch": 1.2477815699658703, + "grad_norm": 0.060736652463674545, + "learning_rate": 0.00017388198240919102, + "loss": 0.4329, + "step": 457 + }, + { + "epoch": 1.2505119453924913, + "grad_norm": 0.05695323646068573, + "learning_rate": 0.00017372101846860707, + "loss": 0.4412, + "step": 458 + }, + { + "epoch": 1.2532423208191126, + "grad_norm": 0.056898247450590134, + "learning_rate": 0.00017355963504024123, + "loss": 0.4418, + "step": 459 + }, + { + "epoch": 1.2559726962457338, + "grad_norm": 0.059471502900123596, + "learning_rate": 0.00017339783304239843, + "loss": 0.4136, + "step": 460 + }, + { + "epoch": 1.2587030716723548, + "grad_norm": 0.05504520982503891, + "learning_rate": 0.00017323561339576543, + "loss": 0.4263, + "step": 461 + }, + { + "epoch": 1.261433447098976, + "grad_norm": 0.059035494923591614, + "learning_rate": 0.0001730729770234054, + "loss": 0.4362, + "step": 462 + }, + { + "epoch": 1.2641638225255973, + "grad_norm": 0.05722351744771004, + "learning_rate": 0.00017290992485075282, + "loss": 0.4239, + "step": 463 + }, + { + "epoch": 1.2668941979522184, + "grad_norm": 0.057449549436569214, + "learning_rate": 0.0001727464578056081, + "loss": 0.4357, + "step": 464 + }, + { + "epoch": 1.2696245733788396, + "grad_norm": 0.0636393278837204, + "learning_rate": 0.00017258257681813244, + "loss": 0.433, + "step": 465 + }, + { + "epoch": 1.2723549488054609, + "grad_norm": 0.061772268265485764, + "learning_rate": 0.0001724182828208424, + "loss": 0.4365, + "step": 466 + }, + { + "epoch": 1.2750853242320819, + "grad_norm": 0.053929511457681656, + "learning_rate": 0.0001722535767486047, + "loss": 0.4346, + "step": 467 + }, + { + "epoch": 1.2778156996587031, + "grad_norm": 0.05948130041360855, + "learning_rate": 0.00017208845953863076, + "loss": 0.4342, + "step": 468 + }, + { + "epoch": 1.2805460750853244, + "grad_norm": 0.05833544209599495, + "learning_rate": 0.0001719229321304716, + "loss": 0.4309, + "step": 469 + }, + { + "epoch": 1.2832764505119454, + "grad_norm": 0.055491410195827484, + "learning_rate": 0.00017175699546601223, + "loss": 0.4279, + "step": 470 + }, + { + "epoch": 1.2860068259385666, + "grad_norm": 0.05924072489142418, + "learning_rate": 0.00017159065048946644, + "loss": 0.432, + "step": 471 + }, + { + "epoch": 1.2887372013651877, + "grad_norm": 0.05847487971186638, + "learning_rate": 0.00017142389814737142, + "loss": 0.424, + "step": 472 + }, + { + "epoch": 1.291467576791809, + "grad_norm": 0.05650070682168007, + "learning_rate": 0.00017125673938858237, + "loss": 0.4134, + "step": 473 + }, + { + "epoch": 1.29419795221843, + "grad_norm": 0.059648044407367706, + "learning_rate": 0.00017108917516426704, + "loss": 0.4279, + "step": 474 + }, + { + "epoch": 1.2969283276450512, + "grad_norm": 0.060436248779296875, + "learning_rate": 0.00017092120642790042, + "loss": 0.4091, + "step": 475 + }, + { + "epoch": 1.2996587030716724, + "grad_norm": 0.06787759065628052, + "learning_rate": 0.00017075283413525916, + "loss": 0.4107, + "step": 476 + }, + { + "epoch": 1.3023890784982934, + "grad_norm": 0.06723356992006302, + "learning_rate": 0.00017058405924441636, + "loss": 0.4339, + "step": 477 + }, + { + "epoch": 1.3051194539249147, + "grad_norm": 0.058346495032310486, + "learning_rate": 0.00017041488271573587, + "loss": 0.441, + "step": 478 + }, + { + "epoch": 1.307849829351536, + "grad_norm": 0.059269823133945465, + "learning_rate": 0.00017024530551186702, + "loss": 0.4338, + "step": 479 + }, + { + "epoch": 1.310580204778157, + "grad_norm": 0.05570577457547188, + "learning_rate": 0.000170075328597739, + "loss": 0.4176, + "step": 480 + }, + { + "epoch": 1.3133105802047782, + "grad_norm": 0.05658780783414841, + "learning_rate": 0.00016990495294055548, + "loss": 0.4327, + "step": 481 + }, + { + "epoch": 1.3160409556313994, + "grad_norm": 0.06438103318214417, + "learning_rate": 0.00016973417950978906, + "loss": 0.4451, + "step": 482 + }, + { + "epoch": 1.3187713310580205, + "grad_norm": 0.06003286689519882, + "learning_rate": 0.00016956300927717575, + "loss": 0.4245, + "step": 483 + }, + { + "epoch": 1.3215017064846417, + "grad_norm": 0.06092451140284538, + "learning_rate": 0.0001693914432167094, + "loss": 0.4331, + "step": 484 + }, + { + "epoch": 1.3242320819112627, + "grad_norm": 0.059084732085466385, + "learning_rate": 0.00016921948230463625, + "loss": 0.4261, + "step": 485 + }, + { + "epoch": 1.326962457337884, + "grad_norm": 0.059612493962049484, + "learning_rate": 0.00016904712751944931, + "loss": 0.4356, + "step": 486 + }, + { + "epoch": 1.329692832764505, + "grad_norm": 0.05373890697956085, + "learning_rate": 0.00016887437984188286, + "loss": 0.4221, + "step": 487 + }, + { + "epoch": 1.3324232081911263, + "grad_norm": 0.06069657579064369, + "learning_rate": 0.00016870124025490673, + "loss": 0.4343, + "step": 488 + }, + { + "epoch": 1.3351535836177475, + "grad_norm": 0.058680132031440735, + "learning_rate": 0.0001685277097437208, + "loss": 0.4376, + "step": 489 + }, + { + "epoch": 1.3378839590443685, + "grad_norm": 0.052157819271087646, + "learning_rate": 0.0001683537892957495, + "loss": 0.4194, + "step": 490 + }, + { + "epoch": 1.3406143344709898, + "grad_norm": 0.05680167302489281, + "learning_rate": 0.00016817947990063598, + "loss": 0.4214, + "step": 491 + }, + { + "epoch": 1.343344709897611, + "grad_norm": 0.061938587576150894, + "learning_rate": 0.0001680047825502366, + "loss": 0.4413, + "step": 492 + }, + { + "epoch": 1.346075085324232, + "grad_norm": 0.05423510819673538, + "learning_rate": 0.00016782969823861526, + "loss": 0.4188, + "step": 493 + }, + { + "epoch": 1.3488054607508533, + "grad_norm": 0.059597909450531006, + "learning_rate": 0.0001676542279620378, + "loss": 0.4188, + "step": 494 + }, + { + "epoch": 1.3515358361774745, + "grad_norm": 0.05773560330271721, + "learning_rate": 0.00016747837271896622, + "loss": 0.4354, + "step": 495 + }, + { + "epoch": 1.3542662116040955, + "grad_norm": 0.06316240131855011, + "learning_rate": 0.00016730213351005303, + "loss": 0.4248, + "step": 496 + }, + { + "epoch": 1.3569965870307168, + "grad_norm": 0.056602396070957184, + "learning_rate": 0.00016712551133813572, + "loss": 0.4227, + "step": 497 + }, + { + "epoch": 1.3597269624573378, + "grad_norm": 0.06384044885635376, + "learning_rate": 0.0001669485072082308, + "loss": 0.4398, + "step": 498 + }, + { + "epoch": 1.362457337883959, + "grad_norm": 0.06040973588824272, + "learning_rate": 0.00016677112212752824, + "loss": 0.4168, + "step": 499 + }, + { + "epoch": 1.36518771331058, + "grad_norm": 0.05779508873820305, + "learning_rate": 0.00016659335710538564, + "loss": 0.4097, + "step": 500 + }, + { + "epoch": 1.3679180887372013, + "grad_norm": 0.060474693775177, + "learning_rate": 0.00016641521315332265, + "loss": 0.4252, + "step": 501 + }, + { + "epoch": 1.3706484641638226, + "grad_norm": 0.05790797993540764, + "learning_rate": 0.00016623669128501504, + "loss": 0.4238, + "step": 502 + }, + { + "epoch": 1.3733788395904436, + "grad_norm": 0.06164141371846199, + "learning_rate": 0.00016605779251628903, + "loss": 0.4336, + "step": 503 + }, + { + "epoch": 1.3761092150170648, + "grad_norm": 0.055059127509593964, + "learning_rate": 0.00016587851786511543, + "loss": 0.4303, + "step": 504 + }, + { + "epoch": 1.378839590443686, + "grad_norm": 0.05771743133664131, + "learning_rate": 0.00016569886835160399, + "loss": 0.4352, + "step": 505 + }, + { + "epoch": 1.3815699658703071, + "grad_norm": 0.056050512939691544, + "learning_rate": 0.0001655188449979974, + "loss": 0.4233, + "step": 506 + }, + { + "epoch": 1.3843003412969284, + "grad_norm": 0.054744672030210495, + "learning_rate": 0.00016533844882866568, + "loss": 0.415, + "step": 507 + }, + { + "epoch": 1.3870307167235496, + "grad_norm": 0.060217492282390594, + "learning_rate": 0.00016515768087010013, + "loss": 0.3959, + "step": 508 + }, + { + "epoch": 1.3897610921501706, + "grad_norm": 0.0636279284954071, + "learning_rate": 0.00016497654215090772, + "loss": 0.4341, + "step": 509 + }, + { + "epoch": 1.3924914675767919, + "grad_norm": 0.05640679970383644, + "learning_rate": 0.00016479503370180507, + "loss": 0.3917, + "step": 510 + }, + { + "epoch": 1.395221843003413, + "grad_norm": 0.05939646065235138, + "learning_rate": 0.00016461315655561263, + "loss": 0.4378, + "step": 511 + }, + { + "epoch": 1.3979522184300341, + "grad_norm": 0.05862488970160484, + "learning_rate": 0.00016443091174724885, + "loss": 0.4017, + "step": 512 + }, + { + "epoch": 1.4006825938566552, + "grad_norm": 0.060345377773046494, + "learning_rate": 0.00016424830031372425, + "loss": 0.4248, + "step": 513 + }, + { + "epoch": 1.4034129692832764, + "grad_norm": 0.06127999722957611, + "learning_rate": 0.00016406532329413546, + "loss": 0.4129, + "step": 514 + }, + { + "epoch": 1.4061433447098977, + "grad_norm": 0.0599684976041317, + "learning_rate": 0.00016388198172965942, + "loss": 0.4223, + "step": 515 + }, + { + "epoch": 1.4088737201365187, + "grad_norm": 0.056950025260448456, + "learning_rate": 0.00016369827666354745, + "loss": 0.4293, + "step": 516 + }, + { + "epoch": 1.41160409556314, + "grad_norm": 0.05798695236444473, + "learning_rate": 0.00016351420914111916, + "loss": 0.4163, + "step": 517 + }, + { + "epoch": 1.4143344709897612, + "grad_norm": 0.056971821933984756, + "learning_rate": 0.0001633297802097567, + "loss": 0.4088, + "step": 518 + }, + { + "epoch": 1.4170648464163822, + "grad_norm": 0.06520035862922668, + "learning_rate": 0.0001631449909188987, + "loss": 0.4316, + "step": 519 + }, + { + "epoch": 1.4197952218430034, + "grad_norm": 0.054386623203754425, + "learning_rate": 0.00016295984232003426, + "loss": 0.4276, + "step": 520 + }, + { + "epoch": 1.4225255972696247, + "grad_norm": 0.06270336359739304, + "learning_rate": 0.00016277433546669703, + "loss": 0.4133, + "step": 521 + }, + { + "epoch": 1.4252559726962457, + "grad_norm": 0.05896778032183647, + "learning_rate": 0.00016258847141445928, + "loss": 0.4331, + "step": 522 + }, + { + "epoch": 1.427986348122867, + "grad_norm": 0.06417705118656158, + "learning_rate": 0.00016240225122092573, + "loss": 0.4306, + "step": 523 + }, + { + "epoch": 1.430716723549488, + "grad_norm": 0.06666136533021927, + "learning_rate": 0.00016221567594572762, + "loss": 0.4369, + "step": 524 + }, + { + "epoch": 1.4334470989761092, + "grad_norm": 0.06409899890422821, + "learning_rate": 0.00016202874665051674, + "loss": 0.442, + "step": 525 + }, + { + "epoch": 1.4361774744027302, + "grad_norm": 0.06460480391979218, + "learning_rate": 0.00016184146439895928, + "loss": 0.4114, + "step": 526 + }, + { + "epoch": 1.4389078498293515, + "grad_norm": 0.06045004725456238, + "learning_rate": 0.00016165383025672981, + "loss": 0.424, + "step": 527 + }, + { + "epoch": 1.4416382252559727, + "grad_norm": 0.0617341473698616, + "learning_rate": 0.00016146584529150526, + "loss": 0.4201, + "step": 528 + }, + { + "epoch": 1.4443686006825938, + "grad_norm": 0.06265206634998322, + "learning_rate": 0.0001612775105729588, + "loss": 0.4145, + "step": 529 + }, + { + "epoch": 1.447098976109215, + "grad_norm": 0.06431074440479279, + "learning_rate": 0.00016108882717275384, + "loss": 0.397, + "step": 530 + }, + { + "epoch": 1.4498293515358363, + "grad_norm": 0.05702768266201019, + "learning_rate": 0.0001608997961645377, + "loss": 0.4024, + "step": 531 + }, + { + "epoch": 1.4525597269624573, + "grad_norm": 0.06387649476528168, + "learning_rate": 0.00016071041862393578, + "loss": 0.4369, + "step": 532 + }, + { + "epoch": 1.4552901023890785, + "grad_norm": 0.06181952729821205, + "learning_rate": 0.0001605206956285454, + "loss": 0.4391, + "step": 533 + }, + { + "epoch": 1.4580204778156998, + "grad_norm": 0.060091473162174225, + "learning_rate": 0.00016033062825792935, + "loss": 0.4207, + "step": 534 + }, + { + "epoch": 1.4607508532423208, + "grad_norm": 0.059614650905132294, + "learning_rate": 0.0001601402175936102, + "loss": 0.409, + "step": 535 + }, + { + "epoch": 1.463481228668942, + "grad_norm": 0.06142239645123482, + "learning_rate": 0.00015994946471906382, + "loss": 0.4236, + "step": 536 + }, + { + "epoch": 1.466211604095563, + "grad_norm": 0.06790998578071594, + "learning_rate": 0.0001597583707197134, + "loss": 0.4131, + "step": 537 + }, + { + "epoch": 1.4689419795221843, + "grad_norm": 0.05919467657804489, + "learning_rate": 0.00015956693668292313, + "loss": 0.418, + "step": 538 + }, + { + "epoch": 1.4716723549488053, + "grad_norm": 0.06804287433624268, + "learning_rate": 0.00015937516369799216, + "loss": 0.4216, + "step": 539 + }, + { + "epoch": 1.4744027303754266, + "grad_norm": 0.061936333775520325, + "learning_rate": 0.00015918305285614822, + "loss": 0.4239, + "step": 540 + }, + { + "epoch": 1.4771331058020478, + "grad_norm": 0.06181802973151207, + "learning_rate": 0.00015899060525054157, + "loss": 0.4136, + "step": 541 + }, + { + "epoch": 1.4798634812286688, + "grad_norm": 0.05767858028411865, + "learning_rate": 0.0001587978219762388, + "loss": 0.4178, + "step": 542 + }, + { + "epoch": 1.48259385665529, + "grad_norm": 0.06959601491689682, + "learning_rate": 0.00015860470413021642, + "loss": 0.4271, + "step": 543 + }, + { + "epoch": 1.4853242320819113, + "grad_norm": 0.05592988058924675, + "learning_rate": 0.00015841125281135473, + "loss": 0.4165, + "step": 544 + }, + { + "epoch": 1.4880546075085324, + "grad_norm": 0.06603039801120758, + "learning_rate": 0.00015821746912043165, + "loss": 0.4359, + "step": 545 + }, + { + "epoch": 1.4907849829351536, + "grad_norm": 0.05518212914466858, + "learning_rate": 0.00015802335416011625, + "loss": 0.4284, + "step": 546 + }, + { + "epoch": 1.4935153583617748, + "grad_norm": 0.062445998191833496, + "learning_rate": 0.00015782890903496264, + "loss": 0.4171, + "step": 547 + }, + { + "epoch": 1.4962457337883959, + "grad_norm": 0.05508886277675629, + "learning_rate": 0.00015763413485140365, + "loss": 0.4001, + "step": 548 + }, + { + "epoch": 1.4989761092150171, + "grad_norm": 0.0545768216252327, + "learning_rate": 0.00015743903271774455, + "loss": 0.4081, + "step": 549 + }, + { + "epoch": 1.5017064846416384, + "grad_norm": 0.058887772262096405, + "learning_rate": 0.0001572436037441566, + "loss": 0.4224, + "step": 550 + }, + { + "epoch": 1.5044368600682594, + "grad_norm": 0.05538494512438774, + "learning_rate": 0.00015704784904267097, + "loss": 0.4254, + "step": 551 + }, + { + "epoch": 1.5071672354948804, + "grad_norm": 0.05865982919931412, + "learning_rate": 0.00015685176972717223, + "loss": 0.4142, + "step": 552 + }, + { + "epoch": 1.5098976109215017, + "grad_norm": 0.05798998102545738, + "learning_rate": 0.00015665536691339207, + "loss": 0.4298, + "step": 553 + }, + { + "epoch": 1.512627986348123, + "grad_norm": 0.05779840052127838, + "learning_rate": 0.00015645864171890295, + "loss": 0.4145, + "step": 554 + }, + { + "epoch": 1.515358361774744, + "grad_norm": 0.05778159946203232, + "learning_rate": 0.00015626159526311174, + "loss": 0.4249, + "step": 555 + }, + { + "epoch": 1.5180887372013652, + "grad_norm": 0.0566212497651577, + "learning_rate": 0.00015606422866725343, + "loss": 0.4366, + "step": 556 + }, + { + "epoch": 1.5208191126279864, + "grad_norm": 0.05623873695731163, + "learning_rate": 0.00015586654305438456, + "loss": 0.4297, + "step": 557 + }, + { + "epoch": 1.5235494880546074, + "grad_norm": 0.05833446979522705, + "learning_rate": 0.00015566853954937694, + "loss": 0.4361, + "step": 558 + }, + { + "epoch": 1.5262798634812287, + "grad_norm": 0.05821897089481354, + "learning_rate": 0.00015547021927891144, + "loss": 0.4309, + "step": 559 + }, + { + "epoch": 1.52901023890785, + "grad_norm": 0.05831674486398697, + "learning_rate": 0.00015527158337147112, + "loss": 0.4228, + "step": 560 + }, + { + "epoch": 1.531740614334471, + "grad_norm": 0.05716761201620102, + "learning_rate": 0.00015507263295733528, + "loss": 0.4237, + "step": 561 + }, + { + "epoch": 1.5344709897610922, + "grad_norm": 0.061434000730514526, + "learning_rate": 0.00015487336916857278, + "loss": 0.4307, + "step": 562 + }, + { + "epoch": 1.5372013651877134, + "grad_norm": 0.055752865970134735, + "learning_rate": 0.00015467379313903557, + "loss": 0.4089, + "step": 563 + }, + { + "epoch": 1.5399317406143345, + "grad_norm": 0.05673924833536148, + "learning_rate": 0.00015447390600435238, + "loss": 0.3955, + "step": 564 + }, + { + "epoch": 1.5426621160409555, + "grad_norm": 0.05844118818640709, + "learning_rate": 0.00015427370890192224, + "loss": 0.4266, + "step": 565 + }, + { + "epoch": 1.545392491467577, + "grad_norm": 0.05962743982672691, + "learning_rate": 0.00015407320297090786, + "loss": 0.4063, + "step": 566 + }, + { + "epoch": 1.548122866894198, + "grad_norm": 0.05776818096637726, + "learning_rate": 0.00015387238935222927, + "loss": 0.4236, + "step": 567 + }, + { + "epoch": 1.550853242320819, + "grad_norm": 0.05769157037138939, + "learning_rate": 0.00015367126918855738, + "loss": 0.4183, + "step": 568 + }, + { + "epoch": 1.5535836177474402, + "grad_norm": 0.05596569553017616, + "learning_rate": 0.0001534698436243073, + "loss": 0.4074, + "step": 569 + }, + { + "epoch": 1.5563139931740615, + "grad_norm": 0.05986526980996132, + "learning_rate": 0.00015326811380563204, + "loss": 0.4166, + "step": 570 + }, + { + "epoch": 1.5590443686006825, + "grad_norm": 0.05552714318037033, + "learning_rate": 0.0001530660808804158, + "loss": 0.3986, + "step": 571 + }, + { + "epoch": 1.5617747440273038, + "grad_norm": 0.05853855237364769, + "learning_rate": 0.00015286374599826754, + "loss": 0.3964, + "step": 572 + }, + { + "epoch": 1.564505119453925, + "grad_norm": 0.06155244633555412, + "learning_rate": 0.00015266111031051442, + "loss": 0.4041, + "step": 573 + }, + { + "epoch": 1.567235494880546, + "grad_norm": 0.061913736164569855, + "learning_rate": 0.00015245817497019524, + "loss": 0.4228, + "step": 574 + }, + { + "epoch": 1.5699658703071673, + "grad_norm": 0.05519396439194679, + "learning_rate": 0.00015225494113205393, + "loss": 0.4124, + "step": 575 + }, + { + "epoch": 1.5726962457337885, + "grad_norm": 0.05629811808466911, + "learning_rate": 0.00015205140995253283, + "loss": 0.418, + "step": 576 + }, + { + "epoch": 1.5754266211604095, + "grad_norm": 0.051916785538196564, + "learning_rate": 0.00015184758258976637, + "loss": 0.4327, + "step": 577 + }, + { + "epoch": 1.5781569965870306, + "grad_norm": 0.05583992972970009, + "learning_rate": 0.00015164346020357417, + "loss": 0.417, + "step": 578 + }, + { + "epoch": 1.580887372013652, + "grad_norm": 0.05611740052700043, + "learning_rate": 0.00015143904395545466, + "loss": 0.413, + "step": 579 + }, + { + "epoch": 1.583617747440273, + "grad_norm": 0.05637525022029877, + "learning_rate": 0.0001512343350085784, + "loss": 0.4113, + "step": 580 + }, + { + "epoch": 1.586348122866894, + "grad_norm": 0.059624236077070236, + "learning_rate": 0.0001510293345277815, + "loss": 0.4321, + "step": 581 + }, + { + "epoch": 1.5890784982935153, + "grad_norm": 0.05502263084053993, + "learning_rate": 0.0001508240436795589, + "loss": 0.409, + "step": 582 + }, + { + "epoch": 1.5918088737201366, + "grad_norm": 0.05809929221868515, + "learning_rate": 0.00015061846363205784, + "loss": 0.4129, + "step": 583 + }, + { + "epoch": 1.5945392491467576, + "grad_norm": 0.05428490787744522, + "learning_rate": 0.00015041259555507108, + "loss": 0.4181, + "step": 584 + }, + { + "epoch": 1.5972696245733788, + "grad_norm": 0.05276649072766304, + "learning_rate": 0.00015020644062003046, + "loss": 0.3996, + "step": 585 + }, + { + "epoch": 1.6, + "grad_norm": 0.06145811080932617, + "learning_rate": 0.00015000000000000001, + "loss": 0.4156, + "step": 586 + }, + { + "epoch": 1.6027303754266211, + "grad_norm": 0.05626256391406059, + "learning_rate": 0.00014979327486966938, + "loss": 0.4184, + "step": 587 + }, + { + "epoch": 1.6054607508532424, + "grad_norm": 0.06118204817175865, + "learning_rate": 0.0001495862664053471, + "loss": 0.4208, + "step": 588 + }, + { + "epoch": 1.6081911262798636, + "grad_norm": 0.06345456838607788, + "learning_rate": 0.0001493789757849541, + "loss": 0.4234, + "step": 589 + }, + { + "epoch": 1.6109215017064846, + "grad_norm": 0.058717817068099976, + "learning_rate": 0.00014917140418801655, + "loss": 0.4176, + "step": 590 + }, + { + "epoch": 1.6136518771331056, + "grad_norm": 0.05213068425655365, + "learning_rate": 0.00014896355279565976, + "loss": 0.3857, + "step": 591 + }, + { + "epoch": 1.6163822525597271, + "grad_norm": 0.056677792221307755, + "learning_rate": 0.00014875542279060085, + "loss": 0.4211, + "step": 592 + }, + { + "epoch": 1.6191126279863481, + "grad_norm": 0.058997780084609985, + "learning_rate": 0.00014854701535714244, + "loss": 0.4174, + "step": 593 + }, + { + "epoch": 1.6218430034129692, + "grad_norm": 0.0554414838552475, + "learning_rate": 0.00014833833168116582, + "loss": 0.4182, + "step": 594 + }, + { + "epoch": 1.6245733788395904, + "grad_norm": 0.06074132025241852, + "learning_rate": 0.00014812937295012406, + "loss": 0.4261, + "step": 595 + }, + { + "epoch": 1.6273037542662117, + "grad_norm": 0.05850062891840935, + "learning_rate": 0.00014792014035303535, + "loss": 0.4085, + "step": 596 + }, + { + "epoch": 1.6300341296928327, + "grad_norm": 0.06121140718460083, + "learning_rate": 0.00014771063508047636, + "loss": 0.4183, + "step": 597 + }, + { + "epoch": 1.632764505119454, + "grad_norm": 0.06299193948507309, + "learning_rate": 0.00014750085832457519, + "loss": 0.426, + "step": 598 + }, + { + "epoch": 1.6354948805460752, + "grad_norm": 0.06619743257761002, + "learning_rate": 0.00014729081127900476, + "loss": 0.4129, + "step": 599 + }, + { + "epoch": 1.6382252559726962, + "grad_norm": 0.05819617956876755, + "learning_rate": 0.0001470804951389761, + "loss": 0.4129, + "step": 600 + }, + { + "epoch": 1.6409556313993174, + "grad_norm": 0.06314659863710403, + "learning_rate": 0.00014686991110123135, + "loss": 0.3967, + "step": 601 + }, + { + "epoch": 1.6436860068259387, + "grad_norm": 0.05983169004321098, + "learning_rate": 0.00014665906036403706, + "loss": 0.4161, + "step": 602 + }, + { + "epoch": 1.6464163822525597, + "grad_norm": 0.06163496896624565, + "learning_rate": 0.00014644794412717736, + "loss": 0.4103, + "step": 603 + }, + { + "epoch": 1.6491467576791807, + "grad_norm": 0.06737516075372696, + "learning_rate": 0.00014623656359194712, + "loss": 0.4215, + "step": 604 + }, + { + "epoch": 1.6518771331058022, + "grad_norm": 0.058461885899305344, + "learning_rate": 0.00014602491996114516, + "loss": 0.4168, + "step": 605 + }, + { + "epoch": 1.6546075085324232, + "grad_norm": 0.06050106883049011, + "learning_rate": 0.0001458130144390673, + "loss": 0.4184, + "step": 606 + }, + { + "epoch": 1.6573378839590442, + "grad_norm": 0.059844836592674255, + "learning_rate": 0.00014560084823149965, + "loss": 0.4181, + "step": 607 + }, + { + "epoch": 1.6600682593856655, + "grad_norm": 0.05483812466263771, + "learning_rate": 0.0001453884225457116, + "loss": 0.3996, + "step": 608 + }, + { + "epoch": 1.6627986348122867, + "grad_norm": 0.06310712546110153, + "learning_rate": 0.00014517573859044907, + "loss": 0.4266, + "step": 609 + }, + { + "epoch": 1.6655290102389078, + "grad_norm": 0.06159716099500656, + "learning_rate": 0.00014496279757592766, + "loss": 0.4248, + "step": 610 + }, + { + "epoch": 1.668259385665529, + "grad_norm": 0.058709222823381424, + "learning_rate": 0.0001447496007138255, + "loss": 0.4067, + "step": 611 + }, + { + "epoch": 1.6709897610921502, + "grad_norm": 0.05836094543337822, + "learning_rate": 0.00014453614921727668, + "loss": 0.4005, + "step": 612 + }, + { + "epoch": 1.6737201365187713, + "grad_norm": 0.05980111286044121, + "learning_rate": 0.00014432244430086423, + "loss": 0.4222, + "step": 613 + }, + { + "epoch": 1.6764505119453925, + "grad_norm": 0.05967998504638672, + "learning_rate": 0.00014410848718061312, + "loss": 0.4075, + "step": 614 + }, + { + "epoch": 1.6791808873720138, + "grad_norm": 0.05903726816177368, + "learning_rate": 0.00014389427907398342, + "loss": 0.4007, + "step": 615 + }, + { + "epoch": 1.6819112627986348, + "grad_norm": 0.05877222120761871, + "learning_rate": 0.00014367982119986342, + "loss": 0.4234, + "step": 616 + }, + { + "epoch": 1.6846416382252558, + "grad_norm": 0.0625043734908104, + "learning_rate": 0.00014346511477856259, + "loss": 0.4165, + "step": 617 + }, + { + "epoch": 1.6873720136518773, + "grad_norm": 0.05730627477169037, + "learning_rate": 0.0001432501610318047, + "loss": 0.4221, + "step": 618 + }, + { + "epoch": 1.6901023890784983, + "grad_norm": 0.05606284737586975, + "learning_rate": 0.00014303496118272084, + "loss": 0.4201, + "step": 619 + }, + { + "epoch": 1.6928327645051193, + "grad_norm": 0.056516390293836594, + "learning_rate": 0.0001428195164558425, + "loss": 0.4241, + "step": 620 + }, + { + "epoch": 1.6955631399317406, + "grad_norm": 0.0579177550971508, + "learning_rate": 0.00014260382807709457, + "loss": 0.4147, + "step": 621 + }, + { + "epoch": 1.6982935153583618, + "grad_norm": 0.05802591145038605, + "learning_rate": 0.0001423878972737883, + "loss": 0.409, + "step": 622 + }, + { + "epoch": 1.7010238907849828, + "grad_norm": 0.05921417847275734, + "learning_rate": 0.0001421717252746145, + "loss": 0.4126, + "step": 623 + }, + { + "epoch": 1.703754266211604, + "grad_norm": 0.0596776120364666, + "learning_rate": 0.00014195531330963635, + "loss": 0.405, + "step": 624 + }, + { + "epoch": 1.7064846416382253, + "grad_norm": 0.057035986334085464, + "learning_rate": 0.0001417386626102825, + "loss": 0.4208, + "step": 625 + }, + { + "epoch": 1.7092150170648464, + "grad_norm": 0.05868854373693466, + "learning_rate": 0.00014152177440934012, + "loss": 0.4186, + "step": 626 + }, + { + "epoch": 1.7119453924914676, + "grad_norm": 0.058524154126644135, + "learning_rate": 0.0001413046499409477, + "loss": 0.4072, + "step": 627 + }, + { + "epoch": 1.7146757679180888, + "grad_norm": 0.05203258991241455, + "learning_rate": 0.0001410872904405882, + "loss": 0.3929, + "step": 628 + }, + { + "epoch": 1.7174061433447099, + "grad_norm": 0.059925347566604614, + "learning_rate": 0.00014086969714508196, + "loss": 0.4211, + "step": 629 + }, + { + "epoch": 1.7201365187713311, + "grad_norm": 0.0577407106757164, + "learning_rate": 0.00014065187129257964, + "loss": 0.4128, + "step": 630 + }, + { + "epoch": 1.7228668941979524, + "grad_norm": 0.06548412144184113, + "learning_rate": 0.00014043381412255526, + "loss": 0.4117, + "step": 631 + }, + { + "epoch": 1.7255972696245734, + "grad_norm": 0.060420285910367966, + "learning_rate": 0.00014021552687579902, + "loss": 0.4176, + "step": 632 + }, + { + "epoch": 1.7283276450511944, + "grad_norm": 0.05787500739097595, + "learning_rate": 0.00013999701079441028, + "loss": 0.4173, + "step": 633 + }, + { + "epoch": 1.7310580204778157, + "grad_norm": 0.10321489721536636, + "learning_rate": 0.00013977826712179058, + "loss": 0.4098, + "step": 634 + }, + { + "epoch": 1.733788395904437, + "grad_norm": 0.05935697257518768, + "learning_rate": 0.00013955929710263653, + "loss": 0.433, + "step": 635 + }, + { + "epoch": 1.736518771331058, + "grad_norm": 0.05731033533811569, + "learning_rate": 0.00013934010198293257, + "loss": 0.4117, + "step": 636 + }, + { + "epoch": 1.7392491467576792, + "grad_norm": 0.05932068079710007, + "learning_rate": 0.00013912068300994413, + "loss": 0.4, + "step": 637 + }, + { + "epoch": 1.7419795221843004, + "grad_norm": 0.06352514028549194, + "learning_rate": 0.0001389010414322104, + "loss": 0.4135, + "step": 638 + }, + { + "epoch": 1.7447098976109214, + "grad_norm": 0.0548391118645668, + "learning_rate": 0.0001386811784995371, + "loss": 0.3998, + "step": 639 + }, + { + "epoch": 1.7474402730375427, + "grad_norm": 0.05962222442030907, + "learning_rate": 0.00013846109546298971, + "loss": 0.3982, + "step": 640 + }, + { + "epoch": 1.750170648464164, + "grad_norm": 0.056578923016786575, + "learning_rate": 0.00013824079357488598, + "loss": 0.4187, + "step": 641 + }, + { + "epoch": 1.752901023890785, + "grad_norm": 0.05794934183359146, + "learning_rate": 0.0001380202740887891, + "loss": 0.406, + "step": 642 + }, + { + "epoch": 1.7556313993174062, + "grad_norm": 0.056768182665109634, + "learning_rate": 0.00013779953825950034, + "loss": 0.4129, + "step": 643 + }, + { + "epoch": 1.7583617747440274, + "grad_norm": 0.06082385033369064, + "learning_rate": 0.00013757858734305203, + "loss": 0.4226, + "step": 644 + }, + { + "epoch": 1.7610921501706485, + "grad_norm": 0.059198446571826935, + "learning_rate": 0.0001373574225967004, + "loss": 0.405, + "step": 645 + }, + { + "epoch": 1.7638225255972695, + "grad_norm": 0.06012206897139549, + "learning_rate": 0.00013713604527891844, + "loss": 0.4192, + "step": 646 + }, + { + "epoch": 1.7665529010238907, + "grad_norm": 0.06151711568236351, + "learning_rate": 0.00013691445664938866, + "loss": 0.4206, + "step": 647 + }, + { + "epoch": 1.769283276450512, + "grad_norm": 0.06284491717815399, + "learning_rate": 0.00013669265796899607, + "loss": 0.4118, + "step": 648 + }, + { + "epoch": 1.772013651877133, + "grad_norm": 0.06001686304807663, + "learning_rate": 0.00013647065049982078, + "loss": 0.4293, + "step": 649 + }, + { + "epoch": 1.7747440273037542, + "grad_norm": 0.05952538549900055, + "learning_rate": 0.0001362484355051311, + "loss": 0.4114, + "step": 650 + }, + { + "epoch": 1.7774744027303755, + "grad_norm": 0.057195715606212616, + "learning_rate": 0.00013602601424937604, + "loss": 0.4104, + "step": 651 + }, + { + "epoch": 1.7802047781569965, + "grad_norm": 0.05979065224528313, + "learning_rate": 0.00013580338799817844, + "loss": 0.4321, + "step": 652 + }, + { + "epoch": 1.7829351535836178, + "grad_norm": 0.06188386306166649, + "learning_rate": 0.00013558055801832748, + "loss": 0.4044, + "step": 653 + }, + { + "epoch": 1.785665529010239, + "grad_norm": 0.060921113938093185, + "learning_rate": 0.0001353575255777717, + "loss": 0.422, + "step": 654 + }, + { + "epoch": 1.78839590443686, + "grad_norm": 0.0592602975666523, + "learning_rate": 0.0001351342919456116, + "loss": 0.3936, + "step": 655 + }, + { + "epoch": 1.7911262798634813, + "grad_norm": 0.06046243757009506, + "learning_rate": 0.0001349108583920925, + "loss": 0.4251, + "step": 656 + }, + { + "epoch": 1.7938566552901025, + "grad_norm": 0.05771365761756897, + "learning_rate": 0.00013468722618859743, + "loss": 0.4073, + "step": 657 + }, + { + "epoch": 1.7965870307167235, + "grad_norm": 0.05681789293885231, + "learning_rate": 0.0001344633966076396, + "loss": 0.4074, + "step": 658 + }, + { + "epoch": 1.7993174061433446, + "grad_norm": 0.05813178792595863, + "learning_rate": 0.00013423937092285555, + "loss": 0.3896, + "step": 659 + }, + { + "epoch": 1.802047781569966, + "grad_norm": 0.05757216364145279, + "learning_rate": 0.00013401515040899746, + "loss": 0.4178, + "step": 660 + }, + { + "epoch": 1.804778156996587, + "grad_norm": 0.057594846934080124, + "learning_rate": 0.00013379073634192632, + "loss": 0.3785, + "step": 661 + }, + { + "epoch": 1.807508532423208, + "grad_norm": 0.06386829912662506, + "learning_rate": 0.00013356612999860436, + "loss": 0.4017, + "step": 662 + }, + { + "epoch": 1.8102389078498293, + "grad_norm": 0.059352222830057144, + "learning_rate": 0.000133341332657088, + "loss": 0.4053, + "step": 663 + }, + { + "epoch": 1.8129692832764506, + "grad_norm": 0.058490559458732605, + "learning_rate": 0.00013311634559652036, + "loss": 0.4036, + "step": 664 + }, + { + "epoch": 1.8156996587030716, + "grad_norm": 0.0580880232155323, + "learning_rate": 0.00013289117009712418, + "loss": 0.4075, + "step": 665 + }, + { + "epoch": 1.8184300341296928, + "grad_norm": 0.054440416395664215, + "learning_rate": 0.00013266580744019445, + "loss": 0.4139, + "step": 666 + }, + { + "epoch": 1.821160409556314, + "grad_norm": 0.058102305978536606, + "learning_rate": 0.00013244025890809112, + "loss": 0.4051, + "step": 667 + }, + { + "epoch": 1.823890784982935, + "grad_norm": 0.06036128103733063, + "learning_rate": 0.00013221452578423176, + "loss": 0.4091, + "step": 668 + }, + { + "epoch": 1.8266211604095564, + "grad_norm": 0.061323538422584534, + "learning_rate": 0.00013198860935308444, + "loss": 0.4273, + "step": 669 + }, + { + "epoch": 1.8293515358361776, + "grad_norm": 0.06144220754504204, + "learning_rate": 0.00013176251090016007, + "loss": 0.4228, + "step": 670 + }, + { + "epoch": 1.8320819112627986, + "grad_norm": 0.05480247363448143, + "learning_rate": 0.0001315362317120055, + "loss": 0.4078, + "step": 671 + }, + { + "epoch": 1.8348122866894196, + "grad_norm": 0.0559588298201561, + "learning_rate": 0.00013130977307619594, + "loss": 0.4015, + "step": 672 + }, + { + "epoch": 1.8375426621160411, + "grad_norm": 0.0562249980866909, + "learning_rate": 0.0001310831362813276, + "loss": 0.4216, + "step": 673 + }, + { + "epoch": 1.8402730375426621, + "grad_norm": 0.05529346689581871, + "learning_rate": 0.00013085632261701063, + "loss": 0.3991, + "step": 674 + }, + { + "epoch": 1.8430034129692832, + "grad_norm": 0.055582497268915176, + "learning_rate": 0.00013062933337386142, + "loss": 0.3956, + "step": 675 + }, + { + "epoch": 1.8457337883959044, + "grad_norm": 0.057054124772548676, + "learning_rate": 0.00013040216984349555, + "loss": 0.398, + "step": 676 + }, + { + "epoch": 1.8484641638225257, + "grad_norm": 0.057355768978595734, + "learning_rate": 0.00013017483331852035, + "loss": 0.4059, + "step": 677 + }, + { + "epoch": 1.8511945392491467, + "grad_norm": 0.056889165192842484, + "learning_rate": 0.00012994732509252744, + "loss": 0.3806, + "step": 678 + }, + { + "epoch": 1.853924914675768, + "grad_norm": 0.057586781680583954, + "learning_rate": 0.00012971964646008542, + "loss": 0.4104, + "step": 679 + }, + { + "epoch": 1.8566552901023892, + "grad_norm": 0.059306979179382324, + "learning_rate": 0.00012949179871673278, + "loss": 0.4033, + "step": 680 + }, + { + "epoch": 1.8593856655290102, + "grad_norm": 0.057881347835063934, + "learning_rate": 0.00012926378315896998, + "loss": 0.4135, + "step": 681 + }, + { + "epoch": 1.8621160409556314, + "grad_norm": 0.06169261038303375, + "learning_rate": 0.00012903560108425258, + "loss": 0.412, + "step": 682 + }, + { + "epoch": 1.8648464163822527, + "grad_norm": 0.05441267788410187, + "learning_rate": 0.00012880725379098352, + "loss": 0.3986, + "step": 683 + }, + { + "epoch": 1.8675767918088737, + "grad_norm": 0.061068952083587646, + "learning_rate": 0.00012857874257850605, + "loss": 0.418, + "step": 684 + }, + { + "epoch": 1.8703071672354947, + "grad_norm": 0.058384671807289124, + "learning_rate": 0.00012835006874709594, + "loss": 0.4074, + "step": 685 + }, + { + "epoch": 1.8730375426621162, + "grad_norm": 0.0570659376680851, + "learning_rate": 0.00012812123359795446, + "loss": 0.4149, + "step": 686 + }, + { + "epoch": 1.8757679180887372, + "grad_norm": 0.05798759683966637, + "learning_rate": 0.00012789223843320073, + "loss": 0.4022, + "step": 687 + }, + { + "epoch": 1.8784982935153582, + "grad_norm": 0.059756677597761154, + "learning_rate": 0.0001276630845558644, + "loss": 0.4152, + "step": 688 + }, + { + "epoch": 1.8812286689419795, + "grad_norm": 0.05982014164328575, + "learning_rate": 0.00012743377326987826, + "loss": 0.4127, + "step": 689 + }, + { + "epoch": 1.8839590443686007, + "grad_norm": 0.05929556116461754, + "learning_rate": 0.00012720430588007077, + "loss": 0.405, + "step": 690 + }, + { + "epoch": 1.8866894197952218, + "grad_norm": 0.05722184479236603, + "learning_rate": 0.00012697468369215863, + "loss": 0.3978, + "step": 691 + }, + { + "epoch": 1.889419795221843, + "grad_norm": 0.05866376683115959, + "learning_rate": 0.00012674490801273938, + "loss": 0.417, + "step": 692 + }, + { + "epoch": 1.8921501706484642, + "grad_norm": 0.055445022881031036, + "learning_rate": 0.00012651498014928402, + "loss": 0.4161, + "step": 693 + }, + { + "epoch": 1.8948805460750853, + "grad_norm": 0.06086587905883789, + "learning_rate": 0.00012628490141012937, + "loss": 0.402, + "step": 694 + }, + { + "epoch": 1.8976109215017065, + "grad_norm": 0.06076718121767044, + "learning_rate": 0.000126054673104471, + "loss": 0.414, + "step": 695 + }, + { + "epoch": 1.9003412969283278, + "grad_norm": 0.055698879063129425, + "learning_rate": 0.00012582429654235523, + "loss": 0.3926, + "step": 696 + }, + { + "epoch": 1.9030716723549488, + "grad_norm": 0.056595612317323685, + "learning_rate": 0.00012559377303467226, + "loss": 0.4135, + "step": 697 + }, + { + "epoch": 1.9058020477815698, + "grad_norm": 0.05591044947504997, + "learning_rate": 0.00012536310389314832, + "loss": 0.4074, + "step": 698 + }, + { + "epoch": 1.9085324232081913, + "grad_norm": 0.06135864555835724, + "learning_rate": 0.0001251322904303383, + "loss": 0.4203, + "step": 699 + }, + { + "epoch": 1.9112627986348123, + "grad_norm": 0.058106984943151474, + "learning_rate": 0.00012490133395961844, + "loss": 0.4046, + "step": 700 + }, + { + "epoch": 1.9139931740614333, + "grad_norm": 0.059473518282175064, + "learning_rate": 0.00012467023579517856, + "loss": 0.4027, + "step": 701 + }, + { + "epoch": 1.9167235494880546, + "grad_norm": 0.057781342417001724, + "learning_rate": 0.00012443899725201482, + "loss": 0.4163, + "step": 702 + }, + { + "epoch": 1.9194539249146758, + "grad_norm": 0.0613093338906765, + "learning_rate": 0.00012420761964592223, + "loss": 0.4127, + "step": 703 + }, + { + "epoch": 1.9221843003412968, + "grad_norm": 0.05781256780028343, + "learning_rate": 0.000123976104293487, + "loss": 0.398, + "step": 704 + }, + { + "epoch": 1.924914675767918, + "grad_norm": 0.057743050158023834, + "learning_rate": 0.00012374445251207914, + "loss": 0.3969, + "step": 705 + }, + { + "epoch": 1.9276450511945393, + "grad_norm": 0.0608978345990181, + "learning_rate": 0.00012351266561984507, + "loss": 0.4037, + "step": 706 + }, + { + "epoch": 1.9303754266211604, + "grad_norm": 0.05937394127249718, + "learning_rate": 0.00012328074493569993, + "loss": 0.3964, + "step": 707 + }, + { + "epoch": 1.9331058020477816, + "grad_norm": 0.06584876775741577, + "learning_rate": 0.0001230486917793202, + "loss": 0.4186, + "step": 708 + }, + { + "epoch": 1.9358361774744028, + "grad_norm": 0.06222471594810486, + "learning_rate": 0.00012281650747113612, + "loss": 0.4178, + "step": 709 + }, + { + "epoch": 1.9385665529010239, + "grad_norm": 0.05962240695953369, + "learning_rate": 0.0001225841933323242, + "loss": 0.3898, + "step": 710 + }, + { + "epoch": 1.9412969283276449, + "grad_norm": 0.06118809059262276, + "learning_rate": 0.00012235175068479984, + "loss": 0.3926, + "step": 711 + }, + { + "epoch": 1.9440273037542664, + "grad_norm": 0.05581739544868469, + "learning_rate": 0.00012211918085120954, + "loss": 0.3907, + "step": 712 + }, + { + "epoch": 1.9467576791808874, + "grad_norm": 0.05898397043347359, + "learning_rate": 0.00012188648515492355, + "loss": 0.3979, + "step": 713 + }, + { + "epoch": 1.9494880546075084, + "grad_norm": 0.06158998981118202, + "learning_rate": 0.00012165366492002832, + "loss": 0.4138, + "step": 714 + }, + { + "epoch": 1.9522184300341296, + "grad_norm": 0.06278332322835922, + "learning_rate": 0.00012142072147131898, + "loss": 0.4141, + "step": 715 + }, + { + "epoch": 1.954948805460751, + "grad_norm": 0.06232950836420059, + "learning_rate": 0.00012118765613429173, + "loss": 0.4058, + "step": 716 + }, + { + "epoch": 1.957679180887372, + "grad_norm": 0.05726422742009163, + "learning_rate": 0.0001209544702351363, + "loss": 0.4021, + "step": 717 + }, + { + "epoch": 1.9604095563139932, + "grad_norm": 0.05597952753305435, + "learning_rate": 0.00012072116510072858, + "loss": 0.3965, + "step": 718 + }, + { + "epoch": 1.9631399317406144, + "grad_norm": 0.0619698166847229, + "learning_rate": 0.00012048774205862279, + "loss": 0.4112, + "step": 719 + }, + { + "epoch": 1.9658703071672354, + "grad_norm": 0.05994318053126335, + "learning_rate": 0.0001202542024370441, + "loss": 0.4186, + "step": 720 + }, + { + "epoch": 1.9686006825938567, + "grad_norm": 0.06278800964355469, + "learning_rate": 0.00012002054756488115, + "loss": 0.4122, + "step": 721 + }, + { + "epoch": 1.971331058020478, + "grad_norm": 0.06267794966697693, + "learning_rate": 0.00011978677877167822, + "loss": 0.4057, + "step": 722 + }, + { + "epoch": 1.974061433447099, + "grad_norm": 0.06913238018751144, + "learning_rate": 0.00011955289738762796, + "loss": 0.4069, + "step": 723 + }, + { + "epoch": 1.9767918088737202, + "grad_norm": 0.06418196856975555, + "learning_rate": 0.00011931890474356358, + "loss": 0.4078, + "step": 724 + }, + { + "epoch": 1.9795221843003414, + "grad_norm": 0.06403093785047531, + "learning_rate": 0.00011908480217095141, + "loss": 0.4062, + "step": 725 + }, + { + "epoch": 1.9822525597269625, + "grad_norm": 0.0585256926715374, + "learning_rate": 0.00011885059100188341, + "loss": 0.409, + "step": 726 + }, + { + "epoch": 1.9849829351535835, + "grad_norm": 0.06400654464960098, + "learning_rate": 0.00011861627256906929, + "loss": 0.4113, + "step": 727 + }, + { + "epoch": 1.9877133105802047, + "grad_norm": 0.06193806603550911, + "learning_rate": 0.00011838184820582923, + "loss": 0.4194, + "step": 728 + }, + { + "epoch": 1.990443686006826, + "grad_norm": 0.05865743011236191, + "learning_rate": 0.00011814731924608616, + "loss": 0.4002, + "step": 729 + }, + { + "epoch": 1.993174061433447, + "grad_norm": 0.05942784622311592, + "learning_rate": 0.00011791268702435816, + "loss": 0.4047, + "step": 730 + }, + { + "epoch": 1.9959044368600682, + "grad_norm": 0.056138355284929276, + "learning_rate": 0.0001176779528757509, + "loss": 0.4084, + "step": 731 + }, + { + "epoch": 1.9986348122866895, + "grad_norm": 0.058754485100507736, + "learning_rate": 0.00011744311813595006, + "loss": 0.3986, + "step": 732 + }, + { + "epoch": 1.9986348122866895, + "eval_loss": 0.40529951453208923, + "eval_runtime": 310.303, + "eval_samples_per_second": 8.392, + "eval_steps_per_second": 1.051, + "step": 732 + }, + { + "epoch": 2.0013651877133105, + "grad_norm": 0.12337245792150497, + "learning_rate": 0.00011720818414121368, + "loss": 0.6736, + "step": 733 + }, + { + "epoch": 2.0040955631399315, + "grad_norm": 0.0677185207605362, + "learning_rate": 0.00011697315222836458, + "loss": 0.3943, + "step": 734 + }, + { + "epoch": 2.006825938566553, + "grad_norm": 0.06678740680217743, + "learning_rate": 0.0001167380237347828, + "loss": 0.3947, + "step": 735 + }, + { + "epoch": 2.009556313993174, + "grad_norm": 0.06545857340097427, + "learning_rate": 0.00011650279999839787, + "loss": 0.3893, + "step": 736 + }, + { + "epoch": 2.012286689419795, + "grad_norm": 0.07168494164943695, + "learning_rate": 0.00011626748235768128, + "loss": 0.3969, + "step": 737 + }, + { + "epoch": 2.0150170648464165, + "grad_norm": 0.06647184491157532, + "learning_rate": 0.00011603207215163894, + "loss": 0.3705, + "step": 738 + }, + { + "epoch": 2.0177474402730375, + "grad_norm": 0.06479815393686295, + "learning_rate": 0.0001157965707198034, + "loss": 0.4064, + "step": 739 + }, + { + "epoch": 2.0204778156996586, + "grad_norm": 0.06511078029870987, + "learning_rate": 0.00011556097940222628, + "loss": 0.408, + "step": 740 + }, + { + "epoch": 2.02320819112628, + "grad_norm": 0.06628479063510895, + "learning_rate": 0.00011532529953947075, + "loss": 0.4041, + "step": 741 + }, + { + "epoch": 2.025938566552901, + "grad_norm": 0.062073782086372375, + "learning_rate": 0.00011508953247260379, + "loss": 0.3935, + "step": 742 + }, + { + "epoch": 2.028668941979522, + "grad_norm": 0.0709017738699913, + "learning_rate": 0.00011485367954318856, + "loss": 0.3895, + "step": 743 + }, + { + "epoch": 2.0313993174061435, + "grad_norm": 0.06909438967704773, + "learning_rate": 0.0001146177420932768, + "loss": 0.398, + "step": 744 + }, + { + "epoch": 2.0341296928327646, + "grad_norm": 0.06830695271492004, + "learning_rate": 0.00011438172146540123, + "loss": 0.3915, + "step": 745 + }, + { + "epoch": 2.0368600682593856, + "grad_norm": 0.06455153971910477, + "learning_rate": 0.00011414561900256784, + "loss": 0.396, + "step": 746 + }, + { + "epoch": 2.0395904436860066, + "grad_norm": 0.06828158348798752, + "learning_rate": 0.00011390943604824826, + "loss": 0.4, + "step": 747 + }, + { + "epoch": 2.042320819112628, + "grad_norm": 0.06488997489213943, + "learning_rate": 0.00011367317394637218, + "loss": 0.3899, + "step": 748 + }, + { + "epoch": 2.045051194539249, + "grad_norm": 0.0722300335764885, + "learning_rate": 0.00011343683404131964, + "loss": 0.4103, + "step": 749 + }, + { + "epoch": 2.04778156996587, + "grad_norm": 0.06264421343803406, + "learning_rate": 0.00011320041767791336, + "loss": 0.3909, + "step": 750 + }, + { + "epoch": 2.0505119453924916, + "grad_norm": 0.06223292276263237, + "learning_rate": 0.00011296392620141114, + "loss": 0.4038, + "step": 751 + }, + { + "epoch": 2.0532423208191126, + "grad_norm": 0.06223985552787781, + "learning_rate": 0.00011272736095749823, + "loss": 0.3851, + "step": 752 + }, + { + "epoch": 2.0559726962457336, + "grad_norm": 0.06464383751153946, + "learning_rate": 0.00011249072329227959, + "loss": 0.4085, + "step": 753 + }, + { + "epoch": 2.058703071672355, + "grad_norm": 0.06551406532526016, + "learning_rate": 0.0001122540145522723, + "loss": 0.3954, + "step": 754 + }, + { + "epoch": 2.061433447098976, + "grad_norm": 0.0651601254940033, + "learning_rate": 0.00011201723608439778, + "loss": 0.3903, + "step": 755 + }, + { + "epoch": 2.064163822525597, + "grad_norm": 0.061696816235780716, + "learning_rate": 0.0001117803892359744, + "loss": 0.3909, + "step": 756 + }, + { + "epoch": 2.0668941979522186, + "grad_norm": 0.06516902893781662, + "learning_rate": 0.00011154347535470947, + "loss": 0.398, + "step": 757 + }, + { + "epoch": 2.0696245733788396, + "grad_norm": 0.06373114883899689, + "learning_rate": 0.00011130649578869173, + "loss": 0.4048, + "step": 758 + }, + { + "epoch": 2.0723549488054607, + "grad_norm": 0.06553075462579727, + "learning_rate": 0.00011106945188638378, + "loss": 0.3939, + "step": 759 + }, + { + "epoch": 2.0750853242320817, + "grad_norm": 0.0630226582288742, + "learning_rate": 0.00011083234499661426, + "loss": 0.3978, + "step": 760 + }, + { + "epoch": 2.077815699658703, + "grad_norm": 0.059685662388801575, + "learning_rate": 0.00011059517646857023, + "loss": 0.3995, + "step": 761 + }, + { + "epoch": 2.080546075085324, + "grad_norm": 0.06554658710956573, + "learning_rate": 0.00011035794765178941, + "loss": 0.392, + "step": 762 + }, + { + "epoch": 2.083276450511945, + "grad_norm": 0.06697095930576324, + "learning_rate": 0.0001101206598961527, + "loss": 0.3807, + "step": 763 + }, + { + "epoch": 2.0860068259385667, + "grad_norm": 0.06333647668361664, + "learning_rate": 0.00010988331455187628, + "loss": 0.3803, + "step": 764 + }, + { + "epoch": 2.0887372013651877, + "grad_norm": 0.06475751847028732, + "learning_rate": 0.00010964591296950406, + "loss": 0.3872, + "step": 765 + }, + { + "epoch": 2.0914675767918087, + "grad_norm": 0.06261032074689865, + "learning_rate": 0.00010940845649989994, + "loss": 0.3971, + "step": 766 + }, + { + "epoch": 2.09419795221843, + "grad_norm": 0.06801852583885193, + "learning_rate": 0.00010917094649424018, + "loss": 0.4013, + "step": 767 + }, + { + "epoch": 2.096928327645051, + "grad_norm": 0.06227778270840645, + "learning_rate": 0.00010893338430400562, + "loss": 0.3919, + "step": 768 + }, + { + "epoch": 2.0996587030716722, + "grad_norm": 0.06618170440196991, + "learning_rate": 0.00010869577128097404, + "loss": 0.3961, + "step": 769 + }, + { + "epoch": 2.1023890784982937, + "grad_norm": 0.061390649527311325, + "learning_rate": 0.00010845810877721252, + "loss": 0.3823, + "step": 770 + }, + { + "epoch": 2.1051194539249147, + "grad_norm": 0.058039017021656036, + "learning_rate": 0.00010822039814506964, + "loss": 0.3834, + "step": 771 + }, + { + "epoch": 2.1078498293515358, + "grad_norm": 0.061444416642189026, + "learning_rate": 0.00010798264073716791, + "loss": 0.3905, + "step": 772 + }, + { + "epoch": 2.1105802047781568, + "grad_norm": 0.05958331748843193, + "learning_rate": 0.00010774483790639591, + "loss": 0.376, + "step": 773 + }, + { + "epoch": 2.1133105802047782, + "grad_norm": 0.06220965459942818, + "learning_rate": 0.00010750699100590076, + "loss": 0.3722, + "step": 774 + }, + { + "epoch": 2.1160409556313993, + "grad_norm": 0.06390852481126785, + "learning_rate": 0.00010726910138908032, + "loss": 0.3707, + "step": 775 + }, + { + "epoch": 2.1187713310580203, + "grad_norm": 0.06177375093102455, + "learning_rate": 0.00010703117040957553, + "loss": 0.3904, + "step": 776 + }, + { + "epoch": 2.1215017064846418, + "grad_norm": 0.06506139785051346, + "learning_rate": 0.00010679319942126264, + "loss": 0.399, + "step": 777 + }, + { + "epoch": 2.124232081911263, + "grad_norm": 0.0721697136759758, + "learning_rate": 0.00010655518977824566, + "loss": 0.4076, + "step": 778 + }, + { + "epoch": 2.126962457337884, + "grad_norm": 0.06590563803911209, + "learning_rate": 0.00010631714283484842, + "loss": 0.3999, + "step": 779 + }, + { + "epoch": 2.1296928327645053, + "grad_norm": 0.0715552568435669, + "learning_rate": 0.0001060790599456071, + "loss": 0.3885, + "step": 780 + }, + { + "epoch": 2.1324232081911263, + "grad_norm": 0.06654267013072968, + "learning_rate": 0.00010584094246526237, + "loss": 0.3991, + "step": 781 + }, + { + "epoch": 2.1351535836177473, + "grad_norm": 0.06633856892585754, + "learning_rate": 0.00010560279174875179, + "loss": 0.4085, + "step": 782 + }, + { + "epoch": 2.137883959044369, + "grad_norm": 0.06462158262729645, + "learning_rate": 0.0001053646091512019, + "loss": 0.3848, + "step": 783 + }, + { + "epoch": 2.14061433447099, + "grad_norm": 0.061437733471393585, + "learning_rate": 0.00010512639602792088, + "loss": 0.3846, + "step": 784 + }, + { + "epoch": 2.143344709897611, + "grad_norm": 0.06375306099653244, + "learning_rate": 0.00010488815373439036, + "loss": 0.3989, + "step": 785 + }, + { + "epoch": 2.146075085324232, + "grad_norm": 0.06337495893239975, + "learning_rate": 0.00010464988362625812, + "loss": 0.3967, + "step": 786 + }, + { + "epoch": 2.1488054607508533, + "grad_norm": 0.07057306170463562, + "learning_rate": 0.00010441158705933016, + "loss": 0.4033, + "step": 787 + }, + { + "epoch": 2.1515358361774743, + "grad_norm": 0.06043674796819687, + "learning_rate": 0.00010417326538956305, + "loss": 0.3868, + "step": 788 + }, + { + "epoch": 2.1542662116040954, + "grad_norm": 0.06690146774053574, + "learning_rate": 0.00010393491997305613, + "loss": 0.3869, + "step": 789 + }, + { + "epoch": 2.156996587030717, + "grad_norm": 0.06832878291606903, + "learning_rate": 0.00010369655216604397, + "loss": 0.4094, + "step": 790 + }, + { + "epoch": 2.159726962457338, + "grad_norm": 0.06499913334846497, + "learning_rate": 0.0001034581633248885, + "loss": 0.3954, + "step": 791 + }, + { + "epoch": 2.162457337883959, + "grad_norm": 0.06597902625799179, + "learning_rate": 0.00010321975480607129, + "loss": 0.3965, + "step": 792 + }, + { + "epoch": 2.1651877133105804, + "grad_norm": 0.06236860156059265, + "learning_rate": 0.00010298132796618596, + "loss": 0.3925, + "step": 793 + }, + { + "epoch": 2.1679180887372014, + "grad_norm": 0.06529796868562698, + "learning_rate": 0.00010274288416193034, + "loss": 0.3724, + "step": 794 + }, + { + "epoch": 2.1706484641638224, + "grad_norm": 0.06897170096635818, + "learning_rate": 0.0001025044247500988, + "loss": 0.3862, + "step": 795 + }, + { + "epoch": 2.173378839590444, + "grad_norm": 0.06522727757692337, + "learning_rate": 0.00010226595108757451, + "loss": 0.3991, + "step": 796 + }, + { + "epoch": 2.176109215017065, + "grad_norm": 0.0633029192686081, + "learning_rate": 0.00010202746453132172, + "loss": 0.3797, + "step": 797 + }, + { + "epoch": 2.178839590443686, + "grad_norm": 0.06274284422397614, + "learning_rate": 0.00010178896643837809, + "loss": 0.4011, + "step": 798 + }, + { + "epoch": 2.181569965870307, + "grad_norm": 0.0627584382891655, + "learning_rate": 0.00010155045816584691, + "loss": 0.3905, + "step": 799 + }, + { + "epoch": 2.1843003412969284, + "grad_norm": 0.06605307757854462, + "learning_rate": 0.00010131194107088935, + "loss": 0.3937, + "step": 800 + }, + { + "epoch": 2.1870307167235494, + "grad_norm": 0.0742136538028717, + "learning_rate": 0.00010107341651071684, + "loss": 0.4042, + "step": 801 + }, + { + "epoch": 2.1897610921501705, + "grad_norm": 0.0774683728814125, + "learning_rate": 0.00010083488584258326, + "loss": 0.3971, + "step": 802 + }, + { + "epoch": 2.192491467576792, + "grad_norm": 0.06698737293481827, + "learning_rate": 0.00010059635042377725, + "loss": 0.4075, + "step": 803 + }, + { + "epoch": 2.195221843003413, + "grad_norm": 0.0646221786737442, + "learning_rate": 0.00010035781161161446, + "loss": 0.3831, + "step": 804 + }, + { + "epoch": 2.197952218430034, + "grad_norm": 0.06556056439876556, + "learning_rate": 0.0001001192707634299, + "loss": 0.3796, + "step": 805 + }, + { + "epoch": 2.2006825938566554, + "grad_norm": 0.061351120471954346, + "learning_rate": 9.988072923657012e-05, + "loss": 0.3657, + "step": 806 + }, + { + "epoch": 2.2034129692832765, + "grad_norm": 0.06275127828121185, + "learning_rate": 9.964218838838554e-05, + "loss": 0.3689, + "step": 807 + }, + { + "epoch": 2.2061433447098975, + "grad_norm": 0.07032433152198792, + "learning_rate": 9.940364957622276e-05, + "loss": 0.3963, + "step": 808 + }, + { + "epoch": 2.208873720136519, + "grad_norm": 0.07062618434429169, + "learning_rate": 9.916511415741676e-05, + "loss": 0.3691, + "step": 809 + }, + { + "epoch": 2.21160409556314, + "grad_norm": 0.06543855369091034, + "learning_rate": 9.892658348928316e-05, + "loss": 0.4048, + "step": 810 + }, + { + "epoch": 2.214334470989761, + "grad_norm": 0.06486646085977554, + "learning_rate": 9.868805892911067e-05, + "loss": 0.4024, + "step": 811 + }, + { + "epoch": 2.217064846416382, + "grad_norm": 0.06630420684814453, + "learning_rate": 9.84495418341531e-05, + "loss": 0.3809, + "step": 812 + }, + { + "epoch": 2.2197952218430035, + "grad_norm": 0.06499867141246796, + "learning_rate": 9.821103356162189e-05, + "loss": 0.399, + "step": 813 + }, + { + "epoch": 2.2225255972696245, + "grad_norm": 0.06988663226366043, + "learning_rate": 9.797253546867831e-05, + "loss": 0.3685, + "step": 814 + }, + { + "epoch": 2.2252559726962455, + "grad_norm": 0.06742274761199951, + "learning_rate": 9.773404891242551e-05, + "loss": 0.3753, + "step": 815 + }, + { + "epoch": 2.227986348122867, + "grad_norm": 0.06449928879737854, + "learning_rate": 9.749557524990121e-05, + "loss": 0.3958, + "step": 816 + }, + { + "epoch": 2.230716723549488, + "grad_norm": 0.06267012655735016, + "learning_rate": 9.72571158380697e-05, + "loss": 0.3969, + "step": 817 + }, + { + "epoch": 2.233447098976109, + "grad_norm": 0.06422371417284012, + "learning_rate": 9.701867203381405e-05, + "loss": 0.4004, + "step": 818 + }, + { + "epoch": 2.2361774744027305, + "grad_norm": 0.06123776733875275, + "learning_rate": 9.678024519392871e-05, + "loss": 0.3843, + "step": 819 + }, + { + "epoch": 2.2389078498293515, + "grad_norm": 0.06269904226064682, + "learning_rate": 9.654183667511154e-05, + "loss": 0.4116, + "step": 820 + }, + { + "epoch": 2.2416382252559726, + "grad_norm": 0.06304040551185608, + "learning_rate": 9.630344783395604e-05, + "loss": 0.394, + "step": 821 + }, + { + "epoch": 2.244368600682594, + "grad_norm": 0.06638960540294647, + "learning_rate": 9.606508002694386e-05, + "loss": 0.404, + "step": 822 + }, + { + "epoch": 2.247098976109215, + "grad_norm": 0.06530074775218964, + "learning_rate": 9.5826734610437e-05, + "loss": 0.3945, + "step": 823 + }, + { + "epoch": 2.249829351535836, + "grad_norm": 0.06665409356355667, + "learning_rate": 9.558841294066985e-05, + "loss": 0.4024, + "step": 824 + }, + { + "epoch": 2.252559726962457, + "grad_norm": 0.06718889623880386, + "learning_rate": 9.535011637374189e-05, + "loss": 0.383, + "step": 825 + }, + { + "epoch": 2.2552901023890786, + "grad_norm": 0.06695706397294998, + "learning_rate": 9.511184626560968e-05, + "loss": 0.3897, + "step": 826 + }, + { + "epoch": 2.2580204778156996, + "grad_norm": 0.061516858637332916, + "learning_rate": 9.487360397207916e-05, + "loss": 0.3792, + "step": 827 + }, + { + "epoch": 2.260750853242321, + "grad_norm": 0.06686560809612274, + "learning_rate": 9.463539084879809e-05, + "loss": 0.3872, + "step": 828 + }, + { + "epoch": 2.263481228668942, + "grad_norm": 0.06617554277181625, + "learning_rate": 9.439720825124827e-05, + "loss": 0.3982, + "step": 829 + }, + { + "epoch": 2.266211604095563, + "grad_norm": 0.06416121870279312, + "learning_rate": 9.415905753473765e-05, + "loss": 0.39, + "step": 830 + }, + { + "epoch": 2.268941979522184, + "grad_norm": 0.06259041279554367, + "learning_rate": 9.392094005439291e-05, + "loss": 0.3851, + "step": 831 + }, + { + "epoch": 2.2716723549488056, + "grad_norm": 0.06047592684626579, + "learning_rate": 9.368285716515162e-05, + "loss": 0.3709, + "step": 832 + }, + { + "epoch": 2.2744027303754266, + "grad_norm": 0.0637134537100792, + "learning_rate": 9.344481022175436e-05, + "loss": 0.3711, + "step": 833 + }, + { + "epoch": 2.2771331058020476, + "grad_norm": 0.06489844620227814, + "learning_rate": 9.320680057873735e-05, + "loss": 0.3717, + "step": 834 + }, + { + "epoch": 2.279863481228669, + "grad_norm": 0.06694884598255157, + "learning_rate": 9.29688295904245e-05, + "loss": 0.385, + "step": 835 + }, + { + "epoch": 2.28259385665529, + "grad_norm": 0.07286783307790756, + "learning_rate": 9.273089861091969e-05, + "loss": 0.3809, + "step": 836 + }, + { + "epoch": 2.285324232081911, + "grad_norm": 0.06461062282323837, + "learning_rate": 9.249300899409924e-05, + "loss": 0.3961, + "step": 837 + }, + { + "epoch": 2.288054607508532, + "grad_norm": 0.06771940737962723, + "learning_rate": 9.225516209360413e-05, + "loss": 0.3948, + "step": 838 + }, + { + "epoch": 2.2907849829351536, + "grad_norm": 0.06662525236606598, + "learning_rate": 9.201735926283213e-05, + "loss": 0.3835, + "step": 839 + }, + { + "epoch": 2.2935153583617747, + "grad_norm": 0.06741426885128021, + "learning_rate": 9.177960185493036e-05, + "loss": 0.4025, + "step": 840 + }, + { + "epoch": 2.296245733788396, + "grad_norm": 0.062717005610466, + "learning_rate": 9.154189122278754e-05, + "loss": 0.3812, + "step": 841 + }, + { + "epoch": 2.298976109215017, + "grad_norm": 0.06642817705869675, + "learning_rate": 9.1304228719026e-05, + "loss": 0.3783, + "step": 842 + }, + { + "epoch": 2.301706484641638, + "grad_norm": 0.06607065349817276, + "learning_rate": 9.106661569599442e-05, + "loss": 0.3935, + "step": 843 + }, + { + "epoch": 2.304436860068259, + "grad_norm": 0.0658462718129158, + "learning_rate": 9.082905350575986e-05, + "loss": 0.3865, + "step": 844 + }, + { + "epoch": 2.3071672354948807, + "grad_norm": 0.0657251700758934, + "learning_rate": 9.059154350010008e-05, + "loss": 0.3955, + "step": 845 + }, + { + "epoch": 2.3098976109215017, + "grad_norm": 0.0725063607096672, + "learning_rate": 9.035408703049596e-05, + "loss": 0.3836, + "step": 846 + }, + { + "epoch": 2.3126279863481227, + "grad_norm": 0.06905627250671387, + "learning_rate": 9.011668544812377e-05, + "loss": 0.3948, + "step": 847 + }, + { + "epoch": 2.315358361774744, + "grad_norm": 0.05961495637893677, + "learning_rate": 8.987934010384733e-05, + "loss": 0.3543, + "step": 848 + }, + { + "epoch": 2.318088737201365, + "grad_norm": 0.06361044198274612, + "learning_rate": 8.96420523482106e-05, + "loss": 0.3954, + "step": 849 + }, + { + "epoch": 2.3208191126279862, + "grad_norm": 0.06655801832675934, + "learning_rate": 8.940482353142983e-05, + "loss": 0.3895, + "step": 850 + }, + { + "epoch": 2.3235494880546073, + "grad_norm": 0.06384111195802689, + "learning_rate": 8.916765500338575e-05, + "loss": 0.4027, + "step": 851 + }, + { + "epoch": 2.3262798634812287, + "grad_norm": 0.06309723109006882, + "learning_rate": 8.893054811361624e-05, + "loss": 0.3768, + "step": 852 + }, + { + "epoch": 2.3290102389078498, + "grad_norm": 0.06191105768084526, + "learning_rate": 8.869350421130831e-05, + "loss": 0.3896, + "step": 853 + }, + { + "epoch": 2.331740614334471, + "grad_norm": 0.06459913402795792, + "learning_rate": 8.845652464529057e-05, + "loss": 0.3891, + "step": 854 + }, + { + "epoch": 2.3344709897610922, + "grad_norm": 0.06782979518175125, + "learning_rate": 8.821961076402563e-05, + "loss": 0.3947, + "step": 855 + }, + { + "epoch": 2.3372013651877133, + "grad_norm": 0.06412837654352188, + "learning_rate": 8.79827639156022e-05, + "loss": 0.3875, + "step": 856 + }, + { + "epoch": 2.3399317406143343, + "grad_norm": 0.06624305993318558, + "learning_rate": 8.774598544772774e-05, + "loss": 0.3846, + "step": 857 + }, + { + "epoch": 2.3426621160409558, + "grad_norm": 0.06252939254045486, + "learning_rate": 8.750927670772044e-05, + "loss": 0.3925, + "step": 858 + }, + { + "epoch": 2.345392491467577, + "grad_norm": 0.06812991946935654, + "learning_rate": 8.727263904250178e-05, + "loss": 0.3889, + "step": 859 + }, + { + "epoch": 2.348122866894198, + "grad_norm": 0.06583413481712341, + "learning_rate": 8.703607379858889e-05, + "loss": 0.371, + "step": 860 + }, + { + "epoch": 2.3508532423208193, + "grad_norm": 0.07355549931526184, + "learning_rate": 8.679958232208668e-05, + "loss": 0.3821, + "step": 861 + }, + { + "epoch": 2.3535836177474403, + "grad_norm": 0.06977757811546326, + "learning_rate": 8.656316595868037e-05, + "loss": 0.3866, + "step": 862 + }, + { + "epoch": 2.3563139931740613, + "grad_norm": 0.06983613967895508, + "learning_rate": 8.632682605362784e-05, + "loss": 0.3973, + "step": 863 + }, + { + "epoch": 2.359044368600683, + "grad_norm": 0.06457202136516571, + "learning_rate": 8.609056395175175e-05, + "loss": 0.3794, + "step": 864 + }, + { + "epoch": 2.361774744027304, + "grad_norm": 0.06292392313480377, + "learning_rate": 8.585438099743217e-05, + "loss": 0.3923, + "step": 865 + }, + { + "epoch": 2.364505119453925, + "grad_norm": 0.06382179260253906, + "learning_rate": 8.56182785345988e-05, + "loss": 0.3757, + "step": 866 + }, + { + "epoch": 2.3672354948805463, + "grad_norm": 0.0666775107383728, + "learning_rate": 8.538225790672322e-05, + "loss": 0.3761, + "step": 867 + }, + { + "epoch": 2.3699658703071673, + "grad_norm": 0.06406944990158081, + "learning_rate": 8.514632045681145e-05, + "loss": 0.3798, + "step": 868 + }, + { + "epoch": 2.3726962457337883, + "grad_norm": 0.06742346286773682, + "learning_rate": 8.491046752739624e-05, + "loss": 0.3922, + "step": 869 + }, + { + "epoch": 2.3754266211604094, + "grad_norm": 0.06776360422372818, + "learning_rate": 8.467470046052927e-05, + "loss": 0.399, + "step": 870 + }, + { + "epoch": 2.378156996587031, + "grad_norm": 0.07043983042240143, + "learning_rate": 8.443902059777373e-05, + "loss": 0.3854, + "step": 871 + }, + { + "epoch": 2.380887372013652, + "grad_norm": 0.07512562721967697, + "learning_rate": 8.420342928019666e-05, + "loss": 0.403, + "step": 872 + }, + { + "epoch": 2.383617747440273, + "grad_norm": 0.06623658537864685, + "learning_rate": 8.396792784836108e-05, + "loss": 0.3929, + "step": 873 + }, + { + "epoch": 2.3863481228668944, + "grad_norm": 0.06477659940719604, + "learning_rate": 8.373251764231872e-05, + "loss": 0.3873, + "step": 874 + }, + { + "epoch": 2.3890784982935154, + "grad_norm": 0.06675104796886444, + "learning_rate": 8.349720000160218e-05, + "loss": 0.382, + "step": 875 + }, + { + "epoch": 2.3918088737201364, + "grad_norm": 0.06558026373386383, + "learning_rate": 8.326197626521723e-05, + "loss": 0.379, + "step": 876 + }, + { + "epoch": 2.394539249146758, + "grad_norm": 0.06641051918268204, + "learning_rate": 8.30268477716354e-05, + "loss": 0.3976, + "step": 877 + }, + { + "epoch": 2.397269624573379, + "grad_norm": 0.061502814292907715, + "learning_rate": 8.279181585878635e-05, + "loss": 0.3762, + "step": 878 + }, + { + "epoch": 2.4, + "grad_norm": 0.06521192193031311, + "learning_rate": 8.255688186404996e-05, + "loss": 0.3882, + "step": 879 + }, + { + "epoch": 2.4027303754266214, + "grad_norm": 0.0659627914428711, + "learning_rate": 8.232204712424911e-05, + "loss": 0.3835, + "step": 880 + }, + { + "epoch": 2.4054607508532424, + "grad_norm": 0.06876777112483978, + "learning_rate": 8.208731297564189e-05, + "loss": 0.3751, + "step": 881 + }, + { + "epoch": 2.4081911262798634, + "grad_norm": 0.066194549202919, + "learning_rate": 8.185268075391388e-05, + "loss": 0.376, + "step": 882 + }, + { + "epoch": 2.4109215017064844, + "grad_norm": 0.0667635127902031, + "learning_rate": 8.161815179417078e-05, + "loss": 0.384, + "step": 883 + }, + { + "epoch": 2.413651877133106, + "grad_norm": 0.0693088099360466, + "learning_rate": 8.138372743093076e-05, + "loss": 0.3817, + "step": 884 + }, + { + "epoch": 2.416382252559727, + "grad_norm": 0.07052286714315414, + "learning_rate": 8.114940899811662e-05, + "loss": 0.3792, + "step": 885 + }, + { + "epoch": 2.419112627986348, + "grad_norm": 0.06873782724142075, + "learning_rate": 8.091519782904857e-05, + "loss": 0.3971, + "step": 886 + }, + { + "epoch": 2.4218430034129694, + "grad_norm": 0.06847493350505829, + "learning_rate": 8.068109525643647e-05, + "loss": 0.3848, + "step": 887 + }, + { + "epoch": 2.4245733788395905, + "grad_norm": 0.062080979347229004, + "learning_rate": 8.044710261237207e-05, + "loss": 0.3853, + "step": 888 + }, + { + "epoch": 2.4273037542662115, + "grad_norm": 0.06559818983078003, + "learning_rate": 8.021322122832178e-05, + "loss": 0.3677, + "step": 889 + }, + { + "epoch": 2.430034129692833, + "grad_norm": 0.06463391333818436, + "learning_rate": 7.99794524351189e-05, + "loss": 0.372, + "step": 890 + }, + { + "epoch": 2.432764505119454, + "grad_norm": 0.06920842081308365, + "learning_rate": 7.974579756295591e-05, + "loss": 0.3831, + "step": 891 + }, + { + "epoch": 2.435494880546075, + "grad_norm": 0.06424865126609802, + "learning_rate": 7.951225794137724e-05, + "loss": 0.3748, + "step": 892 + }, + { + "epoch": 2.4382252559726965, + "grad_norm": 0.06715277582406998, + "learning_rate": 7.927883489927147e-05, + "loss": 0.3915, + "step": 893 + }, + { + "epoch": 2.4409556313993175, + "grad_norm": 0.06705847382545471, + "learning_rate": 7.904552976486372e-05, + "loss": 0.3885, + "step": 894 + }, + { + "epoch": 2.4436860068259385, + "grad_norm": 0.06581594794988632, + "learning_rate": 7.88123438657083e-05, + "loss": 0.3856, + "step": 895 + }, + { + "epoch": 2.4464163822525595, + "grad_norm": 0.06164994090795517, + "learning_rate": 7.857927852868107e-05, + "loss": 0.3748, + "step": 896 + }, + { + "epoch": 2.449146757679181, + "grad_norm": 0.06487210094928741, + "learning_rate": 7.83463350799717e-05, + "loss": 0.3671, + "step": 897 + }, + { + "epoch": 2.451877133105802, + "grad_norm": 0.07010706514120102, + "learning_rate": 7.811351484507647e-05, + "loss": 0.3788, + "step": 898 + }, + { + "epoch": 2.454607508532423, + "grad_norm": 0.0727321207523346, + "learning_rate": 7.788081914879051e-05, + "loss": 0.3875, + "step": 899 + }, + { + "epoch": 2.4573378839590445, + "grad_norm": 0.07144790142774582, + "learning_rate": 7.764824931520018e-05, + "loss": 0.385, + "step": 900 + }, + { + "epoch": 2.4600682593856655, + "grad_norm": 0.06632732599973679, + "learning_rate": 7.741580666767583e-05, + "loss": 0.3876, + "step": 901 + }, + { + "epoch": 2.4627986348122866, + "grad_norm": 0.06642162799835205, + "learning_rate": 7.718349252886395e-05, + "loss": 0.3767, + "step": 902 + }, + { + "epoch": 2.465529010238908, + "grad_norm": 0.07363419234752655, + "learning_rate": 7.695130822067984e-05, + "loss": 0.3987, + "step": 903 + }, + { + "epoch": 2.468259385665529, + "grad_norm": 0.06822016835212708, + "learning_rate": 7.67192550643001e-05, + "loss": 0.3878, + "step": 904 + }, + { + "epoch": 2.47098976109215, + "grad_norm": 0.07104900479316711, + "learning_rate": 7.648733438015493e-05, + "loss": 0.369, + "step": 905 + }, + { + "epoch": 2.4737201365187715, + "grad_norm": 0.07460657507181168, + "learning_rate": 7.625554748792085e-05, + "loss": 0.3804, + "step": 906 + }, + { + "epoch": 2.4764505119453926, + "grad_norm": 0.0632706731557846, + "learning_rate": 7.602389570651303e-05, + "loss": 0.3768, + "step": 907 + }, + { + "epoch": 2.4791808873720136, + "grad_norm": 0.07213195413351059, + "learning_rate": 7.579238035407776e-05, + "loss": 0.3942, + "step": 908 + }, + { + "epoch": 2.4819112627986346, + "grad_norm": 0.06379958242177963, + "learning_rate": 7.556100274798519e-05, + "loss": 0.3619, + "step": 909 + }, + { + "epoch": 2.484641638225256, + "grad_norm": 0.07019772380590439, + "learning_rate": 7.532976420482146e-05, + "loss": 0.3684, + "step": 910 + }, + { + "epoch": 2.487372013651877, + "grad_norm": 0.06975077837705612, + "learning_rate": 7.509866604038157e-05, + "loss": 0.3908, + "step": 911 + }, + { + "epoch": 2.490102389078498, + "grad_norm": 0.07252513617277145, + "learning_rate": 7.486770956966171e-05, + "loss": 0.381, + "step": 912 + }, + { + "epoch": 2.4928327645051196, + "grad_norm": 0.06543651968240738, + "learning_rate": 7.463689610685171e-05, + "loss": 0.3783, + "step": 913 + }, + { + "epoch": 2.4955631399317406, + "grad_norm": 0.06449062377214432, + "learning_rate": 7.440622696532775e-05, + "loss": 0.3793, + "step": 914 + }, + { + "epoch": 2.4982935153583616, + "grad_norm": 0.07260416448116302, + "learning_rate": 7.417570345764481e-05, + "loss": 0.3559, + "step": 915 + }, + { + "epoch": 2.5010238907849827, + "grad_norm": 0.0706956535577774, + "learning_rate": 7.394532689552905e-05, + "loss": 0.3908, + "step": 916 + }, + { + "epoch": 2.503754266211604, + "grad_norm": 0.07324767857789993, + "learning_rate": 7.371509858987061e-05, + "loss": 0.3877, + "step": 917 + }, + { + "epoch": 2.506484641638225, + "grad_norm": 0.07116331160068512, + "learning_rate": 7.348501985071603e-05, + "loss": 0.3918, + "step": 918 + }, + { + "epoch": 2.5092150170648466, + "grad_norm": 0.06699459999799728, + "learning_rate": 7.325509198726064e-05, + "loss": 0.3903, + "step": 919 + }, + { + "epoch": 2.5119453924914676, + "grad_norm": 0.06897846609354019, + "learning_rate": 7.302531630784137e-05, + "loss": 0.3785, + "step": 920 + }, + { + "epoch": 2.5146757679180887, + "grad_norm": 0.06994078308343887, + "learning_rate": 7.279569411992926e-05, + "loss": 0.4009, + "step": 921 + }, + { + "epoch": 2.5174061433447097, + "grad_norm": 0.0688600167632103, + "learning_rate": 7.256622673012175e-05, + "loss": 0.3768, + "step": 922 + }, + { + "epoch": 2.520136518771331, + "grad_norm": 0.07253801077604294, + "learning_rate": 7.233691544413558e-05, + "loss": 0.3816, + "step": 923 + }, + { + "epoch": 2.522866894197952, + "grad_norm": 0.06901174038648605, + "learning_rate": 7.210776156679931e-05, + "loss": 0.3815, + "step": 924 + }, + { + "epoch": 2.5255972696245736, + "grad_norm": 0.06941503286361694, + "learning_rate": 7.187876640204556e-05, + "loss": 0.3871, + "step": 925 + }, + { + "epoch": 2.5283276450511947, + "grad_norm": 0.06918507814407349, + "learning_rate": 7.164993125290407e-05, + "loss": 0.3842, + "step": 926 + }, + { + "epoch": 2.5310580204778157, + "grad_norm": 0.06809090077877045, + "learning_rate": 7.1421257421494e-05, + "loss": 0.3894, + "step": 927 + }, + { + "epoch": 2.5337883959044367, + "grad_norm": 0.0676964670419693, + "learning_rate": 7.119274620901649e-05, + "loss": 0.3659, + "step": 928 + }, + { + "epoch": 2.536518771331058, + "grad_norm": 0.06432293355464935, + "learning_rate": 7.096439891574745e-05, + "loss": 0.3793, + "step": 929 + }, + { + "epoch": 2.539249146757679, + "grad_norm": 0.06571602076292038, + "learning_rate": 7.073621684103007e-05, + "loss": 0.3725, + "step": 930 + }, + { + "epoch": 2.5419795221843002, + "grad_norm": 0.06849601119756699, + "learning_rate": 7.050820128326724e-05, + "loss": 0.3911, + "step": 931 + }, + { + "epoch": 2.5447098976109217, + "grad_norm": 0.06598517298698425, + "learning_rate": 7.028035353991456e-05, + "loss": 0.3824, + "step": 932 + }, + { + "epoch": 2.5474402730375427, + "grad_norm": 0.06540702283382416, + "learning_rate": 7.005267490747263e-05, + "loss": 0.3529, + "step": 933 + }, + { + "epoch": 2.5501706484641637, + "grad_norm": 0.07165670394897461, + "learning_rate": 6.982516668147967e-05, + "loss": 0.3716, + "step": 934 + }, + { + "epoch": 2.5529010238907848, + "grad_norm": 0.07307778298854828, + "learning_rate": 6.959783015650446e-05, + "loss": 0.389, + "step": 935 + }, + { + "epoch": 2.5556313993174062, + "grad_norm": 0.07411336153745651, + "learning_rate": 6.937066662613863e-05, + "loss": 0.3576, + "step": 936 + }, + { + "epoch": 2.5583617747440273, + "grad_norm": 0.07051097601652145, + "learning_rate": 6.914367738298941e-05, + "loss": 0.3861, + "step": 937 + }, + { + "epoch": 2.5610921501706487, + "grad_norm": 0.07393593341112137, + "learning_rate": 6.891686371867239e-05, + "loss": 0.3836, + "step": 938 + }, + { + "epoch": 2.5638225255972698, + "grad_norm": 0.07074485719203949, + "learning_rate": 6.869022692380411e-05, + "loss": 0.39, + "step": 939 + }, + { + "epoch": 2.5665529010238908, + "grad_norm": 0.07122661918401718, + "learning_rate": 6.846376828799451e-05, + "loss": 0.3622, + "step": 940 + }, + { + "epoch": 2.569283276450512, + "grad_norm": 0.07139509916305542, + "learning_rate": 6.823748909983994e-05, + "loss": 0.3663, + "step": 941 + }, + { + "epoch": 2.5720136518771333, + "grad_norm": 0.07215254753828049, + "learning_rate": 6.801139064691562e-05, + "loss": 0.3958, + "step": 942 + }, + { + "epoch": 2.5747440273037543, + "grad_norm": 0.06885367631912231, + "learning_rate": 6.778547421576825e-05, + "loss": 0.3682, + "step": 943 + }, + { + "epoch": 2.5774744027303753, + "grad_norm": 0.06890679150819778, + "learning_rate": 6.75597410919089e-05, + "loss": 0.3747, + "step": 944 + }, + { + "epoch": 2.580204778156997, + "grad_norm": 0.06944692134857178, + "learning_rate": 6.733419255980559e-05, + "loss": 0.3696, + "step": 945 + }, + { + "epoch": 2.582935153583618, + "grad_norm": 0.07257425785064697, + "learning_rate": 6.710882990287585e-05, + "loss": 0.3801, + "step": 946 + }, + { + "epoch": 2.585665529010239, + "grad_norm": 0.07554417103528976, + "learning_rate": 6.688365440347965e-05, + "loss": 0.3863, + "step": 947 + }, + { + "epoch": 2.58839590443686, + "grad_norm": 0.06943865120410919, + "learning_rate": 6.665866734291205e-05, + "loss": 0.383, + "step": 948 + }, + { + "epoch": 2.5911262798634813, + "grad_norm": 0.06850133091211319, + "learning_rate": 6.643387000139565e-05, + "loss": 0.3813, + "step": 949 + }, + { + "epoch": 2.5938566552901023, + "grad_norm": 0.06953789293766022, + "learning_rate": 6.620926365807372e-05, + "loss": 0.385, + "step": 950 + }, + { + "epoch": 2.596587030716724, + "grad_norm": 0.06954308599233627, + "learning_rate": 6.598484959100257e-05, + "loss": 0.3745, + "step": 951 + }, + { + "epoch": 2.599317406143345, + "grad_norm": 0.06686001271009445, + "learning_rate": 6.576062907714448e-05, + "loss": 0.3511, + "step": 952 + }, + { + "epoch": 2.602047781569966, + "grad_norm": 0.0746379867196083, + "learning_rate": 6.553660339236041e-05, + "loss": 0.3818, + "step": 953 + }, + { + "epoch": 2.604778156996587, + "grad_norm": 0.073628731071949, + "learning_rate": 6.53127738114026e-05, + "loss": 0.3825, + "step": 954 + }, + { + "epoch": 2.6075085324232083, + "grad_norm": 0.07217229157686234, + "learning_rate": 6.508914160790752e-05, + "loss": 0.3864, + "step": 955 + }, + { + "epoch": 2.6102389078498294, + "grad_norm": 0.06748787313699722, + "learning_rate": 6.486570805438843e-05, + "loss": 0.3853, + "step": 956 + }, + { + "epoch": 2.6129692832764504, + "grad_norm": 0.0762786790728569, + "learning_rate": 6.46424744222283e-05, + "loss": 0.385, + "step": 957 + }, + { + "epoch": 2.615699658703072, + "grad_norm": 0.07065171003341675, + "learning_rate": 6.441944198167253e-05, + "loss": 0.3924, + "step": 958 + }, + { + "epoch": 2.618430034129693, + "grad_norm": 0.07208285480737686, + "learning_rate": 6.419661200182158e-05, + "loss": 0.3943, + "step": 959 + }, + { + "epoch": 2.621160409556314, + "grad_norm": 0.07120097428560257, + "learning_rate": 6.397398575062396e-05, + "loss": 0.3799, + "step": 960 + }, + { + "epoch": 2.623890784982935, + "grad_norm": 0.06891025602817535, + "learning_rate": 6.375156449486895e-05, + "loss": 0.3693, + "step": 961 + }, + { + "epoch": 2.6266211604095564, + "grad_norm": 0.07594846189022064, + "learning_rate": 6.352934950017921e-05, + "loss": 0.3708, + "step": 962 + }, + { + "epoch": 2.6293515358361774, + "grad_norm": 0.06709565222263336, + "learning_rate": 6.330734203100394e-05, + "loss": 0.3732, + "step": 963 + }, + { + "epoch": 2.632081911262799, + "grad_norm": 0.07154542207717896, + "learning_rate": 6.308554335061135e-05, + "loss": 0.3883, + "step": 964 + }, + { + "epoch": 2.63481228668942, + "grad_norm": 0.06966784596443176, + "learning_rate": 6.286395472108158e-05, + "loss": 0.3842, + "step": 965 + }, + { + "epoch": 2.637542662116041, + "grad_norm": 0.07383087277412415, + "learning_rate": 6.26425774032996e-05, + "loss": 0.3855, + "step": 966 + }, + { + "epoch": 2.640273037542662, + "grad_norm": 0.06765507906675339, + "learning_rate": 6.2421412656948e-05, + "loss": 0.3828, + "step": 967 + }, + { + "epoch": 2.6430034129692834, + "grad_norm": 0.06942715495824814, + "learning_rate": 6.220046174049968e-05, + "loss": 0.3579, + "step": 968 + }, + { + "epoch": 2.6457337883959045, + "grad_norm": 0.07347889989614487, + "learning_rate": 6.19797259112109e-05, + "loss": 0.3648, + "step": 969 + }, + { + "epoch": 2.6484641638225255, + "grad_norm": 0.0671333596110344, + "learning_rate": 6.175920642511404e-05, + "loss": 0.3794, + "step": 970 + }, + { + "epoch": 2.651194539249147, + "grad_norm": 0.06726156920194626, + "learning_rate": 6.153890453701031e-05, + "loss": 0.3673, + "step": 971 + }, + { + "epoch": 2.653924914675768, + "grad_norm": 0.06900251656770706, + "learning_rate": 6.131882150046291e-05, + "loss": 0.3818, + "step": 972 + }, + { + "epoch": 2.656655290102389, + "grad_norm": 0.07286951690912247, + "learning_rate": 6.109895856778967e-05, + "loss": 0.3765, + "step": 973 + }, + { + "epoch": 2.65938566552901, + "grad_norm": 0.07597585022449493, + "learning_rate": 6.087931699005588e-05, + "loss": 0.3897, + "step": 974 + }, + { + "epoch": 2.6621160409556315, + "grad_norm": 0.0680689737200737, + "learning_rate": 6.065989801706744e-05, + "loss": 0.3727, + "step": 975 + }, + { + "epoch": 2.6648464163822525, + "grad_norm": 0.0685984194278717, + "learning_rate": 6.044070289736352e-05, + "loss": 0.3726, + "step": 976 + }, + { + "epoch": 2.667576791808874, + "grad_norm": 0.07317140698432922, + "learning_rate": 6.0221732878209425e-05, + "loss": 0.3825, + "step": 977 + }, + { + "epoch": 2.670307167235495, + "grad_norm": 0.07575836777687073, + "learning_rate": 6.0002989205589734e-05, + "loss": 0.3895, + "step": 978 + }, + { + "epoch": 2.673037542662116, + "grad_norm": 0.07128394395112991, + "learning_rate": 5.978447312420103e-05, + "loss": 0.3716, + "step": 979 + }, + { + "epoch": 2.675767918088737, + "grad_norm": 0.0750584676861763, + "learning_rate": 5.9566185877444755e-05, + "loss": 0.3799, + "step": 980 + }, + { + "epoch": 2.6784982935153585, + "grad_norm": 0.07472096383571625, + "learning_rate": 5.934812870742036e-05, + "loss": 0.3968, + "step": 981 + }, + { + "epoch": 2.6812286689419795, + "grad_norm": 0.07020480930805206, + "learning_rate": 5.913030285491808e-05, + "loss": 0.3686, + "step": 982 + }, + { + "epoch": 2.6839590443686006, + "grad_norm": 0.06642840802669525, + "learning_rate": 5.891270955941184e-05, + "loss": 0.3775, + "step": 983 + }, + { + "epoch": 2.686689419795222, + "grad_norm": 0.07389365881681442, + "learning_rate": 5.869535005905232e-05, + "loss": 0.3698, + "step": 984 + }, + { + "epoch": 2.689419795221843, + "grad_norm": 0.07101259380578995, + "learning_rate": 5.847822559065992e-05, + "loss": 0.3801, + "step": 985 + }, + { + "epoch": 2.692150170648464, + "grad_norm": 0.07233007252216339, + "learning_rate": 5.8261337389717506e-05, + "loss": 0.3828, + "step": 986 + }, + { + "epoch": 2.694880546075085, + "grad_norm": 0.07280164211988449, + "learning_rate": 5.804468669036369e-05, + "loss": 0.3775, + "step": 987 + }, + { + "epoch": 2.6976109215017066, + "grad_norm": 0.07394791394472122, + "learning_rate": 5.7828274725385544e-05, + "loss": 0.3617, + "step": 988 + }, + { + "epoch": 2.7003412969283276, + "grad_norm": 0.0712592750787735, + "learning_rate": 5.761210272621175e-05, + "loss": 0.375, + "step": 989 + }, + { + "epoch": 2.703071672354949, + "grad_norm": 0.07412825524806976, + "learning_rate": 5.739617192290545e-05, + "loss": 0.3899, + "step": 990 + }, + { + "epoch": 2.70580204778157, + "grad_norm": 0.06890816986560822, + "learning_rate": 5.7180483544157546e-05, + "loss": 0.3801, + "step": 991 + }, + { + "epoch": 2.708532423208191, + "grad_norm": 0.0753878653049469, + "learning_rate": 5.696503881727917e-05, + "loss": 0.3764, + "step": 992 + }, + { + "epoch": 2.711262798634812, + "grad_norm": 0.07268603146076202, + "learning_rate": 5.6749838968195326e-05, + "loss": 0.3823, + "step": 993 + }, + { + "epoch": 2.7139931740614336, + "grad_norm": 0.07173381745815277, + "learning_rate": 5.653488522143744e-05, + "loss": 0.3812, + "step": 994 + }, + { + "epoch": 2.7167235494880546, + "grad_norm": 0.07240818440914154, + "learning_rate": 5.6320178800136626e-05, + "loss": 0.3726, + "step": 995 + }, + { + "epoch": 2.7194539249146756, + "grad_norm": 0.07028204202651978, + "learning_rate": 5.610572092601659e-05, + "loss": 0.3802, + "step": 996 + }, + { + "epoch": 2.722184300341297, + "grad_norm": 0.07140224426984787, + "learning_rate": 5.589151281938695e-05, + "loss": 0.3672, + "step": 997 + }, + { + "epoch": 2.724914675767918, + "grad_norm": 0.07478421926498413, + "learning_rate": 5.56775556991358e-05, + "loss": 0.3821, + "step": 998 + }, + { + "epoch": 2.727645051194539, + "grad_norm": 0.07046322524547577, + "learning_rate": 5.5463850782723346e-05, + "loss": 0.3876, + "step": 999 + }, + { + "epoch": 2.73037542662116, + "grad_norm": 0.07505001872777939, + "learning_rate": 5.5250399286174546e-05, + "loss": 0.3903, + "step": 1000 + }, + { + "epoch": 2.7331058020477816, + "grad_norm": 0.07295075058937073, + "learning_rate": 5.50372024240724e-05, + "loss": 0.3771, + "step": 1001 + }, + { + "epoch": 2.7358361774744027, + "grad_norm": 0.07162769138813019, + "learning_rate": 5.48242614095509e-05, + "loss": 0.3861, + "step": 1002 + }, + { + "epoch": 2.738566552901024, + "grad_norm": 0.06978384405374527, + "learning_rate": 5.461157745428841e-05, + "loss": 0.3842, + "step": 1003 + }, + { + "epoch": 2.741296928327645, + "grad_norm": 0.0743769034743309, + "learning_rate": 5.439915176850037e-05, + "loss": 0.363, + "step": 1004 + }, + { + "epoch": 2.744027303754266, + "grad_norm": 0.07271228730678558, + "learning_rate": 5.418698556093271e-05, + "loss": 0.3813, + "step": 1005 + }, + { + "epoch": 2.746757679180887, + "grad_norm": 0.07161740958690643, + "learning_rate": 5.397508003885483e-05, + "loss": 0.3873, + "step": 1006 + }, + { + "epoch": 2.7494880546075087, + "grad_norm": 0.07506071776151657, + "learning_rate": 5.3763436408052904e-05, + "loss": 0.394, + "step": 1007 + }, + { + "epoch": 2.7522184300341297, + "grad_norm": 0.06938126683235168, + "learning_rate": 5.3552055872822636e-05, + "loss": 0.3677, + "step": 1008 + }, + { + "epoch": 2.7549488054607507, + "grad_norm": 0.07216595858335495, + "learning_rate": 5.334093963596294e-05, + "loss": 0.3794, + "step": 1009 + }, + { + "epoch": 2.757679180887372, + "grad_norm": 0.07905741035938263, + "learning_rate": 5.313008889876865e-05, + "loss": 0.3772, + "step": 1010 + }, + { + "epoch": 2.760409556313993, + "grad_norm": 0.07164405286312103, + "learning_rate": 5.2919504861023903e-05, + "loss": 0.3774, + "step": 1011 + }, + { + "epoch": 2.7631399317406142, + "grad_norm": 0.07869692891836166, + "learning_rate": 5.270918872099522e-05, + "loss": 0.3872, + "step": 1012 + }, + { + "epoch": 2.7658703071672353, + "grad_norm": 0.07745044678449631, + "learning_rate": 5.249914167542486e-05, + "loss": 0.3786, + "step": 1013 + }, + { + "epoch": 2.7686006825938567, + "grad_norm": 0.06824437528848648, + "learning_rate": 5.228936491952363e-05, + "loss": 0.3546, + "step": 1014 + }, + { + "epoch": 2.7713310580204777, + "grad_norm": 0.07147728651762009, + "learning_rate": 5.207985964696462e-05, + "loss": 0.3603, + "step": 1015 + }, + { + "epoch": 2.774061433447099, + "grad_norm": 0.06916683167219162, + "learning_rate": 5.1870627049875954e-05, + "loss": 0.3509, + "step": 1016 + }, + { + "epoch": 2.7767918088737202, + "grad_norm": 0.06777457147836685, + "learning_rate": 5.16616683188342e-05, + "loss": 0.377, + "step": 1017 + }, + { + "epoch": 2.7795221843003413, + "grad_norm": 0.06878098845481873, + "learning_rate": 5.145298464285757e-05, + "loss": 0.3787, + "step": 1018 + }, + { + "epoch": 2.7822525597269623, + "grad_norm": 0.07464558631181717, + "learning_rate": 5.12445772093992e-05, + "loss": 0.368, + "step": 1019 + }, + { + "epoch": 2.7849829351535837, + "grad_norm": 0.07435291260480881, + "learning_rate": 5.103644720434027e-05, + "loss": 0.3805, + "step": 1020 + }, + { + "epoch": 2.7877133105802048, + "grad_norm": 0.07324662059545517, + "learning_rate": 5.082859581198344e-05, + "loss": 0.3744, + "step": 1021 + }, + { + "epoch": 2.790443686006826, + "grad_norm": 0.0732622966170311, + "learning_rate": 5.062102421504593e-05, + "loss": 0.3733, + "step": 1022 + }, + { + "epoch": 2.7931740614334473, + "grad_norm": 0.07599971443414688, + "learning_rate": 5.041373359465289e-05, + "loss": 0.3582, + "step": 1023 + }, + { + "epoch": 2.7959044368600683, + "grad_norm": 0.07411503791809082, + "learning_rate": 5.020672513033066e-05, + "loss": 0.3707, + "step": 1024 + }, + { + "epoch": 2.7986348122866893, + "grad_norm": 0.07376236468553543, + "learning_rate": 5.000000000000002e-05, + "loss": 0.3854, + "step": 1025 + }, + { + "epoch": 2.8013651877133103, + "grad_norm": 0.07350826263427734, + "learning_rate": 4.9793559379969566e-05, + "loss": 0.3692, + "step": 1026 + }, + { + "epoch": 2.804095563139932, + "grad_norm": 0.07208722829818726, + "learning_rate": 4.958740444492892e-05, + "loss": 0.3694, + "step": 1027 + }, + { + "epoch": 2.806825938566553, + "grad_norm": 0.07334735989570618, + "learning_rate": 4.9381536367942195e-05, + "loss": 0.367, + "step": 1028 + }, + { + "epoch": 2.8095563139931743, + "grad_norm": 0.0769776999950409, + "learning_rate": 4.917595632044113e-05, + "loss": 0.3646, + "step": 1029 + }, + { + "epoch": 2.8122866894197953, + "grad_norm": 0.07780209183692932, + "learning_rate": 4.8970665472218537e-05, + "loss": 0.3918, + "step": 1030 + }, + { + "epoch": 2.8150170648464163, + "grad_norm": 0.07993920892477036, + "learning_rate": 4.8765664991421634e-05, + "loss": 0.3703, + "step": 1031 + }, + { + "epoch": 2.8177474402730374, + "grad_norm": 0.07131262123584747, + "learning_rate": 4.856095604454539e-05, + "loss": 0.3482, + "step": 1032 + }, + { + "epoch": 2.820477815699659, + "grad_norm": 0.0709707960486412, + "learning_rate": 4.835653979642585e-05, + "loss": 0.3742, + "step": 1033 + }, + { + "epoch": 2.82320819112628, + "grad_norm": 0.07897651195526123, + "learning_rate": 4.815241741023367e-05, + "loss": 0.3647, + "step": 1034 + }, + { + "epoch": 2.825938566552901, + "grad_norm": 0.06946137547492981, + "learning_rate": 4.7948590047467153e-05, + "loss": 0.3688, + "step": 1035 + }, + { + "epoch": 2.8286689419795223, + "grad_norm": 0.07583218812942505, + "learning_rate": 4.774505886794609e-05, + "loss": 0.3841, + "step": 1036 + }, + { + "epoch": 2.8313993174061434, + "grad_norm": 0.07795160263776779, + "learning_rate": 4.754182502980477e-05, + "loss": 0.3697, + "step": 1037 + }, + { + "epoch": 2.8341296928327644, + "grad_norm": 0.07436595857143402, + "learning_rate": 4.7338889689485624e-05, + "loss": 0.3754, + "step": 1038 + }, + { + "epoch": 2.8368600682593854, + "grad_norm": 0.08026924729347229, + "learning_rate": 4.713625400173247e-05, + "loss": 0.3819, + "step": 1039 + }, + { + "epoch": 2.839590443686007, + "grad_norm": 0.07545596361160278, + "learning_rate": 4.693391911958426e-05, + "loss": 0.3669, + "step": 1040 + }, + { + "epoch": 2.842320819112628, + "grad_norm": 0.0736890509724617, + "learning_rate": 4.673188619436798e-05, + "loss": 0.3822, + "step": 1041 + }, + { + "epoch": 2.8450511945392494, + "grad_norm": 0.07859516888856888, + "learning_rate": 4.6530156375692726e-05, + "loss": 0.3766, + "step": 1042 + }, + { + "epoch": 2.8477815699658704, + "grad_norm": 0.07504123449325562, + "learning_rate": 4.632873081144267e-05, + "loss": 0.3593, + "step": 1043 + }, + { + "epoch": 2.8505119453924914, + "grad_norm": 0.07197951525449753, + "learning_rate": 4.6127610647770767e-05, + "loss": 0.3794, + "step": 1044 + }, + { + "epoch": 2.8532423208191124, + "grad_norm": 0.07802017033100128, + "learning_rate": 4.592679702909216e-05, + "loss": 0.3662, + "step": 1045 + }, + { + "epoch": 2.855972696245734, + "grad_norm": 0.07301507890224457, + "learning_rate": 4.572629109807782e-05, + "loss": 0.3677, + "step": 1046 + }, + { + "epoch": 2.858703071672355, + "grad_norm": 0.07427814602851868, + "learning_rate": 4.552609399564762e-05, + "loss": 0.3539, + "step": 1047 + }, + { + "epoch": 2.861433447098976, + "grad_norm": 0.07539915293455124, + "learning_rate": 4.532620686096446e-05, + "loss": 0.3702, + "step": 1048 + }, + { + "epoch": 2.8641638225255974, + "grad_norm": 0.07405383884906769, + "learning_rate": 4.5126630831427264e-05, + "loss": 0.3735, + "step": 1049 + }, + { + "epoch": 2.8668941979522184, + "grad_norm": 0.07310394942760468, + "learning_rate": 4.492736704266475e-05, + "loss": 0.3739, + "step": 1050 + }, + { + "epoch": 2.8696245733788395, + "grad_norm": 0.07626067847013474, + "learning_rate": 4.472841662852888e-05, + "loss": 0.3678, + "step": 1051 + }, + { + "epoch": 2.8723549488054605, + "grad_norm": 0.07700545340776443, + "learning_rate": 4.452978072108859e-05, + "loss": 0.3825, + "step": 1052 + }, + { + "epoch": 2.875085324232082, + "grad_norm": 0.070135198533535, + "learning_rate": 4.4331460450623064e-05, + "loss": 0.373, + "step": 1053 + }, + { + "epoch": 2.877815699658703, + "grad_norm": 0.07328429073095322, + "learning_rate": 4.413345694561549e-05, + "loss": 0.3691, + "step": 1054 + }, + { + "epoch": 2.8805460750853245, + "grad_norm": 0.0815819501876831, + "learning_rate": 4.393577133274658e-05, + "loss": 0.3548, + "step": 1055 + }, + { + "epoch": 2.8832764505119455, + "grad_norm": 0.08154749870300293, + "learning_rate": 4.373840473688829e-05, + "loss": 0.381, + "step": 1056 + }, + { + "epoch": 2.8860068259385665, + "grad_norm": 0.07183761149644852, + "learning_rate": 4.354135828109707e-05, + "loss": 0.3609, + "step": 1057 + }, + { + "epoch": 2.8887372013651875, + "grad_norm": 0.07250452786684036, + "learning_rate": 4.3344633086607955e-05, + "loss": 0.3725, + "step": 1058 + }, + { + "epoch": 2.891467576791809, + "grad_norm": 0.07028324902057648, + "learning_rate": 4.3148230272827784e-05, + "loss": 0.34, + "step": 1059 + }, + { + "epoch": 2.89419795221843, + "grad_norm": 0.07634340226650238, + "learning_rate": 4.295215095732904e-05, + "loss": 0.3749, + "step": 1060 + }, + { + "epoch": 2.896928327645051, + "grad_norm": 0.07495231181383133, + "learning_rate": 4.275639625584338e-05, + "loss": 0.3769, + "step": 1061 + }, + { + "epoch": 2.8996587030716725, + "grad_norm": 0.0772211030125618, + "learning_rate": 4.256096728225548e-05, + "loss": 0.3678, + "step": 1062 + }, + { + "epoch": 2.9023890784982935, + "grad_norm": 0.08062466979026794, + "learning_rate": 4.236586514859633e-05, + "loss": 0.3616, + "step": 1063 + }, + { + "epoch": 2.9051194539249146, + "grad_norm": 0.08190742880105972, + "learning_rate": 4.217109096503736e-05, + "loss": 0.3818, + "step": 1064 + }, + { + "epoch": 2.9078498293515356, + "grad_norm": 0.07490533590316772, + "learning_rate": 4.197664583988376e-05, + "loss": 0.373, + "step": 1065 + }, + { + "epoch": 2.910580204778157, + "grad_norm": 0.07723496109247208, + "learning_rate": 4.1782530879568374e-05, + "loss": 0.3831, + "step": 1066 + }, + { + "epoch": 2.913310580204778, + "grad_norm": 0.07193005830049515, + "learning_rate": 4.1588747188645275e-05, + "loss": 0.3855, + "step": 1067 + }, + { + "epoch": 2.9160409556313995, + "grad_norm": 0.07526146620512009, + "learning_rate": 4.1395295869783615e-05, + "loss": 0.3737, + "step": 1068 + }, + { + "epoch": 2.9187713310580206, + "grad_norm": 0.07713403552770615, + "learning_rate": 4.1202178023761195e-05, + "loss": 0.37, + "step": 1069 + }, + { + "epoch": 2.9215017064846416, + "grad_norm": 0.07559187710285187, + "learning_rate": 4.100939474945843e-05, + "loss": 0.3644, + "step": 1070 + }, + { + "epoch": 2.9242320819112626, + "grad_norm": 0.07406629621982574, + "learning_rate": 4.0816947143851816e-05, + "loss": 0.3647, + "step": 1071 + }, + { + "epoch": 2.926962457337884, + "grad_norm": 0.07426485419273376, + "learning_rate": 4.0624836302007886e-05, + "loss": 0.3722, + "step": 1072 + }, + { + "epoch": 2.929692832764505, + "grad_norm": 0.07818352431058884, + "learning_rate": 4.0433063317076893e-05, + "loss": 0.3656, + "step": 1073 + }, + { + "epoch": 2.932423208191126, + "grad_norm": 0.07689237594604492, + "learning_rate": 4.024162928028663e-05, + "loss": 0.3612, + "step": 1074 + }, + { + "epoch": 2.9351535836177476, + "grad_norm": 0.0766364261507988, + "learning_rate": 4.0050535280936205e-05, + "loss": 0.3612, + "step": 1075 + }, + { + "epoch": 2.9378839590443686, + "grad_norm": 0.07136540114879608, + "learning_rate": 3.985978240638981e-05, + "loss": 0.3627, + "step": 1076 + }, + { + "epoch": 2.9406143344709896, + "grad_norm": 0.07516707479953766, + "learning_rate": 3.966937174207066e-05, + "loss": 0.3837, + "step": 1077 + }, + { + "epoch": 2.9433447098976107, + "grad_norm": 0.07119621336460114, + "learning_rate": 3.947930437145464e-05, + "loss": 0.3563, + "step": 1078 + }, + { + "epoch": 2.946075085324232, + "grad_norm": 0.07871522009372711, + "learning_rate": 3.928958137606421e-05, + "loss": 0.3715, + "step": 1079 + }, + { + "epoch": 2.948805460750853, + "grad_norm": 0.07658346742391586, + "learning_rate": 3.910020383546233e-05, + "loss": 0.3711, + "step": 1080 + }, + { + "epoch": 2.9515358361774746, + "grad_norm": 0.07438597828149796, + "learning_rate": 3.8911172827246215e-05, + "loss": 0.3703, + "step": 1081 + }, + { + "epoch": 2.9542662116040956, + "grad_norm": 0.07197795808315277, + "learning_rate": 3.8722489427041185e-05, + "loss": 0.3621, + "step": 1082 + }, + { + "epoch": 2.9569965870307167, + "grad_norm": 0.07549617439508438, + "learning_rate": 3.853415470849479e-05, + "loss": 0.3736, + "step": 1083 + }, + { + "epoch": 2.9597269624573377, + "grad_norm": 0.0718371719121933, + "learning_rate": 3.834616974327021e-05, + "loss": 0.3755, + "step": 1084 + }, + { + "epoch": 2.962457337883959, + "grad_norm": 0.07231539487838745, + "learning_rate": 3.815853560104075e-05, + "loss": 0.3786, + "step": 1085 + }, + { + "epoch": 2.96518771331058, + "grad_norm": 0.07186778634786606, + "learning_rate": 3.7971253349483285e-05, + "loss": 0.3679, + "step": 1086 + }, + { + "epoch": 2.967918088737201, + "grad_norm": 0.07404022663831711, + "learning_rate": 3.7784324054272405e-05, + "loss": 0.3638, + "step": 1087 + }, + { + "epoch": 2.9706484641638227, + "grad_norm": 0.07804732024669647, + "learning_rate": 3.759774877907428e-05, + "loss": 0.3728, + "step": 1088 + }, + { + "epoch": 2.9733788395904437, + "grad_norm": 0.07797016203403473, + "learning_rate": 3.741152858554077e-05, + "loss": 0.3704, + "step": 1089 + }, + { + "epoch": 2.9761092150170647, + "grad_norm": 0.07321101427078247, + "learning_rate": 3.722566453330298e-05, + "loss": 0.3517, + "step": 1090 + }, + { + "epoch": 2.9788395904436857, + "grad_norm": 0.08448322117328644, + "learning_rate": 3.7040157679965796e-05, + "loss": 0.3875, + "step": 1091 + }, + { + "epoch": 2.981569965870307, + "grad_norm": 0.07892005890607834, + "learning_rate": 3.6855009081101355e-05, + "loss": 0.3686, + "step": 1092 + }, + { + "epoch": 2.9843003412969282, + "grad_norm": 0.07619816064834595, + "learning_rate": 3.6670219790243344e-05, + "loss": 0.356, + "step": 1093 + }, + { + "epoch": 2.9870307167235497, + "grad_norm": 0.07540540397167206, + "learning_rate": 3.648579085888085e-05, + "loss": 0.3564, + "step": 1094 + }, + { + "epoch": 2.9897610921501707, + "grad_norm": 0.07253746688365936, + "learning_rate": 3.630172333645261e-05, + "loss": 0.3771, + "step": 1095 + }, + { + "epoch": 2.9924914675767917, + "grad_norm": 0.07068438827991486, + "learning_rate": 3.611801827034059e-05, + "loss": 0.3645, + "step": 1096 + }, + { + "epoch": 2.9952218430034128, + "grad_norm": 0.07300886511802673, + "learning_rate": 3.593467670586457e-05, + "loss": 0.3619, + "step": 1097 + }, + { + "epoch": 2.9979522184300342, + "grad_norm": 0.071599081158638, + "learning_rate": 3.5751699686275786e-05, + "loss": 0.369, + "step": 1098 + }, + { + "epoch": 2.9979522184300342, + "eval_loss": 0.37550708651542664, + "eval_runtime": 309.4712, + "eval_samples_per_second": 8.414, + "eval_steps_per_second": 1.053, + "step": 1098 + }, + { + "epoch": 3.0006825938566553, + "grad_norm": 0.17725853621959686, + "learning_rate": 3.556908825275117e-05, + "loss": 0.6092, + "step": 1099 + }, + { + "epoch": 3.0034129692832763, + "grad_norm": 0.07963860780000687, + "learning_rate": 3.538684344438736e-05, + "loss": 0.3625, + "step": 1100 + }, + { + "epoch": 3.0061433447098977, + "grad_norm": 0.08206669241189957, + "learning_rate": 3.520496629819494e-05, + "loss": 0.3611, + "step": 1101 + }, + { + "epoch": 3.0088737201365188, + "grad_norm": 0.08496807515621185, + "learning_rate": 3.502345784909229e-05, + "loss": 0.356, + "step": 1102 + }, + { + "epoch": 3.01160409556314, + "grad_norm": 0.09216141700744629, + "learning_rate": 3.484231912989989e-05, + "loss": 0.3537, + "step": 1103 + }, + { + "epoch": 3.0143344709897613, + "grad_norm": 0.08650084584951401, + "learning_rate": 3.466155117133433e-05, + "loss": 0.3413, + "step": 1104 + }, + { + "epoch": 3.0170648464163823, + "grad_norm": 0.10048259794712067, + "learning_rate": 3.448115500200263e-05, + "loss": 0.3624, + "step": 1105 + }, + { + "epoch": 3.0197952218430033, + "grad_norm": 0.0802978053689003, + "learning_rate": 3.430113164839601e-05, + "loss": 0.3554, + "step": 1106 + }, + { + "epoch": 3.0225255972696248, + "grad_norm": 0.08992195874452591, + "learning_rate": 3.4121482134884575e-05, + "loss": 0.3524, + "step": 1107 + }, + { + "epoch": 3.025255972696246, + "grad_norm": 0.08212791383266449, + "learning_rate": 3.3942207483710986e-05, + "loss": 0.3526, + "step": 1108 + }, + { + "epoch": 3.027986348122867, + "grad_norm": 0.08635729551315308, + "learning_rate": 3.3763308714984974e-05, + "loss": 0.3521, + "step": 1109 + }, + { + "epoch": 3.030716723549488, + "grad_norm": 0.08558769524097443, + "learning_rate": 3.358478684667734e-05, + "loss": 0.3627, + "step": 1110 + }, + { + "epoch": 3.0334470989761093, + "grad_norm": 0.08466281741857529, + "learning_rate": 3.3406642894614394e-05, + "loss": 0.3528, + "step": 1111 + }, + { + "epoch": 3.0361774744027303, + "grad_norm": 0.08737502992153168, + "learning_rate": 3.3228877872471786e-05, + "loss": 0.361, + "step": 1112 + }, + { + "epoch": 3.0389078498293514, + "grad_norm": 0.09203612804412842, + "learning_rate": 3.305149279176921e-05, + "loss": 0.3478, + "step": 1113 + }, + { + "epoch": 3.041638225255973, + "grad_norm": 0.0948566272854805, + "learning_rate": 3.287448866186428e-05, + "loss": 0.35, + "step": 1114 + }, + { + "epoch": 3.044368600682594, + "grad_norm": 0.09073270857334137, + "learning_rate": 3.269786648994697e-05, + "loss": 0.3655, + "step": 1115 + }, + { + "epoch": 3.047098976109215, + "grad_norm": 0.09162113070487976, + "learning_rate": 3.252162728103382e-05, + "loss": 0.3542, + "step": 1116 + }, + { + "epoch": 3.0498293515358363, + "grad_norm": 0.08284641057252884, + "learning_rate": 3.234577203796223e-05, + "loss": 0.3564, + "step": 1117 + }, + { + "epoch": 3.0525597269624574, + "grad_norm": 0.09153011441230774, + "learning_rate": 3.217030176138474e-05, + "loss": 0.3699, + "step": 1118 + }, + { + "epoch": 3.0552901023890784, + "grad_norm": 0.09014246612787247, + "learning_rate": 3.199521744976342e-05, + "loss": 0.3458, + "step": 1119 + }, + { + "epoch": 3.0580204778157, + "grad_norm": 0.08532671630382538, + "learning_rate": 3.182052009936404e-05, + "loss": 0.3503, + "step": 1120 + }, + { + "epoch": 3.060750853242321, + "grad_norm": 0.08901910483837128, + "learning_rate": 3.164621070425051e-05, + "loss": 0.3702, + "step": 1121 + }, + { + "epoch": 3.063481228668942, + "grad_norm": 0.09040674567222595, + "learning_rate": 3.147229025627922e-05, + "loss": 0.3512, + "step": 1122 + }, + { + "epoch": 3.066211604095563, + "grad_norm": 0.09059562534093857, + "learning_rate": 3.129875974509332e-05, + "loss": 0.3599, + "step": 1123 + }, + { + "epoch": 3.0689419795221844, + "grad_norm": 0.09486035257577896, + "learning_rate": 3.1125620158117186e-05, + "loss": 0.3645, + "step": 1124 + }, + { + "epoch": 3.0716723549488054, + "grad_norm": 0.08574552834033966, + "learning_rate": 3.095287248055069e-05, + "loss": 0.352, + "step": 1125 + }, + { + "epoch": 3.0744027303754264, + "grad_norm": 0.08876755833625793, + "learning_rate": 3.078051769536378e-05, + "loss": 0.3524, + "step": 1126 + }, + { + "epoch": 3.077133105802048, + "grad_norm": 0.08718768507242203, + "learning_rate": 3.060855678329063e-05, + "loss": 0.3646, + "step": 1127 + }, + { + "epoch": 3.079863481228669, + "grad_norm": 0.08956962823867798, + "learning_rate": 3.043699072282429e-05, + "loss": 0.3508, + "step": 1128 + }, + { + "epoch": 3.08259385665529, + "grad_norm": 0.08665765076875687, + "learning_rate": 3.0265820490210973e-05, + "loss": 0.3615, + "step": 1129 + }, + { + "epoch": 3.0853242320819114, + "grad_norm": 0.08656659722328186, + "learning_rate": 3.0095047059444546e-05, + "loss": 0.3578, + "step": 1130 + }, + { + "epoch": 3.0880546075085324, + "grad_norm": 0.08812808245420456, + "learning_rate": 2.9924671402261018e-05, + "loss": 0.3543, + "step": 1131 + }, + { + "epoch": 3.0907849829351535, + "grad_norm": 0.08373061567544937, + "learning_rate": 2.9754694488133038e-05, + "loss": 0.3518, + "step": 1132 + }, + { + "epoch": 3.093515358361775, + "grad_norm": 0.09521368145942688, + "learning_rate": 2.958511728426414e-05, + "loss": 0.3686, + "step": 1133 + }, + { + "epoch": 3.096245733788396, + "grad_norm": 0.08841294795274734, + "learning_rate": 2.941594075558366e-05, + "loss": 0.3526, + "step": 1134 + }, + { + "epoch": 3.098976109215017, + "grad_norm": 0.09111680090427399, + "learning_rate": 2.9247165864740856e-05, + "loss": 0.3515, + "step": 1135 + }, + { + "epoch": 3.101706484641638, + "grad_norm": 0.0909881740808487, + "learning_rate": 2.9078793572099616e-05, + "loss": 0.3656, + "step": 1136 + }, + { + "epoch": 3.1044368600682595, + "grad_norm": 0.08638916909694672, + "learning_rate": 2.8910824835732952e-05, + "loss": 0.3584, + "step": 1137 + }, + { + "epoch": 3.1071672354948805, + "grad_norm": 0.09203464537858963, + "learning_rate": 2.8743260611417665e-05, + "loss": 0.351, + "step": 1138 + }, + { + "epoch": 3.1098976109215015, + "grad_norm": 0.0900203287601471, + "learning_rate": 2.857610185262859e-05, + "loss": 0.3491, + "step": 1139 + }, + { + "epoch": 3.112627986348123, + "grad_norm": 0.08010142296552658, + "learning_rate": 2.8409349510533578e-05, + "loss": 0.3563, + "step": 1140 + }, + { + "epoch": 3.115358361774744, + "grad_norm": 0.0808601900935173, + "learning_rate": 2.8243004533987793e-05, + "loss": 0.3173, + "step": 1141 + }, + { + "epoch": 3.118088737201365, + "grad_norm": 0.08397600799798965, + "learning_rate": 2.8077067869528417e-05, + "loss": 0.3394, + "step": 1142 + }, + { + "epoch": 3.1208191126279865, + "grad_norm": 0.08854986727237701, + "learning_rate": 2.7911540461369222e-05, + "loss": 0.3498, + "step": 1143 + }, + { + "epoch": 3.1235494880546075, + "grad_norm": 0.09329045563936234, + "learning_rate": 2.774642325139535e-05, + "loss": 0.3702, + "step": 1144 + }, + { + "epoch": 3.1262798634812285, + "grad_norm": 0.08807191997766495, + "learning_rate": 2.7581717179157606e-05, + "loss": 0.3492, + "step": 1145 + }, + { + "epoch": 3.12901023890785, + "grad_norm": 0.08996576815843582, + "learning_rate": 2.7417423181867585e-05, + "loss": 0.3536, + "step": 1146 + }, + { + "epoch": 3.131740614334471, + "grad_norm": 0.08638175576925278, + "learning_rate": 2.72535421943919e-05, + "loss": 0.3428, + "step": 1147 + }, + { + "epoch": 3.134470989761092, + "grad_norm": 0.08896217495203018, + "learning_rate": 2.7090075149247217e-05, + "loss": 0.3653, + "step": 1148 + }, + { + "epoch": 3.137201365187713, + "grad_norm": 0.08900240063667297, + "learning_rate": 2.6927022976594607e-05, + "loss": 0.3572, + "step": 1149 + }, + { + "epoch": 3.1399317406143346, + "grad_norm": 0.09145187586545944, + "learning_rate": 2.676438660423457e-05, + "loss": 0.3565, + "step": 1150 + }, + { + "epoch": 3.1426621160409556, + "grad_norm": 0.09132881462574005, + "learning_rate": 2.660216695760157e-05, + "loss": 0.3617, + "step": 1151 + }, + { + "epoch": 3.1453924914675766, + "grad_norm": 0.08654224127531052, + "learning_rate": 2.6440364959758813e-05, + "loss": 0.3212, + "step": 1152 + }, + { + "epoch": 3.148122866894198, + "grad_norm": 0.09070401638746262, + "learning_rate": 2.6278981531392945e-05, + "loss": 0.3443, + "step": 1153 + }, + { + "epoch": 3.150853242320819, + "grad_norm": 0.08169679343700409, + "learning_rate": 2.6118017590809017e-05, + "loss": 0.3642, + "step": 1154 + }, + { + "epoch": 3.15358361774744, + "grad_norm": 0.0845232605934143, + "learning_rate": 2.595747405392491e-05, + "loss": 0.3444, + "step": 1155 + }, + { + "epoch": 3.1563139931740616, + "grad_norm": 0.0819699615240097, + "learning_rate": 2.579735183426649e-05, + "loss": 0.355, + "step": 1156 + }, + { + "epoch": 3.1590443686006826, + "grad_norm": 0.0867064967751503, + "learning_rate": 2.5637651842962164e-05, + "loss": 0.3552, + "step": 1157 + }, + { + "epoch": 3.1617747440273036, + "grad_norm": 0.08929461985826492, + "learning_rate": 2.5478374988737753e-05, + "loss": 0.3538, + "step": 1158 + }, + { + "epoch": 3.164505119453925, + "grad_norm": 0.08831099420785904, + "learning_rate": 2.531952217791136e-05, + "loss": 0.333, + "step": 1159 + }, + { + "epoch": 3.167235494880546, + "grad_norm": 0.09108342975378036, + "learning_rate": 2.5161094314388278e-05, + "loss": 0.3475, + "step": 1160 + }, + { + "epoch": 3.169965870307167, + "grad_norm": 0.09032892435789108, + "learning_rate": 2.5003092299655584e-05, + "loss": 0.3564, + "step": 1161 + }, + { + "epoch": 3.172696245733788, + "grad_norm": 0.08385740965604782, + "learning_rate": 2.4845517032777364e-05, + "loss": 0.3453, + "step": 1162 + }, + { + "epoch": 3.1754266211604096, + "grad_norm": 0.09305927157402039, + "learning_rate": 2.4688369410389334e-05, + "loss": 0.3575, + "step": 1163 + }, + { + "epoch": 3.1781569965870307, + "grad_norm": 0.08773191273212433, + "learning_rate": 2.4531650326693822e-05, + "loss": 0.3367, + "step": 1164 + }, + { + "epoch": 3.1808873720136517, + "grad_norm": 0.08649349212646484, + "learning_rate": 2.4375360673454718e-05, + "loss": 0.3462, + "step": 1165 + }, + { + "epoch": 3.183617747440273, + "grad_norm": 0.09376425296068192, + "learning_rate": 2.4219501339992334e-05, + "loss": 0.3613, + "step": 1166 + }, + { + "epoch": 3.186348122866894, + "grad_norm": 0.08337985724210739, + "learning_rate": 2.406407321317835e-05, + "loss": 0.3458, + "step": 1167 + }, + { + "epoch": 3.189078498293515, + "grad_norm": 0.08914073556661606, + "learning_rate": 2.3909077177430893e-05, + "loss": 0.3422, + "step": 1168 + }, + { + "epoch": 3.1918088737201367, + "grad_norm": 0.08736269921064377, + "learning_rate": 2.3754514114709304e-05, + "loss": 0.3505, + "step": 1169 + }, + { + "epoch": 3.1945392491467577, + "grad_norm": 0.09058474004268646, + "learning_rate": 2.3600384904509254e-05, + "loss": 0.3535, + "step": 1170 + }, + { + "epoch": 3.1972696245733787, + "grad_norm": 0.09851355850696564, + "learning_rate": 2.3446690423857685e-05, + "loss": 0.3663, + "step": 1171 + }, + { + "epoch": 3.2, + "grad_norm": 0.09518344700336456, + "learning_rate": 2.3293431547307887e-05, + "loss": 0.3449, + "step": 1172 + }, + { + "epoch": 3.202730375426621, + "grad_norm": 0.08695267885923386, + "learning_rate": 2.31406091469344e-05, + "loss": 0.3501, + "step": 1173 + }, + { + "epoch": 3.2054607508532422, + "grad_norm": 0.09185747057199478, + "learning_rate": 2.298822409232817e-05, + "loss": 0.3453, + "step": 1174 + }, + { + "epoch": 3.2081911262798632, + "grad_norm": 0.09628286957740784, + "learning_rate": 2.2836277250591574e-05, + "loss": 0.3467, + "step": 1175 + }, + { + "epoch": 3.2109215017064847, + "grad_norm": 0.08919479697942734, + "learning_rate": 2.2684769486333445e-05, + "loss": 0.3635, + "step": 1176 + }, + { + "epoch": 3.2136518771331057, + "grad_norm": 0.09248843789100647, + "learning_rate": 2.2533701661664154e-05, + "loss": 0.3666, + "step": 1177 + }, + { + "epoch": 3.2163822525597268, + "grad_norm": 0.08832020312547684, + "learning_rate": 2.2383074636190748e-05, + "loss": 0.343, + "step": 1178 + }, + { + "epoch": 3.2191126279863482, + "grad_norm": 0.09390208125114441, + "learning_rate": 2.2232889267012038e-05, + "loss": 0.3617, + "step": 1179 + }, + { + "epoch": 3.2218430034129693, + "grad_norm": 0.0916062742471695, + "learning_rate": 2.2083146408713673e-05, + "loss": 0.3558, + "step": 1180 + }, + { + "epoch": 3.2245733788395903, + "grad_norm": 0.08844900876283646, + "learning_rate": 2.1933846913363466e-05, + "loss": 0.3475, + "step": 1181 + }, + { + "epoch": 3.2273037542662117, + "grad_norm": 0.0881471261382103, + "learning_rate": 2.178499163050617e-05, + "loss": 0.3366, + "step": 1182 + }, + { + "epoch": 3.2300341296928328, + "grad_norm": 0.09539782255887985, + "learning_rate": 2.1636581407159105e-05, + "loss": 0.3461, + "step": 1183 + }, + { + "epoch": 3.232764505119454, + "grad_norm": 0.0891459733247757, + "learning_rate": 2.1488617087806982e-05, + "loss": 0.3406, + "step": 1184 + }, + { + "epoch": 3.2354948805460753, + "grad_norm": 0.08765516430139542, + "learning_rate": 2.1341099514397266e-05, + "loss": 0.3397, + "step": 1185 + }, + { + "epoch": 3.2382252559726963, + "grad_norm": 0.08997386693954468, + "learning_rate": 2.1194029526335303e-05, + "loss": 0.3548, + "step": 1186 + }, + { + "epoch": 3.2409556313993173, + "grad_norm": 0.08536975085735321, + "learning_rate": 2.1047407960479702e-05, + "loss": 0.3305, + "step": 1187 + }, + { + "epoch": 3.2436860068259388, + "grad_norm": 0.08575280010700226, + "learning_rate": 2.0901235651137284e-05, + "loss": 0.3476, + "step": 1188 + }, + { + "epoch": 3.24641638225256, + "grad_norm": 0.08761252462863922, + "learning_rate": 2.0755513430058672e-05, + "loss": 0.3359, + "step": 1189 + }, + { + "epoch": 3.249146757679181, + "grad_norm": 0.0947328731417656, + "learning_rate": 2.0610242126433297e-05, + "loss": 0.3487, + "step": 1190 + }, + { + "epoch": 3.2518771331058023, + "grad_norm": 0.09171999245882034, + "learning_rate": 2.0465422566884805e-05, + "loss": 0.342, + "step": 1191 + }, + { + "epoch": 3.2546075085324233, + "grad_norm": 0.0883520245552063, + "learning_rate": 2.0321055575466284e-05, + "loss": 0.3303, + "step": 1192 + }, + { + "epoch": 3.2573378839590443, + "grad_norm": 0.09119928628206253, + "learning_rate": 2.0177141973655766e-05, + "loss": 0.3467, + "step": 1193 + }, + { + "epoch": 3.2600682593856654, + "grad_norm": 0.0883185863494873, + "learning_rate": 2.0033682580351144e-05, + "loss": 0.3503, + "step": 1194 + }, + { + "epoch": 3.262798634812287, + "grad_norm": 0.08986258506774902, + "learning_rate": 1.9890678211866033e-05, + "loss": 0.3518, + "step": 1195 + }, + { + "epoch": 3.265529010238908, + "grad_norm": 0.08108729869127274, + "learning_rate": 1.9748129681924675e-05, + "loss": 0.3538, + "step": 1196 + }, + { + "epoch": 3.268259385665529, + "grad_norm": 0.08759156614542007, + "learning_rate": 1.9606037801657673e-05, + "loss": 0.3394, + "step": 1197 + }, + { + "epoch": 3.2709897610921503, + "grad_norm": 0.08808710426092148, + "learning_rate": 1.9464403379596963e-05, + "loss": 0.3468, + "step": 1198 + }, + { + "epoch": 3.2737201365187714, + "grad_norm": 0.0865558609366417, + "learning_rate": 1.932322722167168e-05, + "loss": 0.3679, + "step": 1199 + }, + { + "epoch": 3.2764505119453924, + "grad_norm": 0.08987162262201309, + "learning_rate": 1.9182510131203224e-05, + "loss": 0.3455, + "step": 1200 + }, + { + "epoch": 3.2791808873720134, + "grad_norm": 0.08997821807861328, + "learning_rate": 1.9042252908900814e-05, + "loss": 0.3518, + "step": 1201 + }, + { + "epoch": 3.281911262798635, + "grad_norm": 0.08895470201969147, + "learning_rate": 1.8902456352856925e-05, + "loss": 0.344, + "step": 1202 + }, + { + "epoch": 3.284641638225256, + "grad_norm": 0.09291904419660568, + "learning_rate": 1.8763121258542815e-05, + "loss": 0.3475, + "step": 1203 + }, + { + "epoch": 3.2873720136518774, + "grad_norm": 0.0931735560297966, + "learning_rate": 1.86242484188038e-05, + "loss": 0.3265, + "step": 1204 + }, + { + "epoch": 3.2901023890784984, + "grad_norm": 0.08695405721664429, + "learning_rate": 1.848583862385501e-05, + "loss": 0.339, + "step": 1205 + }, + { + "epoch": 3.2928327645051194, + "grad_norm": 0.09283328056335449, + "learning_rate": 1.8347892661276656e-05, + "loss": 0.3541, + "step": 1206 + }, + { + "epoch": 3.2955631399317404, + "grad_norm": 0.0925126001238823, + "learning_rate": 1.82104113160097e-05, + "loss": 0.3579, + "step": 1207 + }, + { + "epoch": 3.298293515358362, + "grad_norm": 0.0894036740064621, + "learning_rate": 1.8073395370351287e-05, + "loss": 0.3404, + "step": 1208 + }, + { + "epoch": 3.301023890784983, + "grad_norm": 0.09419641643762589, + "learning_rate": 1.7936845603950447e-05, + "loss": 0.3601, + "step": 1209 + }, + { + "epoch": 3.303754266211604, + "grad_norm": 0.08752860873937607, + "learning_rate": 1.780076279380337e-05, + "loss": 0.3296, + "step": 1210 + }, + { + "epoch": 3.3064846416382254, + "grad_norm": 0.0924847200512886, + "learning_rate": 1.7665147714249376e-05, + "loss": 0.3438, + "step": 1211 + }, + { + "epoch": 3.3092150170648464, + "grad_norm": 0.08971387892961502, + "learning_rate": 1.753000113696617e-05, + "loss": 0.3495, + "step": 1212 + }, + { + "epoch": 3.3119453924914675, + "grad_norm": 0.08530307561159134, + "learning_rate": 1.7395323830965605e-05, + "loss": 0.332, + "step": 1213 + }, + { + "epoch": 3.3146757679180885, + "grad_norm": 0.08622708916664124, + "learning_rate": 1.726111656258932e-05, + "loss": 0.3389, + "step": 1214 + }, + { + "epoch": 3.31740614334471, + "grad_norm": 0.0962296724319458, + "learning_rate": 1.7127380095504296e-05, + "loss": 0.3562, + "step": 1215 + }, + { + "epoch": 3.320136518771331, + "grad_norm": 0.09346350282430649, + "learning_rate": 1.699411519069858e-05, + "loss": 0.3623, + "step": 1216 + }, + { + "epoch": 3.3228668941979524, + "grad_norm": 0.08937069773674011, + "learning_rate": 1.686132260647696e-05, + "loss": 0.3435, + "step": 1217 + }, + { + "epoch": 3.3255972696245735, + "grad_norm": 0.08801838010549545, + "learning_rate": 1.6729003098456576e-05, + "loss": 0.3396, + "step": 1218 + }, + { + "epoch": 3.3283276450511945, + "grad_norm": 0.0906905010342598, + "learning_rate": 1.6597157419562703e-05, + "loss": 0.3597, + "step": 1219 + }, + { + "epoch": 3.3310580204778155, + "grad_norm": 0.08793555200099945, + "learning_rate": 1.646578632002439e-05, + "loss": 0.355, + "step": 1220 + }, + { + "epoch": 3.333788395904437, + "grad_norm": 0.088701032102108, + "learning_rate": 1.6334890547370286e-05, + "loss": 0.3381, + "step": 1221 + }, + { + "epoch": 3.336518771331058, + "grad_norm": 0.09444410353899002, + "learning_rate": 1.6204470846424268e-05, + "loss": 0.3537, + "step": 1222 + }, + { + "epoch": 3.339249146757679, + "grad_norm": 0.09532656520605087, + "learning_rate": 1.607452795930131e-05, + "loss": 0.3425, + "step": 1223 + }, + { + "epoch": 3.3419795221843005, + "grad_norm": 0.09084934741258621, + "learning_rate": 1.594506262540324e-05, + "loss": 0.3413, + "step": 1224 + }, + { + "epoch": 3.3447098976109215, + "grad_norm": 0.09107686579227448, + "learning_rate": 1.5816075581414458e-05, + "loss": 0.3491, + "step": 1225 + }, + { + "epoch": 3.3474402730375425, + "grad_norm": 0.08773531764745712, + "learning_rate": 1.56875675612978e-05, + "loss": 0.3389, + "step": 1226 + }, + { + "epoch": 3.350170648464164, + "grad_norm": 0.09733074903488159, + "learning_rate": 1.5559539296290403e-05, + "loss": 0.3499, + "step": 1227 + }, + { + "epoch": 3.352901023890785, + "grad_norm": 0.08683840930461884, + "learning_rate": 1.5431991514899446e-05, + "loss": 0.3452, + "step": 1228 + }, + { + "epoch": 3.355631399317406, + "grad_norm": 0.0929400622844696, + "learning_rate": 1.5304924942898068e-05, + "loss": 0.3427, + "step": 1229 + }, + { + "epoch": 3.3583617747440275, + "grad_norm": 0.09083928912878036, + "learning_rate": 1.5178340303321314e-05, + "loss": 0.3552, + "step": 1230 + }, + { + "epoch": 3.3610921501706486, + "grad_norm": 0.08824457973241806, + "learning_rate": 1.5052238316461753e-05, + "loss": 0.3417, + "step": 1231 + }, + { + "epoch": 3.3638225255972696, + "grad_norm": 0.08632628619670868, + "learning_rate": 1.492661969986574e-05, + "loss": 0.3305, + "step": 1232 + }, + { + "epoch": 3.3665529010238906, + "grad_norm": 0.08909586071968079, + "learning_rate": 1.4801485168329066e-05, + "loss": 0.331, + "step": 1233 + }, + { + "epoch": 3.369283276450512, + "grad_norm": 0.09096753597259521, + "learning_rate": 1.4676835433892989e-05, + "loss": 0.346, + "step": 1234 + }, + { + "epoch": 3.372013651877133, + "grad_norm": 0.08751077950000763, + "learning_rate": 1.4552671205840163e-05, + "loss": 0.3346, + "step": 1235 + }, + { + "epoch": 3.374744027303754, + "grad_norm": 0.09124130010604858, + "learning_rate": 1.4428993190690677e-05, + "loss": 0.3459, + "step": 1236 + }, + { + "epoch": 3.3774744027303756, + "grad_norm": 0.08726565539836884, + "learning_rate": 1.4305802092197829e-05, + "loss": 0.336, + "step": 1237 + }, + { + "epoch": 3.3802047781569966, + "grad_norm": 0.09269004315137863, + "learning_rate": 1.4183098611344415e-05, + "loss": 0.3484, + "step": 1238 + }, + { + "epoch": 3.3829351535836176, + "grad_norm": 0.08907980471849442, + "learning_rate": 1.4060883446338502e-05, + "loss": 0.3574, + "step": 1239 + }, + { + "epoch": 3.385665529010239, + "grad_norm": 0.08752036094665527, + "learning_rate": 1.393915729260955e-05, + "loss": 0.3477, + "step": 1240 + }, + { + "epoch": 3.38839590443686, + "grad_norm": 0.09183843433856964, + "learning_rate": 1.3817920842804433e-05, + "loss": 0.3426, + "step": 1241 + }, + { + "epoch": 3.391126279863481, + "grad_norm": 0.09646756947040558, + "learning_rate": 1.3697174786783584e-05, + "loss": 0.3706, + "step": 1242 + }, + { + "epoch": 3.3938566552901026, + "grad_norm": 0.08840009570121765, + "learning_rate": 1.3576919811616862e-05, + "loss": 0.3541, + "step": 1243 + }, + { + "epoch": 3.3965870307167236, + "grad_norm": 0.09218363463878632, + "learning_rate": 1.345715660157989e-05, + "loss": 0.3407, + "step": 1244 + }, + { + "epoch": 3.3993174061433447, + "grad_norm": 0.09676238149404526, + "learning_rate": 1.3337885838149988e-05, + "loss": 0.3335, + "step": 1245 + }, + { + "epoch": 3.4020477815699657, + "grad_norm": 0.09339506179094315, + "learning_rate": 1.3219108200002418e-05, + "loss": 0.3457, + "step": 1246 + }, + { + "epoch": 3.404778156996587, + "grad_norm": 0.09434132277965546, + "learning_rate": 1.3100824363006326e-05, + "loss": 0.3457, + "step": 1247 + }, + { + "epoch": 3.407508532423208, + "grad_norm": 0.09233402460813522, + "learning_rate": 1.2983035000221177e-05, + "loss": 0.3465, + "step": 1248 + }, + { + "epoch": 3.410238907849829, + "grad_norm": 0.0978422462940216, + "learning_rate": 1.2865740781892699e-05, + "loss": 0.3338, + "step": 1249 + }, + { + "epoch": 3.4129692832764507, + "grad_norm": 0.09470426291227341, + "learning_rate": 1.2748942375449135e-05, + "loss": 0.3534, + "step": 1250 + }, + { + "epoch": 3.4156996587030717, + "grad_norm": 0.09681256860494614, + "learning_rate": 1.263264044549748e-05, + "loss": 0.349, + "step": 1251 + }, + { + "epoch": 3.4184300341296927, + "grad_norm": 0.09461713582277298, + "learning_rate": 1.2516835653819725e-05, + "loss": 0.3357, + "step": 1252 + }, + { + "epoch": 3.421160409556314, + "grad_norm": 0.08808429539203644, + "learning_rate": 1.2401528659368911e-05, + "loss": 0.3433, + "step": 1253 + }, + { + "epoch": 3.423890784982935, + "grad_norm": 0.09494482725858688, + "learning_rate": 1.2286720118265659e-05, + "loss": 0.3355, + "step": 1254 + }, + { + "epoch": 3.426621160409556, + "grad_norm": 0.10032296925783157, + "learning_rate": 1.2172410683794177e-05, + "loss": 0.337, + "step": 1255 + }, + { + "epoch": 3.4293515358361777, + "grad_norm": 0.09501981735229492, + "learning_rate": 1.2058601006398718e-05, + "loss": 0.3507, + "step": 1256 + }, + { + "epoch": 3.4320819112627987, + "grad_norm": 0.09376144409179688, + "learning_rate": 1.1945291733679764e-05, + "loss": 0.342, + "step": 1257 + }, + { + "epoch": 3.4348122866894197, + "grad_norm": 0.09684683382511139, + "learning_rate": 1.1832483510390469e-05, + "loss": 0.3536, + "step": 1258 + }, + { + "epoch": 3.4375426621160408, + "grad_norm": 0.09126045554876328, + "learning_rate": 1.1720176978432795e-05, + "loss": 0.356, + "step": 1259 + }, + { + "epoch": 3.4402730375426622, + "grad_norm": 0.08760211616754532, + "learning_rate": 1.1608372776854103e-05, + "loss": 0.3394, + "step": 1260 + }, + { + "epoch": 3.4430034129692833, + "grad_norm": 0.09649397432804108, + "learning_rate": 1.1497071541843306e-05, + "loss": 0.3491, + "step": 1261 + }, + { + "epoch": 3.4457337883959043, + "grad_norm": 0.09457088261842728, + "learning_rate": 1.1386273906727363e-05, + "loss": 0.3452, + "step": 1262 + }, + { + "epoch": 3.4484641638225257, + "grad_norm": 0.09157135337591171, + "learning_rate": 1.1275980501967642e-05, + "loss": 0.3397, + "step": 1263 + }, + { + "epoch": 3.4511945392491468, + "grad_norm": 0.09295860677957535, + "learning_rate": 1.1166191955156346e-05, + "loss": 0.351, + "step": 1264 + }, + { + "epoch": 3.453924914675768, + "grad_norm": 0.09845615178346634, + "learning_rate": 1.1056908891012884e-05, + "loss": 0.3441, + "step": 1265 + }, + { + "epoch": 3.4566552901023893, + "grad_norm": 0.08836519718170166, + "learning_rate": 1.0948131931380457e-05, + "loss": 0.3287, + "step": 1266 + }, + { + "epoch": 3.4593856655290103, + "grad_norm": 0.09238238632678986, + "learning_rate": 1.0839861695222354e-05, + "loss": 0.3313, + "step": 1267 + }, + { + "epoch": 3.4621160409556313, + "grad_norm": 0.09255190193653107, + "learning_rate": 1.0732098798618517e-05, + "loss": 0.3397, + "step": 1268 + }, + { + "epoch": 3.4648464163822528, + "grad_norm": 0.09583252668380737, + "learning_rate": 1.0624843854762034e-05, + "loss": 0.3524, + "step": 1269 + }, + { + "epoch": 3.467576791808874, + "grad_norm": 0.09115204960107803, + "learning_rate": 1.0518097473955624e-05, + "loss": 0.3527, + "step": 1270 + }, + { + "epoch": 3.470307167235495, + "grad_norm": 0.0932367816567421, + "learning_rate": 1.0411860263608186e-05, + "loss": 0.3539, + "step": 1271 + }, + { + "epoch": 3.473037542662116, + "grad_norm": 0.07985576242208481, + "learning_rate": 1.0306132828231318e-05, + "loss": 0.3459, + "step": 1272 + }, + { + "epoch": 3.4757679180887373, + "grad_norm": 0.09572295844554901, + "learning_rate": 1.0200915769435937e-05, + "loss": 0.3528, + "step": 1273 + }, + { + "epoch": 3.4784982935153583, + "grad_norm": 0.08902528136968613, + "learning_rate": 1.009620968592876e-05, + "loss": 0.3382, + "step": 1274 + }, + { + "epoch": 3.4812286689419794, + "grad_norm": 0.09365283697843552, + "learning_rate": 9.992015173508995e-06, + "loss": 0.3376, + "step": 1275 + }, + { + "epoch": 3.483959044368601, + "grad_norm": 0.08596877008676529, + "learning_rate": 9.88833282506486e-06, + "loss": 0.3272, + "step": 1276 + }, + { + "epoch": 3.486689419795222, + "grad_norm": 0.08941985666751862, + "learning_rate": 9.785163230570282e-06, + "loss": 0.3414, + "step": 1277 + }, + { + "epoch": 3.489419795221843, + "grad_norm": 0.09159501641988754, + "learning_rate": 9.682506977081496e-06, + "loss": 0.3445, + "step": 1278 + }, + { + "epoch": 3.4921501706484643, + "grad_norm": 0.09235716611146927, + "learning_rate": 9.580364648733775e-06, + "loss": 0.3472, + "step": 1279 + }, + { + "epoch": 3.4948805460750854, + "grad_norm": 0.08872348070144653, + "learning_rate": 9.478736826737944e-06, + "loss": 0.3325, + "step": 1280 + }, + { + "epoch": 3.4976109215017064, + "grad_norm": 0.09470780938863754, + "learning_rate": 9.37762408937729e-06, + "loss": 0.3429, + "step": 1281 + }, + { + "epoch": 3.500341296928328, + "grad_norm": 0.09355208277702332, + "learning_rate": 9.277027012004125e-06, + "loss": 0.3336, + "step": 1282 + }, + { + "epoch": 3.503071672354949, + "grad_norm": 0.0909925252199173, + "learning_rate": 9.176946167036516e-06, + "loss": 0.3277, + "step": 1283 + }, + { + "epoch": 3.50580204778157, + "grad_norm": 0.09389078617095947, + "learning_rate": 9.07738212395508e-06, + "loss": 0.3515, + "step": 1284 + }, + { + "epoch": 3.508532423208191, + "grad_norm": 0.09370888024568558, + "learning_rate": 8.978335449299791e-06, + "loss": 0.3527, + "step": 1285 + }, + { + "epoch": 3.5112627986348124, + "grad_norm": 0.09172456711530685, + "learning_rate": 8.87980670666655e-06, + "loss": 0.3368, + "step": 1286 + }, + { + "epoch": 3.5139931740614334, + "grad_norm": 0.09319642931222916, + "learning_rate": 8.781796456704262e-06, + "loss": 0.3411, + "step": 1287 + }, + { + "epoch": 3.516723549488055, + "grad_norm": 0.09805614501237869, + "learning_rate": 8.684305257111425e-06, + "loss": 0.356, + "step": 1288 + }, + { + "epoch": 3.519453924914676, + "grad_norm": 0.09786661714315414, + "learning_rate": 8.587333662633035e-06, + "loss": 0.3391, + "step": 1289 + }, + { + "epoch": 3.522184300341297, + "grad_norm": 0.09727593511343002, + "learning_rate": 8.490882225057428e-06, + "loss": 0.3388, + "step": 1290 + }, + { + "epoch": 3.524914675767918, + "grad_norm": 0.09065548330545425, + "learning_rate": 8.39495149321322e-06, + "loss": 0.3349, + "step": 1291 + }, + { + "epoch": 3.527645051194539, + "grad_norm": 0.09764271974563599, + "learning_rate": 8.299542012965944e-06, + "loss": 0.3534, + "step": 1292 + }, + { + "epoch": 3.5303754266211604, + "grad_norm": 0.08825667947530746, + "learning_rate": 8.204654327215267e-06, + "loss": 0.3337, + "step": 1293 + }, + { + "epoch": 3.5331058020477815, + "grad_norm": 0.09345337748527527, + "learning_rate": 8.110288975891634e-06, + "loss": 0.3532, + "step": 1294 + }, + { + "epoch": 3.535836177474403, + "grad_norm": 0.0950547531247139, + "learning_rate": 8.016446495953367e-06, + "loss": 0.3378, + "step": 1295 + }, + { + "epoch": 3.538566552901024, + "grad_norm": 0.09043276309967041, + "learning_rate": 7.923127421383458e-06, + "loss": 0.3401, + "step": 1296 + }, + { + "epoch": 3.541296928327645, + "grad_norm": 0.09156982600688934, + "learning_rate": 7.830332283186714e-06, + "loss": 0.3466, + "step": 1297 + }, + { + "epoch": 3.544027303754266, + "grad_norm": 0.09018293023109436, + "learning_rate": 7.73806160938656e-06, + "loss": 0.3482, + "step": 1298 + }, + { + "epoch": 3.5467576791808875, + "grad_norm": 0.09336891025304794, + "learning_rate": 7.646315925022152e-06, + "loss": 0.341, + "step": 1299 + }, + { + "epoch": 3.5494880546075085, + "grad_norm": 0.0902356430888176, + "learning_rate": 7.555095752145313e-06, + "loss": 0.3404, + "step": 1300 + }, + { + "epoch": 3.55221843003413, + "grad_norm": 0.09115597605705261, + "learning_rate": 7.4644016098176615e-06, + "loss": 0.3452, + "step": 1301 + }, + { + "epoch": 3.554948805460751, + "grad_norm": 0.09417373687028885, + "learning_rate": 7.374234014107484e-06, + "loss": 0.3419, + "step": 1302 + }, + { + "epoch": 3.557679180887372, + "grad_norm": 0.09541752934455872, + "learning_rate": 7.284593478087043e-06, + "loss": 0.3525, + "step": 1303 + }, + { + "epoch": 3.560409556313993, + "grad_norm": 0.09170124679803848, + "learning_rate": 7.195480511829411e-06, + "loss": 0.3442, + "step": 1304 + }, + { + "epoch": 3.5631399317406145, + "grad_norm": 0.09171769767999649, + "learning_rate": 7.106895622405752e-06, + "loss": 0.3545, + "step": 1305 + }, + { + "epoch": 3.5658703071672355, + "grad_norm": 0.09372735768556595, + "learning_rate": 7.018839313882286e-06, + "loss": 0.3533, + "step": 1306 + }, + { + "epoch": 3.5686006825938565, + "grad_norm": 0.09729722142219543, + "learning_rate": 6.931312087317632e-06, + "loss": 0.355, + "step": 1307 + }, + { + "epoch": 3.571331058020478, + "grad_norm": 0.09178127348423004, + "learning_rate": 6.844314440759647e-06, + "loss": 0.3247, + "step": 1308 + }, + { + "epoch": 3.574061433447099, + "grad_norm": 0.08965813368558884, + "learning_rate": 6.7578468692429345e-06, + "loss": 0.3419, + "step": 1309 + }, + { + "epoch": 3.57679180887372, + "grad_norm": 0.09664511680603027, + "learning_rate": 6.6719098647857525e-06, + "loss": 0.3286, + "step": 1310 + }, + { + "epoch": 3.579522184300341, + "grad_norm": 0.09894757717847824, + "learning_rate": 6.586503916387366e-06, + "loss": 0.3249, + "step": 1311 + }, + { + "epoch": 3.5822525597269625, + "grad_norm": 0.09205014258623123, + "learning_rate": 6.501629510025231e-06, + "loss": 0.3395, + "step": 1312 + }, + { + "epoch": 3.5849829351535836, + "grad_norm": 0.08916641771793365, + "learning_rate": 6.417287128652172e-06, + "loss": 0.3465, + "step": 1313 + }, + { + "epoch": 3.587713310580205, + "grad_norm": 0.09294863045215607, + "learning_rate": 6.333477252193731e-06, + "loss": 0.3326, + "step": 1314 + }, + { + "epoch": 3.590443686006826, + "grad_norm": 0.09478142857551575, + "learning_rate": 6.250200357545377e-06, + "loss": 0.3335, + "step": 1315 + }, + { + "epoch": 3.593174061433447, + "grad_norm": 0.09231521189212799, + "learning_rate": 6.167456918569792e-06, + "loss": 0.334, + "step": 1316 + }, + { + "epoch": 3.595904436860068, + "grad_norm": 0.09620673954486847, + "learning_rate": 6.085247406094197e-06, + "loss": 0.3449, + "step": 1317 + }, + { + "epoch": 3.5986348122866896, + "grad_norm": 0.09421432018280029, + "learning_rate": 6.003572287907633e-06, + "loss": 0.3464, + "step": 1318 + }, + { + "epoch": 3.6013651877133106, + "grad_norm": 0.09824170917272568, + "learning_rate": 5.922432028758362e-06, + "loss": 0.3332, + "step": 1319 + }, + { + "epoch": 3.6040955631399316, + "grad_norm": 0.08727800846099854, + "learning_rate": 5.841827090351171e-06, + "loss": 0.3383, + "step": 1320 + }, + { + "epoch": 3.606825938566553, + "grad_norm": 0.09223764389753342, + "learning_rate": 5.761757931344758e-06, + "loss": 0.3328, + "step": 1321 + }, + { + "epoch": 3.609556313993174, + "grad_norm": 0.09691391885280609, + "learning_rate": 5.68222500734914e-06, + "loss": 0.3489, + "step": 1322 + }, + { + "epoch": 3.612286689419795, + "grad_norm": 0.09362566471099854, + "learning_rate": 5.603228770923041e-06, + "loss": 0.3519, + "step": 1323 + }, + { + "epoch": 3.615017064846416, + "grad_norm": 0.09172289818525314, + "learning_rate": 5.524769671571317e-06, + "loss": 0.3409, + "step": 1324 + }, + { + "epoch": 3.6177474402730376, + "grad_norm": 0.09562698006629944, + "learning_rate": 5.446848155742401e-06, + "loss": 0.352, + "step": 1325 + }, + { + "epoch": 3.6204778156996587, + "grad_norm": 0.09124190360307693, + "learning_rate": 5.3694646668257855e-06, + "loss": 0.3337, + "step": 1326 + }, + { + "epoch": 3.62320819112628, + "grad_norm": 0.09086339920759201, + "learning_rate": 5.292619645149433e-06, + "loss": 0.3389, + "step": 1327 + }, + { + "epoch": 3.625938566552901, + "grad_norm": 0.09403908997774124, + "learning_rate": 5.2163135279773904e-06, + "loss": 0.3444, + "step": 1328 + }, + { + "epoch": 3.628668941979522, + "grad_norm": 0.08822885900735855, + "learning_rate": 5.140546749507136e-06, + "loss": 0.3429, + "step": 1329 + }, + { + "epoch": 3.631399317406143, + "grad_norm": 0.0947570726275444, + "learning_rate": 5.06531974086728e-06, + "loss": 0.3413, + "step": 1330 + }, + { + "epoch": 3.6341296928327647, + "grad_norm": 0.09210789203643799, + "learning_rate": 4.9906329301149914e-06, + "loss": 0.3506, + "step": 1331 + }, + { + "epoch": 3.6368600682593857, + "grad_norm": 0.09383577108383179, + "learning_rate": 4.916486742233606e-06, + "loss": 0.3456, + "step": 1332 + }, + { + "epoch": 3.6395904436860067, + "grad_norm": 0.09422284364700317, + "learning_rate": 4.8428815991302005e-06, + "loss": 0.3451, + "step": 1333 + }, + { + "epoch": 3.642320819112628, + "grad_norm": 0.09540600329637527, + "learning_rate": 4.769817919633235e-06, + "loss": 0.3465, + "step": 1334 + }, + { + "epoch": 3.645051194539249, + "grad_norm": 0.09411127120256424, + "learning_rate": 4.697296119490047e-06, + "loss": 0.334, + "step": 1335 + }, + { + "epoch": 3.64778156996587, + "grad_norm": 0.0870002806186676, + "learning_rate": 4.625316611364661e-06, + "loss": 0.3517, + "step": 1336 + }, + { + "epoch": 3.6505119453924912, + "grad_norm": 0.09915148466825485, + "learning_rate": 4.553879804835282e-06, + "loss": 0.3512, + "step": 1337 + }, + { + "epoch": 3.6532423208191127, + "grad_norm": 0.09070401638746262, + "learning_rate": 4.482986106392073e-06, + "loss": 0.3266, + "step": 1338 + }, + { + "epoch": 3.6559726962457337, + "grad_norm": 0.09290861338376999, + "learning_rate": 4.412635919434749e-06, + "loss": 0.3416, + "step": 1339 + }, + { + "epoch": 3.658703071672355, + "grad_norm": 0.09041845798492432, + "learning_rate": 4.342829644270429e-06, + "loss": 0.3369, + "step": 1340 + }, + { + "epoch": 3.6614334470989762, + "grad_norm": 0.09348506480455399, + "learning_rate": 4.273567678111123e-06, + "loss": 0.326, + "step": 1341 + }, + { + "epoch": 3.6641638225255972, + "grad_norm": 0.09402939677238464, + "learning_rate": 4.204850415071748e-06, + "loss": 0.3491, + "step": 1342 + }, + { + "epoch": 3.6668941979522183, + "grad_norm": 0.08721035718917847, + "learning_rate": 4.136678246167636e-06, + "loss": 0.3419, + "step": 1343 + }, + { + "epoch": 3.6696245733788397, + "grad_norm": 0.09258776158094406, + "learning_rate": 4.069051559312531e-06, + "loss": 0.3418, + "step": 1344 + }, + { + "epoch": 3.6723549488054608, + "grad_norm": 0.09165962785482407, + "learning_rate": 4.001970739316163e-06, + "loss": 0.3495, + "step": 1345 + }, + { + "epoch": 3.675085324232082, + "grad_norm": 0.09198355674743652, + "learning_rate": 3.935436167882234e-06, + "loss": 0.3312, + "step": 1346 + }, + { + "epoch": 3.6778156996587033, + "grad_norm": 0.09146588295698166, + "learning_rate": 3.869448223606165e-06, + "loss": 0.3395, + "step": 1347 + }, + { + "epoch": 3.6805460750853243, + "grad_norm": 0.09097348153591156, + "learning_rate": 3.8040072819729545e-06, + "loss": 0.342, + "step": 1348 + }, + { + "epoch": 3.6832764505119453, + "grad_norm": 0.09369444102048874, + "learning_rate": 3.7391137153550137e-06, + "loss": 0.3526, + "step": 1349 + }, + { + "epoch": 3.6860068259385663, + "grad_norm": 0.09838250279426575, + "learning_rate": 3.6747678930101558e-06, + "loss": 0.3468, + "step": 1350 + }, + { + "epoch": 3.688737201365188, + "grad_norm": 0.0884234607219696, + "learning_rate": 3.6109701810793208e-06, + "loss": 0.3473, + "step": 1351 + }, + { + "epoch": 3.691467576791809, + "grad_norm": 0.0916651114821434, + "learning_rate": 3.5477209425846538e-06, + "loss": 0.3391, + "step": 1352 + }, + { + "epoch": 3.6941979522184303, + "grad_norm": 0.0922260656952858, + "learning_rate": 3.4850205374273416e-06, + "loss": 0.3461, + "step": 1353 + }, + { + "epoch": 3.6969283276450513, + "grad_norm": 0.09219305962324142, + "learning_rate": 3.4228693223856136e-06, + "loss": 0.3531, + "step": 1354 + }, + { + "epoch": 3.6996587030716723, + "grad_norm": 0.09641235321760178, + "learning_rate": 3.361267651112676e-06, + "loss": 0.3547, + "step": 1355 + }, + { + "epoch": 3.7023890784982934, + "grad_norm": 0.09496744722127914, + "learning_rate": 3.30021587413476e-06, + "loss": 0.3368, + "step": 1356 + }, + { + "epoch": 3.705119453924915, + "grad_norm": 0.09569863229990005, + "learning_rate": 3.2397143388489983e-06, + "loss": 0.3467, + "step": 1357 + }, + { + "epoch": 3.707849829351536, + "grad_norm": 0.09257599711418152, + "learning_rate": 3.1797633895216394e-06, + "loss": 0.3427, + "step": 1358 + }, + { + "epoch": 3.710580204778157, + "grad_norm": 0.09710348397493362, + "learning_rate": 3.120363367285917e-06, + "loss": 0.3549, + "step": 1359 + }, + { + "epoch": 3.7133105802047783, + "grad_norm": 0.09273111820220947, + "learning_rate": 3.0615146101401925e-06, + "loss": 0.3469, + "step": 1360 + }, + { + "epoch": 3.7160409556313994, + "grad_norm": 0.09577952325344086, + "learning_rate": 3.0032174529460165e-06, + "loss": 0.3525, + "step": 1361 + }, + { + "epoch": 3.7187713310580204, + "grad_norm": 0.09053657948970795, + "learning_rate": 2.945472227426227e-06, + "loss": 0.3399, + "step": 1362 + }, + { + "epoch": 3.7215017064846414, + "grad_norm": 0.09281080961227417, + "learning_rate": 2.8882792621630406e-06, + "loss": 0.3466, + "step": 1363 + }, + { + "epoch": 3.724232081911263, + "grad_norm": 0.09164872020483017, + "learning_rate": 2.8316388825962324e-06, + "loss": 0.354, + "step": 1364 + }, + { + "epoch": 3.726962457337884, + "grad_norm": 0.09354937821626663, + "learning_rate": 2.7755514110212264e-06, + "loss": 0.3506, + "step": 1365 + }, + { + "epoch": 3.7296928327645054, + "grad_norm": 0.09331977367401123, + "learning_rate": 2.7200171665872742e-06, + "loss": 0.3477, + "step": 1366 + }, + { + "epoch": 3.7324232081911264, + "grad_norm": 0.09488935023546219, + "learning_rate": 2.6650364652956894e-06, + "loss": 0.3512, + "step": 1367 + }, + { + "epoch": 3.7351535836177474, + "grad_norm": 0.09157928824424744, + "learning_rate": 2.6106096199979614e-06, + "loss": 0.3504, + "step": 1368 + }, + { + "epoch": 3.7378839590443684, + "grad_norm": 0.09506452083587646, + "learning_rate": 2.5567369403940776e-06, + "loss": 0.336, + "step": 1369 + }, + { + "epoch": 3.74061433447099, + "grad_norm": 0.09797690063714981, + "learning_rate": 2.50341873303066e-06, + "loss": 0.3453, + "step": 1370 + }, + { + "epoch": 3.743344709897611, + "grad_norm": 0.09498438984155655, + "learning_rate": 2.4506553012993093e-06, + "loss": 0.3538, + "step": 1371 + }, + { + "epoch": 3.746075085324232, + "grad_norm": 0.09096547216176987, + "learning_rate": 2.398446945434818e-06, + "loss": 0.349, + "step": 1372 + }, + { + "epoch": 3.7488054607508534, + "grad_norm": 0.09576946496963501, + "learning_rate": 2.346793962513483e-06, + "loss": 0.3537, + "step": 1373 + }, + { + "epoch": 3.7515358361774744, + "grad_norm": 0.08939557522535324, + "learning_rate": 2.2956966464514175e-06, + "loss": 0.3214, + "step": 1374 + }, + { + "epoch": 3.7542662116040955, + "grad_norm": 0.09316585212945938, + "learning_rate": 2.245155288002876e-06, + "loss": 0.3486, + "step": 1375 + }, + { + "epoch": 3.7569965870307165, + "grad_norm": 0.09257607907056808, + "learning_rate": 2.1951701747585982e-06, + "loss": 0.3503, + "step": 1376 + }, + { + "epoch": 3.759726962457338, + "grad_norm": 0.09286779165267944, + "learning_rate": 2.1457415911442013e-06, + "loss": 0.3597, + "step": 1377 + }, + { + "epoch": 3.762457337883959, + "grad_norm": 0.10040642321109772, + "learning_rate": 2.0968698184184565e-06, + "loss": 0.3453, + "step": 1378 + }, + { + "epoch": 3.7651877133105804, + "grad_norm": 0.09531185030937195, + "learning_rate": 2.04855513467187e-06, + "loss": 0.3341, + "step": 1379 + }, + { + "epoch": 3.7679180887372015, + "grad_norm": 0.09448336809873581, + "learning_rate": 2.000797814824906e-06, + "loss": 0.3213, + "step": 1380 + }, + { + "epoch": 3.7706484641638225, + "grad_norm": 0.09589549899101257, + "learning_rate": 1.9535981306265884e-06, + "loss": 0.3595, + "step": 1381 + }, + { + "epoch": 3.7733788395904435, + "grad_norm": 0.10044374316930771, + "learning_rate": 1.9069563506527998e-06, + "loss": 0.3391, + "step": 1382 + }, + { + "epoch": 3.776109215017065, + "grad_norm": 0.09576553106307983, + "learning_rate": 1.8608727403049309e-06, + "loss": 0.3349, + "step": 1383 + }, + { + "epoch": 3.778839590443686, + "grad_norm": 0.0959954783320427, + "learning_rate": 1.8153475618081673e-06, + "loss": 0.3487, + "step": 1384 + }, + { + "epoch": 3.781569965870307, + "grad_norm": 0.09060141444206238, + "learning_rate": 1.7703810742101813e-06, + "loss": 0.347, + "step": 1385 + }, + { + "epoch": 3.7843003412969285, + "grad_norm": 0.09788581728935242, + "learning_rate": 1.7259735333795545e-06, + "loss": 0.3438, + "step": 1386 + }, + { + "epoch": 3.7870307167235495, + "grad_norm": 0.09645051509141922, + "learning_rate": 1.6821251920043246e-06, + "loss": 0.3306, + "step": 1387 + }, + { + "epoch": 3.7897610921501705, + "grad_norm": 0.09661401808261871, + "learning_rate": 1.6388362995905848e-06, + "loss": 0.3455, + "step": 1388 + }, + { + "epoch": 3.7924914675767916, + "grad_norm": 0.09419504553079605, + "learning_rate": 1.5961071024610752e-06, + "loss": 0.3529, + "step": 1389 + }, + { + "epoch": 3.795221843003413, + "grad_norm": 0.09770802408456802, + "learning_rate": 1.5539378437536944e-06, + "loss": 0.3549, + "step": 1390 + }, + { + "epoch": 3.797952218430034, + "grad_norm": 0.08888811618089676, + "learning_rate": 1.5123287634202454e-06, + "loss": 0.3359, + "step": 1391 + }, + { + "epoch": 3.8006825938566555, + "grad_norm": 0.0973770022392273, + "learning_rate": 1.4712800982249474e-06, + "loss": 0.3538, + "step": 1392 + }, + { + "epoch": 3.8034129692832765, + "grad_norm": 0.09432043880224228, + "learning_rate": 1.430792081743182e-06, + "loss": 0.3369, + "step": 1393 + }, + { + "epoch": 3.8061433447098976, + "grad_norm": 0.09583873301744461, + "learning_rate": 1.3908649443600707e-06, + "loss": 0.3513, + "step": 1394 + }, + { + "epoch": 3.8088737201365186, + "grad_norm": 0.0925702303647995, + "learning_rate": 1.351498913269289e-06, + "loss": 0.3445, + "step": 1395 + }, + { + "epoch": 3.81160409556314, + "grad_norm": 0.09331769496202469, + "learning_rate": 1.3126942124716213e-06, + "loss": 0.3495, + "step": 1396 + }, + { + "epoch": 3.814334470989761, + "grad_norm": 0.0974922925233841, + "learning_rate": 1.2744510627738516e-06, + "loss": 0.3511, + "step": 1397 + }, + { + "epoch": 3.817064846416382, + "grad_norm": 0.10009145736694336, + "learning_rate": 1.2367696817873419e-06, + "loss": 0.3371, + "step": 1398 + }, + { + "epoch": 3.8197952218430036, + "grad_norm": 0.09346017241477966, + "learning_rate": 1.1996502839269453e-06, + "loss": 0.3422, + "step": 1399 + }, + { + "epoch": 3.8225255972696246, + "grad_norm": 0.09399791061878204, + "learning_rate": 1.1630930804096495e-06, + "loss": 0.3543, + "step": 1400 + }, + { + "epoch": 3.8252559726962456, + "grad_norm": 0.09346922487020493, + "learning_rate": 1.127098279253491e-06, + "loss": 0.3455, + "step": 1401 + }, + { + "epoch": 3.8279863481228666, + "grad_norm": 0.09039437770843506, + "learning_rate": 1.0916660852763216e-06, + "loss": 0.3543, + "step": 1402 + }, + { + "epoch": 3.830716723549488, + "grad_norm": 0.08980242908000946, + "learning_rate": 1.0567967000945866e-06, + "loss": 0.3308, + "step": 1403 + }, + { + "epoch": 3.833447098976109, + "grad_norm": 0.09566069394350052, + "learning_rate": 1.0224903221222938e-06, + "loss": 0.3385, + "step": 1404 + }, + { + "epoch": 3.8361774744027306, + "grad_norm": 0.08965308964252472, + "learning_rate": 9.88747146569813e-07, + "loss": 0.3386, + "step": 1405 + }, + { + "epoch": 3.8389078498293516, + "grad_norm": 0.09409939497709274, + "learning_rate": 9.555673654427332e-07, + "loss": 0.3552, + "step": 1406 + }, + { + "epoch": 3.8416382252559726, + "grad_norm": 0.0988423153758049, + "learning_rate": 9.229511675408642e-07, + "loss": 0.3423, + "step": 1407 + }, + { + "epoch": 3.8443686006825937, + "grad_norm": 0.09073875099420547, + "learning_rate": 8.90898738457091e-07, + "loss": 0.329, + "step": 1408 + }, + { + "epoch": 3.847098976109215, + "grad_norm": 0.09452011436223984, + "learning_rate": 8.59410260576321e-07, + "loss": 0.3373, + "step": 1409 + }, + { + "epoch": 3.849829351535836, + "grad_norm": 0.09350565820932388, + "learning_rate": 8.28485913074506e-07, + "loss": 0.3508, + "step": 1410 + }, + { + "epoch": 3.852559726962457, + "grad_norm": 0.09692378342151642, + "learning_rate": 7.981258719175322e-07, + "loss": 0.3445, + "step": 1411 + }, + { + "epoch": 3.8552901023890787, + "grad_norm": 0.09220253676176071, + "learning_rate": 7.683303098602989e-07, + "loss": 0.3365, + "step": 1412 + }, + { + "epoch": 3.8580204778156997, + "grad_norm": 0.0962350070476532, + "learning_rate": 7.39099396445686e-07, + "loss": 0.3354, + "step": 1413 + }, + { + "epoch": 3.8607508532423207, + "grad_norm": 0.09249398857355118, + "learning_rate": 7.104332980036211e-07, + "loss": 0.3333, + "step": 1414 + }, + { + "epoch": 3.8634812286689417, + "grad_norm": 0.09596824645996094, + "learning_rate": 6.823321776501024e-07, + "loss": 0.3377, + "step": 1415 + }, + { + "epoch": 3.866211604095563, + "grad_norm": 0.08568532764911652, + "learning_rate": 6.547961952863002e-07, + "loss": 0.3233, + "step": 1416 + }, + { + "epoch": 3.868941979522184, + "grad_norm": 0.09179449081420898, + "learning_rate": 6.278255075976125e-07, + "loss": 0.3253, + "step": 1417 + }, + { + "epoch": 3.8716723549488057, + "grad_norm": 0.09398891776800156, + "learning_rate": 6.014202680528324e-07, + "loss": 0.3376, + "step": 1418 + }, + { + "epoch": 3.8744027303754267, + "grad_norm": 0.09561796486377716, + "learning_rate": 5.755806269031827e-07, + "loss": 0.3459, + "step": 1419 + }, + { + "epoch": 3.8771331058020477, + "grad_norm": 0.0944526270031929, + "learning_rate": 5.503067311815713e-07, + "loss": 0.352, + "step": 1420 + }, + { + "epoch": 3.8798634812286688, + "grad_norm": 0.09351572394371033, + "learning_rate": 5.255987247016591e-07, + "loss": 0.3478, + "step": 1421 + }, + { + "epoch": 3.88259385665529, + "grad_norm": 0.09779185056686401, + "learning_rate": 5.014567480570831e-07, + "loss": 0.3333, + "step": 1422 + }, + { + "epoch": 3.8853242320819112, + "grad_norm": 0.09466547518968582, + "learning_rate": 4.778809386206895e-07, + "loss": 0.3515, + "step": 1423 + }, + { + "epoch": 3.8880546075085323, + "grad_norm": 0.09396017342805862, + "learning_rate": 4.548714305436685e-07, + "loss": 0.3351, + "step": 1424 + }, + { + "epoch": 3.8907849829351537, + "grad_norm": 0.096969835460186, + "learning_rate": 4.324283547548658e-07, + "loss": 0.3437, + "step": 1425 + }, + { + "epoch": 3.8935153583617748, + "grad_norm": 0.08573538810014725, + "learning_rate": 4.1055183896001606e-07, + "loss": 0.338, + "step": 1426 + }, + { + "epoch": 3.896245733788396, + "grad_norm": 0.09765475988388062, + "learning_rate": 3.892420076409886e-07, + "loss": 0.3417, + "step": 1427 + }, + { + "epoch": 3.898976109215017, + "grad_norm": 0.09676413238048553, + "learning_rate": 3.68498982055121e-07, + "loss": 0.355, + "step": 1428 + }, + { + "epoch": 3.9017064846416383, + "grad_norm": 0.09585923701524734, + "learning_rate": 3.483228802344973e-07, + "loss": 0.3347, + "step": 1429 + }, + { + "epoch": 3.9044368600682593, + "grad_norm": 0.0931759849190712, + "learning_rate": 3.2871381698529324e-07, + "loss": 0.3402, + "step": 1430 + }, + { + "epoch": 3.9071672354948808, + "grad_norm": 0.09280584007501602, + "learning_rate": 3.0967190388712097e-07, + "loss": 0.3406, + "step": 1431 + }, + { + "epoch": 3.909897610921502, + "grad_norm": 0.09850934892892838, + "learning_rate": 2.9119724929239645e-07, + "loss": 0.359, + "step": 1432 + }, + { + "epoch": 3.912627986348123, + "grad_norm": 0.09692083299160004, + "learning_rate": 2.7328995832568426e-07, + "loss": 0.3445, + "step": 1433 + }, + { + "epoch": 3.915358361774744, + "grad_norm": 0.09314105659723282, + "learning_rate": 2.5595013288318703e-07, + "loss": 0.321, + "step": 1434 + }, + { + "epoch": 3.9180887372013653, + "grad_norm": 0.09564158320426941, + "learning_rate": 2.391778716320792e-07, + "loss": 0.3433, + "step": 1435 + }, + { + "epoch": 3.9208191126279863, + "grad_norm": 0.09259151667356491, + "learning_rate": 2.2297327000996293e-07, + "loss": 0.3495, + "step": 1436 + }, + { + "epoch": 3.9235494880546073, + "grad_norm": 0.0886520966887474, + "learning_rate": 2.0733642022437994e-07, + "loss": 0.3276, + "step": 1437 + }, + { + "epoch": 3.926279863481229, + "grad_norm": 0.09547660499811172, + "learning_rate": 1.922674112522227e-07, + "loss": 0.3418, + "step": 1438 + }, + { + "epoch": 3.92901023890785, + "grad_norm": 0.0911092683672905, + "learning_rate": 1.7776632883924615e-07, + "loss": 0.3469, + "step": 1439 + }, + { + "epoch": 3.931740614334471, + "grad_norm": 0.09483670443296432, + "learning_rate": 1.638332554996125e-07, + "loss": 0.3399, + "step": 1440 + }, + { + "epoch": 3.934470989761092, + "grad_norm": 0.09258666634559631, + "learning_rate": 1.5046827051536928e-07, + "loss": 0.3325, + "step": 1441 + }, + { + "epoch": 3.9372013651877134, + "grad_norm": 0.09664256125688553, + "learning_rate": 1.3767144993602766e-07, + "loss": 0.3398, + "step": 1442 + }, + { + "epoch": 3.9399317406143344, + "grad_norm": 0.09532354772090912, + "learning_rate": 1.254428665781515e-07, + "loss": 0.3394, + "step": 1443 + }, + { + "epoch": 3.942662116040956, + "grad_norm": 0.09413325786590576, + "learning_rate": 1.1378259002488013e-07, + "loss": 0.3202, + "step": 1444 + }, + { + "epoch": 3.945392491467577, + "grad_norm": 0.09612133353948593, + "learning_rate": 1.0269068662560611e-07, + "loss": 0.345, + "step": 1445 + }, + { + "epoch": 3.948122866894198, + "grad_norm": 0.08994828909635544, + "learning_rate": 9.216721949553142e-08, + "loss": 0.3518, + "step": 1446 + }, + { + "epoch": 3.950853242320819, + "grad_norm": 0.08921853452920914, + "learning_rate": 8.221224851535647e-08, + "loss": 0.3405, + "step": 1447 + }, + { + "epoch": 3.9535836177474404, + "grad_norm": 0.08948387205600739, + "learning_rate": 7.282583033091372e-08, + "loss": 0.3483, + "step": 1448 + }, + { + "epoch": 3.9563139931740614, + "grad_norm": 0.09696392714977264, + "learning_rate": 6.400801835286796e-08, + "loss": 0.3475, + "step": 1449 + }, + { + "epoch": 3.9590443686006824, + "grad_norm": 0.09356880187988281, + "learning_rate": 5.57588627563721e-08, + "loss": 0.3567, + "step": 1450 + }, + { + "epoch": 3.961774744027304, + "grad_norm": 0.09498196840286255, + "learning_rate": 4.807841048082296e-08, + "loss": 0.3458, + "step": 1451 + }, + { + "epoch": 3.964505119453925, + "grad_norm": 0.08671274036169052, + "learning_rate": 4.096670522959478e-08, + "loss": 0.3237, + "step": 1452 + }, + { + "epoch": 3.967235494880546, + "grad_norm": 0.09593278914690018, + "learning_rate": 3.442378746972841e-08, + "loss": 0.3581, + "step": 1453 + }, + { + "epoch": 3.969965870307167, + "grad_norm": 0.09628158062696457, + "learning_rate": 2.844969443178691e-08, + "loss": 0.3397, + "step": 1454 + }, + { + "epoch": 3.9726962457337884, + "grad_norm": 0.0965261310338974, + "learning_rate": 2.304446010958916e-08, + "loss": 0.3487, + "step": 1455 + }, + { + "epoch": 3.9754266211604095, + "grad_norm": 0.09529692679643631, + "learning_rate": 1.8208115260032187e-08, + "loss": 0.3568, + "step": 1456 + }, + { + "epoch": 3.978156996587031, + "grad_norm": 0.08997691422700882, + "learning_rate": 1.3940687402924646e-08, + "loss": 0.3425, + "step": 1457 + }, + { + "epoch": 3.980887372013652, + "grad_norm": 0.09637629240751266, + "learning_rate": 1.0242200820786974e-08, + "loss": 0.3525, + "step": 1458 + }, + { + "epoch": 3.983617747440273, + "grad_norm": 0.09401457756757736, + "learning_rate": 7.112676558784781e-09, + "loss": 0.3384, + "step": 1459 + }, + { + "epoch": 3.986348122866894, + "grad_norm": 0.09133262187242508, + "learning_rate": 4.552132424562317e-09, + "loss": 0.3489, + "step": 1460 + }, + { + "epoch": 3.9890784982935155, + "grad_norm": 0.0924975723028183, + "learning_rate": 2.5605829881203414e-09, + "loss": 0.3249, + "step": 1461 + }, + { + "epoch": 3.9918088737201365, + "grad_norm": 0.09170513600111008, + "learning_rate": 1.1380395818050282e-09, + "loss": 0.3397, + "step": 1462 + }, + { + "epoch": 3.994539249146758, + "grad_norm": 0.08755572885274887, + "learning_rate": 2.8451030018583623e-10, + "loss": 0.3396, + "step": 1463 + }, + { + "epoch": 3.997269624573379, + "grad_norm": 0.09281551837921143, + "learning_rate": 0.0, + "loss": 0.3504, + "step": 1464 + }, + { + "epoch": 3.997269624573379, + "eval_loss": 0.36407509446144104, + "eval_runtime": 308.8895, + "eval_samples_per_second": 8.43, + "eval_steps_per_second": 1.055, + "step": 1464 + } + ], + "logging_steps": 1, + "max_steps": 1464, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.073159862761554e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}