{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4344629729245113, "eval_steps": 1000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 10.134367942810059, "learning_rate": 9e-08, "loss": 1.2511, "step": 10 }, { "epoch": 0.01, "grad_norm": 18.544782638549805, "learning_rate": 1.8e-07, "loss": 1.4001, "step": 20 }, { "epoch": 0.01, "grad_norm": 6.087899684906006, "learning_rate": 2.8e-07, "loss": 1.1501, "step": 30 }, { "epoch": 0.01, "grad_norm": 6.356815814971924, "learning_rate": 3.7999999999999996e-07, "loss": 1.2225, "step": 40 }, { "epoch": 0.02, "grad_norm": 3.9461255073547363, "learning_rate": 4.8e-07, "loss": 1.2241, "step": 50 }, { "epoch": 0.02, "grad_norm": 4.9660773277282715, "learning_rate": 5.8e-07, "loss": 1.2118, "step": 60 }, { "epoch": 0.03, "grad_norm": 6.57827615737915, "learning_rate": 6.800000000000001e-07, "loss": 1.0416, "step": 70 }, { "epoch": 0.03, "grad_norm": 5.070159435272217, "learning_rate": 7.799999999999999e-07, "loss": 1.1224, "step": 80 }, { "epoch": 0.03, "grad_norm": 5.661293029785156, "learning_rate": 8.799999999999999e-07, "loss": 1.0603, "step": 90 }, { "epoch": 0.04, "grad_norm": 4.765249252319336, "learning_rate": 9.8e-07, "loss": 1.0449, "step": 100 }, { "epoch": 0.04, "grad_norm": 4.521543502807617, "learning_rate": 9.991919191919192e-07, "loss": 0.9193, "step": 110 }, { "epoch": 0.04, "grad_norm": 4.420640468597412, "learning_rate": 9.98181818181818e-07, "loss": 0.876, "step": 120 }, { "epoch": 0.05, "grad_norm": 4.311656951904297, "learning_rate": 9.97171717171717e-07, "loss": 0.8012, "step": 130 }, { "epoch": 0.05, "grad_norm": 5.07025146484375, "learning_rate": 9.961616161616162e-07, "loss": 0.826, "step": 140 }, { "epoch": 0.05, "grad_norm": 4.859681129455566, "learning_rate": 9.951515151515151e-07, "loss": 0.8442, "step": 150 }, { "epoch": 0.06, "grad_norm": 4.192190170288086, "learning_rate": 9.94141414141414e-07, "loss": 0.708, "step": 160 }, { "epoch": 0.06, "grad_norm": 5.852901935577393, "learning_rate": 9.93131313131313e-07, "loss": 0.8201, "step": 170 }, { "epoch": 0.06, "grad_norm": 5.076972484588623, "learning_rate": 9.92121212121212e-07, "loss": 0.7973, "step": 180 }, { "epoch": 0.07, "grad_norm": 4.36331844329834, "learning_rate": 9.911111111111111e-07, "loss": 0.753, "step": 190 }, { "epoch": 0.07, "grad_norm": 5.03492546081543, "learning_rate": 9.9010101010101e-07, "loss": 0.7909, "step": 200 }, { "epoch": 0.08, "grad_norm": 4.043544769287109, "learning_rate": 9.89090909090909e-07, "loss": 0.7118, "step": 210 }, { "epoch": 0.08, "grad_norm": 2.2020692825317383, "learning_rate": 9.880808080808082e-07, "loss": 0.6516, "step": 220 }, { "epoch": 0.08, "grad_norm": 4.851771354675293, "learning_rate": 9.870707070707071e-07, "loss": 0.7275, "step": 230 }, { "epoch": 0.09, "grad_norm": 3.2986488342285156, "learning_rate": 9.86060606060606e-07, "loss": 0.6484, "step": 240 }, { "epoch": 0.09, "grad_norm": 5.700368404388428, "learning_rate": 9.85050505050505e-07, "loss": 0.7258, "step": 250 }, { "epoch": 0.09, "grad_norm": 6.46458625793457, "learning_rate": 9.84040404040404e-07, "loss": 0.6416, "step": 260 }, { "epoch": 0.1, "grad_norm": 4.012514114379883, "learning_rate": 9.830303030303029e-07, "loss": 0.5861, "step": 270 }, { "epoch": 0.1, "grad_norm": 3.328000068664551, "learning_rate": 9.82020202020202e-07, "loss": 0.6239, "step": 280 }, { "epoch": 0.1, "grad_norm": 3.837636947631836, "learning_rate": 9.81010101010101e-07, "loss": 0.6718, "step": 290 }, { "epoch": 0.11, "grad_norm": 4.840264797210693, "learning_rate": 9.8e-07, "loss": 0.6623, "step": 300 }, { "epoch": 0.11, "grad_norm": 6.741188049316406, "learning_rate": 9.789898989898989e-07, "loss": 0.6066, "step": 310 }, { "epoch": 0.11, "grad_norm": 5.170821666717529, "learning_rate": 9.779797979797978e-07, "loss": 0.6634, "step": 320 }, { "epoch": 0.12, "grad_norm": 8.481480598449707, "learning_rate": 9.76969696969697e-07, "loss": 0.6474, "step": 330 }, { "epoch": 0.12, "grad_norm": 3.698042631149292, "learning_rate": 9.75959595959596e-07, "loss": 0.6594, "step": 340 }, { "epoch": 0.13, "grad_norm": 4.990598201751709, "learning_rate": 9.749494949494949e-07, "loss": 0.6588, "step": 350 }, { "epoch": 0.13, "grad_norm": 3.2625105381011963, "learning_rate": 9.73939393939394e-07, "loss": 0.6532, "step": 360 }, { "epoch": 0.13, "grad_norm": 6.604375839233398, "learning_rate": 9.72929292929293e-07, "loss": 0.7057, "step": 370 }, { "epoch": 0.14, "grad_norm": 5.352667808532715, "learning_rate": 9.71919191919192e-07, "loss": 0.6273, "step": 380 }, { "epoch": 0.14, "grad_norm": 3.8649349212646484, "learning_rate": 9.709090909090909e-07, "loss": 0.6366, "step": 390 }, { "epoch": 0.14, "grad_norm": 5.2739667892456055, "learning_rate": 9.698989898989898e-07, "loss": 0.6433, "step": 400 }, { "epoch": 0.15, "grad_norm": 4.892832279205322, "learning_rate": 9.68888888888889e-07, "loss": 0.6504, "step": 410 }, { "epoch": 0.15, "grad_norm": 5.36677360534668, "learning_rate": 9.67878787878788e-07, "loss": 0.6661, "step": 420 }, { "epoch": 0.15, "grad_norm": 2.6905229091644287, "learning_rate": 9.668686868686868e-07, "loss": 0.6334, "step": 430 }, { "epoch": 0.16, "grad_norm": 4.99962854385376, "learning_rate": 9.658585858585858e-07, "loss": 0.6028, "step": 440 }, { "epoch": 0.16, "grad_norm": 5.522456169128418, "learning_rate": 9.648484848484847e-07, "loss": 0.6903, "step": 450 }, { "epoch": 0.16, "grad_norm": 7.456121921539307, "learning_rate": 9.638383838383839e-07, "loss": 0.6486, "step": 460 }, { "epoch": 0.17, "grad_norm": 5.989662170410156, "learning_rate": 9.628282828282828e-07, "loss": 0.7085, "step": 470 }, { "epoch": 0.17, "grad_norm": 3.5824246406555176, "learning_rate": 9.618181818181818e-07, "loss": 0.6652, "step": 480 }, { "epoch": 0.18, "grad_norm": 3.074733018875122, "learning_rate": 9.608080808080807e-07, "loss": 0.5733, "step": 490 }, { "epoch": 0.18, "grad_norm": 3.964071750640869, "learning_rate": 9.597979797979797e-07, "loss": 0.6319, "step": 500 }, { "epoch": 0.18, "grad_norm": 3.8981716632843018, "learning_rate": 9.587878787878786e-07, "loss": 0.5937, "step": 510 }, { "epoch": 0.19, "grad_norm": 7.471535682678223, "learning_rate": 9.577777777777778e-07, "loss": 0.6564, "step": 520 }, { "epoch": 0.19, "grad_norm": 5.468303680419922, "learning_rate": 9.567676767676767e-07, "loss": 0.6339, "step": 530 }, { "epoch": 0.19, "grad_norm": 3.9055886268615723, "learning_rate": 9.557575757575759e-07, "loss": 0.7103, "step": 540 }, { "epoch": 0.2, "grad_norm": 4.115898132324219, "learning_rate": 9.547474747474748e-07, "loss": 0.6928, "step": 550 }, { "epoch": 0.2, "grad_norm": 4.248528003692627, "learning_rate": 9.537373737373737e-07, "loss": 0.6084, "step": 560 }, { "epoch": 0.2, "grad_norm": 3.727003335952759, "learning_rate": 9.527272727272727e-07, "loss": 0.6554, "step": 570 }, { "epoch": 0.21, "grad_norm": 4.747541904449463, "learning_rate": 9.517171717171717e-07, "loss": 0.5876, "step": 580 }, { "epoch": 0.21, "grad_norm": 3.913226842880249, "learning_rate": 9.507070707070707e-07, "loss": 0.5888, "step": 590 }, { "epoch": 0.22, "grad_norm": 4.400748252868652, "learning_rate": 9.496969696969696e-07, "loss": 0.6196, "step": 600 }, { "epoch": 0.22, "grad_norm": 3.053006649017334, "learning_rate": 9.486868686868687e-07, "loss": 0.6356, "step": 610 }, { "epoch": 0.22, "grad_norm": 4.207491874694824, "learning_rate": 9.476767676767676e-07, "loss": 0.6161, "step": 620 }, { "epoch": 0.23, "grad_norm": 6.389359951019287, "learning_rate": 9.466666666666666e-07, "loss": 0.6172, "step": 630 }, { "epoch": 0.23, "grad_norm": 4.967292308807373, "learning_rate": 9.456565656565656e-07, "loss": 0.6173, "step": 640 }, { "epoch": 0.23, "grad_norm": 3.855407953262329, "learning_rate": 9.446464646464646e-07, "loss": 0.585, "step": 650 }, { "epoch": 0.24, "grad_norm": 3.580738067626953, "learning_rate": 9.436363636363636e-07, "loss": 0.6166, "step": 660 }, { "epoch": 0.24, "grad_norm": 3.6277294158935547, "learning_rate": 9.426262626262626e-07, "loss": 0.5951, "step": 670 }, { "epoch": 0.24, "grad_norm": 3.7501285076141357, "learning_rate": 9.416161616161615e-07, "loss": 0.5807, "step": 680 }, { "epoch": 0.25, "grad_norm": 4.80528450012207, "learning_rate": 9.406060606060605e-07, "loss": 0.6053, "step": 690 }, { "epoch": 0.25, "grad_norm": 5.07316780090332, "learning_rate": 9.395959595959596e-07, "loss": 0.626, "step": 700 }, { "epoch": 0.25, "grad_norm": 5.338922023773193, "learning_rate": 9.385858585858585e-07, "loss": 0.5356, "step": 710 }, { "epoch": 0.26, "grad_norm": 3.264220714569092, "learning_rate": 9.375757575757576e-07, "loss": 0.5929, "step": 720 }, { "epoch": 0.26, "grad_norm": 3.4565188884735107, "learning_rate": 9.365656565656565e-07, "loss": 0.6379, "step": 730 }, { "epoch": 0.27, "grad_norm": 4.202028751373291, "learning_rate": 9.355555555555556e-07, "loss": 0.6143, "step": 740 }, { "epoch": 0.27, "grad_norm": 5.128079891204834, "learning_rate": 9.345454545454545e-07, "loss": 0.5531, "step": 750 }, { "epoch": 0.27, "grad_norm": 4.205199718475342, "learning_rate": 9.335353535353535e-07, "loss": 0.5842, "step": 760 }, { "epoch": 0.28, "grad_norm": 5.925571918487549, "learning_rate": 9.325252525252525e-07, "loss": 0.6169, "step": 770 }, { "epoch": 0.28, "grad_norm": 4.256628036499023, "learning_rate": 9.315151515151515e-07, "loss": 0.5899, "step": 780 }, { "epoch": 0.28, "grad_norm": 3.2796261310577393, "learning_rate": 9.305050505050504e-07, "loss": 0.6654, "step": 790 }, { "epoch": 0.29, "grad_norm": 3.6715095043182373, "learning_rate": 9.294949494949495e-07, "loss": 0.6272, "step": 800 }, { "epoch": 0.29, "grad_norm": 5.043895721435547, "learning_rate": 9.284848484848484e-07, "loss": 0.6019, "step": 810 }, { "epoch": 0.29, "grad_norm": 4.5200300216674805, "learning_rate": 9.274747474747475e-07, "loss": 0.6992, "step": 820 }, { "epoch": 0.3, "grad_norm": 5.557192325592041, "learning_rate": 9.264646464646464e-07, "loss": 0.6657, "step": 830 }, { "epoch": 0.3, "grad_norm": 3.2051031589508057, "learning_rate": 9.254545454545453e-07, "loss": 0.6262, "step": 840 }, { "epoch": 0.3, "grad_norm": 3.465202808380127, "learning_rate": 9.244444444444444e-07, "loss": 0.6603, "step": 850 }, { "epoch": 0.31, "grad_norm": 5.039762496948242, "learning_rate": 9.234343434343433e-07, "loss": 0.6276, "step": 860 }, { "epoch": 0.31, "grad_norm": 5.317465305328369, "learning_rate": 9.224242424242423e-07, "loss": 0.521, "step": 870 }, { "epoch": 0.32, "grad_norm": 6.985171318054199, "learning_rate": 9.214141414141414e-07, "loss": 0.6768, "step": 880 }, { "epoch": 0.32, "grad_norm": 3.9759483337402344, "learning_rate": 9.204040404040404e-07, "loss": 0.5982, "step": 890 }, { "epoch": 0.32, "grad_norm": 4.207100868225098, "learning_rate": 9.193939393939394e-07, "loss": 0.6117, "step": 900 }, { "epoch": 0.33, "grad_norm": 4.389718055725098, "learning_rate": 9.183838383838384e-07, "loss": 0.5767, "step": 910 }, { "epoch": 0.33, "grad_norm": 3.659482717514038, "learning_rate": 9.173737373737373e-07, "loss": 0.5957, "step": 920 }, { "epoch": 0.33, "grad_norm": 6.660622596740723, "learning_rate": 9.163636363636364e-07, "loss": 0.6403, "step": 930 }, { "epoch": 0.34, "grad_norm": 6.03493070602417, "learning_rate": 9.153535353535353e-07, "loss": 0.5857, "step": 940 }, { "epoch": 0.34, "grad_norm": 2.699523687362671, "learning_rate": 9.143434343434343e-07, "loss": 0.5913, "step": 950 }, { "epoch": 0.34, "grad_norm": 3.8991754055023193, "learning_rate": 9.133333333333333e-07, "loss": 0.6342, "step": 960 }, { "epoch": 0.35, "grad_norm": 4.422178268432617, "learning_rate": 9.123232323232323e-07, "loss": 0.6286, "step": 970 }, { "epoch": 0.35, "grad_norm": 4.381803512573242, "learning_rate": 9.113131313131313e-07, "loss": 0.6899, "step": 980 }, { "epoch": 0.36, "grad_norm": 4.75442361831665, "learning_rate": 9.103030303030302e-07, "loss": 0.5462, "step": 990 }, { "epoch": 0.36, "grad_norm": 3.6305062770843506, "learning_rate": 9.092929292929292e-07, "loss": 0.666, "step": 1000 }, { "epoch": 0.36, "eval_loss": 0.772658109664917, "eval_runtime": 400.6004, "eval_samples_per_second": 2.496, "eval_steps_per_second": 2.496, "step": 1000 }, { "epoch": 0.36, "grad_norm": 4.022682189941406, "learning_rate": 9.082828282828282e-07, "loss": 0.5796, "step": 1010 }, { "epoch": 0.37, "grad_norm": 5.148658752441406, "learning_rate": 9.072727272727272e-07, "loss": 0.5738, "step": 1020 }, { "epoch": 0.37, "grad_norm": 3.78167462348938, "learning_rate": 9.062626262626261e-07, "loss": 0.6572, "step": 1030 }, { "epoch": 0.37, "grad_norm": 4.6793212890625, "learning_rate": 9.052525252525252e-07, "loss": 0.6466, "step": 1040 }, { "epoch": 0.38, "grad_norm": 4.607004165649414, "learning_rate": 9.042424242424242e-07, "loss": 0.5599, "step": 1050 }, { "epoch": 0.38, "grad_norm": 5.730196952819824, "learning_rate": 9.032323232323233e-07, "loss": 0.6559, "step": 1060 }, { "epoch": 0.38, "grad_norm": 4.408863544464111, "learning_rate": 9.022222222222222e-07, "loss": 0.641, "step": 1070 }, { "epoch": 0.39, "grad_norm": 3.0303945541381836, "learning_rate": 9.012121212121212e-07, "loss": 0.5455, "step": 1080 }, { "epoch": 0.39, "grad_norm": 5.44600248336792, "learning_rate": 9.002020202020202e-07, "loss": 0.5672, "step": 1090 }, { "epoch": 0.39, "grad_norm": 4.877390384674072, "learning_rate": 8.991919191919192e-07, "loss": 0.5626, "step": 1100 }, { "epoch": 0.4, "grad_norm": 4.949882507324219, "learning_rate": 8.981818181818181e-07, "loss": 0.6092, "step": 1110 }, { "epoch": 0.4, "grad_norm": 4.810749530792236, "learning_rate": 8.971717171717172e-07, "loss": 0.6486, "step": 1120 }, { "epoch": 0.41, "grad_norm": 4.6897053718566895, "learning_rate": 8.961616161616161e-07, "loss": 0.5854, "step": 1130 }, { "epoch": 0.41, "grad_norm": 3.393486261367798, "learning_rate": 8.951515151515151e-07, "loss": 0.6071, "step": 1140 }, { "epoch": 0.41, "grad_norm": 3.1970651149749756, "learning_rate": 8.941414141414141e-07, "loss": 0.601, "step": 1150 }, { "epoch": 0.42, "grad_norm": 4.972078800201416, "learning_rate": 8.93131313131313e-07, "loss": 0.6291, "step": 1160 }, { "epoch": 0.42, "grad_norm": 5.2576003074646, "learning_rate": 8.921212121212121e-07, "loss": 0.5856, "step": 1170 }, { "epoch": 0.42, "grad_norm": 3.552098035812378, "learning_rate": 8.91111111111111e-07, "loss": 0.5712, "step": 1180 }, { "epoch": 0.43, "grad_norm": 4.2522382736206055, "learning_rate": 8.9010101010101e-07, "loss": 0.6128, "step": 1190 }, { "epoch": 0.43, "grad_norm": 3.3408310413360596, "learning_rate": 8.89090909090909e-07, "loss": 0.6636, "step": 1200 }, { "epoch": 0.43, "grad_norm": 3.3888745307922363, "learning_rate": 8.88080808080808e-07, "loss": 0.5906, "step": 1210 }, { "epoch": 0.44, "grad_norm": 4.374114990234375, "learning_rate": 8.870707070707071e-07, "loss": 0.5863, "step": 1220 }, { "epoch": 0.44, "grad_norm": 3.318901538848877, "learning_rate": 8.860606060606061e-07, "loss": 0.6232, "step": 1230 }, { "epoch": 0.44, "grad_norm": 3.4461617469787598, "learning_rate": 8.85050505050505e-07, "loss": 0.6712, "step": 1240 }, { "epoch": 0.45, "grad_norm": 3.6037635803222656, "learning_rate": 8.840404040404041e-07, "loss": 0.5787, "step": 1250 }, { "epoch": 0.45, "grad_norm": 4.3236002922058105, "learning_rate": 8.83030303030303e-07, "loss": 0.6723, "step": 1260 }, { "epoch": 0.46, "grad_norm": 3.199084997177124, "learning_rate": 8.820202020202019e-07, "loss": 0.5484, "step": 1270 }, { "epoch": 0.46, "grad_norm": 3.675905704498291, "learning_rate": 8.81010101010101e-07, "loss": 0.6713, "step": 1280 }, { "epoch": 0.46, "grad_norm": 4.667660713195801, "learning_rate": 8.799999999999999e-07, "loss": 0.6161, "step": 1290 }, { "epoch": 0.47, "grad_norm": 5.268735408782959, "learning_rate": 8.78989898989899e-07, "loss": 0.6282, "step": 1300 }, { "epoch": 0.47, "grad_norm": 3.6777164936065674, "learning_rate": 8.779797979797979e-07, "loss": 0.662, "step": 1310 }, { "epoch": 0.47, "grad_norm": 5.912172317504883, "learning_rate": 8.769696969696969e-07, "loss": 0.5951, "step": 1320 }, { "epoch": 0.48, "grad_norm": 4.206521034240723, "learning_rate": 8.759595959595959e-07, "loss": 0.704, "step": 1330 }, { "epoch": 0.48, "grad_norm": 4.009490013122559, "learning_rate": 8.749494949494949e-07, "loss": 0.652, "step": 1340 }, { "epoch": 0.48, "grad_norm": 3.8058555126190186, "learning_rate": 8.739393939393938e-07, "loss": 0.5743, "step": 1350 }, { "epoch": 0.49, "grad_norm": 8.637187957763672, "learning_rate": 8.729292929292929e-07, "loss": 0.6198, "step": 1360 }, { "epoch": 0.49, "grad_norm": 5.059712886810303, "learning_rate": 8.719191919191918e-07, "loss": 0.6171, "step": 1370 }, { "epoch": 0.49, "grad_norm": 5.403115272521973, "learning_rate": 8.709090909090909e-07, "loss": 0.5956, "step": 1380 }, { "epoch": 0.5, "grad_norm": 4.084959983825684, "learning_rate": 8.698989898989899e-07, "loss": 0.6023, "step": 1390 }, { "epoch": 0.5, "grad_norm": 4.863948345184326, "learning_rate": 8.688888888888889e-07, "loss": 0.6295, "step": 1400 }, { "epoch": 0.51, "grad_norm": 5.363975524902344, "learning_rate": 8.678787878787879e-07, "loss": 0.5854, "step": 1410 }, { "epoch": 0.51, "grad_norm": 4.450655460357666, "learning_rate": 8.668686868686868e-07, "loss": 0.6579, "step": 1420 }, { "epoch": 0.51, "grad_norm": 5.538711071014404, "learning_rate": 8.658585858585859e-07, "loss": 0.726, "step": 1430 }, { "epoch": 0.52, "grad_norm": 3.3089439868927, "learning_rate": 8.648484848484848e-07, "loss": 0.6435, "step": 1440 }, { "epoch": 0.52, "grad_norm": 3.613719940185547, "learning_rate": 8.638383838383838e-07, "loss": 0.6695, "step": 1450 }, { "epoch": 0.52, "grad_norm": 2.9638564586639404, "learning_rate": 8.628282828282828e-07, "loss": 0.5516, "step": 1460 }, { "epoch": 0.53, "grad_norm": 3.8651678562164307, "learning_rate": 8.618181818181818e-07, "loss": 0.5883, "step": 1470 }, { "epoch": 0.53, "grad_norm": 8.56436824798584, "learning_rate": 8.608080808080807e-07, "loss": 0.6476, "step": 1480 }, { "epoch": 0.53, "grad_norm": 6.782663345336914, "learning_rate": 8.597979797979798e-07, "loss": 0.6369, "step": 1490 }, { "epoch": 0.54, "grad_norm": 3.181833267211914, "learning_rate": 8.587878787878787e-07, "loss": 0.5885, "step": 1500 }, { "epoch": 0.54, "grad_norm": 5.068531513214111, "learning_rate": 8.577777777777777e-07, "loss": 0.6241, "step": 1510 }, { "epoch": 0.55, "grad_norm": 5.7847900390625, "learning_rate": 8.567676767676767e-07, "loss": 0.623, "step": 1520 }, { "epoch": 0.55, "grad_norm": 3.325660228729248, "learning_rate": 8.557575757575757e-07, "loss": 0.574, "step": 1530 }, { "epoch": 0.55, "grad_norm": 3.452211618423462, "learning_rate": 8.547474747474747e-07, "loss": 0.6359, "step": 1540 }, { "epoch": 0.56, "grad_norm": 5.663365364074707, "learning_rate": 8.537373737373736e-07, "loss": 0.6132, "step": 1550 }, { "epoch": 0.56, "grad_norm": 3.970423460006714, "learning_rate": 8.527272727272727e-07, "loss": 0.5644, "step": 1560 }, { "epoch": 0.56, "grad_norm": 4.623054027557373, "learning_rate": 8.517171717171717e-07, "loss": 0.5753, "step": 1570 }, { "epoch": 0.57, "grad_norm": 4.318519592285156, "learning_rate": 8.507070707070707e-07, "loss": 0.6335, "step": 1580 }, { "epoch": 0.57, "grad_norm": 3.2903385162353516, "learning_rate": 8.496969696969697e-07, "loss": 0.5899, "step": 1590 }, { "epoch": 0.57, "grad_norm": 2.847003221511841, "learning_rate": 8.486868686868687e-07, "loss": 0.5834, "step": 1600 }, { "epoch": 0.58, "grad_norm": 5.138712406158447, "learning_rate": 8.476767676767676e-07, "loss": 0.5485, "step": 1610 }, { "epoch": 0.58, "grad_norm": 5.145106792449951, "learning_rate": 8.466666666666667e-07, "loss": 0.6854, "step": 1620 }, { "epoch": 0.58, "grad_norm": 5.585366725921631, "learning_rate": 8.456565656565656e-07, "loss": 0.5595, "step": 1630 }, { "epoch": 0.59, "grad_norm": 3.857548952102661, "learning_rate": 8.446464646464646e-07, "loss": 0.5856, "step": 1640 }, { "epoch": 0.59, "grad_norm": 3.835942029953003, "learning_rate": 8.436363636363636e-07, "loss": 0.5589, "step": 1650 }, { "epoch": 0.6, "grad_norm": 4.407024383544922, "learning_rate": 8.426262626262626e-07, "loss": 0.5358, "step": 1660 }, { "epoch": 0.6, "grad_norm": 4.5201826095581055, "learning_rate": 8.416161616161616e-07, "loss": 0.6021, "step": 1670 }, { "epoch": 0.6, "grad_norm": 4.167496681213379, "learning_rate": 8.406060606060606e-07, "loss": 0.6007, "step": 1680 }, { "epoch": 0.61, "grad_norm": 5.0080695152282715, "learning_rate": 8.395959595959595e-07, "loss": 0.6298, "step": 1690 }, { "epoch": 0.61, "grad_norm": 4.7244977951049805, "learning_rate": 8.385858585858585e-07, "loss": 0.5713, "step": 1700 }, { "epoch": 0.61, "grad_norm": 5.417051315307617, "learning_rate": 8.375757575757575e-07, "loss": 0.6565, "step": 1710 }, { "epoch": 0.62, "grad_norm": 6.2620391845703125, "learning_rate": 8.365656565656564e-07, "loss": 0.6977, "step": 1720 }, { "epoch": 0.62, "grad_norm": 5.629709720611572, "learning_rate": 8.355555555555556e-07, "loss": 0.6281, "step": 1730 }, { "epoch": 0.62, "grad_norm": 4.182885646820068, "learning_rate": 8.345454545454545e-07, "loss": 0.6431, "step": 1740 }, { "epoch": 0.63, "grad_norm": 3.080979824066162, "learning_rate": 8.335353535353536e-07, "loss": 0.5938, "step": 1750 }, { "epoch": 0.63, "grad_norm": 2.8959693908691406, "learning_rate": 8.325252525252525e-07, "loss": 0.6677, "step": 1760 }, { "epoch": 0.63, "grad_norm": 2.9930922985076904, "learning_rate": 8.315151515151515e-07, "loss": 0.5425, "step": 1770 }, { "epoch": 0.64, "grad_norm": 11.59288501739502, "learning_rate": 8.305050505050505e-07, "loss": 0.5816, "step": 1780 }, { "epoch": 0.64, "grad_norm": 6.010501384735107, "learning_rate": 8.294949494949495e-07, "loss": 0.5944, "step": 1790 }, { "epoch": 0.65, "grad_norm": 3.7763831615448, "learning_rate": 8.284848484848484e-07, "loss": 0.6465, "step": 1800 }, { "epoch": 0.65, "grad_norm": 4.849707126617432, "learning_rate": 8.274747474747475e-07, "loss": 0.5679, "step": 1810 }, { "epoch": 0.65, "grad_norm": 3.6395959854125977, "learning_rate": 8.264646464646464e-07, "loss": 0.6026, "step": 1820 }, { "epoch": 0.66, "grad_norm": 4.717153549194336, "learning_rate": 8.254545454545455e-07, "loss": 0.5637, "step": 1830 }, { "epoch": 0.66, "grad_norm": 3.236311674118042, "learning_rate": 8.244444444444444e-07, "loss": 0.5955, "step": 1840 }, { "epoch": 0.66, "grad_norm": 4.533290386199951, "learning_rate": 8.234343434343433e-07, "loss": 0.5681, "step": 1850 }, { "epoch": 0.67, "grad_norm": 3.0948798656463623, "learning_rate": 8.224242424242424e-07, "loss": 0.6272, "step": 1860 }, { "epoch": 0.67, "grad_norm": 3.719708204269409, "learning_rate": 8.214141414141413e-07, "loss": 0.6793, "step": 1870 }, { "epoch": 0.67, "grad_norm": 3.228376865386963, "learning_rate": 8.204040404040403e-07, "loss": 0.558, "step": 1880 }, { "epoch": 0.68, "grad_norm": 3.4296934604644775, "learning_rate": 8.193939393939393e-07, "loss": 0.5717, "step": 1890 }, { "epoch": 0.68, "grad_norm": 5.708250999450684, "learning_rate": 8.184848484848484e-07, "loss": 0.6381, "step": 1900 }, { "epoch": 0.68, "grad_norm": 3.8306288719177246, "learning_rate": 8.174747474747474e-07, "loss": 0.6083, "step": 1910 }, { "epoch": 0.69, "grad_norm": 4.5576252937316895, "learning_rate": 8.164646464646464e-07, "loss": 0.6676, "step": 1920 }, { "epoch": 0.69, "grad_norm": 3.2234649658203125, "learning_rate": 8.154545454545454e-07, "loss": 0.6215, "step": 1930 }, { "epoch": 0.7, "grad_norm": 3.4826815128326416, "learning_rate": 8.144444444444444e-07, "loss": 0.6827, "step": 1940 }, { "epoch": 0.7, "grad_norm": 3.9034299850463867, "learning_rate": 8.134343434343433e-07, "loss": 0.596, "step": 1950 }, { "epoch": 0.7, "grad_norm": 2.7787563800811768, "learning_rate": 8.124242424242424e-07, "loss": 0.6234, "step": 1960 }, { "epoch": 0.71, "grad_norm": 4.442075252532959, "learning_rate": 8.114141414141413e-07, "loss": 0.6244, "step": 1970 }, { "epoch": 0.71, "grad_norm": 3.8511252403259277, "learning_rate": 8.104040404040403e-07, "loss": 0.5721, "step": 1980 }, { "epoch": 0.71, "grad_norm": 3.925318717956543, "learning_rate": 8.093939393939394e-07, "loss": 0.6547, "step": 1990 }, { "epoch": 0.72, "grad_norm": 4.262260437011719, "learning_rate": 8.083838383838384e-07, "loss": 0.6155, "step": 2000 }, { "epoch": 0.72, "eval_loss": 0.7234830260276794, "eval_runtime": 401.6534, "eval_samples_per_second": 2.49, "eval_steps_per_second": 2.49, "step": 2000 }, { "epoch": 0.72, "grad_norm": 4.890273571014404, "learning_rate": 8.073737373737374e-07, "loss": 0.6392, "step": 2010 }, { "epoch": 0.72, "grad_norm": 4.918805122375488, "learning_rate": 8.063636363636364e-07, "loss": 0.5549, "step": 2020 }, { "epoch": 0.73, "grad_norm": 3.605008840560913, "learning_rate": 8.053535353535353e-07, "loss": 0.5152, "step": 2030 }, { "epoch": 0.73, "grad_norm": 4.606666088104248, "learning_rate": 8.043434343434344e-07, "loss": 0.4798, "step": 2040 }, { "epoch": 0.74, "grad_norm": 4.115630626678467, "learning_rate": 8.033333333333333e-07, "loss": 0.5856, "step": 2050 }, { "epoch": 0.74, "grad_norm": 4.37373685836792, "learning_rate": 8.023232323232322e-07, "loss": 0.6467, "step": 2060 }, { "epoch": 0.74, "grad_norm": 3.3623104095458984, "learning_rate": 8.013131313131313e-07, "loss": 0.6591, "step": 2070 }, { "epoch": 0.75, "grad_norm": 3.1440579891204834, "learning_rate": 8.003030303030302e-07, "loss": 0.6003, "step": 2080 }, { "epoch": 0.75, "grad_norm": 3.391533136367798, "learning_rate": 7.992929292929293e-07, "loss": 0.5872, "step": 2090 }, { "epoch": 0.75, "grad_norm": 4.677767276763916, "learning_rate": 7.982828282828282e-07, "loss": 0.5803, "step": 2100 }, { "epoch": 0.76, "grad_norm": 3.759061336517334, "learning_rate": 7.972727272727272e-07, "loss": 0.6155, "step": 2110 }, { "epoch": 0.76, "grad_norm": 3.419386863708496, "learning_rate": 7.962626262626262e-07, "loss": 0.6303, "step": 2120 }, { "epoch": 0.76, "grad_norm": 3.4935250282287598, "learning_rate": 7.952525252525252e-07, "loss": 0.6348, "step": 2130 }, { "epoch": 0.77, "grad_norm": 4.793978214263916, "learning_rate": 7.942424242424241e-07, "loss": 0.5004, "step": 2140 }, { "epoch": 0.77, "grad_norm": 5.078155994415283, "learning_rate": 7.932323232323232e-07, "loss": 0.5583, "step": 2150 }, { "epoch": 0.77, "grad_norm": 3.8855032920837402, "learning_rate": 7.922222222222222e-07, "loss": 0.626, "step": 2160 }, { "epoch": 0.78, "grad_norm": 4.782063007354736, "learning_rate": 7.912121212121213e-07, "loss": 0.4847, "step": 2170 }, { "epoch": 0.78, "grad_norm": 4.760573387145996, "learning_rate": 7.902020202020202e-07, "loss": 0.5923, "step": 2180 }, { "epoch": 0.79, "grad_norm": 5.196963310241699, "learning_rate": 7.891919191919192e-07, "loss": 0.5216, "step": 2190 }, { "epoch": 0.79, "grad_norm": 5.447847843170166, "learning_rate": 7.881818181818182e-07, "loss": 0.6144, "step": 2200 }, { "epoch": 0.79, "grad_norm": 5.5991129875183105, "learning_rate": 7.871717171717171e-07, "loss": 0.5247, "step": 2210 }, { "epoch": 0.8, "grad_norm": 4.1100921630859375, "learning_rate": 7.861616161616161e-07, "loss": 0.6581, "step": 2220 }, { "epoch": 0.8, "grad_norm": 2.998598098754883, "learning_rate": 7.851515151515151e-07, "loss": 0.5111, "step": 2230 }, { "epoch": 0.8, "grad_norm": 3.1241579055786133, "learning_rate": 7.841414141414141e-07, "loss": 0.626, "step": 2240 }, { "epoch": 0.81, "grad_norm": 3.804655075073242, "learning_rate": 7.831313131313131e-07, "loss": 0.596, "step": 2250 }, { "epoch": 0.81, "grad_norm": 3.636692523956299, "learning_rate": 7.821212121212121e-07, "loss": 0.598, "step": 2260 }, { "epoch": 0.81, "grad_norm": 5.374478816986084, "learning_rate": 7.81111111111111e-07, "loss": 0.5155, "step": 2270 }, { "epoch": 0.82, "grad_norm": 4.611242294311523, "learning_rate": 7.801010101010101e-07, "loss": 0.5482, "step": 2280 }, { "epoch": 0.82, "grad_norm": 6.077275276184082, "learning_rate": 7.79090909090909e-07, "loss": 0.5642, "step": 2290 }, { "epoch": 0.82, "grad_norm": 4.110134601593018, "learning_rate": 7.78080808080808e-07, "loss": 0.6482, "step": 2300 }, { "epoch": 0.83, "grad_norm": 3.603376865386963, "learning_rate": 7.77070707070707e-07, "loss": 0.5755, "step": 2310 }, { "epoch": 0.83, "grad_norm": 4.059467315673828, "learning_rate": 7.76060606060606e-07, "loss": 0.5205, "step": 2320 }, { "epoch": 0.84, "grad_norm": 4.656579971313477, "learning_rate": 7.750505050505051e-07, "loss": 0.5975, "step": 2330 }, { "epoch": 0.84, "grad_norm": 4.863138675689697, "learning_rate": 7.74040404040404e-07, "loss": 0.5375, "step": 2340 }, { "epoch": 0.84, "grad_norm": 4.3313517570495605, "learning_rate": 7.73030303030303e-07, "loss": 0.5795, "step": 2350 }, { "epoch": 0.85, "grad_norm": 3.870875120162964, "learning_rate": 7.72020202020202e-07, "loss": 0.5249, "step": 2360 }, { "epoch": 0.85, "grad_norm": 3.720283031463623, "learning_rate": 7.71010101010101e-07, "loss": 0.6733, "step": 2370 }, { "epoch": 0.85, "grad_norm": 3.779296398162842, "learning_rate": 7.699999999999999e-07, "loss": 0.588, "step": 2380 }, { "epoch": 0.86, "grad_norm": 4.588533401489258, "learning_rate": 7.68989898989899e-07, "loss": 0.6044, "step": 2390 }, { "epoch": 0.86, "grad_norm": 5.907507419586182, "learning_rate": 7.679797979797979e-07, "loss": 0.5715, "step": 2400 }, { "epoch": 0.86, "grad_norm": 5.705809116363525, "learning_rate": 7.66969696969697e-07, "loss": 0.5486, "step": 2410 }, { "epoch": 0.87, "grad_norm": 3.8104190826416016, "learning_rate": 7.659595959595959e-07, "loss": 0.602, "step": 2420 }, { "epoch": 0.87, "grad_norm": 5.82910680770874, "learning_rate": 7.649494949494949e-07, "loss": 0.5111, "step": 2430 }, { "epoch": 0.88, "grad_norm": 5.837489604949951, "learning_rate": 7.639393939393939e-07, "loss": 0.6145, "step": 2440 }, { "epoch": 0.88, "grad_norm": 3.5505547523498535, "learning_rate": 7.629292929292929e-07, "loss": 0.5834, "step": 2450 }, { "epoch": 0.88, "grad_norm": 4.314844131469727, "learning_rate": 7.619191919191918e-07, "loss": 0.6152, "step": 2460 }, { "epoch": 0.89, "grad_norm": 4.205423831939697, "learning_rate": 7.609090909090909e-07, "loss": 0.6285, "step": 2470 }, { "epoch": 0.89, "grad_norm": 5.292863368988037, "learning_rate": 7.598989898989898e-07, "loss": 0.6531, "step": 2480 }, { "epoch": 0.89, "grad_norm": 5.13388204574585, "learning_rate": 7.588888888888888e-07, "loss": 0.628, "step": 2490 }, { "epoch": 0.9, "grad_norm": 2.864743232727051, "learning_rate": 7.578787878787879e-07, "loss": 0.5403, "step": 2500 }, { "epoch": 0.9, "grad_norm": 4.284875869750977, "learning_rate": 7.568686868686868e-07, "loss": 0.6097, "step": 2510 }, { "epoch": 0.9, "grad_norm": 3.3840548992156982, "learning_rate": 7.558585858585859e-07, "loss": 0.5759, "step": 2520 }, { "epoch": 0.91, "grad_norm": 4.531360626220703, "learning_rate": 7.548484848484848e-07, "loss": 0.5581, "step": 2530 }, { "epoch": 0.91, "grad_norm": 5.861146926879883, "learning_rate": 7.538383838383838e-07, "loss": 0.6329, "step": 2540 }, { "epoch": 0.91, "grad_norm": 5.117300987243652, "learning_rate": 7.528282828282828e-07, "loss": 0.6071, "step": 2550 }, { "epoch": 0.92, "grad_norm": 3.9243366718292236, "learning_rate": 7.518181818181818e-07, "loss": 0.5476, "step": 2560 }, { "epoch": 0.92, "grad_norm": 6.215851783752441, "learning_rate": 7.508080808080808e-07, "loss": 0.5731, "step": 2570 }, { "epoch": 0.93, "grad_norm": 6.189659118652344, "learning_rate": 7.497979797979798e-07, "loss": 0.592, "step": 2580 }, { "epoch": 0.93, "grad_norm": 2.877923011779785, "learning_rate": 7.487878787878787e-07, "loss": 0.5435, "step": 2590 }, { "epoch": 0.93, "grad_norm": 5.963223934173584, "learning_rate": 7.477777777777778e-07, "loss": 0.6104, "step": 2600 }, { "epoch": 0.94, "grad_norm": 4.374766826629639, "learning_rate": 7.467676767676767e-07, "loss": 0.6022, "step": 2610 }, { "epoch": 0.94, "grad_norm": 4.16854190826416, "learning_rate": 7.457575757575756e-07, "loss": 0.4931, "step": 2620 }, { "epoch": 0.94, "grad_norm": 7.225739479064941, "learning_rate": 7.447474747474747e-07, "loss": 0.5659, "step": 2630 }, { "epoch": 0.95, "grad_norm": 4.522436141967773, "learning_rate": 7.437373737373736e-07, "loss": 0.5449, "step": 2640 }, { "epoch": 0.95, "grad_norm": 4.935425758361816, "learning_rate": 7.427272727272727e-07, "loss": 0.6093, "step": 2650 }, { "epoch": 0.95, "grad_norm": 3.947935104370117, "learning_rate": 7.417171717171716e-07, "loss": 0.5976, "step": 2660 }, { "epoch": 0.96, "grad_norm": 5.328210830688477, "learning_rate": 7.407070707070707e-07, "loss": 0.6283, "step": 2670 }, { "epoch": 0.96, "grad_norm": 3.0664663314819336, "learning_rate": 7.396969696969697e-07, "loss": 0.5853, "step": 2680 }, { "epoch": 0.96, "grad_norm": 5.055959701538086, "learning_rate": 7.386868686868687e-07, "loss": 0.5767, "step": 2690 }, { "epoch": 0.97, "grad_norm": 5.318157196044922, "learning_rate": 7.376767676767676e-07, "loss": 0.5319, "step": 2700 }, { "epoch": 0.97, "grad_norm": 3.5820364952087402, "learning_rate": 7.366666666666667e-07, "loss": 0.5506, "step": 2710 }, { "epoch": 0.98, "grad_norm": 4.367839813232422, "learning_rate": 7.356565656565656e-07, "loss": 0.5524, "step": 2720 }, { "epoch": 0.98, "grad_norm": 4.305497169494629, "learning_rate": 7.346464646464647e-07, "loss": 0.5639, "step": 2730 }, { "epoch": 0.98, "grad_norm": 6.55611515045166, "learning_rate": 7.336363636363636e-07, "loss": 0.6346, "step": 2740 }, { "epoch": 0.99, "grad_norm": 9.09347152709961, "learning_rate": 7.326262626262626e-07, "loss": 0.5574, "step": 2750 }, { "epoch": 0.99, "grad_norm": 3.9730687141418457, "learning_rate": 7.316161616161616e-07, "loss": 0.5908, "step": 2760 }, { "epoch": 0.99, "grad_norm": 16.01730728149414, "learning_rate": 7.306060606060605e-07, "loss": 0.5936, "step": 2770 }, { "epoch": 1.0, "grad_norm": 5.077668190002441, "learning_rate": 7.295959595959595e-07, "loss": 0.5556, "step": 2780 }, { "epoch": 1.0, "grad_norm": 4.859917640686035, "learning_rate": 7.285858585858585e-07, "loss": 0.5452, "step": 2790 }, { "epoch": 1.0, "grad_norm": 5.687443733215332, "learning_rate": 7.275757575757575e-07, "loss": 0.6583, "step": 2800 }, { "epoch": 1.01, "grad_norm": 4.369338035583496, "learning_rate": 7.265656565656565e-07, "loss": 0.5623, "step": 2810 }, { "epoch": 1.01, "grad_norm": 4.035587310791016, "learning_rate": 7.255555555555555e-07, "loss": 0.571, "step": 2820 }, { "epoch": 1.01, "grad_norm": 4.2013139724731445, "learning_rate": 7.245454545454544e-07, "loss": 0.4768, "step": 2830 }, { "epoch": 1.02, "grad_norm": 3.272221565246582, "learning_rate": 7.235353535353536e-07, "loss": 0.581, "step": 2840 }, { "epoch": 1.02, "grad_norm": 3.97727370262146, "learning_rate": 7.225252525252525e-07, "loss": 0.5654, "step": 2850 }, { "epoch": 1.03, "grad_norm": 4.043779373168945, "learning_rate": 7.215151515151516e-07, "loss": 0.5971, "step": 2860 }, { "epoch": 1.03, "grad_norm": 4.015261173248291, "learning_rate": 7.205050505050505e-07, "loss": 0.5921, "step": 2870 }, { "epoch": 1.03, "grad_norm": 4.540653705596924, "learning_rate": 7.194949494949495e-07, "loss": 0.6103, "step": 2880 }, { "epoch": 1.04, "grad_norm": 4.258879661560059, "learning_rate": 7.184848484848485e-07, "loss": 0.5519, "step": 2890 }, { "epoch": 1.04, "grad_norm": 7.0088629722595215, "learning_rate": 7.174747474747475e-07, "loss": 0.6054, "step": 2900 }, { "epoch": 1.04, "grad_norm": 3.9803833961486816, "learning_rate": 7.164646464646464e-07, "loss": 0.5557, "step": 2910 }, { "epoch": 1.05, "grad_norm": 3.6390326023101807, "learning_rate": 7.154545454545454e-07, "loss": 0.5406, "step": 2920 }, { "epoch": 1.05, "grad_norm": 3.998908042907715, "learning_rate": 7.144444444444444e-07, "loss": 0.604, "step": 2930 }, { "epoch": 1.05, "grad_norm": 4.304749488830566, "learning_rate": 7.134343434343433e-07, "loss": 0.5859, "step": 2940 }, { "epoch": 1.06, "grad_norm": 4.742584228515625, "learning_rate": 7.124242424242424e-07, "loss": 0.5184, "step": 2950 }, { "epoch": 1.06, "grad_norm": 5.163829326629639, "learning_rate": 7.114141414141413e-07, "loss": 0.5669, "step": 2960 }, { "epoch": 1.07, "grad_norm": 5.633606910705566, "learning_rate": 7.104040404040404e-07, "loss": 0.5967, "step": 2970 }, { "epoch": 1.07, "grad_norm": 6.37916898727417, "learning_rate": 7.093939393939393e-07, "loss": 0.5971, "step": 2980 }, { "epoch": 1.07, "grad_norm": 2.9268598556518555, "learning_rate": 7.083838383838383e-07, "loss": 0.5342, "step": 2990 }, { "epoch": 1.08, "grad_norm": 4.331085205078125, "learning_rate": 7.073737373737373e-07, "loss": 0.5779, "step": 3000 }, { "epoch": 1.08, "eval_loss": 0.7162447571754456, "eval_runtime": 401.0241, "eval_samples_per_second": 2.494, "eval_steps_per_second": 2.494, "step": 3000 }, { "epoch": 1.08, "grad_norm": 2.9604339599609375, "learning_rate": 7.063636363636364e-07, "loss": 0.5723, "step": 3010 }, { "epoch": 1.08, "grad_norm": 4.6410393714904785, "learning_rate": 7.053535353535354e-07, "loss": 0.5986, "step": 3020 }, { "epoch": 1.09, "grad_norm": 4.530472278594971, "learning_rate": 7.043434343434344e-07, "loss": 0.705, "step": 3030 }, { "epoch": 1.09, "grad_norm": 4.028800964355469, "learning_rate": 7.033333333333333e-07, "loss": 0.5135, "step": 3040 }, { "epoch": 1.09, "grad_norm": 2.882619619369507, "learning_rate": 7.023232323232324e-07, "loss": 0.5373, "step": 3050 }, { "epoch": 1.1, "grad_norm": 5.280877590179443, "learning_rate": 7.013131313131313e-07, "loss": 0.5457, "step": 3060 }, { "epoch": 1.1, "grad_norm": 3.983971357345581, "learning_rate": 7.003030303030302e-07, "loss": 0.5678, "step": 3070 }, { "epoch": 1.1, "grad_norm": 3.4524683952331543, "learning_rate": 6.992929292929293e-07, "loss": 0.5794, "step": 3080 }, { "epoch": 1.11, "grad_norm": 4.039379119873047, "learning_rate": 6.982828282828282e-07, "loss": 0.4778, "step": 3090 }, { "epoch": 1.11, "grad_norm": 4.454843521118164, "learning_rate": 6.972727272727273e-07, "loss": 0.5312, "step": 3100 }, { "epoch": 1.12, "grad_norm": 4.90205192565918, "learning_rate": 6.962626262626262e-07, "loss": 0.5675, "step": 3110 }, { "epoch": 1.12, "grad_norm": 3.076164484024048, "learning_rate": 6.952525252525252e-07, "loss": 0.5949, "step": 3120 }, { "epoch": 1.12, "grad_norm": 3.707550287246704, "learning_rate": 6.942424242424242e-07, "loss": 0.6053, "step": 3130 }, { "epoch": 1.13, "grad_norm": 4.36667537689209, "learning_rate": 6.932323232323232e-07, "loss": 0.638, "step": 3140 }, { "epoch": 1.13, "grad_norm": 4.635988235473633, "learning_rate": 6.922222222222221e-07, "loss": 0.5698, "step": 3150 }, { "epoch": 1.13, "grad_norm": 3.769178628921509, "learning_rate": 6.912121212121212e-07, "loss": 0.5563, "step": 3160 }, { "epoch": 1.14, "grad_norm": 3.096804618835449, "learning_rate": 6.902020202020201e-07, "loss": 0.5268, "step": 3170 }, { "epoch": 1.14, "grad_norm": 5.9978461265563965, "learning_rate": 6.891919191919193e-07, "loss": 0.4878, "step": 3180 }, { "epoch": 1.14, "grad_norm": 4.963356971740723, "learning_rate": 6.881818181818182e-07, "loss": 0.5904, "step": 3190 }, { "epoch": 1.15, "grad_norm": 4.526729583740234, "learning_rate": 6.871717171717171e-07, "loss": 0.5957, "step": 3200 }, { "epoch": 1.15, "grad_norm": 3.4197323322296143, "learning_rate": 6.861616161616162e-07, "loss": 0.5522, "step": 3210 }, { "epoch": 1.15, "grad_norm": 3.989546060562134, "learning_rate": 6.851515151515151e-07, "loss": 0.6122, "step": 3220 }, { "epoch": 1.16, "grad_norm": 5.137784004211426, "learning_rate": 6.841414141414141e-07, "loss": 0.58, "step": 3230 }, { "epoch": 1.16, "grad_norm": 4.193089485168457, "learning_rate": 6.831313131313131e-07, "loss": 0.4918, "step": 3240 }, { "epoch": 1.17, "grad_norm": 4.1724162101745605, "learning_rate": 6.821212121212121e-07, "loss": 0.6044, "step": 3250 }, { "epoch": 1.17, "grad_norm": 4.983777046203613, "learning_rate": 6.811111111111111e-07, "loss": 0.533, "step": 3260 }, { "epoch": 1.17, "grad_norm": 7.2310333251953125, "learning_rate": 6.801010101010101e-07, "loss": 0.5843, "step": 3270 }, { "epoch": 1.18, "grad_norm": 4.620791912078857, "learning_rate": 6.79090909090909e-07, "loss": 0.5242, "step": 3280 }, { "epoch": 1.18, "grad_norm": 4.3235087394714355, "learning_rate": 6.780808080808081e-07, "loss": 0.5836, "step": 3290 }, { "epoch": 1.18, "grad_norm": 4.413332462310791, "learning_rate": 6.77070707070707e-07, "loss": 0.6532, "step": 3300 }, { "epoch": 1.19, "grad_norm": 4.286377429962158, "learning_rate": 6.76060606060606e-07, "loss": 0.5527, "step": 3310 }, { "epoch": 1.19, "grad_norm": 6.150529861450195, "learning_rate": 6.75050505050505e-07, "loss": 0.5745, "step": 3320 }, { "epoch": 1.19, "grad_norm": 4.088225841522217, "learning_rate": 6.74040404040404e-07, "loss": 0.5882, "step": 3330 }, { "epoch": 1.2, "grad_norm": 4.4364824295043945, "learning_rate": 6.73030303030303e-07, "loss": 0.513, "step": 3340 }, { "epoch": 1.2, "grad_norm": 3.8595519065856934, "learning_rate": 6.72020202020202e-07, "loss": 0.5621, "step": 3350 }, { "epoch": 1.2, "grad_norm": 6.183053016662598, "learning_rate": 6.71010101010101e-07, "loss": 0.5318, "step": 3360 }, { "epoch": 1.21, "grad_norm": 4.677096366882324, "learning_rate": 6.7e-07, "loss": 0.4965, "step": 3370 }, { "epoch": 1.21, "grad_norm": 5.507148742675781, "learning_rate": 6.68989898989899e-07, "loss": 0.569, "step": 3380 }, { "epoch": 1.22, "grad_norm": 3.813816547393799, "learning_rate": 6.679797979797979e-07, "loss": 0.5326, "step": 3390 }, { "epoch": 1.22, "grad_norm": 5.973514080047607, "learning_rate": 6.66969696969697e-07, "loss": 0.6733, "step": 3400 }, { "epoch": 1.22, "grad_norm": 5.510775089263916, "learning_rate": 6.659595959595959e-07, "loss": 0.5978, "step": 3410 }, { "epoch": 1.23, "grad_norm": 3.8347678184509277, "learning_rate": 6.64949494949495e-07, "loss": 0.5141, "step": 3420 }, { "epoch": 1.23, "grad_norm": 4.097416877746582, "learning_rate": 6.639393939393939e-07, "loss": 0.5503, "step": 3430 }, { "epoch": 1.23, "grad_norm": 3.52165150642395, "learning_rate": 6.629292929292929e-07, "loss": 0.5506, "step": 3440 }, { "epoch": 1.24, "grad_norm": 4.4569573402404785, "learning_rate": 6.619191919191919e-07, "loss": 0.6121, "step": 3450 }, { "epoch": 1.24, "grad_norm": 4.094751834869385, "learning_rate": 6.609090909090909e-07, "loss": 0.5562, "step": 3460 }, { "epoch": 1.24, "grad_norm": 4.277998447418213, "learning_rate": 6.598989898989898e-07, "loss": 0.5432, "step": 3470 }, { "epoch": 1.25, "grad_norm": 5.9544148445129395, "learning_rate": 6.588888888888889e-07, "loss": 0.5891, "step": 3480 }, { "epoch": 1.25, "grad_norm": 5.688138008117676, "learning_rate": 6.578787878787878e-07, "loss": 0.6172, "step": 3490 }, { "epoch": 1.26, "grad_norm": 5.771819591522217, "learning_rate": 6.568686868686868e-07, "loss": 0.5362, "step": 3500 }, { "epoch": 1.26, "grad_norm": 4.87513542175293, "learning_rate": 6.558585858585858e-07, "loss": 0.5608, "step": 3510 }, { "epoch": 1.26, "grad_norm": 3.709472179412842, "learning_rate": 6.548484848484848e-07, "loss": 0.5361, "step": 3520 }, { "epoch": 1.27, "grad_norm": 4.465284824371338, "learning_rate": 6.538383838383839e-07, "loss": 0.5435, "step": 3530 }, { "epoch": 1.27, "grad_norm": 4.30505895614624, "learning_rate": 6.528282828282828e-07, "loss": 0.5714, "step": 3540 }, { "epoch": 1.27, "grad_norm": 4.445021152496338, "learning_rate": 6.518181818181818e-07, "loss": 0.5773, "step": 3550 }, { "epoch": 1.28, "grad_norm": 4.173781394958496, "learning_rate": 6.508080808080808e-07, "loss": 0.6223, "step": 3560 }, { "epoch": 1.28, "grad_norm": 5.676968574523926, "learning_rate": 6.497979797979798e-07, "loss": 0.5957, "step": 3570 }, { "epoch": 1.28, "grad_norm": 3.447176694869995, "learning_rate": 6.487878787878788e-07, "loss": 0.5896, "step": 3580 }, { "epoch": 1.29, "grad_norm": 4.1883111000061035, "learning_rate": 6.477777777777778e-07, "loss": 0.5602, "step": 3590 }, { "epoch": 1.29, "grad_norm": 4.16115140914917, "learning_rate": 6.467676767676767e-07, "loss": 0.5752, "step": 3600 }, { "epoch": 1.29, "grad_norm": 4.55715274810791, "learning_rate": 6.457575757575758e-07, "loss": 0.5344, "step": 3610 }, { "epoch": 1.3, "grad_norm": 5.83474588394165, "learning_rate": 6.447474747474747e-07, "loss": 0.5867, "step": 3620 }, { "epoch": 1.3, "grad_norm": 2.700615644454956, "learning_rate": 6.437373737373736e-07, "loss": 0.5395, "step": 3630 }, { "epoch": 1.31, "grad_norm": 4.691040992736816, "learning_rate": 6.428282828282828e-07, "loss": 0.5509, "step": 3640 }, { "epoch": 1.31, "grad_norm": 4.966342449188232, "learning_rate": 6.418181818181818e-07, "loss": 0.6047, "step": 3650 }, { "epoch": 1.31, "grad_norm": 3.6059834957122803, "learning_rate": 6.408080808080808e-07, "loss": 0.5338, "step": 3660 }, { "epoch": 1.32, "grad_norm": 3.471660852432251, "learning_rate": 6.397979797979798e-07, "loss": 0.5532, "step": 3670 }, { "epoch": 1.32, "grad_norm": 5.410092353820801, "learning_rate": 6.387878787878788e-07, "loss": 0.5739, "step": 3680 }, { "epoch": 1.32, "grad_norm": 4.829250335693359, "learning_rate": 6.377777777777778e-07, "loss": 0.6064, "step": 3690 }, { "epoch": 1.33, "grad_norm": 5.880673408508301, "learning_rate": 6.367676767676767e-07, "loss": 0.575, "step": 3700 }, { "epoch": 1.33, "grad_norm": 3.647641658782959, "learning_rate": 6.357575757575757e-07, "loss": 0.594, "step": 3710 }, { "epoch": 1.33, "grad_norm": 4.267665386199951, "learning_rate": 6.347474747474747e-07, "loss": 0.574, "step": 3720 }, { "epoch": 1.34, "grad_norm": 3.5303924083709717, "learning_rate": 6.337373737373736e-07, "loss": 0.496, "step": 3730 }, { "epoch": 1.34, "grad_norm": 3.1564202308654785, "learning_rate": 6.327272727272727e-07, "loss": 0.5196, "step": 3740 }, { "epoch": 1.34, "grad_norm": 4.458827495574951, "learning_rate": 6.317171717171716e-07, "loss": 0.5616, "step": 3750 }, { "epoch": 1.35, "grad_norm": 4.057519435882568, "learning_rate": 6.307070707070707e-07, "loss": 0.5293, "step": 3760 }, { "epoch": 1.35, "grad_norm": 5.7592363357543945, "learning_rate": 6.296969696969696e-07, "loss": 0.5433, "step": 3770 }, { "epoch": 1.36, "grad_norm": 5.579089641571045, "learning_rate": 6.286868686868687e-07, "loss": 0.546, "step": 3780 }, { "epoch": 1.36, "grad_norm": 3.8911049365997314, "learning_rate": 6.276767676767677e-07, "loss": 0.5991, "step": 3790 }, { "epoch": 1.36, "grad_norm": 6.364475727081299, "learning_rate": 6.266666666666667e-07, "loss": 0.5908, "step": 3800 }, { "epoch": 1.37, "grad_norm": 5.0971760749816895, "learning_rate": 6.256565656565656e-07, "loss": 0.5915, "step": 3810 }, { "epoch": 1.37, "grad_norm": 3.897958755493164, "learning_rate": 6.246464646464647e-07, "loss": 0.5859, "step": 3820 }, { "epoch": 1.37, "grad_norm": 6.308644771575928, "learning_rate": 6.236363636363636e-07, "loss": 0.5991, "step": 3830 }, { "epoch": 1.38, "grad_norm": 3.0512614250183105, "learning_rate": 6.226262626262627e-07, "loss": 0.5742, "step": 3840 }, { "epoch": 1.38, "grad_norm": 4.052495956420898, "learning_rate": 6.216161616161616e-07, "loss": 0.54, "step": 3850 }, { "epoch": 1.38, "grad_norm": 4.723029613494873, "learning_rate": 6.206060606060605e-07, "loss": 0.5524, "step": 3860 }, { "epoch": 1.39, "grad_norm": 4.705147743225098, "learning_rate": 6.195959595959596e-07, "loss": 0.5924, "step": 3870 }, { "epoch": 1.39, "grad_norm": 4.098758697509766, "learning_rate": 6.185858585858585e-07, "loss": 0.5482, "step": 3880 }, { "epoch": 1.4, "grad_norm": 2.9597623348236084, "learning_rate": 6.175757575757575e-07, "loss": 0.5521, "step": 3890 }, { "epoch": 1.4, "grad_norm": 3.7308359146118164, "learning_rate": 6.165656565656565e-07, "loss": 0.5645, "step": 3900 }, { "epoch": 1.4, "grad_norm": 4.260051250457764, "learning_rate": 6.155555555555555e-07, "loss": 0.5471, "step": 3910 }, { "epoch": 1.41, "grad_norm": 6.539883136749268, "learning_rate": 6.145454545454545e-07, "loss": 0.5439, "step": 3920 }, { "epoch": 1.41, "grad_norm": 6.235405921936035, "learning_rate": 6.135353535353535e-07, "loss": 0.5435, "step": 3930 }, { "epoch": 1.41, "grad_norm": 3.3652477264404297, "learning_rate": 6.125252525252524e-07, "loss": 0.5544, "step": 3940 }, { "epoch": 1.42, "grad_norm": 3.795522928237915, "learning_rate": 6.115151515151516e-07, "loss": 0.5794, "step": 3950 }, { "epoch": 1.42, "grad_norm": 5.496920108795166, "learning_rate": 6.105050505050505e-07, "loss": 0.5133, "step": 3960 }, { "epoch": 1.42, "grad_norm": 5.946590423583984, "learning_rate": 6.094949494949495e-07, "loss": 0.6391, "step": 3970 }, { "epoch": 1.43, "grad_norm": 3.943232774734497, "learning_rate": 6.084848484848485e-07, "loss": 0.5574, "step": 3980 }, { "epoch": 1.43, "grad_norm": 4.132944107055664, "learning_rate": 6.074747474747474e-07, "loss": 0.5014, "step": 3990 }, { "epoch": 1.43, "grad_norm": 4.284780979156494, "learning_rate": 6.064646464646465e-07, "loss": 0.5907, "step": 4000 }, { "epoch": 1.43, "eval_loss": 0.698131263256073, "eval_runtime": 401.1999, "eval_samples_per_second": 2.493, "eval_steps_per_second": 2.493, "step": 4000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 2000, "total_flos": 3.77128394686464e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }