{ "best_metric": 0.007954709231853485, "best_model_checkpoint": "autotrain-tlvon-zmjgo/checkpoint-3960", "epoch": 10.0, "eval_steps": 500, "global_step": 3960, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.047979797979797977, "grad_norm": 69.1661148071289, "learning_rate": 7.575757575757576e-07, "loss": 9.6544, "step": 19 }, { "epoch": 0.09595959595959595, "grad_norm": 59.700984954833984, "learning_rate": 1.6666666666666667e-06, "loss": 8.2669, "step": 38 }, { "epoch": 0.14393939393939395, "grad_norm": 22.565589904785156, "learning_rate": 2.6262626262626263e-06, "loss": 6.1057, "step": 57 }, { "epoch": 0.1919191919191919, "grad_norm": 12.696467399597168, "learning_rate": 3.5858585858585863e-06, "loss": 4.9529, "step": 76 }, { "epoch": 0.2398989898989899, "grad_norm": 11.562800407409668, "learning_rate": 4.5454545454545455e-06, "loss": 4.3771, "step": 95 }, { "epoch": 0.2878787878787879, "grad_norm": 7.8888840675354, "learning_rate": 5.5050505050505056e-06, "loss": 4.027, "step": 114 }, { "epoch": 0.33585858585858586, "grad_norm": 11.868915557861328, "learning_rate": 6.464646464646465e-06, "loss": 3.3608, "step": 133 }, { "epoch": 0.3838383838383838, "grad_norm": 6.647916793823242, "learning_rate": 7.424242424242425e-06, "loss": 3.3761, "step": 152 }, { "epoch": 0.4318181818181818, "grad_norm": 5.503500938415527, "learning_rate": 8.383838383838384e-06, "loss": 3.3234, "step": 171 }, { "epoch": 0.4797979797979798, "grad_norm": 7.909439563751221, "learning_rate": 9.343434343434344e-06, "loss": 2.9671, "step": 190 }, { "epoch": 0.5277777777777778, "grad_norm": 6.679018020629883, "learning_rate": 1.0303030303030304e-05, "loss": 2.9742, "step": 209 }, { "epoch": 0.5757575757575758, "grad_norm": 7.1399664878845215, "learning_rate": 1.1262626262626264e-05, "loss": 2.8917, "step": 228 }, { "epoch": 0.6237373737373737, "grad_norm": 6.230110168457031, "learning_rate": 1.2222222222222222e-05, "loss": 2.9786, "step": 247 }, { "epoch": 0.6717171717171717, "grad_norm": 5.975085735321045, "learning_rate": 1.318181818181818e-05, "loss": 2.5503, "step": 266 }, { "epoch": 0.7196969696969697, "grad_norm": 6.412594318389893, "learning_rate": 1.4141414141414141e-05, "loss": 2.6222, "step": 285 }, { "epoch": 0.7676767676767676, "grad_norm": 5.266932487487793, "learning_rate": 1.5101010101010103e-05, "loss": 2.4747, "step": 304 }, { "epoch": 0.8156565656565656, "grad_norm": 6.42549991607666, "learning_rate": 1.606060606060606e-05, "loss": 2.4522, "step": 323 }, { "epoch": 0.8636363636363636, "grad_norm": 5.953466892242432, "learning_rate": 1.7020202020202023e-05, "loss": 2.4671, "step": 342 }, { "epoch": 0.9116161616161617, "grad_norm": 5.912783145904541, "learning_rate": 1.797979797979798e-05, "loss": 2.1904, "step": 361 }, { "epoch": 0.9595959595959596, "grad_norm": 6.379250526428223, "learning_rate": 1.893939393939394e-05, "loss": 2.2119, "step": 380 }, { "epoch": 1.0, "eval_gen_len": 433.9545, "eval_loss": 1.9540973901748657, "eval_rouge1": 61.8945, "eval_rouge2": 45.0914, "eval_rougeL": 55.0215, "eval_rougeLsum": 60.5887, "eval_runtime": 309.9067, "eval_samples_per_second": 0.639, "eval_steps_per_second": 0.161, "step": 396 }, { "epoch": 1.0075757575757576, "grad_norm": 5.456070899963379, "learning_rate": 1.98989898989899e-05, "loss": 2.1641, "step": 399 }, { "epoch": 1.0555555555555556, "grad_norm": 6.323323726654053, "learning_rate": 2.085858585858586e-05, "loss": 1.8417, "step": 418 }, { "epoch": 1.1035353535353536, "grad_norm": 5.177259922027588, "learning_rate": 2.1818181818181818e-05, "loss": 2.1193, "step": 437 }, { "epoch": 1.1515151515151516, "grad_norm": 4.9402689933776855, "learning_rate": 2.277777777777778e-05, "loss": 2.0342, "step": 456 }, { "epoch": 1.1994949494949494, "grad_norm": 5.041110515594482, "learning_rate": 2.3737373737373738e-05, "loss": 1.9017, "step": 475 }, { "epoch": 1.2474747474747474, "grad_norm": 5.8411946296691895, "learning_rate": 2.46969696969697e-05, "loss": 1.9286, "step": 494 }, { "epoch": 1.2954545454545454, "grad_norm": 4.762038707733154, "learning_rate": 2.5656565656565658e-05, "loss": 1.9661, "step": 513 }, { "epoch": 1.3434343434343434, "grad_norm": 5.631948947906494, "learning_rate": 2.6616161616161616e-05, "loss": 1.6384, "step": 532 }, { "epoch": 1.3914141414141414, "grad_norm": 8.481356620788574, "learning_rate": 2.7575757575757578e-05, "loss": 1.628, "step": 551 }, { "epoch": 1.4393939393939394, "grad_norm": 5.706149101257324, "learning_rate": 2.8535353535353536e-05, "loss": 1.7007, "step": 570 }, { "epoch": 1.4873737373737375, "grad_norm": 5.414116382598877, "learning_rate": 2.9494949494949498e-05, "loss": 1.5938, "step": 589 }, { "epoch": 1.5353535353535355, "grad_norm": 4.6547441482543945, "learning_rate": 3.0454545454545456e-05, "loss": 1.6564, "step": 608 }, { "epoch": 1.5833333333333335, "grad_norm": 5.299274444580078, "learning_rate": 3.141414141414142e-05, "loss": 1.5271, "step": 627 }, { "epoch": 1.6313131313131313, "grad_norm": 5.234986305236816, "learning_rate": 3.237373737373737e-05, "loss": 1.4951, "step": 646 }, { "epoch": 1.6792929292929293, "grad_norm": 4.347626209259033, "learning_rate": 3.3333333333333335e-05, "loss": 1.2796, "step": 665 }, { "epoch": 1.7272727272727273, "grad_norm": 4.5906596183776855, "learning_rate": 3.429292929292929e-05, "loss": 1.3838, "step": 684 }, { "epoch": 1.7752525252525253, "grad_norm": 5.531446933746338, "learning_rate": 3.525252525252525e-05, "loss": 1.5017, "step": 703 }, { "epoch": 1.823232323232323, "grad_norm": 2.8456931114196777, "learning_rate": 3.621212121212122e-05, "loss": 1.3426, "step": 722 }, { "epoch": 1.871212121212121, "grad_norm": 4.428695201873779, "learning_rate": 3.7171717171717175e-05, "loss": 1.4338, "step": 741 }, { "epoch": 1.9191919191919191, "grad_norm": 5.436211585998535, "learning_rate": 3.8131313131313133e-05, "loss": 1.4004, "step": 760 }, { "epoch": 1.9671717171717171, "grad_norm": 5.241793632507324, "learning_rate": 3.909090909090909e-05, "loss": 1.2884, "step": 779 }, { "epoch": 2.0, "eval_gen_len": 452.298, "eval_loss": 1.0236088037490845, "eval_rouge1": 73.1042, "eval_rouge2": 58.8087, "eval_rougeL": 67.5013, "eval_rougeLsum": 71.9232, "eval_runtime": 307.9407, "eval_samples_per_second": 0.643, "eval_steps_per_second": 0.162, "step": 792 }, { "epoch": 2.015151515151515, "grad_norm": 6.048191547393799, "learning_rate": 4.005050505050506e-05, "loss": 1.1874, "step": 798 }, { "epoch": 2.063131313131313, "grad_norm": 4.456089019775391, "learning_rate": 4.101010101010101e-05, "loss": 1.2008, "step": 817 }, { "epoch": 2.111111111111111, "grad_norm": 4.898771286010742, "learning_rate": 4.196969696969697e-05, "loss": 1.1192, "step": 836 }, { "epoch": 2.159090909090909, "grad_norm": 4.564539909362793, "learning_rate": 4.292929292929293e-05, "loss": 1.0235, "step": 855 }, { "epoch": 2.207070707070707, "grad_norm": 3.8853230476379395, "learning_rate": 4.388888888888889e-05, "loss": 1.0205, "step": 874 }, { "epoch": 2.255050505050505, "grad_norm": 5.523693084716797, "learning_rate": 4.484848484848485e-05, "loss": 1.1685, "step": 893 }, { "epoch": 2.303030303030303, "grad_norm": 4.635316848754883, "learning_rate": 4.5808080808080814e-05, "loss": 0.9885, "step": 912 }, { "epoch": 2.351010101010101, "grad_norm": 4.56376314163208, "learning_rate": 4.676767676767677e-05, "loss": 0.9086, "step": 931 }, { "epoch": 2.398989898989899, "grad_norm": 4.306213855743408, "learning_rate": 4.772727272727273e-05, "loss": 0.9958, "step": 950 }, { "epoch": 2.446969696969697, "grad_norm": 3.2135376930236816, "learning_rate": 4.868686868686869e-05, "loss": 0.8709, "step": 969 }, { "epoch": 2.494949494949495, "grad_norm": 6.685573101043701, "learning_rate": 4.964646464646465e-05, "loss": 0.931, "step": 988 }, { "epoch": 2.542929292929293, "grad_norm": 3.9406039714813232, "learning_rate": 4.993265993265993e-05, "loss": 0.8775, "step": 1007 }, { "epoch": 2.590909090909091, "grad_norm": 5.393746852874756, "learning_rate": 4.9826038159371496e-05, "loss": 0.8857, "step": 1026 }, { "epoch": 2.638888888888889, "grad_norm": 5.132722854614258, "learning_rate": 4.971941638608306e-05, "loss": 0.8471, "step": 1045 }, { "epoch": 2.686868686868687, "grad_norm": 4.663482189178467, "learning_rate": 4.9612794612794616e-05, "loss": 0.7213, "step": 1064 }, { "epoch": 2.734848484848485, "grad_norm": 4.11750602722168, "learning_rate": 4.950617283950618e-05, "loss": 0.8061, "step": 1083 }, { "epoch": 2.782828282828283, "grad_norm": 4.041347026824951, "learning_rate": 4.9399551066217736e-05, "loss": 0.7509, "step": 1102 }, { "epoch": 2.830808080808081, "grad_norm": 4.211297988891602, "learning_rate": 4.92929292929293e-05, "loss": 0.6744, "step": 1121 }, { "epoch": 2.878787878787879, "grad_norm": 3.244640827178955, "learning_rate": 4.9186307519640855e-05, "loss": 0.7003, "step": 1140 }, { "epoch": 2.926767676767677, "grad_norm": 5.315489768981934, "learning_rate": 4.907968574635242e-05, "loss": 0.7851, "step": 1159 }, { "epoch": 2.974747474747475, "grad_norm": 4.3310651779174805, "learning_rate": 4.8973063973063975e-05, "loss": 0.6508, "step": 1178 }, { "epoch": 3.0, "eval_gen_len": 446.7525, "eval_loss": 0.4764357805252075, "eval_rouge1": 81.6804, "eval_rouge2": 72.1296, "eval_rougeL": 77.6939, "eval_rougeLsum": 80.7121, "eval_runtime": 314.0103, "eval_samples_per_second": 0.631, "eval_steps_per_second": 0.159, "step": 1188 }, { "epoch": 3.022727272727273, "grad_norm": 4.33705997467041, "learning_rate": 4.886644219977554e-05, "loss": 0.6099, "step": 1197 }, { "epoch": 3.0707070707070705, "grad_norm": 3.1699767112731934, "learning_rate": 4.8759820426487094e-05, "loss": 0.5929, "step": 1216 }, { "epoch": 3.1186868686868685, "grad_norm": 4.443921089172363, "learning_rate": 4.865319865319866e-05, "loss": 0.5826, "step": 1235 }, { "epoch": 3.1666666666666665, "grad_norm": 2.6721291542053223, "learning_rate": 4.8546576879910214e-05, "loss": 0.5011, "step": 1254 }, { "epoch": 3.2146464646464645, "grad_norm": 3.757880687713623, "learning_rate": 4.843995510662178e-05, "loss": 0.5661, "step": 1273 }, { "epoch": 3.2626262626262625, "grad_norm": 3.813979387283325, "learning_rate": 4.8333333333333334e-05, "loss": 0.5751, "step": 1292 }, { "epoch": 3.3106060606060606, "grad_norm": 3.1598410606384277, "learning_rate": 4.82267115600449e-05, "loss": 0.4737, "step": 1311 }, { "epoch": 3.3585858585858586, "grad_norm": 3.785410165786743, "learning_rate": 4.812008978675645e-05, "loss": 0.517, "step": 1330 }, { "epoch": 3.4065656565656566, "grad_norm": 3.080495834350586, "learning_rate": 4.8013468013468016e-05, "loss": 0.5221, "step": 1349 }, { "epoch": 3.4545454545454546, "grad_norm": 3.909271001815796, "learning_rate": 4.790684624017957e-05, "loss": 0.4863, "step": 1368 }, { "epoch": 3.5025252525252526, "grad_norm": 4.788086891174316, "learning_rate": 4.7800224466891136e-05, "loss": 0.4692, "step": 1387 }, { "epoch": 3.5505050505050506, "grad_norm": 3.4247238636016846, "learning_rate": 4.769360269360269e-05, "loss": 0.4108, "step": 1406 }, { "epoch": 3.5984848484848486, "grad_norm": 4.0106587409973145, "learning_rate": 4.7586980920314256e-05, "loss": 0.4493, "step": 1425 }, { "epoch": 3.6464646464646466, "grad_norm": 3.134174108505249, "learning_rate": 4.748035914702582e-05, "loss": 0.4723, "step": 1444 }, { "epoch": 3.6944444444444446, "grad_norm": 3.621345043182373, "learning_rate": 4.7373737373737375e-05, "loss": 0.4259, "step": 1463 }, { "epoch": 3.742424242424242, "grad_norm": 3.506635904312134, "learning_rate": 4.726711560044894e-05, "loss": 0.4028, "step": 1482 }, { "epoch": 3.7904040404040407, "grad_norm": 3.2687313556671143, "learning_rate": 4.7160493827160495e-05, "loss": 0.3411, "step": 1501 }, { "epoch": 3.8383838383838382, "grad_norm": 2.7574453353881836, "learning_rate": 4.705387205387206e-05, "loss": 0.3987, "step": 1520 }, { "epoch": 3.8863636363636362, "grad_norm": 3.152305841445923, "learning_rate": 4.6947250280583614e-05, "loss": 0.3134, "step": 1539 }, { "epoch": 3.9343434343434343, "grad_norm": 3.0011281967163086, "learning_rate": 4.684062850729518e-05, "loss": 0.4049, "step": 1558 }, { "epoch": 3.9823232323232323, "grad_norm": 3.843477487564087, "learning_rate": 4.6734006734006734e-05, "loss": 0.3601, "step": 1577 }, { "epoch": 4.0, "eval_gen_len": 450.1111, "eval_loss": 0.200236514210701, "eval_rouge1": 88.8764, "eval_rouge2": 82.5347, "eval_rougeL": 86.0051, "eval_rougeLsum": 88.1546, "eval_runtime": 311.9983, "eval_samples_per_second": 0.635, "eval_steps_per_second": 0.16, "step": 1584 }, { "epoch": 4.03030303030303, "grad_norm": 3.5295767784118652, "learning_rate": 4.66273849607183e-05, "loss": 0.2928, "step": 1596 }, { "epoch": 4.078282828282829, "grad_norm": 2.5708324909210205, "learning_rate": 4.6520763187429854e-05, "loss": 0.3198, "step": 1615 }, { "epoch": 4.126262626262626, "grad_norm": 3.164304256439209, "learning_rate": 4.641414141414142e-05, "loss": 0.2895, "step": 1634 }, { "epoch": 4.174242424242424, "grad_norm": 2.5397143363952637, "learning_rate": 4.630751964085297e-05, "loss": 0.277, "step": 1653 }, { "epoch": 4.222222222222222, "grad_norm": 2.639850616455078, "learning_rate": 4.6200897867564537e-05, "loss": 0.2605, "step": 1672 }, { "epoch": 4.27020202020202, "grad_norm": 2.7750766277313232, "learning_rate": 4.609427609427609e-05, "loss": 0.2702, "step": 1691 }, { "epoch": 4.318181818181818, "grad_norm": 3.7487545013427734, "learning_rate": 4.5987654320987656e-05, "loss": 0.2904, "step": 1710 }, { "epoch": 4.366161616161616, "grad_norm": 2.997239351272583, "learning_rate": 4.588103254769921e-05, "loss": 0.2802, "step": 1729 }, { "epoch": 4.414141414141414, "grad_norm": 3.394998788833618, "learning_rate": 4.5774410774410776e-05, "loss": 0.2538, "step": 1748 }, { "epoch": 4.462121212121212, "grad_norm": 2.767073392868042, "learning_rate": 4.566778900112233e-05, "loss": 0.2537, "step": 1767 }, { "epoch": 4.51010101010101, "grad_norm": 2.4357552528381348, "learning_rate": 4.5561167227833895e-05, "loss": 0.2404, "step": 1786 }, { "epoch": 4.558080808080808, "grad_norm": 3.32743763923645, "learning_rate": 4.545454545454546e-05, "loss": 0.2273, "step": 1805 }, { "epoch": 4.606060606060606, "grad_norm": 4.112804889678955, "learning_rate": 4.5347923681257015e-05, "loss": 0.2373, "step": 1824 }, { "epoch": 4.654040404040404, "grad_norm": 2.256756067276001, "learning_rate": 4.524130190796858e-05, "loss": 0.2387, "step": 1843 }, { "epoch": 4.702020202020202, "grad_norm": 2.8960530757904053, "learning_rate": 4.5134680134680135e-05, "loss": 0.2507, "step": 1862 }, { "epoch": 4.75, "grad_norm": 2.621000289916992, "learning_rate": 4.50280583613917e-05, "loss": 0.1899, "step": 1881 }, { "epoch": 4.797979797979798, "grad_norm": 2.6264095306396484, "learning_rate": 4.4921436588103254e-05, "loss": 0.1921, "step": 1900 }, { "epoch": 4.845959595959596, "grad_norm": 2.770009994506836, "learning_rate": 4.481481481481482e-05, "loss": 0.1819, "step": 1919 }, { "epoch": 4.893939393939394, "grad_norm": 2.9698147773742676, "learning_rate": 4.4708193041526374e-05, "loss": 0.1877, "step": 1938 }, { "epoch": 4.941919191919192, "grad_norm": 2.6372764110565186, "learning_rate": 4.460157126823794e-05, "loss": 0.1848, "step": 1957 }, { "epoch": 4.98989898989899, "grad_norm": 3.164541006088257, "learning_rate": 4.4494949494949493e-05, "loss": 0.1663, "step": 1976 }, { "epoch": 5.0, "eval_gen_len": 456.2828, "eval_loss": 0.07814756035804749, "eval_rouge1": 94.3866, "eval_rouge2": 91.5001, "eval_rougeL": 93.1131, "eval_rougeLsum": 94.0657, "eval_runtime": 313.1525, "eval_samples_per_second": 0.632, "eval_steps_per_second": 0.16, "step": 1980 }, { "epoch": 5.037878787878788, "grad_norm": 1.722895622253418, "learning_rate": 4.438832772166106e-05, "loss": 0.1759, "step": 1995 }, { "epoch": 5.085858585858586, "grad_norm": 5.62981653213501, "learning_rate": 4.428170594837261e-05, "loss": 0.1636, "step": 2014 }, { "epoch": 5.133838383838384, "grad_norm": 2.8217477798461914, "learning_rate": 4.4175084175084176e-05, "loss": 0.1606, "step": 2033 }, { "epoch": 5.181818181818182, "grad_norm": 2.2173731327056885, "learning_rate": 4.406846240179573e-05, "loss": 0.1439, "step": 2052 }, { "epoch": 5.22979797979798, "grad_norm": 2.4998719692230225, "learning_rate": 4.3961840628507296e-05, "loss": 0.1621, "step": 2071 }, { "epoch": 5.277777777777778, "grad_norm": 2.240722894668579, "learning_rate": 4.385521885521885e-05, "loss": 0.1453, "step": 2090 }, { "epoch": 5.325757575757576, "grad_norm": 2.2540104389190674, "learning_rate": 4.3748597081930415e-05, "loss": 0.1277, "step": 2109 }, { "epoch": 5.373737373737374, "grad_norm": 2.4170944690704346, "learning_rate": 4.364197530864197e-05, "loss": 0.1447, "step": 2128 }, { "epoch": 5.421717171717171, "grad_norm": 2.999178409576416, "learning_rate": 4.3535353535353535e-05, "loss": 0.1436, "step": 2147 }, { "epoch": 5.46969696969697, "grad_norm": 1.2877140045166016, "learning_rate": 4.34287317620651e-05, "loss": 0.1567, "step": 2166 }, { "epoch": 5.517676767676767, "grad_norm": 2.6238033771514893, "learning_rate": 4.332210998877666e-05, "loss": 0.1196, "step": 2185 }, { "epoch": 5.565656565656566, "grad_norm": 3.2489261627197266, "learning_rate": 4.321548821548822e-05, "loss": 0.1403, "step": 2204 }, { "epoch": 5.613636363636363, "grad_norm": 2.750288963317871, "learning_rate": 4.310886644219978e-05, "loss": 0.1422, "step": 2223 }, { "epoch": 5.661616161616162, "grad_norm": 2.125354290008545, "learning_rate": 4.300224466891134e-05, "loss": 0.1209, "step": 2242 }, { "epoch": 5.709595959595959, "grad_norm": 2.956669330596924, "learning_rate": 4.28956228956229e-05, "loss": 0.1261, "step": 2261 }, { "epoch": 5.757575757575758, "grad_norm": 2.4984519481658936, "learning_rate": 4.278900112233446e-05, "loss": 0.1688, "step": 2280 }, { "epoch": 5.805555555555555, "grad_norm": 1.8321527242660522, "learning_rate": 4.268237934904602e-05, "loss": 0.1242, "step": 2299 }, { "epoch": 5.853535353535354, "grad_norm": 2.03973650932312, "learning_rate": 4.257575757575758e-05, "loss": 0.1083, "step": 2318 }, { "epoch": 5.901515151515151, "grad_norm": 3.7561116218566895, "learning_rate": 4.246913580246914e-05, "loss": 0.1553, "step": 2337 }, { "epoch": 5.94949494949495, "grad_norm": 4.251192092895508, "learning_rate": 4.23625140291807e-05, "loss": 0.1083, "step": 2356 }, { "epoch": 5.997474747474747, "grad_norm": 1.9990973472595215, "learning_rate": 4.225589225589226e-05, "loss": 0.1058, "step": 2375 }, { "epoch": 6.0, "eval_gen_len": 457.202, "eval_loss": 0.04152845963835716, "eval_rouge1": 96.8862, "eval_rouge2": 95.3951, "eval_rougeL": 96.2693, "eval_rougeLsum": 96.7116, "eval_runtime": 311.7737, "eval_samples_per_second": 0.635, "eval_steps_per_second": 0.16, "step": 2376 }, { "epoch": 6.045454545454546, "grad_norm": 0.8384280204772949, "learning_rate": 4.214927048260382e-05, "loss": 0.108, "step": 2394 }, { "epoch": 6.093434343434343, "grad_norm": 2.411668539047241, "learning_rate": 4.204264870931538e-05, "loss": 0.1004, "step": 2413 }, { "epoch": 6.141414141414141, "grad_norm": 1.1796258687973022, "learning_rate": 4.193602693602694e-05, "loss": 0.1045, "step": 2432 }, { "epoch": 6.1893939393939394, "grad_norm": 1.7287437915802002, "learning_rate": 4.18294051627385e-05, "loss": 0.0994, "step": 2451 }, { "epoch": 6.237373737373737, "grad_norm": 1.7795087099075317, "learning_rate": 4.172278338945006e-05, "loss": 0.0972, "step": 2470 }, { "epoch": 6.2853535353535355, "grad_norm": 1.167188048362732, "learning_rate": 4.161616161616162e-05, "loss": 0.0983, "step": 2489 }, { "epoch": 6.333333333333333, "grad_norm": 2.2238261699676514, "learning_rate": 4.150953984287318e-05, "loss": 0.0924, "step": 2508 }, { "epoch": 6.3813131313131315, "grad_norm": 2.0129451751708984, "learning_rate": 4.140291806958474e-05, "loss": 0.1008, "step": 2527 }, { "epoch": 6.429292929292929, "grad_norm": 2.159029006958008, "learning_rate": 4.12962962962963e-05, "loss": 0.0844, "step": 2546 }, { "epoch": 6.4772727272727275, "grad_norm": 1.6442041397094727, "learning_rate": 4.118967452300786e-05, "loss": 0.089, "step": 2565 }, { "epoch": 6.525252525252525, "grad_norm": 1.7517181634902954, "learning_rate": 4.108305274971942e-05, "loss": 0.0901, "step": 2584 }, { "epoch": 6.5732323232323235, "grad_norm": 1.2451421022415161, "learning_rate": 4.097643097643098e-05, "loss": 0.0792, "step": 2603 }, { "epoch": 6.621212121212121, "grad_norm": 1.7869890928268433, "learning_rate": 4.086980920314254e-05, "loss": 0.0915, "step": 2622 }, { "epoch": 6.66919191919192, "grad_norm": 2.2728748321533203, "learning_rate": 4.07631874298541e-05, "loss": 0.0844, "step": 2641 }, { "epoch": 6.717171717171717, "grad_norm": 1.853063702583313, "learning_rate": 4.065656565656566e-05, "loss": 0.0857, "step": 2660 }, { "epoch": 6.765151515151516, "grad_norm": 1.8357278108596802, "learning_rate": 4.0549943883277216e-05, "loss": 0.0785, "step": 2679 }, { "epoch": 6.813131313131313, "grad_norm": 2.4003820419311523, "learning_rate": 4.044332210998878e-05, "loss": 0.0928, "step": 2698 }, { "epoch": 6.861111111111111, "grad_norm": 3.2185213565826416, "learning_rate": 4.0336700336700336e-05, "loss": 0.0846, "step": 2717 }, { "epoch": 6.909090909090909, "grad_norm": 2.1494338512420654, "learning_rate": 4.02300785634119e-05, "loss": 0.0779, "step": 2736 }, { "epoch": 6.957070707070707, "grad_norm": 1.4656015634536743, "learning_rate": 4.012345679012346e-05, "loss": 0.0701, "step": 2755 }, { "epoch": 7.0, "eval_gen_len": 457.0758, "eval_loss": 0.023281894624233246, "eval_rouge1": 98.2551, "eval_rouge2": 97.419, "eval_rougeL": 97.9827, "eval_rougeLsum": 98.1479, "eval_runtime": 308.2542, "eval_samples_per_second": 0.642, "eval_steps_per_second": 0.162, "step": 2772 }, { "epoch": 7.005050505050505, "grad_norm": 2.010523796081543, "learning_rate": 4.001683501683502e-05, "loss": 0.0752, "step": 2774 }, { "epoch": 7.053030303030303, "grad_norm": 2.3323521614074707, "learning_rate": 3.991021324354658e-05, "loss": 0.0637, "step": 2793 }, { "epoch": 7.101010101010101, "grad_norm": 1.684549331665039, "learning_rate": 3.980359147025814e-05, "loss": 0.0711, "step": 2812 }, { "epoch": 7.148989898989899, "grad_norm": 1.6758151054382324, "learning_rate": 3.96969696969697e-05, "loss": 0.0709, "step": 2831 }, { "epoch": 7.196969696969697, "grad_norm": 1.9380394220352173, "learning_rate": 3.959034792368126e-05, "loss": 0.0665, "step": 2850 }, { "epoch": 7.244949494949495, "grad_norm": 1.988451361656189, "learning_rate": 3.948372615039282e-05, "loss": 0.0693, "step": 2869 }, { "epoch": 7.292929292929293, "grad_norm": 1.2491166591644287, "learning_rate": 3.937710437710438e-05, "loss": 0.0721, "step": 2888 }, { "epoch": 7.340909090909091, "grad_norm": 1.4535962343215942, "learning_rate": 3.927048260381594e-05, "loss": 0.0659, "step": 2907 }, { "epoch": 7.388888888888889, "grad_norm": 1.9495213031768799, "learning_rate": 3.91638608305275e-05, "loss": 0.085, "step": 2926 }, { "epoch": 7.436868686868687, "grad_norm": 2.000051259994507, "learning_rate": 3.905723905723906e-05, "loss": 0.0694, "step": 2945 }, { "epoch": 7.484848484848484, "grad_norm": 2.803647518157959, "learning_rate": 3.895061728395062e-05, "loss": 0.0713, "step": 2964 }, { "epoch": 7.532828282828283, "grad_norm": 1.7719061374664307, "learning_rate": 3.884399551066218e-05, "loss": 0.0626, "step": 2983 }, { "epoch": 7.58080808080808, "grad_norm": 2.1927216053009033, "learning_rate": 3.8737373737373737e-05, "loss": 0.0668, "step": 3002 }, { "epoch": 7.628787878787879, "grad_norm": 1.1310573816299438, "learning_rate": 3.86307519640853e-05, "loss": 0.0521, "step": 3021 }, { "epoch": 7.6767676767676765, "grad_norm": 1.4547724723815918, "learning_rate": 3.8524130190796856e-05, "loss": 0.0556, "step": 3040 }, { "epoch": 7.724747474747475, "grad_norm": 2.3777520656585693, "learning_rate": 3.841750841750842e-05, "loss": 0.053, "step": 3059 }, { "epoch": 7.7727272727272725, "grad_norm": 2.8660731315612793, "learning_rate": 3.8310886644219976e-05, "loss": 0.0662, "step": 3078 }, { "epoch": 7.820707070707071, "grad_norm": 1.2403335571289062, "learning_rate": 3.820426487093154e-05, "loss": 0.0513, "step": 3097 }, { "epoch": 7.8686868686868685, "grad_norm": 1.2479897737503052, "learning_rate": 3.8097643097643095e-05, "loss": 0.0529, "step": 3116 }, { "epoch": 7.916666666666667, "grad_norm": 0.8943136930465698, "learning_rate": 3.799102132435466e-05, "loss": 0.0493, "step": 3135 }, { "epoch": 7.9646464646464645, "grad_norm": 2.092480421066284, "learning_rate": 3.788439955106622e-05, "loss": 0.0613, "step": 3154 }, { "epoch": 8.0, "eval_gen_len": 456.2626, "eval_loss": 0.014666617847979069, "eval_rouge1": 98.8857, "eval_rouge2": 98.3456, "eval_rougeL": 98.7777, "eval_rougeLsum": 98.8775, "eval_runtime": 309.5854, "eval_samples_per_second": 0.64, "eval_steps_per_second": 0.162, "step": 3168 }, { "epoch": 8.012626262626263, "grad_norm": 0.7548523545265198, "learning_rate": 3.777777777777778e-05, "loss": 0.0606, "step": 3173 }, { "epoch": 8.06060606060606, "grad_norm": 1.4151593446731567, "learning_rate": 3.767115600448934e-05, "loss": 0.047, "step": 3192 }, { "epoch": 8.108585858585858, "grad_norm": 1.26515531539917, "learning_rate": 3.75645342312009e-05, "loss": 0.0535, "step": 3211 }, { "epoch": 8.156565656565657, "grad_norm": 1.318965196609497, "learning_rate": 3.745791245791246e-05, "loss": 0.0496, "step": 3230 }, { "epoch": 8.204545454545455, "grad_norm": 1.5114891529083252, "learning_rate": 3.735129068462402e-05, "loss": 0.0434, "step": 3249 }, { "epoch": 8.252525252525253, "grad_norm": 1.121440052986145, "learning_rate": 3.724466891133558e-05, "loss": 0.0447, "step": 3268 }, { "epoch": 8.30050505050505, "grad_norm": 0.9955199360847473, "learning_rate": 3.713804713804714e-05, "loss": 0.0478, "step": 3287 }, { "epoch": 8.348484848484848, "grad_norm": 2.078119993209839, "learning_rate": 3.70314253647587e-05, "loss": 0.0494, "step": 3306 }, { "epoch": 8.396464646464647, "grad_norm": 1.3387614488601685, "learning_rate": 3.692480359147026e-05, "loss": 0.0402, "step": 3325 }, { "epoch": 8.444444444444445, "grad_norm": 2.1103882789611816, "learning_rate": 3.681818181818182e-05, "loss": 0.0457, "step": 3344 }, { "epoch": 8.492424242424242, "grad_norm": 1.5628422498703003, "learning_rate": 3.6711560044893376e-05, "loss": 0.0462, "step": 3363 }, { "epoch": 8.54040404040404, "grad_norm": 1.7333801984786987, "learning_rate": 3.660493827160494e-05, "loss": 0.0466, "step": 3382 }, { "epoch": 8.58838383838384, "grad_norm": 1.3456194400787354, "learning_rate": 3.6498316498316496e-05, "loss": 0.0402, "step": 3401 }, { "epoch": 8.636363636363637, "grad_norm": 1.2034332752227783, "learning_rate": 3.639169472502806e-05, "loss": 0.0459, "step": 3420 }, { "epoch": 8.684343434343434, "grad_norm": 0.502601683139801, "learning_rate": 3.6285072951739616e-05, "loss": 0.0465, "step": 3439 }, { "epoch": 8.732323232323232, "grad_norm": 1.651060938835144, "learning_rate": 3.617845117845118e-05, "loss": 0.0401, "step": 3458 }, { "epoch": 8.780303030303031, "grad_norm": 1.1892279386520386, "learning_rate": 3.6071829405162735e-05, "loss": 0.0357, "step": 3477 }, { "epoch": 8.828282828282829, "grad_norm": 1.0248866081237793, "learning_rate": 3.59652076318743e-05, "loss": 0.0368, "step": 3496 }, { "epoch": 8.876262626262626, "grad_norm": 1.235990285873413, "learning_rate": 3.5858585858585855e-05, "loss": 0.0354, "step": 3515 }, { "epoch": 8.924242424242424, "grad_norm": 1.156433343887329, "learning_rate": 3.575196408529742e-05, "loss": 0.0366, "step": 3534 }, { "epoch": 8.972222222222221, "grad_norm": 1.7279999256134033, "learning_rate": 3.564534231200898e-05, "loss": 0.042, "step": 3553 }, { "epoch": 9.0, "eval_gen_len": 458.904, "eval_loss": 0.010045090690255165, "eval_rouge1": 99.2356, "eval_rouge2": 98.9142, "eval_rougeL": 99.1626, "eval_rougeLsum": 99.2057, "eval_runtime": 310.3859, "eval_samples_per_second": 0.638, "eval_steps_per_second": 0.161, "step": 3564 }, { "epoch": 9.02020202020202, "grad_norm": 1.298857569694519, "learning_rate": 3.553872053872054e-05, "loss": 0.0423, "step": 3572 }, { "epoch": 9.068181818181818, "grad_norm": 1.124570608139038, "learning_rate": 3.54320987654321e-05, "loss": 0.0309, "step": 3591 }, { "epoch": 9.116161616161616, "grad_norm": 1.5465493202209473, "learning_rate": 3.532547699214366e-05, "loss": 0.0365, "step": 3610 }, { "epoch": 9.164141414141413, "grad_norm": 1.208004117012024, "learning_rate": 3.521885521885522e-05, "loss": 0.035, "step": 3629 }, { "epoch": 9.212121212121213, "grad_norm": 1.7865431308746338, "learning_rate": 3.511223344556678e-05, "loss": 0.0376, "step": 3648 }, { "epoch": 9.26010101010101, "grad_norm": 0.8254820108413696, "learning_rate": 3.500561167227834e-05, "loss": 0.0327, "step": 3667 }, { "epoch": 9.308080808080808, "grad_norm": 1.0625301599502563, "learning_rate": 3.48989898989899e-05, "loss": 0.0363, "step": 3686 }, { "epoch": 9.356060606060606, "grad_norm": 1.4023059606552124, "learning_rate": 3.4792368125701466e-05, "loss": 0.0386, "step": 3705 }, { "epoch": 9.404040404040405, "grad_norm": 1.9438424110412598, "learning_rate": 3.468574635241302e-05, "loss": 0.0345, "step": 3724 }, { "epoch": 9.452020202020202, "grad_norm": 0.7543076872825623, "learning_rate": 3.4579124579124586e-05, "loss": 0.0369, "step": 3743 }, { "epoch": 9.5, "grad_norm": 1.4393041133880615, "learning_rate": 3.447250280583614e-05, "loss": 0.0327, "step": 3762 }, { "epoch": 9.547979797979798, "grad_norm": 0.8074661493301392, "learning_rate": 3.4365881032547706e-05, "loss": 0.0317, "step": 3781 }, { "epoch": 9.595959595959595, "grad_norm": 1.5561342239379883, "learning_rate": 3.425925925925926e-05, "loss": 0.037, "step": 3800 }, { "epoch": 9.643939393939394, "grad_norm": 2.0807337760925293, "learning_rate": 3.4152637485970825e-05, "loss": 0.0365, "step": 3819 }, { "epoch": 9.691919191919192, "grad_norm": 0.9361720681190491, "learning_rate": 3.404601571268238e-05, "loss": 0.0289, "step": 3838 }, { "epoch": 9.73989898989899, "grad_norm": 0.8103030323982239, "learning_rate": 3.3939393939393945e-05, "loss": 0.0294, "step": 3857 }, { "epoch": 9.787878787878787, "grad_norm": 1.4826786518096924, "learning_rate": 3.38327721661055e-05, "loss": 0.0274, "step": 3876 }, { "epoch": 9.835858585858587, "grad_norm": 1.5168986320495605, "learning_rate": 3.3726150392817064e-05, "loss": 0.0323, "step": 3895 }, { "epoch": 9.883838383838384, "grad_norm": 1.7513259649276733, "learning_rate": 3.361952861952862e-05, "loss": 0.0312, "step": 3914 }, { "epoch": 9.931818181818182, "grad_norm": 1.0549064874649048, "learning_rate": 3.3512906846240184e-05, "loss": 0.0294, "step": 3933 }, { "epoch": 9.97979797979798, "grad_norm": 1.206652045249939, "learning_rate": 3.340628507295174e-05, "loss": 0.0295, "step": 3952 }, { "epoch": 10.0, "eval_gen_len": 458.5909, "eval_loss": 0.007954709231853485, "eval_rouge1": 99.4349, "eval_rouge2": 99.1573, "eval_rougeL": 99.4066, "eval_rougeLsum": 99.4213, "eval_runtime": 308.7546, "eval_samples_per_second": 0.641, "eval_steps_per_second": 0.162, "step": 3960 } ], "logging_steps": 19, "max_steps": 9900, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 500, "total_flos": 1017429232582656.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }