diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,4356 +1,2928 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 14.962593516209477, + "epoch": 9.975062344139651, "eval_steps": 500, - "global_step": 3000, + "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004987531172069825, - "grad_norm": 3.65625, - "learning_rate": 6.666666666666667e-07, - "loss": 2.7726, + "grad_norm": 4.53125, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.7759, "step": 1 }, { "epoch": 0.02493765586034913, - "grad_norm": 1.828125, - "learning_rate": 3.3333333333333333e-06, - "loss": 2.8032, + "grad_norm": 8.125, + "learning_rate": 5e-06, + "loss": 2.8609, "step": 5 }, { "epoch": 0.04987531172069826, - "grad_norm": 1.625, - "learning_rate": 6.666666666666667e-06, - "loss": 2.8059, + "grad_norm": 3.984375, + "learning_rate": 1e-05, + "loss": 2.7873, "step": 10 }, { "epoch": 0.07481296758104738, - "grad_norm": 2.0625, - "learning_rate": 1e-05, - "loss": 2.8169, + "grad_norm": 9.625, + "learning_rate": 1.5e-05, + "loss": 2.7334, "step": 15 }, { "epoch": 0.09975062344139651, - "grad_norm": 1.8203125, - "learning_rate": 1.3333333333333333e-05, - "loss": 2.7666, + "grad_norm": 2.859375, + "learning_rate": 2e-05, + "loss": 2.5897, "step": 20 }, { "epoch": 0.12468827930174564, - "grad_norm": 1.6875, - "learning_rate": 1.6666666666666667e-05, - "loss": 2.7631, + "grad_norm": 2.0, + "learning_rate": 2.5e-05, + "loss": 2.423, "step": 25 }, { "epoch": 0.14962593516209477, - "grad_norm": 5.46875, - "learning_rate": 2e-05, - "loss": 2.6985, + "grad_norm": 5.6875, + "learning_rate": 3e-05, + "loss": 2.2591, "step": 30 }, { "epoch": 0.1745635910224439, - "grad_norm": 1.4296875, - "learning_rate": 2.3333333333333336e-05, - "loss": 2.6688, + "grad_norm": 2.34375, + "learning_rate": 3.5e-05, + "loss": 2.1123, "step": 35 }, { "epoch": 0.19950124688279303, - "grad_norm": 2.15625, - "learning_rate": 2.6666666666666667e-05, - "loss": 2.558, + "grad_norm": 4.03125, + "learning_rate": 4e-05, + "loss": 1.9826, "step": 40 }, { "epoch": 0.22443890274314215, - "grad_norm": 3.84375, - "learning_rate": 3e-05, - "loss": 2.4229, + "grad_norm": 3.015625, + "learning_rate": 4.5e-05, + "loss": 1.8715, "step": 45 }, { "epoch": 0.24937655860349128, - "grad_norm": 1.9375, - "learning_rate": 3.3333333333333335e-05, - "loss": 2.3437, + "grad_norm": 1.625, + "learning_rate": 5e-05, + "loss": 1.7412, "step": 50 }, { "epoch": 0.2743142144638404, - "grad_norm": 1.4140625, - "learning_rate": 3.6666666666666666e-05, - "loss": 2.237, + "grad_norm": 0.98046875, + "learning_rate": 5.500000000000001e-05, + "loss": 1.6251, "step": 55 }, { "epoch": 0.29925187032418954, - "grad_norm": 1.34375, - "learning_rate": 4e-05, - "loss": 2.1707, + "grad_norm": 3.3125, + "learning_rate": 6e-05, + "loss": 1.5376, "step": 60 }, { "epoch": 0.32418952618453867, - "grad_norm": 1.5625, - "learning_rate": 4.3333333333333334e-05, - "loss": 2.0941, + "grad_norm": 0.57421875, + "learning_rate": 6.500000000000001e-05, + "loss": 1.444, "step": 65 }, { "epoch": 0.3491271820448878, - "grad_norm": 13.125, - "learning_rate": 4.666666666666667e-05, - "loss": 1.9843, + "grad_norm": 0.7890625, + "learning_rate": 7e-05, + "loss": 1.3852, "step": 70 }, { "epoch": 0.3740648379052369, - "grad_norm": 0.86328125, - "learning_rate": 5e-05, - "loss": 1.8983, + "grad_norm": 0.4765625, + "learning_rate": 7.500000000000001e-05, + "loss": 1.3384, "step": 75 }, { "epoch": 0.39900249376558605, - "grad_norm": 2.359375, - "learning_rate": 5.333333333333333e-05, - "loss": 1.8418, + "grad_norm": 0.46484375, + "learning_rate": 8e-05, + "loss": 1.2946, "step": 80 }, { "epoch": 0.4239401496259352, - "grad_norm": 1.09375, - "learning_rate": 5.666666666666667e-05, - "loss": 1.7522, + "grad_norm": 0.61328125, + "learning_rate": 8.5e-05, + "loss": 1.2742, "step": 85 }, { "epoch": 0.4488778054862843, - "grad_norm": 1.9375, - "learning_rate": 6e-05, - "loss": 1.7016, + "grad_norm": 0.431640625, + "learning_rate": 9e-05, + "loss": 1.2416, "step": 90 }, { "epoch": 0.47381546134663344, - "grad_norm": 0.67578125, - "learning_rate": 6.333333333333333e-05, - "loss": 1.6093, + "grad_norm": 0.345703125, + "learning_rate": 9.5e-05, + "loss": 1.2248, "step": 95 }, { "epoch": 0.49875311720698257, - "grad_norm": 0.66796875, - "learning_rate": 6.666666666666667e-05, - "loss": 1.5596, + "grad_norm": 0.55078125, + "learning_rate": 0.0001, + "loss": 1.1963, "step": 100 }, { "epoch": 0.5236907730673317, - "grad_norm": 0.53515625, - "learning_rate": 7e-05, - "loss": 1.5197, + "grad_norm": 0.48046875, + "learning_rate": 0.000105, + "loss": 1.1878, "step": 105 }, { "epoch": 0.5486284289276808, - "grad_norm": 0.486328125, - "learning_rate": 7.333333333333333e-05, - "loss": 1.4761, + "grad_norm": 0.5625, + "learning_rate": 0.00011000000000000002, + "loss": 1.1837, "step": 110 }, { "epoch": 0.57356608478803, - "grad_norm": 0.53125, - "learning_rate": 7.666666666666667e-05, - "loss": 1.4268, + "grad_norm": 0.51953125, + "learning_rate": 0.00011499999999999999, + "loss": 1.1665, "step": 115 }, { "epoch": 0.5985037406483791, - "grad_norm": 0.400390625, - "learning_rate": 8e-05, - "loss": 1.3845, + "grad_norm": 1.2109375, + "learning_rate": 0.00012, + "loss": 1.1605, "step": 120 }, { "epoch": 0.6234413965087282, - "grad_norm": 0.55859375, - "learning_rate": 8.333333333333334e-05, - "loss": 1.3622, + "grad_norm": 0.490234375, + "learning_rate": 0.000125, + "loss": 1.1303, "step": 125 }, { "epoch": 0.6483790523690773, - "grad_norm": 0.421875, - "learning_rate": 8.666666666666667e-05, - "loss": 1.3215, + "grad_norm": 0.62890625, + "learning_rate": 0.00013000000000000002, + "loss": 1.1453, "step": 130 }, { "epoch": 0.6733167082294265, - "grad_norm": 0.64453125, - "learning_rate": 9e-05, - "loss": 1.2959, + "grad_norm": 0.53515625, + "learning_rate": 0.00013500000000000003, + "loss": 1.138, "step": 135 }, { "epoch": 0.6982543640897756, - "grad_norm": 0.470703125, - "learning_rate": 9.333333333333334e-05, - "loss": 1.2876, + "grad_norm": 0.431640625, + "learning_rate": 0.00014, + "loss": 1.1157, "step": 140 }, { "epoch": 0.7231920199501247, - "grad_norm": 0.396484375, - "learning_rate": 9.666666666666667e-05, - "loss": 1.28, + "grad_norm": 0.98046875, + "learning_rate": 0.000145, + "loss": 1.1255, "step": 145 }, { "epoch": 0.7481296758104738, - "grad_norm": 0.447265625, - "learning_rate": 0.0001, - "loss": 1.2628, + "grad_norm": 0.59375, + "learning_rate": 0.00015000000000000001, + "loss": 1.1147, "step": 150 }, { "epoch": 0.773067331670823, - "grad_norm": 0.83203125, - "learning_rate": 0.00010333333333333334, - "loss": 1.2393, + "grad_norm": 1.2109375, + "learning_rate": 0.000155, + "loss": 1.1138, "step": 155 }, { "epoch": 0.7980049875311721, - "grad_norm": 0.375, - "learning_rate": 0.00010666666666666667, - "loss": 1.2335, + "grad_norm": 0.9921875, + "learning_rate": 0.00016, + "loss": 1.102, "step": 160 }, { "epoch": 0.8229426433915212, - "grad_norm": 0.48828125, - "learning_rate": 0.00011000000000000002, - "loss": 1.2125, + "grad_norm": 0.66015625, + "learning_rate": 0.000165, + "loss": 1.0893, "step": 165 }, { "epoch": 0.8478802992518704, - "grad_norm": 0.69140625, - "learning_rate": 0.00011333333333333334, - "loss": 1.2182, + "grad_norm": 1.0859375, + "learning_rate": 0.00017, + "loss": 1.0974, "step": 170 }, { "epoch": 0.8728179551122195, - "grad_norm": 0.42578125, - "learning_rate": 0.00011666666666666668, - "loss": 1.1958, + "grad_norm": 0.56640625, + "learning_rate": 0.000175, + "loss": 1.0929, "step": 175 }, { "epoch": 0.8977556109725686, - "grad_norm": 0.8359375, - "learning_rate": 0.00012, - "loss": 1.2011, + "grad_norm": 0.439453125, + "learning_rate": 0.00018, + "loss": 1.0945, "step": 180 }, { "epoch": 0.9226932668329177, - "grad_norm": 0.50390625, - "learning_rate": 0.00012333333333333334, - "loss": 1.1778, + "grad_norm": 0.515625, + "learning_rate": 0.00018500000000000002, + "loss": 1.086, "step": 185 }, { "epoch": 0.9476309226932669, - "grad_norm": 0.384765625, - "learning_rate": 0.00012666666666666666, - "loss": 1.1628, + "grad_norm": 0.3828125, + "learning_rate": 0.00019, + "loss": 1.0736, "step": 190 }, { "epoch": 0.972568578553616, - "grad_norm": 0.83203125, - "learning_rate": 0.00013000000000000002, - "loss": 1.1709, + "grad_norm": 0.494140625, + "learning_rate": 0.000195, + "loss": 1.0721, "step": 195 }, { "epoch": 0.9975062344139651, - "grad_norm": 0.7734375, - "learning_rate": 0.00013333333333333334, - "loss": 1.163, + "grad_norm": 0.734375, + "learning_rate": 0.0002, + "loss": 1.0668, "step": 200 }, { "epoch": 0.9975062344139651, - "eval_loss": 2.5127053260803223, - "eval_runtime": 0.5084, - "eval_samples_per_second": 19.67, - "eval_steps_per_second": 1.967, + "eval_loss": 2.471259355545044, + "eval_runtime": 0.9509, + "eval_samples_per_second": 10.517, + "eval_steps_per_second": 1.052, "step": 200 }, { "epoch": 1.0224438902743143, - "grad_norm": 0.5390625, - "learning_rate": 0.00013666666666666666, - "loss": 1.1675, + "grad_norm": 0.921875, + "learning_rate": 0.00019999619230641713, + "loss": 1.041, "step": 205 }, { "epoch": 1.0473815461346634, - "grad_norm": 0.43359375, - "learning_rate": 0.00014, - "loss": 1.1397, + "grad_norm": 0.4296875, + "learning_rate": 0.00019998476951563915, + "loss": 1.0476, "step": 210 }, { "epoch": 1.0723192019950125, - "grad_norm": 0.392578125, - "learning_rate": 0.00014333333333333334, - "loss": 1.143, + "grad_norm": 0.57421875, + "learning_rate": 0.00019996573249755572, + "loss": 1.0466, "step": 215 }, { "epoch": 1.0972568578553616, - "grad_norm": 0.443359375, - "learning_rate": 0.00014666666666666666, - "loss": 1.1466, + "grad_norm": 0.4765625, + "learning_rate": 0.0001999390827019096, + "loss": 1.0377, "step": 220 }, { "epoch": 1.1221945137157108, - "grad_norm": 0.5234375, - "learning_rate": 0.00015000000000000001, - "loss": 1.1303, + "grad_norm": 0.6015625, + "learning_rate": 0.0001999048221581858, + "loss": 1.04, "step": 225 }, { "epoch": 1.14713216957606, - "grad_norm": 0.83984375, - "learning_rate": 0.00015333333333333334, - "loss": 1.1195, + "grad_norm": 0.6953125, + "learning_rate": 0.0001998629534754574, + "loss": 1.0279, "step": 230 }, { "epoch": 1.172069825436409, - "grad_norm": 0.546875, - "learning_rate": 0.00015666666666666666, - "loss": 1.1156, + "grad_norm": 0.5859375, + "learning_rate": 0.0001998134798421867, + "loss": 1.0384, "step": 235 }, { "epoch": 1.1970074812967582, - "grad_norm": 0.458984375, - "learning_rate": 0.00016, - "loss": 1.1262, + "grad_norm": 0.46875, + "learning_rate": 0.00019975640502598244, + "loss": 1.0298, "step": 240 }, { "epoch": 1.2219451371571073, - "grad_norm": 0.484375, - "learning_rate": 0.00016333333333333334, - "loss": 1.1202, + "grad_norm": 0.4375, + "learning_rate": 0.0001996917333733128, + "loss": 1.0369, "step": 245 }, { "epoch": 1.2468827930174564, - "grad_norm": 0.4921875, - "learning_rate": 0.0001666666666666667, - "loss": 1.1114, + "grad_norm": 0.462890625, + "learning_rate": 0.00019961946980917456, + "loss": 1.0218, "step": 250 }, { "epoch": 1.2718204488778055, - "grad_norm": 0.55078125, - "learning_rate": 0.00017, - "loss": 1.1186, + "grad_norm": 0.494140625, + "learning_rate": 0.00019953961983671788, + "loss": 1.0195, "step": 255 }, { "epoch": 1.2967581047381547, - "grad_norm": 0.546875, - "learning_rate": 0.00017333333333333334, - "loss": 1.1142, + "grad_norm": 0.427734375, + "learning_rate": 0.00019945218953682734, + "loss": 1.0136, "step": 260 }, { "epoch": 1.3216957605985038, - "grad_norm": 0.546875, - "learning_rate": 0.00017666666666666666, - "loss": 1.1062, + "grad_norm": 0.5703125, + "learning_rate": 0.00019935718556765876, + "loss": 1.0284, "step": 265 }, { "epoch": 1.346633416458853, - "grad_norm": 0.466796875, - "learning_rate": 0.00018, - "loss": 1.0976, + "grad_norm": 0.40234375, + "learning_rate": 0.00019925461516413223, + "loss": 1.0308, "step": 270 }, { "epoch": 1.371571072319202, - "grad_norm": 0.56640625, - "learning_rate": 0.00018333333333333334, - "loss": 1.1058, + "grad_norm": 0.49609375, + "learning_rate": 0.00019914448613738106, + "loss": 1.0243, "step": 275 }, { "epoch": 1.3965087281795512, - "grad_norm": 0.5625, - "learning_rate": 0.0001866666666666667, - "loss": 1.0984, + "grad_norm": 0.71484375, + "learning_rate": 0.00019902680687415705, + "loss": 1.0262, "step": 280 }, { "epoch": 1.4214463840399003, - "grad_norm": 0.578125, - "learning_rate": 0.00019, - "loss": 1.0941, + "grad_norm": 0.490234375, + "learning_rate": 0.0001989015863361917, + "loss": 1.0164, "step": 285 }, { "epoch": 1.4463840399002494, - "grad_norm": 0.609375, - "learning_rate": 0.00019333333333333333, - "loss": 1.0903, + "grad_norm": 0.640625, + "learning_rate": 0.00019876883405951377, + "loss": 1.0182, "step": 290 }, { "epoch": 1.4713216957605986, - "grad_norm": 0.875, - "learning_rate": 0.00019666666666666666, - "loss": 1.0919, + "grad_norm": 0.396484375, + "learning_rate": 0.00019862856015372317, + "loss": 1.0073, "step": 295 }, { "epoch": 1.4962593516209477, - "grad_norm": 0.54296875, - "learning_rate": 0.0002, - "loss": 1.0744, + "grad_norm": 0.4921875, + "learning_rate": 0.00019848077530122083, + "loss": 1.0199, "step": 300 }, { "epoch": 1.5211970074812968, - "grad_norm": 0.6953125, - "learning_rate": 0.00019999830768577443, - "loss": 1.0881, + "grad_norm": 0.4453125, + "learning_rate": 0.0001983254907563955, + "loss": 0.9976, "step": 305 }, { "epoch": 1.546134663341646, - "grad_norm": 0.58984375, - "learning_rate": 0.00019999323080037624, - "loss": 1.0791, + "grad_norm": 0.60546875, + "learning_rate": 0.00019816271834476642, + "loss": 0.9939, "step": 310 }, { "epoch": 1.571072319201995, - "grad_norm": 0.53125, - "learning_rate": 0.00019998476951563915, - "loss": 1.0633, + "grad_norm": 0.421875, + "learning_rate": 0.00019799247046208297, + "loss": 1.0139, "step": 315 }, { "epoch": 1.5960099750623442, - "grad_norm": 0.76953125, - "learning_rate": 0.00019997292411794618, - "loss": 1.0775, + "grad_norm": 0.42578125, + "learning_rate": 0.00019781476007338058, + "loss": 1.0017, "step": 320 }, { "epoch": 1.6209476309226933, - "grad_norm": 0.7578125, - "learning_rate": 0.0001999576950082201, - "loss": 1.0701, + "grad_norm": 0.47265625, + "learning_rate": 0.00019762960071199333, + "loss": 0.9987, "step": 325 }, { "epoch": 1.6458852867830425, - "grad_norm": 0.52734375, - "learning_rate": 0.0001999390827019096, - "loss": 1.0794, + "grad_norm": 0.41796875, + "learning_rate": 0.00019743700647852354, + "loss": 0.9893, "step": 330 }, { "epoch": 1.6708229426433916, - "grad_norm": 0.60546875, - "learning_rate": 0.00019991708782897213, - "loss": 1.0662, + "grad_norm": 0.443359375, + "learning_rate": 0.00019723699203976766, + "loss": 0.9944, "step": 335 }, { "epoch": 1.6957605985037407, - "grad_norm": 0.6015625, - "learning_rate": 0.0001998917111338525, - "loss": 1.0781, + "grad_norm": 0.353515625, + "learning_rate": 0.00019702957262759965, + "loss": 0.9848, "step": 340 }, { "epoch": 1.7206982543640899, - "grad_norm": 0.58984375, - "learning_rate": 0.0001998629534754574, - "loss": 1.0677, + "grad_norm": 0.546875, + "learning_rate": 0.0001968147640378108, + "loss": 0.9907, "step": 345 }, { "epoch": 1.745635910224439, - "grad_norm": 0.52734375, - "learning_rate": 0.00019983081582712685, - "loss": 1.072, + "grad_norm": 0.46484375, + "learning_rate": 0.00019659258262890683, + "loss": 0.9837, "step": 350 }, { "epoch": 1.770573566084788, - "grad_norm": 0.48828125, - "learning_rate": 0.00019979529927660074, - "loss": 1.0668, + "grad_norm": 0.48046875, + "learning_rate": 0.0001963630453208623, + "loss": 0.9871, "step": 355 }, { "epoch": 1.7955112219451372, - "grad_norm": 0.498046875, - "learning_rate": 0.00019975640502598244, - "loss": 1.0688, + "grad_norm": 0.71875, + "learning_rate": 0.0001961261695938319, + "loss": 0.9928, "step": 360 }, { "epoch": 1.8204488778054864, "grad_norm": 0.671875, - "learning_rate": 0.00019971413439169775, - "loss": 1.0561, + "learning_rate": 0.0001958819734868193, + "loss": 0.988, "step": 365 }, { "epoch": 1.8453865336658355, - "grad_norm": 0.66015625, - "learning_rate": 0.00019966848880445062, - "loss": 1.0585, + "grad_norm": 0.5, + "learning_rate": 0.00019563047559630357, + "loss": 0.9768, "step": 370 }, { "epoch": 1.8703241895261846, - "grad_norm": 0.5859375, - "learning_rate": 0.00019961946980917456, - "loss": 1.0642, + "grad_norm": 0.6875, + "learning_rate": 0.0001953716950748227, + "loss": 0.9947, "step": 375 }, { "epoch": 1.8952618453865338, - "grad_norm": 0.8671875, - "learning_rate": 0.00019956707906498044, - "loss": 1.0538, + "grad_norm": 0.6484375, + "learning_rate": 0.00019510565162951537, + "loss": 0.9951, "step": 380 }, { "epoch": 1.9201995012468829, - "grad_norm": 0.8671875, - "learning_rate": 0.00019951131834510032, - "loss": 1.0605, + "grad_norm": 0.5546875, + "learning_rate": 0.00019483236552061994, + "loss": 0.9709, "step": 385 }, { "epoch": 1.945137157107232, - "grad_norm": 0.5703125, - "learning_rate": 0.00019945218953682734, - "loss": 1.0631, + "grad_norm": 0.36328125, + "learning_rate": 0.0001945518575599317, + "loss": 0.9837, "step": 390 }, { "epoch": 1.9700748129675811, - "grad_norm": 0.48046875, - "learning_rate": 0.000199389694641452, - "loss": 1.0573, + "grad_norm": 0.38671875, + "learning_rate": 0.00019426414910921787, + "loss": 0.9895, "step": 395 }, { "epoch": 1.9950124688279303, - "grad_norm": 0.65234375, - "learning_rate": 0.00019932383577419432, - "loss": 1.0647, + "grad_norm": 0.421875, + "learning_rate": 0.00019396926207859084, + "loss": 0.9846, "step": 400 }, { "epoch": 2.0, - "eval_loss": 2.4642767906188965, - "eval_runtime": 0.528, - "eval_samples_per_second": 18.94, - "eval_steps_per_second": 1.894, + "eval_loss": 2.465005397796631, + "eval_runtime": 0.5313, + "eval_samples_per_second": 18.822, + "eval_steps_per_second": 1.882, "step": 401 }, { "epoch": 2.0199501246882794, - "grad_norm": 0.85546875, - "learning_rate": 0.00019925461516413223, - "loss": 1.0445, + "grad_norm": 0.79296875, + "learning_rate": 0.00019366721892483978, + "loss": 0.9501, "step": 405 }, { "epoch": 2.0448877805486285, - "grad_norm": 0.609375, - "learning_rate": 0.00019918203515412617, - "loss": 1.0286, + "grad_norm": 0.671875, + "learning_rate": 0.00019335804264972018, + "loss": 0.9451, "step": 410 }, { "epoch": 2.0698254364089776, - "grad_norm": 0.5234375, - "learning_rate": 0.00019910609820073986, - "loss": 1.0285, + "grad_norm": 0.59765625, + "learning_rate": 0.00019304175679820247, + "loss": 0.9343, "step": 415 }, { "epoch": 2.0947630922693268, - "grad_norm": 0.51171875, - "learning_rate": 0.00019902680687415705, - "loss": 1.0393, + "grad_norm": 0.412109375, + "learning_rate": 0.00019271838545667876, + "loss": 0.9283, "step": 420 }, { "epoch": 2.119700748129676, - "grad_norm": 0.4765625, - "learning_rate": 0.00019894416385809444, - "loss": 1.0295, + "grad_norm": 0.416015625, + "learning_rate": 0.0001923879532511287, + "loss": 0.9398, "step": 425 }, { "epoch": 2.144638403990025, - "grad_norm": 0.58203125, - "learning_rate": 0.00019885817194971117, - "loss": 1.0329, + "grad_norm": 0.337890625, + "learning_rate": 0.00019205048534524406, + "loss": 0.9454, "step": 430 }, { "epoch": 2.169576059850374, - "grad_norm": 0.79296875, - "learning_rate": 0.00019876883405951377, - "loss": 1.0332, + "grad_norm": 0.494140625, + "learning_rate": 0.0001917060074385124, + "loss": 0.9341, "step": 435 }, { "epoch": 2.1945137157107233, - "grad_norm": 0.60546875, - "learning_rate": 0.00019867615321125795, - "loss": 1.0357, + "grad_norm": 0.38671875, + "learning_rate": 0.0001913545457642601, + "loss": 0.9405, "step": 440 }, { "epoch": 2.2194513715710724, - "grad_norm": 0.53515625, - "learning_rate": 0.00019858013254184597, - "loss": 1.0275, + "grad_norm": 0.4296875, + "learning_rate": 0.00019099612708765434, + "loss": 0.9424, "step": 445 }, { "epoch": 2.2443890274314215, - "grad_norm": 0.6015625, - "learning_rate": 0.00019848077530122083, - "loss": 1.0239, + "grad_norm": 0.451171875, + "learning_rate": 0.000190630778703665, + "loss": 0.9329, "step": 450 }, { "epoch": 2.2693266832917707, - "grad_norm": 0.6015625, - "learning_rate": 0.0001983780848522559, - "loss": 1.0292, + "grad_norm": 0.41796875, + "learning_rate": 0.00019025852843498607, + "loss": 0.9377, "step": 455 }, { "epoch": 2.29426433915212, - "grad_norm": 0.58984375, - "learning_rate": 0.00019827206467064133, - "loss": 1.0222, + "grad_norm": 0.5, + "learning_rate": 0.0001898794046299167, + "loss": 0.9409, "step": 460 }, { "epoch": 2.319201995012469, - "grad_norm": 0.51953125, - "learning_rate": 0.00019816271834476642, - "loss": 1.0256, + "grad_norm": 0.46484375, + "learning_rate": 0.00018949343616020252, + "loss": 0.9473, "step": 465 }, { "epoch": 2.344139650872818, - "grad_norm": 0.64453125, - "learning_rate": 0.00019805004957559793, - "loss": 1.0329, + "grad_norm": 0.38671875, + "learning_rate": 0.0001891006524188368, + "loss": 0.9384, "step": 470 }, { "epoch": 2.369077306733167, - "grad_norm": 0.6484375, - "learning_rate": 0.00019793406217655517, - "loss": 1.0253, + "grad_norm": 0.40625, + "learning_rate": 0.00018870108331782217, + "loss": 0.9471, "step": 475 }, { "epoch": 2.3940149625935163, - "grad_norm": 0.67578125, - "learning_rate": 0.00019781476007338058, - "loss": 1.0264, + "grad_norm": 0.392578125, + "learning_rate": 0.00018829475928589271, + "loss": 0.9393, "step": 480 }, { "epoch": 2.4189526184538654, - "grad_norm": 0.64453125, - "learning_rate": 0.00019769214730400712, - "loss": 1.0241, + "grad_norm": 0.3671875, + "learning_rate": 0.00018788171126619653, + "loss": 0.931, "step": 485 }, { "epoch": 2.4438902743142146, - "grad_norm": 0.890625, - "learning_rate": 0.00019756622801842143, - "loss": 1.0198, + "grad_norm": 0.408203125, + "learning_rate": 0.00018746197071393958, + "loss": 0.9309, "step": 490 }, { "epoch": 2.4688279301745637, - "grad_norm": 0.51953125, - "learning_rate": 0.00019743700647852354, - "loss": 1.0253, + "grad_norm": 0.361328125, + "learning_rate": 0.00018703556959398998, + "loss": 0.9375, "step": 495 }, { "epoch": 2.493765586034913, - "grad_norm": 0.515625, - "learning_rate": 0.00019730448705798239, - "loss": 1.0231, + "grad_norm": 0.3984375, + "learning_rate": 0.00018660254037844388, + "loss": 0.9384, "step": 500 }, { "epoch": 2.518703241895262, - "grad_norm": 0.4921875, - "learning_rate": 0.00019716867424208806, - "loss": 1.033, + "grad_norm": 0.37109375, + "learning_rate": 0.00018616291604415258, + "loss": 0.9365, "step": 505 }, { "epoch": 2.543640897755611, - "grad_norm": 0.5078125, - "learning_rate": 0.00019702957262759965, - "loss": 1.0258, + "grad_norm": 0.34765625, + "learning_rate": 0.00018571673007021123, + "loss": 0.9449, "step": 510 }, { "epoch": 2.56857855361596, - "grad_norm": 0.59765625, - "learning_rate": 0.00019688718692259006, - "loss": 1.003, + "grad_norm": 0.36328125, + "learning_rate": 0.00018526401643540922, + "loss": 0.9422, "step": 515 }, { "epoch": 2.5935162094763093, - "grad_norm": 0.60546875, - "learning_rate": 0.00019674152194628638, - "loss": 1.0126, + "grad_norm": 0.458984375, + "learning_rate": 0.0001848048096156426, + "loss": 0.9313, "step": 520 }, { "epoch": 2.6184538653366585, - "grad_norm": 0.58984375, - "learning_rate": 0.00019659258262890683, - "loss": 1.012, + "grad_norm": 0.431640625, + "learning_rate": 0.0001843391445812886, + "loss": 0.9279, "step": 525 }, { "epoch": 2.6433915211970076, - "grad_norm": 0.498046875, - "learning_rate": 0.0001964403740114939, - "loss": 1.0175, + "grad_norm": 0.3984375, + "learning_rate": 0.00018386705679454242, + "loss": 0.9393, "step": 530 }, { "epoch": 2.6683291770573567, - "grad_norm": 0.53125, - "learning_rate": 0.00019628490124574377, - "loss": 1.0117, + "grad_norm": 0.34765625, + "learning_rate": 0.00018338858220671682, + "loss": 0.942, "step": 535 }, { "epoch": 2.693266832917706, - "grad_norm": 0.5703125, - "learning_rate": 0.0001961261695938319, - "loss": 1.0187, + "grad_norm": 0.39453125, + "learning_rate": 0.00018290375725550417, + "loss": 0.9412, "step": 540 }, { "epoch": 2.718204488778055, - "grad_norm": 0.97265625, - "learning_rate": 0.00019596418442823494, - "loss": 1.0184, + "grad_norm": 0.345703125, + "learning_rate": 0.00018241261886220154, + "loss": 0.9455, "step": 545 }, { "epoch": 2.743142144638404, - "grad_norm": 0.671875, - "learning_rate": 0.0001957989512315489, - "loss": 1.0146, + "grad_norm": 0.47265625, + "learning_rate": 0.0001819152044288992, + "loss": 0.9385, "step": 550 }, { "epoch": 2.7680798004987532, - "grad_norm": 0.490234375, - "learning_rate": 0.00019563047559630357, - "loss": 1.0141, + "grad_norm": 0.392578125, + "learning_rate": 0.00018141155183563193, + "loss": 0.9438, "step": 555 }, { "epoch": 2.7930174563591024, - "grad_norm": 0.48046875, - "learning_rate": 0.0001954587632247732, - "loss": 1.0209, + "grad_norm": 0.3984375, + "learning_rate": 0.00018090169943749476, + "loss": 0.937, "step": 560 }, { "epoch": 2.8179551122194515, - "grad_norm": 0.51171875, - "learning_rate": 0.00019528381992878362, - "loss": 1.0165, + "grad_norm": 0.38671875, + "learning_rate": 0.00018038568606172173, + "loss": 0.9257, "step": 565 }, { "epoch": 2.8428927680798006, - "grad_norm": 0.470703125, - "learning_rate": 0.00019510565162951537, - "loss": 1.0161, + "grad_norm": 0.3828125, + "learning_rate": 0.00017986355100472928, + "loss": 0.9311, "step": 570 }, { "epoch": 2.8678304239401498, - "grad_norm": 0.90234375, - "learning_rate": 0.0001949242643573034, - "loss": 1.0158, + "grad_norm": 0.419921875, + "learning_rate": 0.00017933533402912354, + "loss": 0.9327, "step": 575 }, { "epoch": 2.892768079800499, - "grad_norm": 0.59765625, - "learning_rate": 0.00019473966425143292, - "loss": 1.0127, + "grad_norm": 0.400390625, + "learning_rate": 0.00017880107536067218, + "loss": 0.9202, "step": 580 }, { "epoch": 2.917705735660848, - "grad_norm": 0.65625, - "learning_rate": 0.0001945518575599317, - "loss": 1.0037, + "grad_norm": 0.455078125, + "learning_rate": 0.0001782608156852414, + "loss": 0.9277, "step": 585 }, { "epoch": 2.942643391521197, - "grad_norm": 0.5234375, - "learning_rate": 0.00019436085063935835, - "loss": 1.0068, + "grad_norm": 0.462890625, + "learning_rate": 0.0001777145961456971, + "loss": 0.9359, "step": 590 }, { "epoch": 2.9675810473815463, - "grad_norm": 0.86328125, - "learning_rate": 0.00019416664995458756, - "loss": 1.0061, + "grad_norm": 0.41796875, + "learning_rate": 0.00017716245833877201, + "loss": 0.9207, "step": 595 }, { "epoch": 2.9925187032418954, - "grad_norm": 0.46484375, - "learning_rate": 0.00019396926207859084, - "loss": 1.0051, + "grad_norm": 0.55078125, + "learning_rate": 0.0001766044443118978, + "loss": 0.941, "step": 600 }, { "epoch": 2.997506234413965, - "eval_loss": 2.4609696865081787, - "eval_runtime": 0.6338, - "eval_samples_per_second": 15.777, - "eval_steps_per_second": 1.578, + "eval_loss": 2.487478733062744, + "eval_runtime": 0.9468, + "eval_samples_per_second": 10.562, + "eval_steps_per_second": 1.056, "step": 601 }, { "epoch": 3.0174563591022445, - "grad_norm": 0.578125, - "learning_rate": 0.00019376869369221452, - "loss": 0.9992, + "grad_norm": 0.453125, + "learning_rate": 0.0001760405965600031, + "loss": 0.9043, "step": 605 }, { "epoch": 3.0423940149625937, - "grad_norm": 0.7109375, - "learning_rate": 0.00019356495158395315, - "loss": 0.9824, + "grad_norm": 0.400390625, + "learning_rate": 0.00017547095802227723, + "loss": 0.8768, "step": 610 }, { "epoch": 3.067331670822943, - "grad_norm": 0.75390625, - "learning_rate": 0.00019335804264972018, - "loss": 0.9911, + "grad_norm": 0.404296875, + "learning_rate": 0.00017489557207890023, + "loss": 0.8886, "step": 615 }, { "epoch": 3.092269326683292, - "grad_norm": 0.796875, - "learning_rate": 0.00019314797389261424, - "loss": 0.9921, + "grad_norm": 0.365234375, + "learning_rate": 0.00017431448254773944, + "loss": 0.8857, "step": 620 }, { "epoch": 3.117206982543641, - "grad_norm": 0.5859375, - "learning_rate": 0.00019293475242268223, - "loss": 0.9925, + "grad_norm": 0.416015625, + "learning_rate": 0.0001737277336810124, + "loss": 0.8933, "step": 625 }, { "epoch": 3.14214463840399, - "grad_norm": 0.55078125, - "learning_rate": 0.00019271838545667876, - "loss": 0.9767, + "grad_norm": 0.35546875, + "learning_rate": 0.00017313537016191706, + "loss": 0.8898, "step": 630 }, { "epoch": 3.1670822942643393, - "grad_norm": 0.66015625, - "learning_rate": 0.0001924988803178216, - "loss": 0.9821, + "grad_norm": 0.451171875, + "learning_rate": 0.00017253743710122875, + "loss": 0.8916, "step": 635 }, { "epoch": 3.1920199501246884, - "grad_norm": 0.6015625, - "learning_rate": 0.00019227624443554425, - "loss": 0.9839, + "grad_norm": 0.56640625, + "learning_rate": 0.0001719339800338651, + "loss": 0.882, "step": 640 }, { "epoch": 3.2169576059850375, - "grad_norm": 0.51953125, - "learning_rate": 0.00019205048534524406, - "loss": 0.9904, + "grad_norm": 0.470703125, + "learning_rate": 0.00017132504491541818, + "loss": 0.8975, "step": 645 }, { "epoch": 3.2418952618453867, - "grad_norm": 0.5703125, - "learning_rate": 0.00019182161068802741, - "loss": 0.987, + "grad_norm": 0.427734375, + "learning_rate": 0.00017071067811865476, + "loss": 0.8952, "step": 650 }, { "epoch": 3.266832917705736, - "grad_norm": 0.66015625, - "learning_rate": 0.00019158962821045112, - "loss": 0.9912, + "grad_norm": 0.384765625, + "learning_rate": 0.0001700909264299851, + "loss": 0.8788, "step": 655 }, { "epoch": 3.291770573566085, - "grad_norm": 0.58984375, - "learning_rate": 0.0001913545457642601, - "loss": 0.9848, + "grad_norm": 0.396484375, + "learning_rate": 0.00016946583704589973, + "loss": 0.8939, "step": 660 }, { "epoch": 3.316708229426434, - "grad_norm": 0.63671875, - "learning_rate": 0.0001911163713061217, - "loss": 0.9872, + "grad_norm": 0.439453125, + "learning_rate": 0.0001688354575693754, + "loss": 0.8954, "step": 665 }, { "epoch": 3.341645885286783, - "grad_norm": 0.7265625, - "learning_rate": 0.00019087511289735644, - "loss": 0.989, + "grad_norm": 0.38671875, + "learning_rate": 0.00016819983600624986, + "loss": 0.8839, "step": 670 }, { "epoch": 3.3665835411471323, - "grad_norm": 0.5625, - "learning_rate": 0.000190630778703665, - "loss": 0.9968, + "grad_norm": 0.392578125, + "learning_rate": 0.00016755902076156604, + "loss": 0.896, "step": 675 }, { "epoch": 3.3915211970074814, - "grad_norm": 0.5859375, - "learning_rate": 0.00019038337699485208, - "loss": 0.9839, + "grad_norm": 0.337890625, + "learning_rate": 0.00016691306063588583, + "loss": 0.8921, "step": 680 }, { "epoch": 3.4164588528678306, - "grad_norm": 0.58984375, - "learning_rate": 0.00019013291614454621, - "loss": 0.9919, + "grad_norm": 0.41015625, + "learning_rate": 0.00016626200482157378, + "loss": 0.8848, "step": 685 }, { "epoch": 3.4413965087281797, - "grad_norm": 0.498046875, - "learning_rate": 0.0001898794046299167, - "loss": 0.9905, + "grad_norm": 0.35546875, + "learning_rate": 0.00016560590289905073, + "loss": 0.8909, "step": 690 }, { "epoch": 3.466334164588529, - "grad_norm": 0.56640625, - "learning_rate": 0.00018962285103138636, - "loss": 0.9932, + "grad_norm": 0.3671875, + "learning_rate": 0.00016494480483301836, + "loss": 0.8981, "step": 695 }, { "epoch": 3.491271820448878, - "grad_norm": 0.51953125, - "learning_rate": 0.00018936326403234125, - "loss": 0.982, + "grad_norm": 0.341796875, + "learning_rate": 0.00016427876096865394, + "loss": 0.8871, "step": 700 }, { "epoch": 3.516209476309227, - "grad_norm": 0.71875, - "learning_rate": 0.0001891006524188368, - "loss": 0.9914, + "grad_norm": 0.380859375, + "learning_rate": 0.0001636078220277764, + "loss": 0.8843, "step": 705 }, { "epoch": 3.541147132169576, - "grad_norm": 0.5859375, - "learning_rate": 0.00018883502507930042, - "loss": 0.9809, + "grad_norm": 0.404296875, + "learning_rate": 0.00016293203910498376, + "loss": 0.8867, "step": 710 }, { "epoch": 3.5660847880299253, - "grad_norm": 0.55859375, - "learning_rate": 0.0001885663910042306, - "loss": 0.9818, + "grad_norm": 0.396484375, + "learning_rate": 0.00016225146366376198, + "loss": 0.889, "step": 715 }, { "epoch": 3.5910224438902745, - "grad_norm": 0.73046875, - "learning_rate": 0.00018829475928589271, - "loss": 0.9841, + "grad_norm": 0.380859375, + "learning_rate": 0.0001615661475325658, + "loss": 0.8902, "step": 720 }, { "epoch": 3.6159600997506236, - "grad_norm": 0.65625, - "learning_rate": 0.00018802013911801112, - "loss": 0.9796, + "grad_norm": 0.412109375, + "learning_rate": 0.00016087614290087208, + "loss": 0.8821, "step": 725 }, { "epoch": 3.6408977556109727, - "grad_norm": 0.609375, - "learning_rate": 0.0001877425397954582, - "loss": 0.9817, + "grad_norm": 0.37890625, + "learning_rate": 0.00016018150231520486, + "loss": 0.8948, "step": 730 }, { "epoch": 3.665835411471322, - "grad_norm": 0.62109375, - "learning_rate": 0.00018746197071393958, - "loss": 0.9848, + "grad_norm": 0.396484375, + "learning_rate": 0.00015948227867513415, + "loss": 0.8944, "step": 735 }, { "epoch": 3.690773067331671, - "grad_norm": 0.6796875, - "learning_rate": 0.00018717844136967624, - "loss": 0.9814, + "grad_norm": 0.37890625, + "learning_rate": 0.00015877852522924732, + "loss": 0.8931, "step": 740 }, { "epoch": 3.71571072319202, - "grad_norm": 0.609375, - "learning_rate": 0.00018689196135908304, - "loss": 0.9831, + "grad_norm": 0.38671875, + "learning_rate": 0.00015807029557109398, + "loss": 0.8817, "step": 745 }, { "epoch": 3.7406483790523692, - "grad_norm": 0.578125, - "learning_rate": 0.00018660254037844388, - "loss": 0.9829, + "grad_norm": 0.4296875, + "learning_rate": 0.0001573576436351046, + "loss": 0.8938, "step": 750 }, { "epoch": 3.765586034912718, - "grad_norm": 0.51953125, - "learning_rate": 0.00018631018822358363, - "loss": 0.9651, + "grad_norm": 0.345703125, + "learning_rate": 0.00015664062369248328, + "loss": 0.8953, "step": 755 }, { "epoch": 3.7905236907730675, - "grad_norm": 0.54296875, - "learning_rate": 0.00018601491478953657, - "loss": 0.9748, + "grad_norm": 0.37890625, + "learning_rate": 0.0001559192903470747, + "loss": 0.8887, "step": 760 }, { "epoch": 3.815461346633416, - "grad_norm": 0.51953125, - "learning_rate": 0.00018571673007021123, - "loss": 0.9896, + "grad_norm": 0.404296875, + "learning_rate": 0.0001551936985312058, + "loss": 0.8957, "step": 765 }, { "epoch": 3.8403990024937658, - "grad_norm": 0.56640625, - "learning_rate": 0.00018541564415805258, - "loss": 0.9782, + "grad_norm": 0.69140625, + "learning_rate": 0.00015446390350150273, + "loss": 0.9005, "step": 770 }, { "epoch": 3.8653366583541144, - "grad_norm": 0.5390625, - "learning_rate": 0.00018511166724369997, - "loss": 0.9824, + "grad_norm": 0.515625, + "learning_rate": 0.0001537299608346824, + "loss": 0.8937, "step": 775 }, { "epoch": 3.890274314214464, - "grad_norm": 0.5859375, - "learning_rate": 0.0001848048096156426, - "loss": 0.9884, + "grad_norm": 0.6484375, + "learning_rate": 0.0001529919264233205, + "loss": 0.8938, "step": 780 }, { "epoch": 3.9152119700748127, - "grad_norm": 0.609375, - "learning_rate": 0.00018449508165987105, - "loss": 0.9872, + "grad_norm": 0.44921875, + "learning_rate": 0.0001522498564715949, + "loss": 0.8859, "step": 785 }, { "epoch": 3.9401496259351623, - "grad_norm": 0.462890625, - "learning_rate": 0.00018418249385952575, - "loss": 0.9833, + "grad_norm": 0.34765625, + "learning_rate": 0.00015150380749100545, + "loss": 0.8847, "step": 790 }, { "epoch": 3.965087281795511, - "grad_norm": 0.5546875, - "learning_rate": 0.00018386705679454242, - "loss": 0.9718, + "grad_norm": 0.4609375, + "learning_rate": 0.00015075383629607042, + "loss": 0.89, "step": 795 }, { "epoch": 3.9900249376558605, - "grad_norm": 0.48828125, - "learning_rate": 0.00018354878114129367, - "loss": 0.9807, + "grad_norm": 0.353515625, + "learning_rate": 0.00015000000000000001, + "loss": 0.8872, "step": 800 }, { "epoch": 4.0, - "eval_loss": 2.4767043590545654, - "eval_runtime": 0.5377, - "eval_samples_per_second": 18.597, - "eval_steps_per_second": 1.86, + "eval_loss": 2.521277666091919, + "eval_runtime": 0.5369, + "eval_samples_per_second": 18.624, + "eval_steps_per_second": 1.862, "step": 802 }, { "epoch": 4.014962593516209, - "grad_norm": 0.64453125, - "learning_rate": 0.0001832276776722278, - "loss": 0.9718, + "grad_norm": 0.357421875, + "learning_rate": 0.00014924235601034672, + "loss": 0.8609, "step": 805 }, { "epoch": 4.039900249376559, - "grad_norm": 0.4609375, - "learning_rate": 0.00018290375725550417, - "loss": 0.9596, + "grad_norm": 0.3359375, + "learning_rate": 0.00014848096202463372, + "loss": 0.8593, "step": 810 }, { "epoch": 4.0648379052369075, - "grad_norm": 0.671875, - "learning_rate": 0.00018257703085462542, - "loss": 0.9552, + "grad_norm": 0.369140625, + "learning_rate": 0.00014771587602596084, + "loss": 0.8442, "step": 815 }, { "epoch": 4.089775561097257, - "grad_norm": 0.498046875, - "learning_rate": 0.00018224750952806624, - "loss": 0.9614, + "grad_norm": 0.37890625, + "learning_rate": 0.00014694715627858908, + "loss": 0.8414, "step": 820 }, { "epoch": 4.114713216957606, - "grad_norm": 0.546875, - "learning_rate": 0.0001819152044288992, - "loss": 0.9702, + "grad_norm": 0.390625, + "learning_rate": 0.00014617486132350343, + "loss": 0.8416, "step": 825 }, { "epoch": 4.139650872817955, - "grad_norm": 0.51171875, - "learning_rate": 0.00018158012680441723, - "loss": 0.9534, + "grad_norm": 0.3515625, + "learning_rate": 0.00014539904997395468, + "loss": 0.8449, "step": 830 }, { "epoch": 4.164588528678304, - "grad_norm": 0.48046875, - "learning_rate": 0.00018124228799575295, - "loss": 0.9643, + "grad_norm": 0.5546875, + "learning_rate": 0.00014461978131098088, + "loss": 0.8426, "step": 835 }, { "epoch": 4.1895261845386536, - "grad_norm": 0.494140625, - "learning_rate": 0.00018090169943749476, - "loss": 0.9464, + "grad_norm": 0.3984375, + "learning_rate": 0.00014383711467890774, + "loss": 0.8485, "step": 840 }, { "epoch": 4.214463840399002, - "grad_norm": 0.58984375, - "learning_rate": 0.00018055837265729994, - "loss": 0.9624, + "grad_norm": 0.39453125, + "learning_rate": 0.00014305110968082952, + "loss": 0.8504, "step": 845 }, { "epoch": 4.239401496259352, - "grad_norm": 0.49609375, - "learning_rate": 0.0001802123192755044, - "loss": 0.9606, + "grad_norm": 0.36328125, + "learning_rate": 0.00014226182617406996, + "loss": 0.8498, "step": 850 }, { "epoch": 4.2643391521197005, - "grad_norm": 0.64453125, - "learning_rate": 0.00017986355100472928, - "loss": 0.9548, + "grad_norm": 0.34375, + "learning_rate": 0.00014146932426562392, + "loss": 0.858, "step": 855 }, { "epoch": 4.28927680798005, - "grad_norm": 0.7109375, - "learning_rate": 0.0001795120796494848, - "loss": 0.9541, + "grad_norm": 0.38671875, + "learning_rate": 0.00014067366430758004, + "loss": 0.8497, "step": 860 }, { "epoch": 4.314214463840399, - "grad_norm": 0.498046875, - "learning_rate": 0.00017915791710577033, - "loss": 0.9537, + "grad_norm": 0.46875, + "learning_rate": 0.00013987490689252463, + "loss": 0.8493, "step": 865 }, { "epoch": 4.339152119700748, - "grad_norm": 0.51953125, - "learning_rate": 0.00017880107536067218, - "loss": 0.9691, + "grad_norm": 0.421875, + "learning_rate": 0.00013907311284892736, + "loss": 0.852, "step": 870 }, { "epoch": 4.364089775561097, - "grad_norm": 0.51953125, - "learning_rate": 0.00017844156649195759, - "loss": 0.9597, + "grad_norm": 0.38671875, + "learning_rate": 0.000138268343236509, + "loss": 0.8581, "step": 875 }, { "epoch": 4.389027431421447, - "grad_norm": 0.52734375, - "learning_rate": 0.00017807940266766593, - "loss": 0.9517, + "grad_norm": 0.388671875, + "learning_rate": 0.00013746065934159123, + "loss": 0.8518, "step": 880 }, { "epoch": 4.413965087281795, - "grad_norm": 0.46875, - "learning_rate": 0.0001777145961456971, - "loss": 0.9598, + "grad_norm": 0.375, + "learning_rate": 0.00013665012267242974, + "loss": 0.8556, "step": 885 }, { "epoch": 4.438902743142145, - "grad_norm": 0.6484375, - "learning_rate": 0.0001773471592733964, - "loss": 0.9617, + "grad_norm": 0.373046875, + "learning_rate": 0.00013583679495453, + "loss": 0.8618, "step": 890 }, { "epoch": 4.4638403990024935, - "grad_norm": 0.53125, - "learning_rate": 0.00017697710448713678, - "loss": 0.9513, + "grad_norm": 0.33203125, + "learning_rate": 0.00013502073812594675, + "loss": 0.8431, "step": 895 }, { "epoch": 4.488778054862843, - "grad_norm": 0.546875, - "learning_rate": 0.0001766044443118978, - "loss": 0.9637, + "grad_norm": 0.396484375, + "learning_rate": 0.00013420201433256689, + "loss": 0.8481, "step": 900 }, { "epoch": 4.513715710723192, - "grad_norm": 0.49609375, - "learning_rate": 0.00017622919136084183, - "loss": 0.9562, + "grad_norm": 0.4140625, + "learning_rate": 0.0001333806859233771, + "loss": 0.858, "step": 905 }, { "epoch": 4.538653366583541, - "grad_norm": 0.546875, - "learning_rate": 0.00017585135833488692, - "loss": 0.9632, + "grad_norm": 0.51171875, + "learning_rate": 0.00013255681544571568, + "loss": 0.8491, "step": 910 }, { "epoch": 4.56359102244389, - "grad_norm": 0.9453125, - "learning_rate": 0.00017547095802227723, - "loss": 0.9448, + "grad_norm": 0.3828125, + "learning_rate": 0.00013173046564050924, + "loss": 0.8536, "step": 915 }, { "epoch": 4.58852867830424, - "grad_norm": 0.53125, - "learning_rate": 0.00017508800329814995, - "loss": 0.963, + "grad_norm": 0.37109375, + "learning_rate": 0.00013090169943749476, + "loss": 0.8505, "step": 920 }, { "epoch": 4.613466334164588, - "grad_norm": 0.515625, - "learning_rate": 0.0001747025071240996, - "loss": 0.962, + "grad_norm": 0.365234375, + "learning_rate": 0.00013007057995042732, + "loss": 0.8447, "step": 925 }, { "epoch": 4.638403990024938, - "grad_norm": 0.478515625, - "learning_rate": 0.00017431448254773944, - "loss": 0.9627, + "grad_norm": 0.39453125, + "learning_rate": 0.00012923717047227368, + "loss": 0.8519, "step": 930 }, { "epoch": 4.6633416458852865, - "grad_norm": 0.51953125, - "learning_rate": 0.0001739239427022596, - "loss": 0.9574, + "grad_norm": 0.390625, + "learning_rate": 0.00012840153447039228, + "loss": 0.8503, "step": 935 }, { "epoch": 4.688279301745636, - "grad_norm": 0.4765625, - "learning_rate": 0.0001735309008059829, - "loss": 0.9675, + "grad_norm": 0.38671875, + "learning_rate": 0.0001275637355816999, + "loss": 0.8525, "step": 940 }, { "epoch": 4.713216957605985, - "grad_norm": 0.5078125, - "learning_rate": 0.00017313537016191706, - "loss": 0.9546, + "grad_norm": 0.396484375, + "learning_rate": 0.00012672383760782568, + "loss": 0.8422, "step": 945 }, { "epoch": 4.738154613466334, - "grad_norm": 0.63671875, - "learning_rate": 0.00017273736415730488, - "loss": 0.9472, + "grad_norm": 0.38671875, + "learning_rate": 0.00012588190451025207, + "loss": 0.8513, "step": 950 }, { "epoch": 4.763092269326683, - "grad_norm": 0.71484375, - "learning_rate": 0.0001723368962631708, - "loss": 0.9528, + "grad_norm": 0.373046875, + "learning_rate": 0.00012503800040544416, + "loss": 0.8574, "step": 955 }, { "epoch": 4.788029925187033, - "grad_norm": 0.48828125, - "learning_rate": 0.0001719339800338651, - "loss": 0.962, + "grad_norm": 0.34765625, + "learning_rate": 0.00012419218955996676, + "loss": 0.8386, "step": 960 }, { "epoch": 4.812967581047381, - "grad_norm": 0.625, - "learning_rate": 0.00017152862910660516, - "loss": 0.9602, + "grad_norm": 0.40234375, + "learning_rate": 0.00012334453638559057, + "loss": 0.8521, "step": 965 }, { "epoch": 4.837905236907731, - "grad_norm": 0.62890625, - "learning_rate": 0.00017112085720101373, - "loss": 0.9529, + "grad_norm": 0.4296875, + "learning_rate": 0.0001224951054343865, + "loss": 0.8469, "step": 970 }, { "epoch": 4.86284289276808, - "grad_norm": 0.51953125, - "learning_rate": 0.00017071067811865476, - "loss": 0.957, + "grad_norm": 0.365234375, + "learning_rate": 0.00012164396139381029, + "loss": 0.8506, "step": 975 }, { "epoch": 4.887780548628429, - "grad_norm": 0.5546875, - "learning_rate": 0.0001702981057425662, - "loss": 0.9597, + "grad_norm": 0.365234375, + "learning_rate": 0.00012079116908177593, + "loss": 0.8524, "step": 980 }, { "epoch": 4.912718204488778, - "grad_norm": 0.6328125, - "learning_rate": 0.00016988315403679, - "loss": 0.9653, + "grad_norm": 0.373046875, + "learning_rate": 0.00011993679344171973, + "loss": 0.846, "step": 985 }, { "epoch": 4.937655860349127, - "grad_norm": 0.51953125, - "learning_rate": 0.00016946583704589973, - "loss": 0.9486, + "grad_norm": 0.349609375, + "learning_rate": 0.00011908089953765449, + "loss": 0.8523, "step": 990 }, { "epoch": 4.962593516209476, - "grad_norm": 0.50390625, - "learning_rate": 0.00016904616889452497, - "loss": 0.9587, + "grad_norm": 0.376953125, + "learning_rate": 0.00011822355254921478, + "loss": 0.8515, "step": 995 }, { "epoch": 4.987531172069826, - "grad_norm": 0.4765625, - "learning_rate": 0.0001686241637868734, - "loss": 0.9508, + "grad_norm": 0.384765625, + "learning_rate": 0.00011736481776669306, + "loss": 0.8497, "step": 1000 }, { "epoch": 4.997506234413965, - "eval_loss": 2.4788339138031006, - "eval_runtime": 0.6532, - "eval_samples_per_second": 15.309, - "eval_steps_per_second": 1.531, + "eval_loss": 2.5718774795532227, + "eval_runtime": 0.8024, + "eval_samples_per_second": 12.462, + "eval_steps_per_second": 1.246, "step": 1002 }, { "epoch": 5.012468827930174, - "grad_norm": 0.45703125, - "learning_rate": 0.00016819983600624986, - "loss": 0.9493, + "grad_norm": 0.37109375, + "learning_rate": 0.00011650476058606777, + "loss": 0.8318, "step": 1005 }, { "epoch": 5.037406483790524, - "grad_norm": 0.5078125, - "learning_rate": 0.00016777319991457325, - "loss": 0.941, + "grad_norm": 0.380859375, + "learning_rate": 0.0001156434465040231, + "loss": 0.8014, "step": 1010 }, { "epoch": 5.062344139650873, - "grad_norm": 0.51953125, - "learning_rate": 0.00016734426995189004, - "loss": 0.9364, + "grad_norm": 0.37890625, + "learning_rate": 0.00011478094111296109, + "loss": 0.8068, "step": 1015 }, { "epoch": 5.087281795511222, - "grad_norm": 0.60546875, - "learning_rate": 0.00016691306063588583, - "loss": 0.9424, + "grad_norm": 0.353515625, + "learning_rate": 0.00011391731009600654, + "loss": 0.8065, "step": 1020 }, { "epoch": 5.112219451371571, - "grad_norm": 0.52734375, - "learning_rate": 0.00016647958656139378, - "loss": 0.9365, + "grad_norm": 0.408203125, + "learning_rate": 0.00011305261922200519, + "loss": 0.8112, "step": 1025 }, { "epoch": 5.13715710723192, - "grad_norm": 0.5, - "learning_rate": 0.00016604386239990078, - "loss": 0.9405, + "grad_norm": 0.384765625, + "learning_rate": 0.00011218693434051475, + "loss": 0.8188, "step": 1030 }, { "epoch": 5.162094763092269, - "grad_norm": 0.58203125, - "learning_rate": 0.00016560590289905073, - "loss": 0.9274, + "grad_norm": 0.392578125, + "learning_rate": 0.0001113203213767907, + "loss": 0.8073, "step": 1035 }, { "epoch": 5.187032418952619, - "grad_norm": 0.53515625, - "learning_rate": 0.00016516572288214552, - "loss": 0.9433, + "grad_norm": 0.376953125, + "learning_rate": 0.00011045284632676536, + "loss": 0.8124, "step": 1040 }, { "epoch": 5.211970074812967, - "grad_norm": 0.50390625, - "learning_rate": 0.00016472333724764325, - "loss": 0.9372, + "grad_norm": 0.39453125, + "learning_rate": 0.00010958457525202241, + "loss": 0.8194, "step": 1045 }, { "epoch": 5.236907730673317, - "grad_norm": 0.72265625, - "learning_rate": 0.00016427876096865394, - "loss": 0.948, + "grad_norm": 0.380859375, + "learning_rate": 0.00010871557427476583, + "loss": 0.805, "step": 1050 }, { "epoch": 5.261845386533666, - "grad_norm": 0.5, - "learning_rate": 0.00016383200909243285, - "loss": 0.9342, + "grad_norm": 0.359375, + "learning_rate": 0.0001078459095727845, + "loss": 0.8162, "step": 1055 }, { "epoch": 5.286783042394015, - "grad_norm": 0.546875, - "learning_rate": 0.00016338309673987101, - "loss": 0.944, + "grad_norm": 0.375, + "learning_rate": 0.00010697564737441252, + "loss": 0.8079, "step": 1060 }, { "epoch": 5.311720698254364, - "grad_norm": 0.51953125, - "learning_rate": 0.00016293203910498376, - "loss": 0.9438, + "grad_norm": 0.38671875, + "learning_rate": 0.00010610485395348571, + "loss": 0.8098, "step": 1065 }, { "epoch": 5.3366583541147135, - "grad_norm": 0.57421875, - "learning_rate": 0.000162478851454396, - "loss": 0.9319, + "grad_norm": 0.37890625, + "learning_rate": 0.0001052335956242944, + "loss": 0.8182, "step": 1070 }, { "epoch": 5.361596009975062, - "grad_norm": 0.65625, - "learning_rate": 0.000162023549126826, - "loss": 0.9411, + "grad_norm": 0.365234375, + "learning_rate": 0.00010436193873653361, + "loss": 0.8157, "step": 1075 }, { "epoch": 5.386533665835412, - "grad_norm": 0.57421875, - "learning_rate": 0.0001615661475325658, - "loss": 0.9316, + "grad_norm": 0.412109375, + "learning_rate": 0.00010348994967025012, + "loss": 0.8149, "step": 1080 }, { "epoch": 5.41147132169576, - "grad_norm": 0.73828125, - "learning_rate": 0.00016110666215295998, - "loss": 0.9386, + "grad_norm": 0.376953125, + "learning_rate": 0.00010261769483078733, + "loss": 0.8144, "step": 1085 }, { "epoch": 5.43640897755611, - "grad_norm": 0.75390625, - "learning_rate": 0.00016064510853988138, - "loss": 0.9429, + "grad_norm": 0.3671875, + "learning_rate": 0.00010174524064372837, + "loss": 0.8094, "step": 1090 }, { "epoch": 5.461346633416459, - "grad_norm": 0.55078125, - "learning_rate": 0.00016018150231520486, - "loss": 0.9378, + "grad_norm": 0.37109375, + "learning_rate": 0.0001008726535498374, + "loss": 0.8203, "step": 1095 }, { "epoch": 5.486284289276808, - "grad_norm": 0.458984375, - "learning_rate": 0.00015971585917027862, - "loss": 0.9416, + "grad_norm": 0.375, + "learning_rate": 0.0001, + "loss": 0.8134, "step": 1100 }, { "epoch": 5.511221945137157, - "grad_norm": 0.46484375, - "learning_rate": 0.00015924819486539307, - "loss": 0.9239, + "grad_norm": 0.3828125, + "learning_rate": 9.912734645016263e-05, + "loss": 0.8172, "step": 1105 }, { "epoch": 5.5361596009975065, - "grad_norm": 0.609375, - "learning_rate": 0.00015877852522924732, - "loss": 0.9264, + "grad_norm": 0.376953125, + "learning_rate": 9.825475935627165e-05, + "loss": 0.8193, "step": 1110 }, { "epoch": 5.561097256857855, - "grad_norm": 0.46875, - "learning_rate": 0.00015830686615841348, - "loss": 0.9382, + "grad_norm": 0.37890625, + "learning_rate": 9.73823051692127e-05, + "loss": 0.8154, "step": 1115 }, { "epoch": 5.586034912718205, - "grad_norm": 0.5, - "learning_rate": 0.00015783323361679864, - "loss": 0.9405, + "grad_norm": 0.51953125, + "learning_rate": 9.651005032974994e-05, + "loss": 0.8231, "step": 1120 }, { "epoch": 5.610972568578553, - "grad_norm": 0.56640625, - "learning_rate": 0.0001573576436351046, - "loss": 0.9466, + "grad_norm": 0.412109375, + "learning_rate": 9.563806126346642e-05, + "loss": 0.8181, "step": 1125 }, { "epoch": 5.635910224438903, - "grad_norm": 0.5703125, - "learning_rate": 0.00015688011231028518, - "loss": 0.9366, + "grad_norm": 0.337890625, + "learning_rate": 9.476640437570562e-05, + "loss": 0.8137, "step": 1130 }, { "epoch": 5.660847880299252, - "grad_norm": 0.56640625, - "learning_rate": 0.00015640065580500148, - "loss": 0.9419, + "grad_norm": 0.400390625, + "learning_rate": 9.38951460465143e-05, + "loss": 0.8228, "step": 1135 }, { "epoch": 5.685785536159601, - "grad_norm": 0.57421875, - "learning_rate": 0.0001559192903470747, - "loss": 0.9283, + "grad_norm": 0.421875, + "learning_rate": 9.302435262558747e-05, + "loss": 0.8143, "step": 1140 }, { "epoch": 5.71072319201995, - "grad_norm": 0.53125, - "learning_rate": 0.00015543603222893716, - "loss": 0.9384, + "grad_norm": 0.384765625, + "learning_rate": 9.215409042721552e-05, + "loss": 0.8166, "step": 1145 }, { "epoch": 5.7356608478802995, - "grad_norm": 0.474609375, - "learning_rate": 0.0001549508978070806, - "loss": 0.9373, + "grad_norm": 0.375, + "learning_rate": 9.128442572523417e-05, + "loss": 0.8238, "step": 1150 }, { "epoch": 5.760598503740648, - "grad_norm": 0.55078125, - "learning_rate": 0.00015446390350150273, - "loss": 0.9268, + "grad_norm": 0.392578125, + "learning_rate": 9.04154247479776e-05, + "loss": 0.8163, "step": 1155 }, { "epoch": 5.785536159600998, - "grad_norm": 0.46875, - "learning_rate": 0.0001539750657951513, - "loss": 0.9381, + "grad_norm": 0.3671875, + "learning_rate": 8.954715367323468e-05, + "loss": 0.8254, "step": 1160 }, { "epoch": 5.8104738154613464, - "grad_norm": 0.490234375, - "learning_rate": 0.00015348440123336645, - "loss": 0.9321, + "grad_norm": 0.3984375, + "learning_rate": 8.867967862320934e-05, + "loss": 0.8141, "step": 1165 }, { "epoch": 5.835411471321696, - "grad_norm": 0.49609375, - "learning_rate": 0.0001529919264233205, - "loss": 0.9304, + "grad_norm": 0.380859375, + "learning_rate": 8.781306565948528e-05, + "loss": 0.8207, "step": 1170 }, { "epoch": 5.860349127182045, - "grad_norm": 0.59375, - "learning_rate": 0.000152497658033456, - "loss": 0.934, + "grad_norm": 0.36328125, + "learning_rate": 8.694738077799488e-05, + "loss": 0.8197, "step": 1175 }, { "epoch": 5.885286783042394, - "grad_norm": 0.5078125, - "learning_rate": 0.00015200161279292155, - "loss": 0.9493, + "grad_norm": 0.353515625, + "learning_rate": 8.608268990399349e-05, + "loss": 0.8162, "step": 1180 }, { "epoch": 5.910224438902743, - "grad_norm": 0.474609375, - "learning_rate": 0.00015150380749100545, - "loss": 0.9362, + "grad_norm": 0.3828125, + "learning_rate": 8.521905888703893e-05, + "loss": 0.8192, "step": 1185 }, { "epoch": 5.9351620947630925, - "grad_norm": 0.455078125, - "learning_rate": 0.00015100425897656753, - "loss": 0.9348, + "grad_norm": 0.388671875, + "learning_rate": 8.435655349597689e-05, + "loss": 0.8216, "step": 1190 }, { "epoch": 5.960099750623441, - "grad_norm": 0.5078125, - "learning_rate": 0.000150502984157469, - "loss": 0.9406, + "grad_norm": 0.404296875, + "learning_rate": 8.349523941393224e-05, + "loss": 0.8127, "step": 1195 }, { "epoch": 5.985037406483791, - "grad_norm": 0.609375, - "learning_rate": 0.00015000000000000001, - "loss": 0.9256, + "grad_norm": 0.36328125, + "learning_rate": 8.263518223330697e-05, + "loss": 0.8085, "step": 1200 }, { "epoch": 6.0, - "eval_loss": 2.491248607635498, - "eval_runtime": 0.5413, - "eval_samples_per_second": 18.475, - "eval_steps_per_second": 1.848, + "eval_loss": 2.6190178394317627, + "eval_runtime": 0.5351, + "eval_samples_per_second": 18.688, + "eval_steps_per_second": 1.869, "step": 1203 }, { "epoch": 6.0099750623441395, - "grad_norm": 0.515625, - "learning_rate": 0.00014949532352830541, - "loss": 0.9255, + "grad_norm": 0.353515625, + "learning_rate": 8.177644745078526e-05, + "loss": 0.7925, "step": 1205 }, { "epoch": 6.034912718204489, - "grad_norm": 0.5546875, - "learning_rate": 0.0001489889718238087, - "loss": 0.9186, + "grad_norm": 0.5, + "learning_rate": 8.091910046234552e-05, + "loss": 0.7848, "step": 1210 }, { "epoch": 6.059850374064838, - "grad_norm": 0.6796875, - "learning_rate": 0.00014848096202463372, - "loss": 0.9201, + "grad_norm": 0.380859375, + "learning_rate": 8.00632065582803e-05, + "loss": 0.7776, "step": 1215 }, { "epoch": 6.084788029925187, - "grad_norm": 0.7578125, - "learning_rate": 0.00014797131132502465, - "loss": 0.9192, + "grad_norm": 0.392578125, + "learning_rate": 7.920883091822408e-05, + "loss": 0.7813, "step": 1220 }, { "epoch": 6.109725685785536, - "grad_norm": 0.5703125, - "learning_rate": 0.00014746003697476404, - "loss": 0.9178, + "grad_norm": 0.392578125, + "learning_rate": 7.835603860618972e-05, + "loss": 0.7775, "step": 1225 }, { "epoch": 6.134663341645886, - "grad_norm": 0.640625, - "learning_rate": 0.00014694715627858908, - "loss": 0.9112, + "grad_norm": 0.40234375, + "learning_rate": 7.750489456561352e-05, + "loss": 0.7795, "step": 1230 }, { "epoch": 6.159600997506234, - "grad_norm": 0.625, - "learning_rate": 0.00014643268659560572, - "loss": 0.9181, + "grad_norm": 0.40625, + "learning_rate": 7.66554636144095e-05, + "loss": 0.7802, "step": 1235 }, { "epoch": 6.184538653366584, - "grad_norm": 0.51171875, - "learning_rate": 0.00014591664533870118, - "loss": 0.9302, + "grad_norm": 0.39453125, + "learning_rate": 7.580781044003324e-05, + "loss": 0.7901, "step": 1240 }, { "epoch": 6.2094763092269325, - "grad_norm": 0.55859375, - "learning_rate": 0.00014539904997395468, - "loss": 0.9257, + "grad_norm": 0.375, + "learning_rate": 7.496199959455584e-05, + "loss": 0.7797, "step": 1245 }, { "epoch": 6.234413965087282, - "grad_norm": 0.5078125, - "learning_rate": 0.00014487991802004623, - "loss": 0.9188, + "grad_norm": 0.404296875, + "learning_rate": 7.411809548974792e-05, + "loss": 0.7788, "step": 1250 }, { "epoch": 6.259351620947631, - "grad_norm": 0.5859375, - "learning_rate": 0.00014435926704766362, - "loss": 0.9218, + "grad_norm": 0.384765625, + "learning_rate": 7.327616239217431e-05, + "loss": 0.7815, "step": 1255 }, { "epoch": 6.28428927680798, - "grad_norm": 0.65234375, - "learning_rate": 0.00014383711467890774, - "loss": 0.9132, + "grad_norm": 0.486328125, + "learning_rate": 7.243626441830009e-05, + "loss": 0.7878, "step": 1260 }, { "epoch": 6.309226932668329, - "grad_norm": 0.58984375, - "learning_rate": 0.00014331347858669632, - "loss": 0.931, + "grad_norm": 0.38671875, + "learning_rate": 7.159846552960774e-05, + "loss": 0.7831, "step": 1265 }, { "epoch": 6.334164588528679, - "grad_norm": 0.546875, - "learning_rate": 0.00014278837649416544, - "loss": 0.9187, + "grad_norm": 0.388671875, + "learning_rate": 7.076282952772633e-05, + "loss": 0.7852, "step": 1270 }, { "epoch": 6.359102244389027, - "grad_norm": 0.60546875, - "learning_rate": 0.00014226182617406996, - "loss": 0.9258, + "grad_norm": 0.3984375, + "learning_rate": 6.992942004957271e-05, + "loss": 0.7967, "step": 1275 }, { "epoch": 6.384039900249377, - "grad_norm": 0.51171875, - "learning_rate": 0.0001417338454481818, - "loss": 0.9125, + "grad_norm": 0.3828125, + "learning_rate": 6.909830056250527e-05, + "loss": 0.7775, "step": 1280 }, { "epoch": 6.4089775561097255, - "grad_norm": 0.6015625, - "learning_rate": 0.00014120445218668686, - "loss": 0.9148, + "grad_norm": 0.38671875, + "learning_rate": 6.826953435949081e-05, + "loss": 0.7836, "step": 1285 }, { "epoch": 6.433915211970075, - "grad_norm": 0.58203125, - "learning_rate": 0.00014067366430758004, - "loss": 0.9149, + "grad_norm": 0.3984375, + "learning_rate": 6.744318455428436e-05, + "loss": 0.7802, "step": 1290 }, { "epoch": 6.458852867830424, - "grad_norm": 0.515625, - "learning_rate": 0.00014014149977605893, - "loss": 0.9119, + "grad_norm": 0.396484375, + "learning_rate": 6.661931407662292e-05, + "loss": 0.7923, "step": 1295 }, { "epoch": 6.483790523690773, - "grad_norm": 0.609375, - "learning_rate": 0.0001396079766039157, - "loss": 0.9313, + "grad_norm": 0.400390625, + "learning_rate": 6.579798566743314e-05, + "loss": 0.7813, "step": 1300 }, { "epoch": 6.508728179551122, - "grad_norm": 0.482421875, - "learning_rate": 0.00013907311284892736, - "loss": 0.9148, + "grad_norm": 0.419921875, + "learning_rate": 6.497926187405326e-05, + "loss": 0.7923, "step": 1305 }, { "epoch": 6.533665835411472, - "grad_norm": 0.6015625, - "learning_rate": 0.00013853692661424484, - "loss": 0.9253, + "grad_norm": 0.3671875, + "learning_rate": 6.416320504546997e-05, + "loss": 0.7941, "step": 1310 }, { "epoch": 6.55860349127182, - "grad_norm": 0.546875, - "learning_rate": 0.00013799943604777992, - "loss": 0.9204, + "grad_norm": 0.365234375, + "learning_rate": 6.334987732757029e-05, + "loss": 0.783, "step": 1315 }, { "epoch": 6.58354114713217, - "grad_norm": 0.50390625, - "learning_rate": 0.00013746065934159123, - "loss": 0.9219, + "grad_norm": 0.37109375, + "learning_rate": 6.25393406584088e-05, + "loss": 0.7837, "step": 1320 }, { "epoch": 6.6084788029925186, - "grad_norm": 0.486328125, - "learning_rate": 0.00013692061473126845, - "loss": 0.9162, + "grad_norm": 0.412109375, + "learning_rate": 6.173165676349103e-05, + "loss": 0.7885, "step": 1325 }, { "epoch": 6.633416458852868, - "grad_norm": 0.486328125, - "learning_rate": 0.00013637932049531516, - "loss": 0.9239, + "grad_norm": 0.375, + "learning_rate": 6.092688715107264e-05, + "loss": 0.7984, "step": 1330 }, { "epoch": 6.658354114713217, - "grad_norm": 0.546875, - "learning_rate": 0.00013583679495453, - "loss": 0.9217, + "grad_norm": 0.365234375, + "learning_rate": 6.012509310747538e-05, + "loss": 0.7971, "step": 1335 }, { "epoch": 6.683291770573566, - "grad_norm": 0.46484375, - "learning_rate": 0.00013529305647138687, - "loss": 0.9268, + "grad_norm": 0.376953125, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.798, "step": 1340 }, { "epoch": 6.708229426433915, - "grad_norm": 0.46875, - "learning_rate": 0.00013474812344941315, - "loss": 0.9238, + "grad_norm": 0.37890625, + "learning_rate": 5.853067573437612e-05, + "loss": 0.7786, "step": 1345 }, { "epoch": 6.733167082294265, - "grad_norm": 0.494140625, - "learning_rate": 0.00013420201433256689, - "loss": 0.9141, + "grad_norm": 0.404296875, + "learning_rate": 5.773817382593008e-05, + "loss": 0.7939, "step": 1350 }, { "epoch": 6.758104738154613, - "grad_norm": 0.51171875, - "learning_rate": 0.00013365474760461266, - "loss": 0.9219, + "grad_norm": 0.369140625, + "learning_rate": 5.694889031917047e-05, + "loss": 0.7881, "step": 1355 }, { "epoch": 6.783042394014963, - "grad_norm": 0.49609375, - "learning_rate": 0.0001331063417884958, - "loss": 0.9177, + "grad_norm": 0.3828125, + "learning_rate": 5.616288532109225e-05, + "loss": 0.7872, "step": 1360 }, { "epoch": 6.807980049875312, - "grad_norm": 0.453125, - "learning_rate": 0.00013255681544571568, - "loss": 0.9215, + "grad_norm": 0.365234375, + "learning_rate": 5.5380218689019125e-05, + "loss": 0.7919, "step": 1365 }, { "epoch": 6.832917705735661, - "grad_norm": 0.59765625, - "learning_rate": 0.00013200618717569714, - "loss": 0.9087, + "grad_norm": 0.41796875, + "learning_rate": 5.4600950026045326e-05, + "loss": 0.7819, "step": 1370 }, { "epoch": 6.85785536159601, - "grad_norm": 0.470703125, - "learning_rate": 0.00013145447561516138, - "loss": 0.9276, + "grad_norm": 0.376953125, + "learning_rate": 5.382513867649663e-05, + "loss": 0.7805, "step": 1375 }, { "epoch": 6.882793017456359, - "grad_norm": 0.478515625, - "learning_rate": 0.00013090169943749476, - "loss": 0.9147, + "grad_norm": 0.384765625, + "learning_rate": 5.305284372141095e-05, + "loss": 0.7995, "step": 1380 }, { "epoch": 6.907730673316708, - "grad_norm": 0.478515625, - "learning_rate": 0.0001303478773521171, - "loss": 0.9209, + "grad_norm": 0.365234375, + "learning_rate": 5.2284123974039154e-05, + "loss": 0.7911, "step": 1385 }, { "epoch": 6.932668329177058, - "grad_norm": 0.51953125, - "learning_rate": 0.0001297930281038482, - "loss": 0.9248, + "grad_norm": 0.373046875, + "learning_rate": 5.15190379753663e-05, + "loss": 0.7821, "step": 1390 }, { "epoch": 6.957605985037406, - "grad_norm": 0.58984375, - "learning_rate": 0.00012923717047227368, - "loss": 0.9197, + "grad_norm": 0.376953125, + "learning_rate": 5.07576439896533e-05, + "loss": 0.7839, "step": 1395 }, { "epoch": 6.982543640897756, - "grad_norm": 0.5625, - "learning_rate": 0.00012868032327110904, - "loss": 0.9216, + "grad_norm": 0.375, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7905, "step": 1400 }, { "epoch": 6.997506234413965, - "eval_loss": 2.5038018226623535, - "eval_runtime": 0.6591, - "eval_samples_per_second": 15.172, - "eval_steps_per_second": 1.517, + "eval_loss": 2.6576273441314697, + "eval_runtime": 0.9682, + "eval_samples_per_second": 10.328, + "eval_steps_per_second": 1.033, "step": 1403 }, { "epoch": 7.007481296758105, - "grad_norm": 0.5625, - "learning_rate": 0.00012812250534756308, - "loss": 0.9115, + "grad_norm": 0.376953125, + "learning_rate": 4.924616370392961e-05, + "loss": 0.7696, "step": 1405 }, { "epoch": 7.032418952618454, - "grad_norm": 0.6015625, - "learning_rate": 0.0001275637355816999, - "loss": 0.9038, + "grad_norm": 0.392578125, + "learning_rate": 4.8496192508994576e-05, + "loss": 0.7561, "step": 1410 }, { "epoch": 7.057356608478803, - "grad_norm": 0.578125, - "learning_rate": 0.0001270040328858001, - "loss": 0.9088, + "grad_norm": 0.38671875, + "learning_rate": 4.7750143528405126e-05, + "loss": 0.7558, "step": 1415 }, { "epoch": 7.082294264339152, - "grad_norm": 0.4921875, - "learning_rate": 0.00012644341620372023, - "loss": 0.9016, + "grad_norm": 0.41796875, + "learning_rate": 4.700807357667952e-05, + "loss": 0.7591, "step": 1420 }, { "epoch": 7.107231920199501, - "grad_norm": 0.5390625, - "learning_rate": 0.00012588190451025207, - "loss": 0.9112, + "grad_norm": 0.3828125, + "learning_rate": 4.6270039165317605e-05, + "loss": 0.7628, "step": 1425 }, { "epoch": 7.132169576059851, - "grad_norm": 0.484375, - "learning_rate": 0.0001253195168104802, - "loss": 0.9081, + "grad_norm": 0.38671875, + "learning_rate": 4.5536096498497295e-05, + "loss": 0.7595, "step": 1430 }, { "epoch": 7.157107231920199, - "grad_norm": 0.59375, - "learning_rate": 0.0001247562721391386, - "loss": 0.9094, + "grad_norm": 0.384765625, + "learning_rate": 4.480630146879419e-05, + "loss": 0.7667, "step": 1435 }, { "epoch": 7.182044887780549, - "grad_norm": 0.498046875, - "learning_rate": 0.00012419218955996676, - "loss": 0.8956, + "grad_norm": 0.37109375, + "learning_rate": 4.4080709652925336e-05, + "loss": 0.7551, "step": 1440 }, { "epoch": 7.206982543640898, - "grad_norm": 0.515625, - "learning_rate": 0.00012362728816506417, - "loss": 0.9044, + "grad_norm": 0.416015625, + "learning_rate": 4.335937630751674e-05, + "loss": 0.7631, "step": 1445 }, { "epoch": 7.231920199501247, - "grad_norm": 0.494140625, - "learning_rate": 0.00012306158707424403, - "loss": 0.9069, + "grad_norm": 0.412109375, + "learning_rate": 4.264235636489542e-05, + "loss": 0.7659, "step": 1450 }, { "epoch": 7.256857855361596, - "grad_norm": 0.58984375, - "learning_rate": 0.0001224951054343865, - "loss": 0.8949, + "grad_norm": 0.419921875, + "learning_rate": 4.1929704428906026e-05, + "loss": 0.7652, "step": 1455 }, { "epoch": 7.2817955112219455, - "grad_norm": 0.58984375, - "learning_rate": 0.00012192786241879033, - "loss": 0.901, + "grad_norm": 0.376953125, + "learning_rate": 4.12214747707527e-05, + "loss": 0.7697, "step": 1460 }, { "epoch": 7.306733167082294, - "grad_norm": 0.58203125, - "learning_rate": 0.00012135987722652402, - "loss": 0.8983, + "grad_norm": 0.36328125, + "learning_rate": 4.0517721324865884e-05, + "loss": 0.7646, "step": 1465 }, { "epoch": 7.331670822942644, - "grad_norm": 0.58203125, - "learning_rate": 0.00012079116908177593, - "loss": 0.9113, + "grad_norm": 0.373046875, + "learning_rate": 3.981849768479517e-05, + "loss": 0.7659, "step": 1470 }, { "epoch": 7.356608478802992, - "grad_norm": 0.482421875, - "learning_rate": 0.00012022175723320381, - "loss": 0.9026, + "grad_norm": 0.37109375, + "learning_rate": 3.9123857099127936e-05, + "loss": 0.7665, "step": 1475 }, { "epoch": 7.381546134663342, - "grad_norm": 0.51171875, - "learning_rate": 0.00011965166095328301, - "loss": 0.9075, + "grad_norm": 0.388671875, + "learning_rate": 3.843385246743417e-05, + "loss": 0.769, "step": 1480 }, { "epoch": 7.406483790523691, - "grad_norm": 0.58203125, - "learning_rate": 0.00011908089953765449, - "loss": 0.9045, + "grad_norm": 0.369140625, + "learning_rate": 3.774853633623806e-05, + "loss": 0.7728, "step": 1485 }, { "epoch": 7.43142144638404, - "grad_norm": 0.494140625, - "learning_rate": 0.00011850949230447145, - "loss": 0.9099, + "grad_norm": 0.37109375, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.7605, "step": 1490 }, { "epoch": 7.456359102244389, - "grad_norm": 0.447265625, - "learning_rate": 0.00011793745859374575, - "loss": 0.9035, + "grad_norm": 0.38671875, + "learning_rate": 3.6392177972223594e-05, + "loss": 0.7678, "step": 1495 }, { "epoch": 7.4812967581047385, - "grad_norm": 0.5234375, - "learning_rate": 0.00011736481776669306, - "loss": 0.9093, + "grad_norm": 0.37890625, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.7676, "step": 1500 }, { "epoch": 7.506234413965087, - "grad_norm": 0.5390625, - "learning_rate": 0.00011679158920507774, - "loss": 0.8925, + "grad_norm": 0.423828125, + "learning_rate": 3.5055195166981645e-05, + "loss": 0.7742, "step": 1505 }, { "epoch": 7.531172069825437, - "grad_norm": 0.5, - "learning_rate": 0.00011621779231055676, - "loss": 0.9054, + "grad_norm": 0.3671875, + "learning_rate": 3.439409710094929e-05, + "loss": 0.7642, "step": 1510 }, { "epoch": 7.556109725685785, - "grad_norm": 0.5390625, - "learning_rate": 0.0001156434465040231, - "loss": 0.9017, + "grad_norm": 0.3828125, + "learning_rate": 3.373799517842627e-05, + "loss": 0.7751, "step": 1515 }, { "epoch": 7.581047381546135, - "grad_norm": 0.53515625, - "learning_rate": 0.00011506857122494831, - "loss": 0.9125, + "grad_norm": 0.38671875, + "learning_rate": 3.308693936411421e-05, + "loss": 0.7615, "step": 1520 }, { "epoch": 7.605985037406484, - "grad_norm": 0.57421875, - "learning_rate": 0.00011449318593072466, - "loss": 0.9051, + "grad_norm": 0.375, + "learning_rate": 3.244097923843398e-05, + "loss": 0.7666, "step": 1525 }, { "epoch": 7.630922693266833, - "grad_norm": 0.625, - "learning_rate": 0.00011391731009600654, - "loss": 0.894, + "grad_norm": 0.388671875, + "learning_rate": 3.1800163993750166e-05, + "loss": 0.7725, "step": 1530 }, { "epoch": 7.655860349127182, - "grad_norm": 0.462890625, - "learning_rate": 0.00011334096321205128, - "loss": 0.9096, + "grad_norm": 0.380859375, + "learning_rate": 3.116454243062459e-05, + "loss": 0.7635, "step": 1535 }, { "epoch": 7.6807980049875315, - "grad_norm": 0.58984375, - "learning_rate": 0.00011276416478605949, - "loss": 0.908, + "grad_norm": 0.384765625, + "learning_rate": 3.053416295410026e-05, + "loss": 0.7676, "step": 1540 }, { "epoch": 7.70573566084788, - "grad_norm": 0.46484375, - "learning_rate": 0.00011218693434051475, - "loss": 0.9118, + "grad_norm": 0.3828125, + "learning_rate": 2.9909073570014912e-05, + "loss": 0.7621, "step": 1545 }, { "epoch": 7.73067331670823, - "grad_norm": 0.578125, - "learning_rate": 0.00011160929141252303, - "loss": 0.8973, + "grad_norm": 0.375, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7496, "step": 1550 }, { "epoch": 7.7556109725685785, - "grad_norm": 0.66796875, - "learning_rate": 0.00011103125555315119, - "loss": 0.9025, + "grad_norm": 0.380859375, + "learning_rate": 2.8674955084581857e-05, + "loss": 0.7622, "step": 1555 }, { "epoch": 7.780548628428928, - "grad_norm": 0.62109375, - "learning_rate": 0.00011045284632676536, - "loss": 0.8963, + "grad_norm": 0.365234375, + "learning_rate": 2.8066019966134904e-05, + "loss": 0.7671, "step": 1560 }, { "epoch": 7.805486284289277, - "grad_norm": 0.609375, - "learning_rate": 0.00010987408331036879, - "loss": 0.9171, + "grad_norm": 0.37109375, + "learning_rate": 2.746256289877126e-05, + "loss": 0.7593, "step": 1565 }, { "epoch": 7.830423940149626, - "grad_norm": 0.52734375, - "learning_rate": 0.00010929498609293924, - "loss": 0.9225, + "grad_norm": 0.375, + "learning_rate": 2.6864629838082956e-05, + "loss": 0.7597, "step": 1570 }, { "epoch": 7.855361596009975, - "grad_norm": 0.474609375, - "learning_rate": 0.00010871557427476583, - "loss": 0.9087, + "grad_norm": 0.37109375, + "learning_rate": 2.6272266318987603e-05, + "loss": 0.7761, "step": 1575 }, { "epoch": 7.8802992518703245, - "grad_norm": 0.5546875, - "learning_rate": 0.00010813586746678583, - "loss": 0.9119, + "grad_norm": 0.373046875, + "learning_rate": 2.5685517452260567e-05, + "loss": 0.7646, "step": 1580 }, { "epoch": 7.905236907730673, - "grad_norm": 0.5078125, - "learning_rate": 0.00010755588528992082, - "loss": 0.8963, + "grad_norm": 0.373046875, + "learning_rate": 2.5104427921099782e-05, + "loss": 0.763, "step": 1585 }, { "epoch": 7.930174563591023, - "grad_norm": 0.5, - "learning_rate": 0.00010697564737441252, - "loss": 0.9115, + "grad_norm": 0.390625, + "learning_rate": 2.45290419777228e-05, + "loss": 0.7679, "step": 1590 }, { "epoch": 7.9551122194513715, - "grad_norm": 0.51171875, - "learning_rate": 0.00010639517335915856, - "loss": 0.903, + "grad_norm": 0.380859375, + "learning_rate": 2.3959403439996907e-05, + "loss": 0.7591, "step": 1595 }, { "epoch": 7.980049875311721, - "grad_norm": 0.5390625, - "learning_rate": 0.00010581448289104758, - "loss": 0.9094, + "grad_norm": 0.369140625, + "learning_rate": 2.339555568810221e-05, + "loss": 0.7684, "step": 1600 }, { "epoch": 8.0, - "eval_loss": 2.512359380722046, - "eval_runtime": 0.5399, - "eval_samples_per_second": 18.52, - "eval_steps_per_second": 1.852, + "eval_loss": 2.6891818046569824, + "eval_runtime": 0.5383, + "eval_samples_per_second": 18.576, + "eval_steps_per_second": 1.858, "step": 1604 }, { "epoch": 8.00498753117207, - "grad_norm": 0.60546875, - "learning_rate": 0.0001052335956242944, - "loss": 0.9066, + "grad_norm": 0.369140625, + "learning_rate": 2.2837541661228025e-05, + "loss": 0.7604, "step": 1605 }, { "epoch": 8.029925187032418, - "grad_norm": 0.56640625, - "learning_rate": 0.0001046525312197747, - "loss": 0.8932, + "grad_norm": 0.3671875, + "learning_rate": 2.2285403854302912e-05, + "loss": 0.7569, "step": 1610 }, { "epoch": 8.054862842892769, - "grad_norm": 0.46484375, - "learning_rate": 0.0001040713093443596, - "loss": 0.8877, + "grad_norm": 0.373046875, + "learning_rate": 2.173918431475861e-05, + "loss": 0.7565, "step": 1615 }, { "epoch": 8.079800498753118, - "grad_norm": 0.53125, - "learning_rate": 0.00010348994967025012, - "loss": 0.9041, + "grad_norm": 0.3828125, + "learning_rate": 2.119892463932781e-05, + "loss": 0.7619, "step": 1620 }, { "epoch": 8.104738154613466, - "grad_norm": 0.515625, - "learning_rate": 0.00010290847187431113, - "loss": 0.8898, + "grad_norm": 0.37109375, + "learning_rate": 2.0664665970876496e-05, + "loss": 0.7531, "step": 1625 }, { "epoch": 8.129675810473815, - "grad_norm": 0.486328125, - "learning_rate": 0.00010232689563740563, - "loss": 0.8814, + "grad_norm": 0.375, + "learning_rate": 2.013644899527074e-05, + "loss": 0.7463, "step": 1630 }, { "epoch": 8.154613466334165, - "grad_norm": 0.515625, - "learning_rate": 0.00010174524064372837, - "loss": 0.8814, + "grad_norm": 0.3671875, + "learning_rate": 1.9614313938278272e-05, + "loss": 0.7611, "step": 1635 }, { "epoch": 8.179551122194514, - "grad_norm": 0.5234375, - "learning_rate": 0.00010116352658013973, - "loss": 0.8934, + "grad_norm": 0.3671875, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.7513, "step": 1640 }, { "epoch": 8.204488778054863, - "grad_norm": 0.470703125, - "learning_rate": 0.00010058177313549939, - "loss": 0.8979, + "grad_norm": 0.3671875, + "learning_rate": 1.858844816436809e-05, + "loss": 0.7477, "step": 1645 }, { "epoch": 8.229426433915211, - "grad_norm": 0.466796875, - "learning_rate": 0.0001, - "loss": 0.9019, + "grad_norm": 0.3671875, + "learning_rate": 1.808479557110081e-05, + "loss": 0.7548, "step": 1650 }, { "epoch": 8.254364089775562, - "grad_norm": 0.486328125, - "learning_rate": 9.94182268645006e-05, - "loss": 0.8859, + "grad_norm": 0.3671875, + "learning_rate": 1.7587381137798432e-05, + "loss": 0.7528, "step": 1655 }, { "epoch": 8.27930174563591, - "grad_norm": 0.51953125, - "learning_rate": 9.883647341986032e-05, - "loss": 0.8938, + "grad_norm": 0.37109375, + "learning_rate": 1.7096242744495837e-05, + "loss": 0.7551, "step": 1660 }, { "epoch": 8.30423940149626, - "grad_norm": 0.53125, - "learning_rate": 9.825475935627165e-05, - "loss": 0.892, + "grad_norm": 0.361328125, + "learning_rate": 1.661141779328319e-05, + "loss": 0.7474, "step": 1665 }, { "epoch": 8.329177057356608, - "grad_norm": 0.54296875, - "learning_rate": 9.767310436259438e-05, - "loss": 0.8955, + "grad_norm": 0.3828125, + "learning_rate": 1.6132943205457606e-05, + "loss": 0.7582, "step": 1670 }, { "epoch": 8.354114713216958, - "grad_norm": 0.5078125, - "learning_rate": 9.709152812568886e-05, - "loss": 0.8929, + "grad_norm": 0.361328125, + "learning_rate": 1.566085541871145e-05, + "loss": 0.7383, "step": 1675 }, { "epoch": 8.379052369077307, - "grad_norm": 0.59375, - "learning_rate": 9.651005032974994e-05, - "loss": 0.8945, + "grad_norm": 0.3671875, + "learning_rate": 1.5195190384357404e-05, + "loss": 0.7522, "step": 1680 }, { "epoch": 8.403990024937656, - "grad_norm": 0.478515625, - "learning_rate": 9.592869065564043e-05, - "loss": 0.888, + "grad_norm": 0.38671875, + "learning_rate": 1.4735983564590783e-05, + "loss": 0.7516, "step": 1685 }, { "epoch": 8.428927680798004, - "grad_norm": 0.65625, - "learning_rate": 9.534746878022534e-05, - "loss": 0.8907, + "grad_norm": 0.375, + "learning_rate": 1.4283269929788779e-05, + "loss": 0.755, "step": 1690 }, { "epoch": 8.453865336658355, - "grad_norm": 0.515625, - "learning_rate": 9.476640437570562e-05, - "loss": 0.8885, + "grad_norm": 0.384765625, + "learning_rate": 1.3837083955847418e-05, + "loss": 0.7474, "step": 1695 }, { "epoch": 8.478802992518704, - "grad_norm": 0.515625, - "learning_rate": 9.418551710895243e-05, - "loss": 0.8867, + "grad_norm": 0.369140625, + "learning_rate": 1.339745962155613e-05, + "loss": 0.762, "step": 1700 }, { "epoch": 8.503740648379052, - "grad_norm": 0.4765625, - "learning_rate": 9.360482664084145e-05, - "loss": 0.8881, + "grad_norm": 0.373046875, + "learning_rate": 1.296443040601003e-05, + "loss": 0.7577, "step": 1705 }, { "epoch": 8.528678304239401, - "grad_norm": 0.490234375, - "learning_rate": 9.302435262558747e-05, - "loss": 0.8964, + "grad_norm": 0.365234375, + "learning_rate": 1.2538029286060426e-05, + "loss": 0.7583, "step": 1710 }, { "epoch": 8.553615960099751, - "grad_norm": 0.5390625, - "learning_rate": 9.244411471007922e-05, - "loss": 0.888, + "grad_norm": 0.38671875, + "learning_rate": 1.2118288733803473e-05, + "loss": 0.7482, "step": 1715 }, { "epoch": 8.5785536159601, - "grad_norm": 0.5703125, - "learning_rate": 9.186413253321418e-05, - "loss": 0.8806, + "grad_norm": 0.3671875, + "learning_rate": 1.1705240714107302e-05, + "loss": 0.7529, "step": 1720 }, { "epoch": 8.603491271820449, - "grad_norm": 0.52734375, - "learning_rate": 9.128442572523417e-05, - "loss": 0.8849, + "grad_norm": 0.380859375, + "learning_rate": 1.129891668217783e-05, + "loss": 0.7537, "step": 1725 }, { "epoch": 8.628428927680797, - "grad_norm": 0.5703125, - "learning_rate": 9.070501390706079e-05, - "loss": 0.8924, + "grad_norm": 0.37109375, + "learning_rate": 1.0899347581163221e-05, + "loss": 0.7528, "step": 1730 }, { "epoch": 8.653366583541148, - "grad_norm": 0.470703125, - "learning_rate": 9.012591668963122e-05, - "loss": 0.8911, + "grad_norm": 0.37109375, + "learning_rate": 1.0506563839797501e-05, + "loss": 0.7498, "step": 1735 }, { "epoch": 8.678304239401497, - "grad_norm": 0.50390625, - "learning_rate": 8.954715367323468e-05, - "loss": 0.8956, + "grad_norm": 0.3671875, + "learning_rate": 1.0120595370083318e-05, + "loss": 0.7617, "step": 1740 }, { "epoch": 8.703241895261845, - "grad_norm": 0.53125, - "learning_rate": 8.896874444684883e-05, - "loss": 0.8981, + "grad_norm": 0.365234375, + "learning_rate": 9.74147156501396e-06, + "loss": 0.7514, "step": 1745 }, { "epoch": 8.728179551122194, - "grad_norm": 1.0390625, - "learning_rate": 8.839070858747697e-05, - "loss": 0.8989, + "grad_norm": 0.359375, + "learning_rate": 9.369221296335006e-06, + "loss": 0.7521, "step": 1750 }, { "epoch": 8.753117206982544, - "grad_norm": 0.470703125, - "learning_rate": 8.781306565948528e-05, - "loss": 0.9039, + "grad_norm": 0.369140625, + "learning_rate": 9.00387291234569e-06, + "loss": 0.7667, "step": 1755 }, { "epoch": 8.778054862842893, - "grad_norm": 0.515625, - "learning_rate": 8.723583521394054e-05, - "loss": 0.9023, + "grad_norm": 0.361328125, + "learning_rate": 8.645454235739903e-06, + "loss": 0.753, "step": 1760 }, { "epoch": 8.802992518703242, - "grad_norm": 0.45703125, - "learning_rate": 8.665903678794873e-05, - "loss": 0.9084, + "grad_norm": 0.369140625, + "learning_rate": 8.293992561487596e-06, + "loss": 0.7532, "step": 1765 }, { "epoch": 8.82793017456359, - "grad_norm": 0.470703125, - "learning_rate": 8.608268990399349e-05, - "loss": 0.8976, + "grad_norm": 0.37109375, + "learning_rate": 7.949514654755962e-06, + "loss": 0.7516, "step": 1770 }, { "epoch": 8.85286783042394, - "grad_norm": 0.58203125, - "learning_rate": 8.550681406927535e-05, - "loss": 0.8931, + "grad_norm": 0.365234375, + "learning_rate": 7.612046748871327e-06, + "loss": 0.7586, "step": 1775 }, { "epoch": 8.87780548628429, - "grad_norm": 0.5078125, - "learning_rate": 8.49314287750517e-05, - "loss": 0.8893, + "grad_norm": 0.375, + "learning_rate": 7.281614543321269e-06, + "loss": 0.7534, "step": 1780 }, { "epoch": 8.902743142144638, - "grad_norm": 0.48828125, - "learning_rate": 8.435655349597689e-05, - "loss": 0.9013, + "grad_norm": 0.359375, + "learning_rate": 6.958243201797554e-06, + "loss": 0.7483, "step": 1785 }, { "epoch": 8.927680798004987, - "grad_norm": 0.494140625, - "learning_rate": 8.378220768944327e-05, - "loss": 0.9058, + "grad_norm": 0.3671875, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.7441, "step": 1790 }, { "epoch": 8.952618453865338, - "grad_norm": 0.48046875, - "learning_rate": 8.32084107949223e-05, - "loss": 0.9019, + "grad_norm": 0.373046875, + "learning_rate": 6.332781075160243e-06, + "loss": 0.7482, "step": 1795 }, { "epoch": 8.977556109725686, - "grad_norm": 0.451171875, - "learning_rate": 8.263518223330697e-05, - "loss": 0.8961, + "grad_norm": 0.373046875, + "learning_rate": 6.030737921409169e-06, + "loss": 0.7564, "step": 1800 }, { "epoch": 8.997506234413965, - "eval_loss": 2.5246224403381348, - "eval_runtime": 0.6529, - "eval_samples_per_second": 15.316, - "eval_steps_per_second": 1.532, + "eval_loss": 2.6970067024230957, + "eval_runtime": 0.8304, + "eval_samples_per_second": 12.042, + "eval_steps_per_second": 1.204, "step": 1804 }, { "epoch": 9.002493765586035, - "grad_norm": 0.462890625, - "learning_rate": 8.206254140625426e-05, - "loss": 0.8949, + "grad_norm": 0.39453125, + "learning_rate": 5.735850890782157e-06, + "loss": 0.7559, "step": 1805 }, { "epoch": 9.027431421446384, - "grad_norm": 0.478515625, - "learning_rate": 8.149050769552856e-05, - "loss": 0.8901, + "grad_norm": 0.359375, + "learning_rate": 5.448142440068316e-06, + "loss": 0.7497, "step": 1810 }, { "epoch": 9.052369077306734, - "grad_norm": 0.53515625, - "learning_rate": 8.091910046234552e-05, - "loss": 0.8822, + "grad_norm": 0.369140625, + "learning_rate": 5.167634479380068e-06, + "loss": 0.7512, "step": 1815 }, { "epoch": 9.077306733167083, - "grad_norm": 0.4765625, - "learning_rate": 8.034833904671698e-05, - "loss": 0.8846, + "grad_norm": 0.369140625, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.746, "step": 1820 }, { "epoch": 9.102244389027431, - "grad_norm": 0.5, - "learning_rate": 7.977824276679623e-05, - "loss": 0.8819, + "grad_norm": 0.37109375, + "learning_rate": 4.628304925177318e-06, + "loss": 0.7498, "step": 1825 }, { "epoch": 9.12718204488778, - "grad_norm": 0.482421875, - "learning_rate": 7.920883091822408e-05, - "loss": 0.888, + "grad_norm": 0.369140625, + "learning_rate": 4.369524403696457e-06, + "loss": 0.7535, "step": 1830 }, { "epoch": 9.15211970074813, - "grad_norm": 0.470703125, - "learning_rate": 7.864012277347602e-05, - "loss": 0.8682, + "grad_norm": 0.37109375, + "learning_rate": 4.118026513180695e-06, + "loss": 0.7456, "step": 1835 }, { "epoch": 9.17705735660848, - "grad_norm": 0.478515625, - "learning_rate": 7.807213758120966e-05, - "loss": 0.8772, + "grad_norm": 0.373046875, + "learning_rate": 3.873830406168111e-06, + "loss": 0.7535, "step": 1840 }, { "epoch": 9.201995012468828, - "grad_norm": 0.51953125, - "learning_rate": 7.750489456561352e-05, - "loss": 0.8871, + "grad_norm": 0.359375, + "learning_rate": 3.6369546791377052e-06, + "loss": 0.7559, "step": 1845 }, { "epoch": 9.226932668329177, - "grad_norm": 0.5078125, - "learning_rate": 7.693841292575598e-05, - "loss": 0.8856, + "grad_norm": 0.365234375, + "learning_rate": 3.40741737109318e-06, + "loss": 0.7421, "step": 1850 }, { "epoch": 9.251870324189527, - "grad_norm": 0.515625, - "learning_rate": 7.637271183493586e-05, - "loss": 0.884, + "grad_norm": 0.365234375, + "learning_rate": 3.1852359621892367e-06, + "loss": 0.7535, "step": 1855 }, { "epoch": 9.276807980049876, - "grad_norm": 0.490234375, - "learning_rate": 7.580781044003324e-05, - "loss": 0.8885, + "grad_norm": 0.369140625, + "learning_rate": 2.970427372400353e-06, + "loss": 0.7479, "step": 1860 }, { "epoch": 9.301745635910224, - "grad_norm": 0.48828125, - "learning_rate": 7.524372786086142e-05, - "loss": 0.8825, + "grad_norm": 0.3671875, + "learning_rate": 2.7630079602323442e-06, + "loss": 0.7545, "step": 1865 }, { "epoch": 9.326683291770573, - "grad_norm": 0.486328125, - "learning_rate": 7.468048318951983e-05, - "loss": 0.8863, + "grad_norm": 0.359375, + "learning_rate": 2.5629935214764865e-06, + "loss": 0.7456, "step": 1870 }, { "epoch": 9.351620947630924, - "grad_norm": 0.5, - "learning_rate": 7.411809548974792e-05, - "loss": 0.8877, + "grad_norm": 0.37109375, + "learning_rate": 2.3703992880066638e-06, + "loss": 0.7569, "step": 1875 }, { "epoch": 9.376558603491272, - "grad_norm": 0.46875, - "learning_rate": 7.35565837962798e-05, - "loss": 0.8749, + "grad_norm": 0.3671875, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.7547, "step": 1880 }, { "epoch": 9.401496259351621, - "grad_norm": 0.470703125, - "learning_rate": 7.299596711419994e-05, - "loss": 0.8851, + "grad_norm": 0.365234375, + "learning_rate": 2.0075295379170412e-06, + "loss": 0.7544, "step": 1885 }, { "epoch": 9.42643391521197, - "grad_norm": 0.5, - "learning_rate": 7.243626441830009e-05, - "loss": 0.8957, + "grad_norm": 0.375, + "learning_rate": 1.8372816552336026e-06, + "loss": 0.7579, "step": 1890 }, { "epoch": 9.451371571072318, - "grad_norm": 0.46875, - "learning_rate": 7.187749465243693e-05, - "loss": 0.8904, + "grad_norm": 0.361328125, + "learning_rate": 1.6745092436045494e-06, + "loss": 0.7456, "step": 1895 }, { "epoch": 9.476309226932669, - "grad_norm": 0.466796875, - "learning_rate": 7.131967672889101e-05, - "loss": 0.8822, + "grad_norm": 0.361328125, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.7461, "step": 1900 }, { "epoch": 9.501246882793017, - "grad_norm": 0.49609375, - "learning_rate": 7.076282952772633e-05, - "loss": 0.8771, + "grad_norm": 0.359375, + "learning_rate": 1.3714398462768563e-06, + "loss": 0.7453, "step": 1905 }, { "epoch": 9.526184538653366, - "grad_norm": 0.462890625, - "learning_rate": 7.02069718961518e-05, - "loss": 0.8841, + "grad_norm": 0.365234375, + "learning_rate": 1.231165940486234e-06, + "loss": 0.7542, "step": 1910 }, { "epoch": 9.551122194513717, - "grad_norm": 0.482421875, - "learning_rate": 6.965212264788297e-05, - "loss": 0.8857, + "grad_norm": 0.37109375, + "learning_rate": 1.0984136638083177e-06, + "loss": 0.7612, "step": 1915 }, { "epoch": 9.576059850374065, - "grad_norm": 0.494140625, - "learning_rate": 6.909830056250527e-05, - "loss": 0.8808, + "grad_norm": 0.373046875, + "learning_rate": 9.731931258429638e-07, + "loss": 0.7553, "step": 1920 }, { "epoch": 9.600997506234414, - "grad_norm": 0.482421875, - "learning_rate": 6.854552438483865e-05, - "loss": 0.8872, + "grad_norm": 0.365234375, + "learning_rate": 8.555138626189618e-07, + "loss": 0.7532, "step": 1925 }, { "epoch": 9.625935162094763, - "grad_norm": 0.46875, - "learning_rate": 6.799381282430284e-05, - "loss": 0.893, + "grad_norm": 0.3671875, + "learning_rate": 7.453848358678017e-07, + "loss": 0.7504, "step": 1930 }, { "epoch": 9.650872817955111, - "grad_norm": 0.4765625, - "learning_rate": 6.744318455428436e-05, - "loss": 0.87, + "grad_norm": 0.369140625, + "learning_rate": 6.428144323412544e-07, + "loss": 0.7573, "step": 1935 }, { "epoch": 9.675810473815462, - "grad_norm": 0.453125, - "learning_rate": 6.68936582115042e-05, - "loss": 0.9005, + "grad_norm": 0.365234375, + "learning_rate": 5.478104631726711e-07, + "loss": 0.7509, "step": 1940 }, { "epoch": 9.70074812967581, - "grad_norm": 0.50390625, - "learning_rate": 6.634525239538736e-05, - "loss": 0.8936, + "grad_norm": 0.369140625, + "learning_rate": 4.6038016328211476e-07, + "loss": 0.7465, "step": 1945 }, { "epoch": 9.72568578553616, - "grad_norm": 0.5859375, - "learning_rate": 6.579798566743314e-05, - "loss": 0.898, + "grad_norm": 0.369140625, + "learning_rate": 3.805301908254455e-07, + "loss": 0.7466, "step": 1950 }, { "epoch": 9.75062344139651, - "grad_norm": 0.53125, - "learning_rate": 6.525187655058686e-05, - "loss": 0.8844, + "grad_norm": 0.359375, + "learning_rate": 3.0826662668720364e-07, + "loss": 0.758, "step": 1955 }, { "epoch": 9.775561097256858, - "grad_norm": 0.46484375, - "learning_rate": 6.470694352861312e-05, - "loss": 0.8849, + "grad_norm": 0.37109375, + "learning_rate": 2.4359497401758024e-07, + "loss": 0.7448, "step": 1960 }, { "epoch": 9.800498753117207, - "grad_norm": 0.51171875, - "learning_rate": 6.416320504546997e-05, - "loss": 0.881, + "grad_norm": 0.3671875, + "learning_rate": 1.86520157813308e-07, + "loss": 0.7529, "step": 1965 }, { "epoch": 9.825436408977556, - "grad_norm": 0.447265625, - "learning_rate": 6.362067950468489e-05, - "loss": 0.8804, + "grad_norm": 0.36328125, + "learning_rate": 1.3704652454261668e-07, + "loss": 0.7475, "step": 1970 }, { "epoch": 9.850374064837904, - "grad_norm": 0.455078125, - "learning_rate": 6.307938526873157e-05, - "loss": 0.8794, + "grad_norm": 0.361328125, + "learning_rate": 9.517784181422019e-08, + "loss": 0.7581, "step": 1975 }, { "epoch": 9.875311720698255, - "grad_norm": 0.474609375, - "learning_rate": 6.25393406584088e-05, - "loss": 0.8784, + "grad_norm": 0.365234375, + "learning_rate": 6.09172980904238e-08, + "loss": 0.7494, "step": 1980 }, { "epoch": 9.900249376558603, - "grad_norm": 0.5234375, - "learning_rate": 6.200056395222012e-05, - "loss": 0.8875, + "grad_norm": 0.369140625, + "learning_rate": 3.4267502444274015e-08, + "loss": 0.7507, "step": 1985 }, { "epoch": 9.925187032418952, - "grad_norm": 0.482421875, - "learning_rate": 6.146307338575519e-05, - "loss": 0.8774, + "grad_norm": 0.375, + "learning_rate": 1.5230484360873044e-08, + "loss": 0.7452, "step": 1990 }, { "epoch": 9.950124688279303, - "grad_norm": 0.470703125, - "learning_rate": 6.092688715107264e-05, - "loss": 0.8789, + "grad_norm": 0.365234375, + "learning_rate": 3.807693582869032e-09, + "loss": 0.7455, "step": 1995 }, { "epoch": 9.975062344139651, - "grad_norm": 0.48046875, - "learning_rate": 6.039202339608432e-05, - "loss": 0.8922, + "grad_norm": 0.380859375, + "learning_rate": 0.0, + "loss": 0.747, "step": 2000 }, { - "epoch": 10.0, - "grad_norm": 0.447265625, - "learning_rate": 5.985850022394106e-05, - "loss": 0.8816, - "step": 2005 - }, - { - "epoch": 10.0, - "eval_loss": 2.5341641902923584, - "eval_runtime": 0.5369, - "eval_samples_per_second": 18.625, - "eval_steps_per_second": 1.863, - "step": 2005 - }, - { - "epoch": 10.024937655860349, - "grad_norm": 0.4921875, - "learning_rate": 5.9326335692419995e-05, - "loss": 0.8711, - "step": 2010 - }, - { - "epoch": 10.049875311720697, - "grad_norm": 0.48828125, - "learning_rate": 5.879554781331317e-05, - "loss": 0.8889, - "step": 2015 - }, - { - "epoch": 10.074812967581048, - "grad_norm": 0.478515625, - "learning_rate": 5.8266154551818216e-05, - "loss": 0.8789, - "step": 2020 - }, - { - "epoch": 10.099750623441397, - "grad_norm": 0.45703125, - "learning_rate": 5.773817382593008e-05, - "loss": 0.8638, - "step": 2025 - }, - { - "epoch": 10.124688279301745, - "grad_norm": 0.5, - "learning_rate": 5.72116235058346e-05, - "loss": 0.874, - "step": 2030 - }, - { - "epoch": 10.149625935162096, - "grad_norm": 0.453125, - "learning_rate": 5.668652141330373e-05, - "loss": 0.8764, - "step": 2035 - }, - { - "epoch": 10.174563591022444, - "grad_norm": 0.466796875, - "learning_rate": 5.616288532109225e-05, - "loss": 0.8855, - "step": 2040 - }, - { - "epoch": 10.199501246882793, - "grad_norm": 0.5078125, - "learning_rate": 5.564073295233645e-05, - "loss": 0.8802, - "step": 2045 - }, - { - "epoch": 10.224438902743142, - "grad_norm": 0.498046875, - "learning_rate": 5.5120081979953785e-05, - "loss": 0.8738, - "step": 2050 - }, - { - "epoch": 10.24937655860349, - "grad_norm": 0.4921875, - "learning_rate": 5.4600950026045326e-05, - "loss": 0.8715, - "step": 2055 - }, - { - "epoch": 10.27431421446384, - "grad_norm": 0.470703125, - "learning_rate": 5.4083354661298814e-05, - "loss": 0.878, - "step": 2060 - }, - { - "epoch": 10.29925187032419, - "grad_norm": 0.453125, - "learning_rate": 5.356731340439431e-05, - "loss": 0.8851, - "step": 2065 + "epoch": 9.975062344139651, + "eval_loss": 2.6979904174804688, + "eval_runtime": 0.4975, + "eval_samples_per_second": 20.1, + "eval_steps_per_second": 2.01, + "step": 2000 }, { - "epoch": 10.324189526184538, - "grad_norm": 0.453125, - "learning_rate": 5.305284372141095e-05, - "loss": 0.8741, - "step": 2070 - }, - { - "epoch": 10.349127182044889, - "grad_norm": 0.4765625, - "learning_rate": 5.253996302523596e-05, - "loss": 0.8801, - "step": 2075 - }, - { - "epoch": 10.374064837905237, - "grad_norm": 0.474609375, - "learning_rate": 5.2028688674975415e-05, - "loss": 0.8729, - "step": 2080 - }, - { - "epoch": 10.399002493765586, - "grad_norm": 0.474609375, - "learning_rate": 5.15190379753663e-05, - "loss": 0.8726, - "step": 2085 - }, - { - "epoch": 10.423940149625935, - "grad_norm": 0.52734375, - "learning_rate": 5.101102817619131e-05, - "loss": 0.8908, - "step": 2090 - }, - { - "epoch": 10.448877805486283, - "grad_norm": 0.486328125, - "learning_rate": 5.05046764716946e-05, - "loss": 0.8868, - "step": 2095 - }, - { - "epoch": 10.473815461346634, - "grad_norm": 0.466796875, - "learning_rate": 5.000000000000002e-05, - "loss": 0.8743, - "step": 2100 - }, - { - "epoch": 10.498753117206983, - "grad_norm": 0.46875, - "learning_rate": 4.9497015842531026e-05, - "loss": 0.8756, - "step": 2105 - }, - { - "epoch": 10.523690773067331, - "grad_norm": 0.47265625, - "learning_rate": 4.899574102343247e-05, - "loss": 0.881, - "step": 2110 - }, - { - "epoch": 10.548628428927682, - "grad_norm": 0.52734375, - "learning_rate": 4.8496192508994576e-05, - "loss": 0.8769, - "step": 2115 - }, - { - "epoch": 10.57356608478803, - "grad_norm": 0.48046875, - "learning_rate": 4.799838720707846e-05, - "loss": 0.88, - "step": 2120 - }, - { - "epoch": 10.598503740648379, - "grad_norm": 0.46875, - "learning_rate": 4.7502341966544e-05, - "loss": 0.8787, - "step": 2125 - }, - { - "epoch": 10.623441396508728, - "grad_norm": 0.4765625, - "learning_rate": 4.700807357667952e-05, - "loss": 0.8774, - "step": 2130 - }, - { - "epoch": 10.648379052369076, - "grad_norm": 0.458984375, - "learning_rate": 4.6515598766633597e-05, - "loss": 0.8807, - "step": 2135 - }, - { - "epoch": 10.673316708229427, - "grad_norm": 0.44140625, - "learning_rate": 4.6024934204848745e-05, - "loss": 0.8702, - "step": 2140 - }, - { - "epoch": 10.698254364089776, - "grad_norm": 0.4921875, - "learning_rate": 4.5536096498497295e-05, - "loss": 0.873, - "step": 2145 - }, - { - "epoch": 10.723192019950124, - "grad_norm": 0.470703125, - "learning_rate": 4.50491021929194e-05, - "loss": 0.8785, - "step": 2150 - }, - { - "epoch": 10.748129675810475, - "grad_norm": 0.462890625, - "learning_rate": 4.456396777106285e-05, - "loss": 0.8851, - "step": 2155 - }, - { - "epoch": 10.773067331670823, - "grad_norm": 0.51171875, - "learning_rate": 4.4080709652925336e-05, - "loss": 0.8827, - "step": 2160 - }, - { - "epoch": 10.798004987531172, - "grad_norm": 0.490234375, - "learning_rate": 4.359934419499858e-05, - "loss": 0.8717, - "step": 2165 - }, - { - "epoch": 10.82294264339152, - "grad_norm": 0.43359375, - "learning_rate": 4.3119887689714844e-05, - "loss": 0.8737, - "step": 2170 - }, - { - "epoch": 10.84788029925187, - "grad_norm": 0.466796875, - "learning_rate": 4.264235636489542e-05, - "loss": 0.8703, - "step": 2175 - }, - { - "epoch": 10.87281795511222, - "grad_norm": 0.45703125, - "learning_rate": 4.216676638320135e-05, - "loss": 0.8657, - "step": 2180 - }, - { - "epoch": 10.897755610972569, - "grad_norm": 0.451171875, - "learning_rate": 4.169313384158653e-05, - "loss": 0.8798, - "step": 2185 - }, - { - "epoch": 10.922693266832917, - "grad_norm": 0.5, - "learning_rate": 4.12214747707527e-05, - "loss": 0.873, - "step": 2190 - }, - { - "epoch": 10.947630922693268, - "grad_norm": 0.470703125, - "learning_rate": 4.0751805134606944e-05, - "loss": 0.877, - "step": 2195 - }, - { - "epoch": 10.972568578553616, - "grad_norm": 0.455078125, - "learning_rate": 4.028414082972141e-05, - "loss": 0.8892, - "step": 2200 - }, - { - "epoch": 10.997506234413965, - "grad_norm": 0.435546875, - "learning_rate": 3.981849768479517e-05, - "loss": 0.8722, - "step": 2205 - }, - { - "epoch": 10.997506234413965, - "eval_loss": 2.534595012664795, - "eval_runtime": 0.6862, - "eval_samples_per_second": 14.573, - "eval_steps_per_second": 1.457, - "step": 2205 - }, - { - "epoch": 11.022443890274314, - "grad_norm": 0.451171875, - "learning_rate": 3.935489146011869e-05, - "loss": 0.8622, - "step": 2210 - }, - { - "epoch": 11.047381546134662, - "grad_norm": 0.478515625, - "learning_rate": 3.8893337847040025e-05, - "loss": 0.8802, - "step": 2215 - }, - { - "epoch": 11.072319201995013, - "grad_norm": 0.453125, - "learning_rate": 3.843385246743417e-05, - "loss": 0.8698, - "step": 2220 - }, - { - "epoch": 11.097256857855362, - "grad_norm": 0.470703125, - "learning_rate": 3.7976450873174005e-05, - "loss": 0.8726, - "step": 2225 - }, - { - "epoch": 11.12219451371571, - "grad_norm": 0.4453125, - "learning_rate": 3.7521148545604e-05, - "loss": 0.8788, - "step": 2230 - }, - { - "epoch": 11.147132169576059, - "grad_norm": 0.4375, - "learning_rate": 3.7067960895016275e-05, - "loss": 0.8712, - "step": 2235 - }, - { - "epoch": 11.17206982543641, - "grad_norm": 0.494140625, - "learning_rate": 3.661690326012897e-05, - "loss": 0.8716, - "step": 2240 - }, - { - "epoch": 11.197007481296758, - "grad_norm": 0.447265625, - "learning_rate": 3.61679909075672e-05, - "loss": 0.8845, - "step": 2245 - }, - { - "epoch": 11.221945137157107, - "grad_norm": 0.44921875, - "learning_rate": 3.5721239031346066e-05, - "loss": 0.874, - "step": 2250 - }, - { - "epoch": 11.246882793017456, - "grad_norm": 0.416015625, - "learning_rate": 3.527666275235677e-05, - "loss": 0.8713, - "step": 2255 - }, - { - "epoch": 11.271820448877806, - "grad_norm": 0.44140625, - "learning_rate": 3.483427711785449e-05, - "loss": 0.8764, - "step": 2260 - }, - { - "epoch": 11.296758104738155, - "grad_norm": 0.484375, - "learning_rate": 3.439409710094929e-05, - "loss": 0.8751, - "step": 2265 - }, - { - "epoch": 11.321695760598503, - "grad_norm": 0.46875, - "learning_rate": 3.395613760009925e-05, - "loss": 0.8752, - "step": 2270 - }, - { - "epoch": 11.346633416458852, - "grad_norm": 0.640625, - "learning_rate": 3.352041343860621e-05, - "loss": 0.8738, - "step": 2275 - }, - { - "epoch": 11.371571072319203, - "grad_norm": 0.455078125, - "learning_rate": 3.308693936411421e-05, - "loss": 0.869, - "step": 2280 - }, - { - "epoch": 11.396508728179551, - "grad_norm": 0.474609375, - "learning_rate": 3.265573004810997e-05, - "loss": 0.8755, - "step": 2285 - }, - { - "epoch": 11.4214463840399, - "grad_norm": 0.484375, - "learning_rate": 3.222680008542678e-05, - "loss": 0.8732, - "step": 2290 - }, - { - "epoch": 11.446384039900249, - "grad_norm": 0.443359375, - "learning_rate": 3.1800163993750166e-05, - "loss": 0.8741, - "step": 2295 - }, - { - "epoch": 11.471321695760599, - "grad_norm": 0.451171875, - "learning_rate": 3.137583621312665e-05, - "loss": 0.8636, - "step": 2300 - }, - { - "epoch": 11.496259351620948, - "grad_norm": 0.447265625, - "learning_rate": 3.095383110547506e-05, - "loss": 0.8758, - "step": 2305 - }, - { - "epoch": 11.521197007481296, - "grad_norm": 0.46875, - "learning_rate": 3.053416295410026e-05, - "loss": 0.8626, - "step": 2310 - }, - { - "epoch": 11.546134663341645, - "grad_norm": 0.45703125, - "learning_rate": 3.0116845963209993e-05, - "loss": 0.8797, - "step": 2315 - }, - { - "epoch": 11.571072319201996, - "grad_norm": 0.47265625, - "learning_rate": 2.9701894257433826e-05, - "loss": 0.8623, - "step": 2320 - }, - { - "epoch": 11.596009975062344, - "grad_norm": 0.443359375, - "learning_rate": 2.9289321881345254e-05, - "loss": 0.8736, - "step": 2325 - }, - { - "epoch": 11.620947630922693, - "grad_norm": 0.455078125, - "learning_rate": 2.8879142798986292e-05, - "loss": 0.8656, - "step": 2330 - }, - { - "epoch": 11.645885286783042, - "grad_norm": 0.44921875, - "learning_rate": 2.8471370893394866e-05, - "loss": 0.8637, - "step": 2335 - }, - { - "epoch": 11.670822942643392, - "grad_norm": 0.44921875, - "learning_rate": 2.8066019966134904e-05, - "loss": 0.875, - "step": 2340 - }, - { - "epoch": 11.69576059850374, - "grad_norm": 0.45703125, - "learning_rate": 2.7663103736829198e-05, - "loss": 0.8727, - "step": 2345 - }, - { - "epoch": 11.72069825436409, - "grad_norm": 0.474609375, - "learning_rate": 2.7262635842695127e-05, - "loss": 0.8767, - "step": 2350 - }, - { - "epoch": 11.745635910224438, - "grad_norm": 0.4375, - "learning_rate": 2.6864629838082956e-05, - "loss": 0.8718, - "step": 2355 - }, - { - "epoch": 11.770573566084789, - "grad_norm": 0.453125, - "learning_rate": 2.6469099194017143e-05, - "loss": 0.8805, - "step": 2360 - }, - { - "epoch": 11.795511221945137, - "grad_norm": 0.4609375, - "learning_rate": 2.6076057297740407e-05, - "loss": 0.8701, - "step": 2365 - }, - { - "epoch": 11.820448877805486, - "grad_norm": 0.455078125, - "learning_rate": 2.5685517452260567e-05, - "loss": 0.8661, - "step": 2370 - }, - { - "epoch": 11.845386533665835, - "grad_norm": 0.44140625, - "learning_rate": 2.529749287590042e-05, - "loss": 0.8684, - "step": 2375 - }, - { - "epoch": 11.870324189526185, - "grad_norm": 0.453125, - "learning_rate": 2.491199670185008e-05, - "loss": 0.8673, - "step": 2380 - }, - { - "epoch": 11.895261845386534, - "grad_norm": 0.4453125, - "learning_rate": 2.45290419777228e-05, - "loss": 0.8707, - "step": 2385 - }, - { - "epoch": 11.920199501246882, - "grad_norm": 0.431640625, - "learning_rate": 2.4148641665113113e-05, - "loss": 0.8744, - "step": 2390 - }, - { - "epoch": 11.945137157107231, - "grad_norm": 0.46484375, - "learning_rate": 2.3770808639158216e-05, - "loss": 0.8767, - "step": 2395 - }, - { - "epoch": 11.970074812967582, - "grad_norm": 0.447265625, - "learning_rate": 2.339555568810221e-05, - "loss": 0.8761, - "step": 2400 - }, - { - "epoch": 11.99501246882793, - "grad_norm": 0.455078125, - "learning_rate": 2.302289551286321e-05, - "loss": 0.8768, - "step": 2405 - }, - { - "epoch": 12.0, - "eval_loss": 2.5409655570983887, - "eval_runtime": 0.5391, - "eval_samples_per_second": 18.549, - "eval_steps_per_second": 1.855, - "step": 2406 - }, - { - "epoch": 12.019950124688279, - "grad_norm": 0.498046875, - "learning_rate": 2.265284072660362e-05, - "loss": 0.8646, - "step": 2410 - }, - { - "epoch": 12.044887780548628, - "grad_norm": 0.494140625, - "learning_rate": 2.2285403854302912e-05, - "loss": 0.8762, - "step": 2415 - }, - { - "epoch": 12.069825436408978, - "grad_norm": 0.439453125, - "learning_rate": 2.192059733233408e-05, - "loss": 0.8769, - "step": 2420 - }, - { - "epoch": 12.094763092269327, - "grad_norm": 0.44921875, - "learning_rate": 2.155843350804243e-05, - "loss": 0.865, - "step": 2425 - }, - { - "epoch": 12.119700748129675, - "grad_norm": 0.46875, - "learning_rate": 2.119892463932781e-05, - "loss": 0.8751, - "step": 2430 - }, - { - "epoch": 12.144638403990024, - "grad_norm": 0.4453125, - "learning_rate": 2.0842082894229676e-05, - "loss": 0.8704, - "step": 2435 - }, - { - "epoch": 12.169576059850375, - "grad_norm": 0.439453125, - "learning_rate": 2.0487920350515212e-05, - "loss": 0.8698, - "step": 2440 - }, - { - "epoch": 12.194513715710723, - "grad_norm": 0.455078125, - "learning_rate": 2.013644899527074e-05, - "loss": 0.8773, - "step": 2445 - }, - { - "epoch": 12.219451371571072, - "grad_norm": 0.4375, - "learning_rate": 1.9787680724495617e-05, - "loss": 0.869, - "step": 2450 - }, - { - "epoch": 12.24438902743142, - "grad_norm": 0.478515625, - "learning_rate": 1.9441627342700065e-05, - "loss": 0.8817, - "step": 2455 - }, - { - "epoch": 12.269326683291771, - "grad_norm": 0.447265625, - "learning_rate": 1.9098300562505266e-05, - "loss": 0.8671, - "step": 2460 - }, - { - "epoch": 12.29426433915212, - "grad_norm": 0.431640625, - "learning_rate": 1.8757712004247096e-05, - "loss": 0.8663, - "step": 2465 - }, - { - "epoch": 12.319201995012468, - "grad_norm": 0.478515625, - "learning_rate": 1.8419873195582814e-05, - "loss": 0.8712, - "step": 2470 - }, - { - "epoch": 12.344139650872817, - "grad_norm": 0.4609375, - "learning_rate": 1.808479557110081e-05, - "loss": 0.8737, - "step": 2475 - }, - { - "epoch": 12.369077306733168, - "grad_norm": 0.482421875, - "learning_rate": 1.775249047193377e-05, - "loss": 0.866, - "step": 2480 - }, - { - "epoch": 12.394014962593516, - "grad_norm": 0.48046875, - "learning_rate": 1.7422969145374592e-05, - "loss": 0.8699, - "step": 2485 - }, - { - "epoch": 12.418952618453865, - "grad_norm": 0.455078125, - "learning_rate": 1.7096242744495837e-05, - "loss": 0.875, - "step": 2490 - }, - { - "epoch": 12.443890274314214, - "grad_norm": 0.4609375, - "learning_rate": 1.677232232777224e-05, - "loss": 0.8704, - "step": 2495 - }, - { - "epoch": 12.468827930174564, - "grad_norm": 0.4375, - "learning_rate": 1.6451218858706374e-05, - "loss": 0.869, - "step": 2500 - }, - { - "epoch": 12.493765586034913, - "grad_norm": 0.4453125, - "learning_rate": 1.6132943205457606e-05, - "loss": 0.8732, - "step": 2505 - }, - { - "epoch": 12.518703241895262, - "grad_norm": 0.4375, - "learning_rate": 1.5817506140474247e-05, - "loss": 0.8706, - "step": 2510 - }, - { - "epoch": 12.54364089775561, - "grad_norm": 0.455078125, - "learning_rate": 1.550491834012898e-05, - "loss": 0.8665, - "step": 2515 - }, - { - "epoch": 12.56857855361596, - "grad_norm": 0.4765625, - "learning_rate": 1.5195190384357404e-05, - "loss": 0.8715, - "step": 2520 - }, - { - "epoch": 12.59351620947631, - "grad_norm": 0.44921875, - "learning_rate": 1.4888332756300027e-05, - "loss": 0.873, - "step": 2525 - }, - { - "epoch": 12.618453865336658, - "grad_norm": 0.44140625, - "learning_rate": 1.458435584194745e-05, - "loss": 0.8663, - "step": 2530 - }, - { - "epoch": 12.643391521197007, - "grad_norm": 0.453125, - "learning_rate": 1.4283269929788779e-05, - "loss": 0.8715, - "step": 2535 - }, - { - "epoch": 12.668329177057357, - "grad_norm": 0.447265625, - "learning_rate": 1.3985085210463477e-05, - "loss": 0.8625, - "step": 2540 - }, - { - "epoch": 12.693266832917706, - "grad_norm": 0.484375, - "learning_rate": 1.3689811776416362e-05, - "loss": 0.872, - "step": 2545 - }, - { - "epoch": 12.718204488778055, - "grad_norm": 0.43359375, - "learning_rate": 1.339745962155613e-05, - "loss": 0.8621, - "step": 2550 - }, - { - "epoch": 12.743142144638403, - "grad_norm": 0.443359375, - "learning_rate": 1.3108038640916986e-05, - "loss": 0.8709, - "step": 2555 - }, - { - "epoch": 12.768079800498754, - "grad_norm": 0.46875, - "learning_rate": 1.2821558630323772e-05, - "loss": 0.8675, - "step": 2560 - }, - { - "epoch": 12.793017456359102, - "grad_norm": 0.43359375, - "learning_rate": 1.2538029286060426e-05, - "loss": 0.8683, - "step": 2565 - }, - { - "epoch": 12.817955112219451, - "grad_norm": 0.45703125, - "learning_rate": 1.2257460204541794e-05, - "loss": 0.8646, - "step": 2570 - }, - { - "epoch": 12.8428927680798, - "grad_norm": 0.447265625, - "learning_rate": 1.1979860881988902e-05, - "loss": 0.8703, - "step": 2575 - }, - { - "epoch": 12.86783042394015, - "grad_norm": 0.45703125, - "learning_rate": 1.1705240714107302e-05, - "loss": 0.8732, - "step": 2580 - }, - { - "epoch": 12.892768079800499, - "grad_norm": 0.455078125, - "learning_rate": 1.1433608995769395e-05, - "loss": 0.8722, - "step": 2585 - }, - { - "epoch": 12.917705735660848, - "grad_norm": 0.447265625, - "learning_rate": 1.116497492069961e-05, - "loss": 0.8724, - "step": 2590 - }, - { - "epoch": 12.942643391521196, - "grad_norm": 0.42578125, - "learning_rate": 1.0899347581163221e-05, - "loss": 0.867, - "step": 2595 - }, - { - "epoch": 12.967581047381547, - "grad_norm": 0.4609375, - "learning_rate": 1.0636735967658784e-05, - "loss": 0.8667, - "step": 2600 - }, - { - "epoch": 12.992518703241895, - "grad_norm": 0.435546875, - "learning_rate": 1.0377148968613658e-05, - "loss": 0.8694, - "step": 2605 - }, - { - "epoch": 12.997506234413965, - "eval_loss": 2.5414979457855225, - "eval_runtime": 0.6532, - "eval_samples_per_second": 15.31, - "eval_steps_per_second": 1.531, - "step": 2606 - }, - { - "epoch": 13.017456359102244, - "grad_norm": 0.439453125, - "learning_rate": 1.0120595370083318e-05, - "loss": 0.8778, - "step": 2610 - }, - { - "epoch": 13.042394014962593, - "grad_norm": 0.486328125, - "learning_rate": 9.867083855453774e-06, - "loss": 0.8707, - "step": 2615 - }, - { - "epoch": 13.067331670822943, - "grad_norm": 0.451171875, - "learning_rate": 9.616623005147951e-06, - "loss": 0.8675, - "step": 2620 - }, - { - "epoch": 13.092269326683292, - "grad_norm": 0.4375, - "learning_rate": 9.369221296335006e-06, - "loss": 0.8655, - "step": 2625 - }, - { - "epoch": 13.11720698254364, - "grad_norm": 0.43359375, - "learning_rate": 9.124887102643575e-06, - "loss": 0.8662, - "step": 2630 - }, - { - "epoch": 13.14214463840399, - "grad_norm": 0.45703125, - "learning_rate": 8.883628693878298e-06, - "loss": 0.8656, - "step": 2635 - }, - { - "epoch": 13.16708229426434, - "grad_norm": 0.455078125, - "learning_rate": 8.645454235739903e-06, - "loss": 0.8677, - "step": 2640 - }, - { - "epoch": 13.192019950124688, - "grad_norm": 0.474609375, - "learning_rate": 8.41037178954891e-06, - "loss": 0.8673, - "step": 2645 - }, - { - "epoch": 13.216957605985037, - "grad_norm": 0.46875, - "learning_rate": 8.178389311972612e-06, - "loss": 0.8726, - "step": 2650 - }, - { - "epoch": 13.241895261845386, - "grad_norm": 0.47265625, - "learning_rate": 7.949514654755962e-06, - "loss": 0.8622, - "step": 2655 - }, - { - "epoch": 13.266832917705736, - "grad_norm": 0.4453125, - "learning_rate": 7.72375556445577e-06, - "loss": 0.8703, - "step": 2660 - }, - { - "epoch": 13.291770573566085, - "grad_norm": 0.439453125, - "learning_rate": 7.501119682178393e-06, - "loss": 0.8723, - "step": 2665 - }, - { - "epoch": 13.316708229426434, - "grad_norm": 0.4453125, - "learning_rate": 7.281614543321269e-06, - "loss": 0.8673, - "step": 2670 - }, - { - "epoch": 13.341645885286782, - "grad_norm": 0.431640625, - "learning_rate": 7.0652475773177464e-06, - "loss": 0.867, - "step": 2675 - }, - { - "epoch": 13.366583541147133, - "grad_norm": 0.4453125, - "learning_rate": 6.852026107385756e-06, - "loss": 0.8627, - "step": 2680 - }, - { - "epoch": 13.391521197007481, - "grad_norm": 0.453125, - "learning_rate": 6.6419573502798374e-06, - "loss": 0.8693, - "step": 2685 - }, - { - "epoch": 13.41645885286783, - "grad_norm": 0.44921875, - "learning_rate": 6.435048416046863e-06, - "loss": 0.8745, - "step": 2690 - }, - { - "epoch": 13.441396508728179, - "grad_norm": 0.4453125, - "learning_rate": 6.231306307785522e-06, - "loss": 0.8654, - "step": 2695 - }, - { - "epoch": 13.46633416458853, - "grad_norm": 0.451171875, - "learning_rate": 6.030737921409169e-06, - "loss": 0.8763, - "step": 2700 - }, - { - "epoch": 13.491271820448878, - "grad_norm": 0.443359375, - "learning_rate": 5.833350045412478e-06, - "loss": 0.8773, - "step": 2705 - }, - { - "epoch": 13.516209476309227, - "grad_norm": 0.466796875, - "learning_rate": 5.639149360641649e-06, - "loss": 0.8763, - "step": 2710 - }, - { - "epoch": 13.541147132169575, - "grad_norm": 0.478515625, - "learning_rate": 5.448142440068316e-06, - "loss": 0.8732, - "step": 2715 - }, - { - "epoch": 13.566084788029926, - "grad_norm": 0.42578125, - "learning_rate": 5.26033574856708e-06, - "loss": 0.8718, - "step": 2720 - }, - { - "epoch": 13.591022443890274, - "grad_norm": 0.447265625, - "learning_rate": 5.075735642696611e-06, - "loss": 0.8776, - "step": 2725 - }, - { - "epoch": 13.615960099750623, - "grad_norm": 0.4453125, - "learning_rate": 4.8943483704846475e-06, - "loss": 0.8643, - "step": 2730 - }, - { - "epoch": 13.640897755610972, - "grad_norm": 0.443359375, - "learning_rate": 4.7161800712163806e-06, - "loss": 0.8767, - "step": 2735 - }, - { - "epoch": 13.665835411471322, - "grad_norm": 0.435546875, - "learning_rate": 4.541236775226809e-06, - "loss": 0.8656, - "step": 2740 - }, - { - "epoch": 13.690773067331671, - "grad_norm": 0.45703125, - "learning_rate": 4.369524403696457e-06, - "loss": 0.8735, - "step": 2745 - }, - { - "epoch": 13.71571072319202, - "grad_norm": 0.451171875, - "learning_rate": 4.20104876845111e-06, - "loss": 0.8678, - "step": 2750 - }, - { - "epoch": 13.740648379052368, - "grad_norm": 0.44140625, - "learning_rate": 4.0358155717650895e-06, - "loss": 0.8591, - "step": 2755 - }, - { - "epoch": 13.765586034912719, - "grad_norm": 0.462890625, - "learning_rate": 3.873830406168111e-06, - "loss": 0.8646, - "step": 2760 - }, - { - "epoch": 13.790523690773068, - "grad_norm": 0.451171875, - "learning_rate": 3.715098754256241e-06, - "loss": 0.8643, - "step": 2765 - }, - { - "epoch": 13.815461346633416, - "grad_norm": 0.451171875, - "learning_rate": 3.5596259885061102e-06, - "loss": 0.8702, - "step": 2770 - }, - { - "epoch": 13.840399002493765, - "grad_norm": 0.447265625, - "learning_rate": 3.40741737109318e-06, - "loss": 0.8669, - "step": 2775 - }, - { - "epoch": 13.865336658354115, - "grad_norm": 0.486328125, - "learning_rate": 3.2584780537136207e-06, - "loss": 0.8818, - "step": 2780 - }, - { - "epoch": 13.890274314214464, - "grad_norm": 0.447265625, - "learning_rate": 3.1128130774099262e-06, - "loss": 0.8771, - "step": 2785 - }, - { - "epoch": 13.915211970074813, - "grad_norm": 0.458984375, - "learning_rate": 2.970427372400353e-06, - "loss": 0.8643, - "step": 2790 - }, - { - "epoch": 13.940149625935161, - "grad_norm": 0.453125, - "learning_rate": 2.8313257579119843e-06, - "loss": 0.8569, - "step": 2795 - }, - { - "epoch": 13.965087281795512, - "grad_norm": 0.46484375, - "learning_rate": 2.6955129420176196e-06, - "loss": 0.8664, - "step": 2800 - }, - { - "epoch": 13.99002493765586, - "grad_norm": 0.4453125, - "learning_rate": 2.5629935214764865e-06, - "loss": 0.8709, - "step": 2805 - }, - { - "epoch": 14.0, - "eval_loss": 2.5418429374694824, - "eval_runtime": 0.5336, - "eval_samples_per_second": 18.742, - "eval_steps_per_second": 1.874, - "step": 2807 - }, - { - "epoch": 14.01496259351621, - "grad_norm": 0.44140625, - "learning_rate": 2.433771981578581e-06, - "loss": 0.8642, - "step": 2810 - }, - { - "epoch": 14.039900249376558, - "grad_norm": 0.447265625, - "learning_rate": 2.307852695992907e-06, - "loss": 0.8614, - "step": 2815 - }, - { - "epoch": 14.064837905236908, - "grad_norm": 0.447265625, - "learning_rate": 2.1852399266194314e-06, - "loss": 0.8539, - "step": 2820 - }, - { - "epoch": 14.089775561097257, - "grad_norm": 0.4296875, - "learning_rate": 2.0659378234448525e-06, - "loss": 0.867, - "step": 2825 - }, - { - "epoch": 14.114713216957606, - "grad_norm": 0.4609375, - "learning_rate": 1.9499504244020693e-06, - "loss": 0.8612, - "step": 2830 - }, - { - "epoch": 14.139650872817954, - "grad_norm": 0.443359375, - "learning_rate": 1.8372816552336026e-06, - "loss": 0.8633, - "step": 2835 - }, - { - "epoch": 14.164588528678305, - "grad_norm": 0.44140625, - "learning_rate": 1.7279353293586765e-06, - "loss": 0.8609, - "step": 2840 - }, - { - "epoch": 14.189526184538654, - "grad_norm": 0.4453125, - "learning_rate": 1.621915147744124e-06, - "loss": 0.8658, - "step": 2845 - }, - { - "epoch": 14.214463840399002, - "grad_norm": 0.47265625, - "learning_rate": 1.5192246987791981e-06, - "loss": 0.8608, - "step": 2850 - }, - { - "epoch": 14.239401496259351, - "grad_norm": 0.4296875, - "learning_rate": 1.4198674581540339e-06, - "loss": 0.8689, - "step": 2855 - }, - { - "epoch": 14.264339152119701, - "grad_norm": 0.44921875, - "learning_rate": 1.323846788742078e-06, - "loss": 0.8778, - "step": 2860 - }, - { - "epoch": 14.28927680798005, - "grad_norm": 0.44921875, - "learning_rate": 1.231165940486234e-06, - "loss": 0.8685, - "step": 2865 - }, - { - "epoch": 14.314214463840399, - "grad_norm": 0.4609375, - "learning_rate": 1.14182805028884e-06, - "loss": 0.8658, - "step": 2870 - }, - { - "epoch": 14.339152119700747, - "grad_norm": 0.439453125, - "learning_rate": 1.055836141905553e-06, - "loss": 0.8771, - "step": 2875 - }, - { - "epoch": 14.364089775561098, - "grad_norm": 0.44140625, - "learning_rate": 9.731931258429638e-07, - "loss": 0.8816, - "step": 2880 - }, - { - "epoch": 14.389027431421447, - "grad_norm": 0.4453125, - "learning_rate": 8.939017992601328e-07, - "loss": 0.8699, - "step": 2885 - }, - { - "epoch": 14.413965087281795, - "grad_norm": 0.447265625, - "learning_rate": 8.17964845873831e-07, - "loss": 0.8734, - "step": 2890 - }, - { - "epoch": 14.438902743142144, - "grad_norm": 0.462890625, - "learning_rate": 7.453848358678017e-07, - "loss": 0.8724, - "step": 2895 - }, - { - "epoch": 14.463840399002494, - "grad_norm": 0.453125, - "learning_rate": 6.761642258056978e-07, - "loss": 0.8784, - "step": 2900 - }, - { - "epoch": 14.488778054862843, - "grad_norm": 0.455078125, - "learning_rate": 6.103053585480023e-07, - "loss": 0.8644, - "step": 2905 - }, - { - "epoch": 14.513715710723192, - "grad_norm": 0.435546875, - "learning_rate": 5.478104631726711e-07, - "loss": 0.8615, - "step": 2910 - }, - { - "epoch": 14.53865336658354, - "grad_norm": 0.4453125, - "learning_rate": 4.88681654899692e-07, - "loss": 0.8662, - "step": 2915 - }, - { - "epoch": 14.563591022443891, - "grad_norm": 0.4765625, - "learning_rate": 4.329209350195651e-07, - "loss": 0.8694, - "step": 2920 - }, - { - "epoch": 14.58852867830424, - "grad_norm": 0.447265625, - "learning_rate": 3.805301908254455e-07, - "loss": 0.8729, - "step": 2925 - }, - { - "epoch": 14.613466334164588, - "grad_norm": 0.431640625, - "learning_rate": 3.315111955493944e-07, - "loss": 0.8758, - "step": 2930 - }, - { - "epoch": 14.638403990024937, - "grad_norm": 0.490234375, - "learning_rate": 2.858656083022604e-07, - "loss": 0.8569, - "step": 2935 - }, - { - "epoch": 14.663341645885287, - "grad_norm": 0.447265625, - "learning_rate": 2.4359497401758024e-07, - "loss": 0.8734, - "step": 2940 - }, - { - "epoch": 14.688279301745636, - "grad_norm": 0.458984375, - "learning_rate": 2.0470072339926484e-07, - "loss": 0.868, - "step": 2945 - }, - { - "epoch": 14.713216957605985, - "grad_norm": 0.51171875, - "learning_rate": 1.6918417287318245e-07, - "loss": 0.8797, - "step": 2950 - }, - { - "epoch": 14.738154613466333, - "grad_norm": 0.43359375, - "learning_rate": 1.3704652454261668e-07, - "loss": 0.8734, - "step": 2955 - }, - { - "epoch": 14.763092269326684, - "grad_norm": 0.44140625, - "learning_rate": 1.0828886614754341e-07, - "loss": 0.88, - "step": 2960 - }, - { - "epoch": 14.788029925187033, - "grad_norm": 0.494140625, - "learning_rate": 8.29121710278713e-08, - "loss": 0.8667, - "step": 2965 - }, - { - "epoch": 14.812967581047381, - "grad_norm": 0.458984375, - "learning_rate": 6.09172980904238e-08, - "loss": 0.8642, - "step": 2970 - }, - { - "epoch": 14.83790523690773, - "grad_norm": 0.431640625, - "learning_rate": 4.230499177994007e-08, - "loss": 0.8733, - "step": 2975 - }, - { - "epoch": 14.86284289276808, - "grad_norm": 0.431640625, - "learning_rate": 2.7075882053828605e-08, - "loss": 0.8704, - "step": 2980 - }, - { - "epoch": 14.88778054862843, - "grad_norm": 0.455078125, - "learning_rate": 1.5230484360873044e-08, - "loss": 0.8678, - "step": 2985 - }, - { - "epoch": 14.912718204488778, - "grad_norm": 0.4296875, - "learning_rate": 6.769199623779532e-09, - "loss": 0.8684, - "step": 2990 - }, - { - "epoch": 14.937655860349127, - "grad_norm": 0.44921875, - "learning_rate": 1.6923142255764746e-09, - "loss": 0.8586, - "step": 2995 - }, - { - "epoch": 14.962593516209477, - "grad_norm": 0.44140625, - "learning_rate": 0.0, - "loss": 0.8781, - "step": 3000 - }, - { - "epoch": 14.962593516209477, - "eval_loss": 2.542728900909424, - "eval_runtime": 0.4926, - "eval_samples_per_second": 20.301, - "eval_steps_per_second": 2.03, - "step": 3000 - }, - { - "epoch": 14.962593516209477, - "step": 3000, - "total_flos": 1.7621200159784305e+18, - "train_loss": 0.9822600702444713, - "train_runtime": 10311.9233, - "train_samples_per_second": 13.994, - "train_steps_per_second": 0.291 + "epoch": 9.975062344139651, + "step": 2000, + "total_flos": 1.1920978083462513e+18, + "train_loss": 0.9040022449493408, + "train_runtime": 12134.7289, + "train_samples_per_second": 7.919, + "train_steps_per_second": 0.165 } ], "logging_steps": 5, - "max_steps": 3000, + "max_steps": 2000, "num_input_tokens_seen": 0, - "num_train_epochs": 15, + "num_train_epochs": 10, "save_steps": 100, - "total_flos": 1.7621200159784305e+18, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1920978083462513e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null