{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.10345, "eval_steps": 400, "global_step": 14000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5e-05, "loss": 2.4874, "step": 25 }, { "epoch": 0.0, "learning_rate": 0.0001, "loss": 1.5625, "step": 50 }, { "epoch": 0.0, "learning_rate": 0.00015, "loss": 0.8084, "step": 75 }, { "epoch": 0.01, "learning_rate": 0.0002, "loss": 0.181, "step": 100 }, { "epoch": 0.01, "learning_rate": 0.00025, "loss": 0.1378, "step": 125 }, { "epoch": 0.01, "learning_rate": 0.0003, "loss": 0.1267, "step": 150 }, { "epoch": 0.01, "learning_rate": 0.00035, "loss": 0.1118, "step": 175 }, { "epoch": 0.01, "learning_rate": 0.0004, "loss": 0.1189, "step": 200 }, { "epoch": 0.01, "learning_rate": 0.00045000000000000004, "loss": 0.11, "step": 225 }, { "epoch": 0.01, "learning_rate": 0.0005, "loss": 0.1131, "step": 250 }, { "epoch": 0.01, "learning_rate": 0.00055, "loss": 0.1196, "step": 275 }, { "epoch": 0.01, "learning_rate": 0.0006, "loss": 0.0998, "step": 300 }, { "epoch": 0.02, "learning_rate": 0.0006500000000000001, "loss": 0.1087, "step": 325 }, { "epoch": 0.02, "learning_rate": 0.0007, "loss": 0.1098, "step": 350 }, { "epoch": 0.02, "learning_rate": 0.00075, "loss": 0.1138, "step": 375 }, { "epoch": 0.02, "learning_rate": 0.0008, "loss": 0.1212, "step": 400 }, { "epoch": 0.02, "eval_loss": 0.5537986159324646, "eval_runtime": 216.4127, "eval_samples_per_second": 4.621, "eval_steps_per_second": 0.291, "step": 400 }, { "epoch": 0.02, "learning_rate": 0.00085, "loss": 0.1228, "step": 425 }, { "epoch": 0.02, "learning_rate": 0.0009000000000000001, "loss": 0.1177, "step": 450 }, { "epoch": 0.02, "learning_rate": 0.00095, "loss": 0.1196, "step": 475 }, { "epoch": 0.03, "learning_rate": 0.001, "loss": 0.1214, "step": 500 }, { "epoch": 0.03, "learning_rate": 0.0009987179487179487, "loss": 0.1415, "step": 525 }, { "epoch": 0.03, "learning_rate": 0.0009974358974358974, "loss": 0.1491, "step": 550 }, { "epoch": 0.03, "learning_rate": 0.0009961538461538463, "loss": 0.1467, "step": 575 }, { "epoch": 0.03, "learning_rate": 0.000994871794871795, "loss": 0.1455, "step": 600 }, { "epoch": 0.03, "learning_rate": 0.0009935897435897436, "loss": 0.1393, "step": 625 }, { "epoch": 0.03, "learning_rate": 0.0009923076923076923, "loss": 0.1355, "step": 650 }, { "epoch": 0.03, "learning_rate": 0.0009910256410256412, "loss": 0.1299, "step": 675 }, { "epoch": 0.04, "learning_rate": 0.0009897435897435899, "loss": 0.124, "step": 700 }, { "epoch": 0.04, "learning_rate": 0.0009884615384615385, "loss": 0.1209, "step": 725 }, { "epoch": 0.04, "learning_rate": 0.0009871794871794872, "loss": 0.1247, "step": 750 }, { "epoch": 0.04, "learning_rate": 0.0009858974358974359, "loss": 0.1238, "step": 775 }, { "epoch": 0.04, "learning_rate": 0.0009846153846153848, "loss": 0.1211, "step": 800 }, { "epoch": 0.04, "eval_loss": 1.2810015678405762, "eval_runtime": 217.0046, "eval_samples_per_second": 4.608, "eval_steps_per_second": 0.29, "step": 800 }, { "epoch": 0.04, "learning_rate": 0.0009833333333333332, "loss": 0.1226, "step": 825 }, { "epoch": 0.04, "learning_rate": 0.0009820512820512821, "loss": 0.1267, "step": 850 }, { "epoch": 0.04, "learning_rate": 0.000980820512820513, "loss": 0.1039, "step": 875 }, { "epoch": 0.04, "learning_rate": 0.0009795384615384616, "loss": 0.1142, "step": 900 }, { "epoch": 0.05, "learning_rate": 0.0009782564102564103, "loss": 0.1062, "step": 925 }, { "epoch": 0.05, "learning_rate": 0.000976974358974359, "loss": 0.115, "step": 950 }, { "epoch": 0.05, "learning_rate": 0.0009756923076923077, "loss": 0.1131, "step": 975 }, { "epoch": 0.05, "learning_rate": 0.0009744102564102564, "loss": 0.1173, "step": 1000 }, { "epoch": 0.05, "learning_rate": 0.0009731282051282051, "loss": 0.1114, "step": 1025 }, { "epoch": 0.05, "learning_rate": 0.0009718461538461539, "loss": 0.1123, "step": 1050 }, { "epoch": 0.05, "learning_rate": 0.0009705641025641025, "loss": 0.1258, "step": 1075 }, { "epoch": 0.06, "learning_rate": 0.0009692820512820512, "loss": 0.1168, "step": 1100 }, { "epoch": 0.06, "learning_rate": 0.000968, "loss": 0.1193, "step": 1125 }, { "epoch": 0.06, "learning_rate": 0.0009667179487179487, "loss": 0.1216, "step": 1150 }, { "epoch": 0.06, "learning_rate": 0.0009654358974358975, "loss": 0.1257, "step": 1175 }, { "epoch": 0.06, "learning_rate": 0.0009641538461538461, "loss": 0.1219, "step": 1200 }, { "epoch": 0.06, "eval_loss": 1.4006367921829224, "eval_runtime": 216.1116, "eval_samples_per_second": 4.627, "eval_steps_per_second": 0.292, "step": 1200 }, { "epoch": 0.06, "learning_rate": 0.0009628717948717949, "loss": 0.1096, "step": 1225 }, { "epoch": 0.06, "learning_rate": 0.0009615897435897436, "loss": 0.1234, "step": 1250 }, { "epoch": 0.06, "learning_rate": 0.0009603076923076923, "loss": 0.1406, "step": 1275 }, { "epoch": 0.07, "learning_rate": 0.0009590256410256411, "loss": 0.1189, "step": 1300 }, { "epoch": 0.07, "learning_rate": 0.0009577435897435897, "loss": 0.1211, "step": 1325 }, { "epoch": 0.07, "learning_rate": 0.0009564615384615385, "loss": 0.1153, "step": 1350 }, { "epoch": 0.07, "learning_rate": 0.0009551794871794872, "loss": 0.1161, "step": 1375 }, { "epoch": 0.07, "learning_rate": 0.000953897435897436, "loss": 0.1179, "step": 1400 }, { "epoch": 0.07, "learning_rate": 0.0009526153846153847, "loss": 0.1102, "step": 1425 }, { "epoch": 0.07, "learning_rate": 0.0009513333333333334, "loss": 0.123, "step": 1450 }, { "epoch": 0.07, "learning_rate": 0.0009500512820512821, "loss": 0.1386, "step": 1475 }, { "epoch": 0.07, "learning_rate": 0.0009487692307692308, "loss": 0.0922, "step": 1500 }, { "epoch": 0.08, "learning_rate": 0.0009474871794871796, "loss": 0.1243, "step": 1525 }, { "epoch": 0.08, "learning_rate": 0.0009462051282051282, "loss": 0.0956, "step": 1550 }, { "epoch": 0.08, "learning_rate": 0.000944923076923077, "loss": 0.1027, "step": 1575 }, { "epoch": 0.08, "learning_rate": 0.0009436410256410256, "loss": 0.1142, "step": 1600 }, { "epoch": 0.08, "eval_loss": 0.10953158885240555, "eval_runtime": 216.1306, "eval_samples_per_second": 4.627, "eval_steps_per_second": 0.291, "step": 1600 }, { "epoch": 0.08, "learning_rate": 0.0009423589743589744, "loss": 0.117, "step": 1625 }, { "epoch": 0.08, "learning_rate": 0.0009410769230769231, "loss": 0.1146, "step": 1650 }, { "epoch": 0.08, "learning_rate": 0.0009397948717948717, "loss": 0.0974, "step": 1675 }, { "epoch": 0.09, "learning_rate": 0.0009385128205128205, "loss": 0.1041, "step": 1700 }, { "epoch": 0.09, "learning_rate": 0.0009372307692307692, "loss": 0.1048, "step": 1725 }, { "epoch": 0.09, "learning_rate": 0.000935948717948718, "loss": 0.0934, "step": 1750 }, { "epoch": 0.09, "learning_rate": 0.0009346666666666667, "loss": 0.0982, "step": 1775 }, { "epoch": 0.09, "learning_rate": 0.0009333846153846154, "loss": 0.1014, "step": 1800 }, { "epoch": 0.09, "learning_rate": 0.0009321025641025641, "loss": 0.1304, "step": 1825 }, { "epoch": 0.09, "learning_rate": 0.0009308205128205128, "loss": 0.0995, "step": 1850 }, { "epoch": 0.09, "learning_rate": 0.0009295384615384616, "loss": 0.0912, "step": 1875 }, { "epoch": 0.1, "learning_rate": 0.0009282564102564102, "loss": 0.0963, "step": 1900 }, { "epoch": 0.1, "learning_rate": 0.000926974358974359, "loss": 0.1051, "step": 1925 }, { "epoch": 0.1, "learning_rate": 0.0009256923076923077, "loss": 0.0987, "step": 1950 }, { "epoch": 0.1, "learning_rate": 0.0009244102564102565, "loss": 0.0974, "step": 1975 }, { "epoch": 0.1, "learning_rate": 0.0009231282051282052, "loss": 0.1087, "step": 2000 }, { "epoch": 0.1, "eval_loss": 0.10780761390924454, "eval_runtime": 216.1847, "eval_samples_per_second": 4.626, "eval_steps_per_second": 0.291, "step": 2000 }, { "epoch": 0.1, "learning_rate": 0.0009218461538461538, "loss": 0.1035, "step": 2025 }, { "epoch": 0.1, "learning_rate": 0.0009205641025641026, "loss": 0.1094, "step": 2050 }, { "epoch": 0.1, "learning_rate": 0.0009192820512820513, "loss": 0.0851, "step": 2075 }, { "epoch": 0.1, "learning_rate": 0.0009180000000000001, "loss": 0.0741, "step": 2100 }, { "epoch": 0.11, "learning_rate": 0.0009167179487179488, "loss": 0.0881, "step": 2125 }, { "epoch": 0.11, "learning_rate": 0.0009154358974358975, "loss": 0.0794, "step": 2150 }, { "epoch": 0.11, "learning_rate": 0.0009141538461538462, "loss": 0.1199, "step": 2175 }, { "epoch": 0.11, "learning_rate": 0.0009128717948717948, "loss": 0.1307, "step": 2200 }, { "epoch": 0.11, "learning_rate": 0.0009115897435897436, "loss": 0.108, "step": 2225 }, { "epoch": 0.11, "learning_rate": 0.0009103076923076923, "loss": 0.078, "step": 2250 }, { "epoch": 0.11, "learning_rate": 0.000909025641025641, "loss": 0.0853, "step": 2275 }, { "epoch": 0.12, "learning_rate": 0.0009077435897435897, "loss": 0.0852, "step": 2300 }, { "epoch": 0.12, "learning_rate": 0.0009064615384615385, "loss": 0.0856, "step": 2325 }, { "epoch": 0.12, "learning_rate": 0.0009051794871794872, "loss": 0.0663, "step": 2350 }, { "epoch": 0.12, "learning_rate": 0.0009038974358974358, "loss": 0.0818, "step": 2375 }, { "epoch": 0.12, "learning_rate": 0.0009026153846153846, "loss": 0.0724, "step": 2400 }, { "epoch": 0.12, "eval_loss": 0.11282841861248016, "eval_runtime": 215.7164, "eval_samples_per_second": 4.636, "eval_steps_per_second": 0.292, "step": 2400 }, { "epoch": 0.12, "learning_rate": 0.0009013333333333333, "loss": 0.0824, "step": 2425 }, { "epoch": 0.12, "learning_rate": 0.0009000512820512821, "loss": 0.091, "step": 2450 }, { "epoch": 0.12, "learning_rate": 0.0008987692307692308, "loss": 0.0697, "step": 2475 }, { "epoch": 0.12, "learning_rate": 0.0008974871794871795, "loss": 0.076, "step": 2500 }, { "epoch": 0.13, "learning_rate": 0.0008962051282051282, "loss": 0.0769, "step": 2525 }, { "epoch": 0.13, "learning_rate": 0.0008949230769230769, "loss": 0.0724, "step": 2550 }, { "epoch": 0.13, "learning_rate": 0.0008936410256410257, "loss": 0.068, "step": 2575 }, { "epoch": 0.13, "learning_rate": 0.0008923589743589744, "loss": 0.0632, "step": 2600 }, { "epoch": 0.13, "learning_rate": 0.0008910769230769231, "loss": 0.0825, "step": 2625 }, { "epoch": 0.13, "learning_rate": 0.0008897948717948718, "loss": 0.0831, "step": 2650 }, { "epoch": 0.13, "learning_rate": 0.0008885128205128206, "loss": 0.0689, "step": 2675 }, { "epoch": 0.14, "learning_rate": 0.0008872307692307693, "loss": 0.0741, "step": 2700 }, { "epoch": 0.14, "learning_rate": 0.000885948717948718, "loss": 0.0798, "step": 2725 }, { "epoch": 0.14, "learning_rate": 0.0008846666666666667, "loss": 0.0817, "step": 2750 }, { "epoch": 0.14, "learning_rate": 0.0008833846153846154, "loss": 0.081, "step": 2775 }, { "epoch": 0.14, "learning_rate": 0.0008821025641025642, "loss": 0.0867, "step": 2800 }, { "epoch": 0.14, "eval_loss": 0.10396511852741241, "eval_runtime": 216.3083, "eval_samples_per_second": 4.623, "eval_steps_per_second": 0.291, "step": 2800 }, { "epoch": 0.14, "learning_rate": 0.0008808205128205128, "loss": 0.091, "step": 2825 }, { "epoch": 0.14, "learning_rate": 0.0008795384615384616, "loss": 0.0801, "step": 2850 }, { "epoch": 0.14, "learning_rate": 0.0008782564102564102, "loss": 0.1067, "step": 2875 }, { "epoch": 0.14, "learning_rate": 0.0008769743589743589, "loss": 0.0848, "step": 2900 }, { "epoch": 0.15, "learning_rate": 0.0008756923076923077, "loss": 0.0846, "step": 2925 }, { "epoch": 0.15, "learning_rate": 0.0008744102564102564, "loss": 0.076, "step": 2950 }, { "epoch": 0.15, "learning_rate": 0.0008731282051282051, "loss": 0.0729, "step": 2975 }, { "epoch": 0.15, "learning_rate": 0.0008718461538461538, "loss": 0.0994, "step": 3000 }, { "epoch": 0.15, "learning_rate": 0.0008705641025641026, "loss": 0.0901, "step": 3025 }, { "epoch": 0.15, "learning_rate": 0.0008692820512820513, "loss": 0.0735, "step": 3050 }, { "epoch": 0.15, "learning_rate": 0.0008680000000000001, "loss": 0.0796, "step": 3075 }, { "epoch": 0.15, "learning_rate": 0.0008667179487179487, "loss": 0.0888, "step": 3100 }, { "epoch": 0.16, "learning_rate": 0.0008654358974358974, "loss": 0.09, "step": 3125 }, { "epoch": 0.16, "learning_rate": 0.0008641538461538462, "loss": 0.0698, "step": 3150 }, { "epoch": 0.16, "learning_rate": 0.0008628717948717949, "loss": 0.074, "step": 3175 }, { "epoch": 0.16, "learning_rate": 0.0008615897435897437, "loss": 0.0644, "step": 3200 }, { "epoch": 0.16, "eval_loss": 0.09922421723604202, "eval_runtime": 218.0471, "eval_samples_per_second": 4.586, "eval_steps_per_second": 0.289, "step": 3200 }, { "epoch": 0.16, "learning_rate": 0.0008603076923076923, "loss": 0.0788, "step": 3225 }, { "epoch": 0.16, "learning_rate": 0.0008590256410256411, "loss": 0.0797, "step": 3250 }, { "epoch": 0.16, "learning_rate": 0.0008577948717948718, "loss": 0.11, "step": 3275 }, { "epoch": 0.17, "learning_rate": 0.0008565128205128206, "loss": 0.1003, "step": 3300 }, { "epoch": 0.17, "learning_rate": 0.0008552307692307693, "loss": 0.091, "step": 3325 }, { "epoch": 0.17, "learning_rate": 0.0008539487179487179, "loss": 0.1001, "step": 3350 }, { "epoch": 0.17, "learning_rate": 0.0008526666666666667, "loss": 0.087, "step": 3375 }, { "epoch": 0.17, "learning_rate": 0.0008513846153846154, "loss": 0.0857, "step": 3400 }, { "epoch": 0.17, "learning_rate": 0.0008501025641025642, "loss": 0.1035, "step": 3425 }, { "epoch": 0.17, "learning_rate": 0.0008488205128205129, "loss": 0.0913, "step": 3450 }, { "epoch": 0.17, "learning_rate": 0.0008475384615384616, "loss": 0.1002, "step": 3475 }, { "epoch": 0.17, "learning_rate": 0.0008462564102564103, "loss": 0.1144, "step": 3500 }, { "epoch": 0.18, "learning_rate": 0.000844974358974359, "loss": 0.1348, "step": 3525 }, { "epoch": 0.18, "learning_rate": 0.0008436923076923078, "loss": 0.0958, "step": 3550 }, { "epoch": 0.18, "learning_rate": 0.0008424102564102565, "loss": 0.116, "step": 3575 }, { "epoch": 0.18, "learning_rate": 0.0008411282051282051, "loss": 0.0895, "step": 3600 }, { "epoch": 0.18, "eval_loss": 0.1009831354022026, "eval_runtime": 216.8096, "eval_samples_per_second": 4.612, "eval_steps_per_second": 0.291, "step": 3600 }, { "epoch": 0.18, "learning_rate": 0.0008398461538461538, "loss": 0.0824, "step": 3625 }, { "epoch": 0.18, "learning_rate": 0.0008385641025641026, "loss": 0.1105, "step": 3650 }, { "epoch": 0.18, "learning_rate": 0.0008372820512820513, "loss": 0.1328, "step": 3675 }, { "epoch": 0.18, "learning_rate": 0.0008359999999999999, "loss": 0.1774, "step": 3700 }, { "epoch": 0.19, "learning_rate": 0.0008347179487179487, "loss": 0.1158, "step": 3725 }, { "epoch": 0.19, "learning_rate": 0.0008334358974358974, "loss": 0.1067, "step": 3750 }, { "epoch": 0.19, "learning_rate": 0.0008321538461538462, "loss": 0.0855, "step": 3775 }, { "epoch": 0.19, "learning_rate": 0.0008308717948717949, "loss": 0.0917, "step": 3800 }, { "epoch": 0.19, "learning_rate": 0.0008295897435897436, "loss": 0.0945, "step": 3825 }, { "epoch": 0.19, "learning_rate": 0.0008283076923076923, "loss": 0.0863, "step": 3850 }, { "epoch": 0.19, "learning_rate": 0.000827025641025641, "loss": 0.1196, "step": 3875 }, { "epoch": 0.2, "learning_rate": 0.0008257435897435898, "loss": 0.0989, "step": 3900 }, { "epoch": 0.2, "learning_rate": 0.0008244615384615385, "loss": 0.1076, "step": 3925 }, { "epoch": 0.2, "learning_rate": 0.0008231794871794872, "loss": 0.1138, "step": 3950 }, { "epoch": 0.2, "learning_rate": 0.0008218974358974359, "loss": 0.1353, "step": 3975 }, { "epoch": 1.0, "learning_rate": 0.0008206153846153847, "loss": 0.236, "step": 4000 }, { "epoch": 1.0, "eval_loss": 0.9967941641807556, "eval_runtime": 215.9827, "eval_samples_per_second": 4.63, "eval_steps_per_second": 0.292, "step": 4000 }, { "epoch": 1.0, "learning_rate": 0.0008193846153846154, "loss": 0.382, "step": 4025 }, { "epoch": 1.0, "learning_rate": 0.0008181025641025642, "loss": 0.3524, "step": 4050 }, { "epoch": 1.0, "learning_rate": 0.0008168205128205128, "loss": 0.1807, "step": 4075 }, { "epoch": 1.01, "learning_rate": 0.0008155384615384615, "loss": 0.1568, "step": 4100 }, { "epoch": 1.01, "learning_rate": 0.0008142564102564103, "loss": 0.1533, "step": 4125 }, { "epoch": 1.01, "learning_rate": 0.000812974358974359, "loss": 0.1324, "step": 4150 }, { "epoch": 1.01, "learning_rate": 0.0008116923076923078, "loss": 0.1379, "step": 4175 }, { "epoch": 1.01, "learning_rate": 0.0008104102564102564, "loss": 0.1332, "step": 4200 }, { "epoch": 1.01, "learning_rate": 0.0008091282051282052, "loss": 0.1431, "step": 4225 }, { "epoch": 1.01, "learning_rate": 0.0008078461538461539, "loss": 0.1466, "step": 4250 }, { "epoch": 1.01, "learning_rate": 0.0008065641025641026, "loss": 0.1188, "step": 4275 }, { "epoch": 1.02, "learning_rate": 0.0008052820512820514, "loss": 0.1295, "step": 4300 }, { "epoch": 1.02, "learning_rate": 0.000804, "loss": 0.1378, "step": 4325 }, { "epoch": 1.02, "learning_rate": 0.0008027179487179488, "loss": 0.1197, "step": 4350 }, { "epoch": 1.02, "learning_rate": 0.0008014358974358974, "loss": 0.1231, "step": 4375 }, { "epoch": 1.02, "learning_rate": 0.0008001538461538462, "loss": 0.1268, "step": 4400 }, { "epoch": 1.02, "eval_loss": 1.367616891860962, "eval_runtime": 215.6124, "eval_samples_per_second": 4.638, "eval_steps_per_second": 0.292, "step": 4400 }, { "epoch": 1.02, "learning_rate": 0.0007988717948717948, "loss": 0.1283, "step": 4425 }, { "epoch": 1.02, "learning_rate": 0.0007975897435897435, "loss": 0.112, "step": 4450 }, { "epoch": 1.02, "learning_rate": 0.0007963076923076923, "loss": 0.1102, "step": 4475 }, { "epoch": 1.03, "learning_rate": 0.000795025641025641, "loss": 0.1225, "step": 4500 }, { "epoch": 1.03, "learning_rate": 0.0007937435897435898, "loss": 0.1155, "step": 4525 }, { "epoch": 1.03, "learning_rate": 0.0007924615384615384, "loss": 0.1176, "step": 4550 }, { "epoch": 1.03, "learning_rate": 0.0007911794871794872, "loss": 0.1049, "step": 4575 }, { "epoch": 1.03, "learning_rate": 0.0007898974358974359, "loss": 0.1204, "step": 4600 }, { "epoch": 1.03, "learning_rate": 0.0007886153846153847, "loss": 0.1112, "step": 4625 }, { "epoch": 1.03, "learning_rate": 0.0007873333333333334, "loss": 0.1064, "step": 4650 }, { "epoch": 1.03, "learning_rate": 0.000786051282051282, "loss": 0.1108, "step": 4675 }, { "epoch": 1.04, "learning_rate": 0.0007847692307692308, "loss": 0.1067, "step": 4700 }, { "epoch": 1.04, "learning_rate": 0.0007834871794871795, "loss": 0.0994, "step": 4725 }, { "epoch": 1.04, "learning_rate": 0.0007822051282051283, "loss": 0.105, "step": 4750 }, { "epoch": 1.04, "learning_rate": 0.000780923076923077, "loss": 0.0964, "step": 4775 }, { "epoch": 1.04, "learning_rate": 0.0007796410256410257, "loss": 0.1118, "step": 4800 }, { "epoch": 1.04, "eval_loss": 1.3230507373809814, "eval_runtime": 217.4474, "eval_samples_per_second": 4.599, "eval_steps_per_second": 0.29, "step": 4800 }, { "epoch": 1.04, "learning_rate": 0.0007783589743589744, "loss": 0.1062, "step": 4825 }, { "epoch": 1.04, "learning_rate": 0.0007770769230769231, "loss": 0.0876, "step": 4850 }, { "epoch": 1.04, "learning_rate": 0.0007757948717948719, "loss": 0.0958, "step": 4875 }, { "epoch": 1.05, "learning_rate": 0.0007745128205128205, "loss": 0.0915, "step": 4900 }, { "epoch": 1.05, "learning_rate": 0.0007732307692307693, "loss": 0.0935, "step": 4925 }, { "epoch": 1.05, "learning_rate": 0.000771948717948718, "loss": 0.0947, "step": 4950 }, { "epoch": 1.05, "learning_rate": 0.0007706666666666668, "loss": 0.0985, "step": 4975 }, { "epoch": 1.05, "learning_rate": 0.0007693846153846154, "loss": 0.0913, "step": 5000 }, { "epoch": 1.05, "learning_rate": 0.000768102564102564, "loss": 0.0933, "step": 5025 }, { "epoch": 1.05, "learning_rate": 0.0007668205128205128, "loss": 0.0953, "step": 5050 }, { "epoch": 1.05, "learning_rate": 0.0007655384615384615, "loss": 0.0958, "step": 5075 }, { "epoch": 1.06, "learning_rate": 0.0007642564102564103, "loss": 0.0938, "step": 5100 }, { "epoch": 1.06, "learning_rate": 0.000762974358974359, "loss": 0.0886, "step": 5125 }, { "epoch": 1.06, "learning_rate": 0.0007616923076923077, "loss": 0.0897, "step": 5150 }, { "epoch": 1.06, "learning_rate": 0.0007604102564102564, "loss": 0.0925, "step": 5175 }, { "epoch": 1.06, "learning_rate": 0.0007591282051282051, "loss": 0.0868, "step": 5200 }, { "epoch": 1.06, "eval_loss": 1.243537187576294, "eval_runtime": 216.704, "eval_samples_per_second": 4.615, "eval_steps_per_second": 0.291, "step": 5200 }, { "epoch": 1.06, "learning_rate": 0.0007578461538461539, "loss": 0.0964, "step": 5225 }, { "epoch": 1.06, "learning_rate": 0.0007565641025641025, "loss": 0.094, "step": 5250 }, { "epoch": 1.06, "learning_rate": 0.0007552820512820513, "loss": 0.0932, "step": 5275 }, { "epoch": 1.07, "learning_rate": 0.000754, "loss": 0.0908, "step": 5300 }, { "epoch": 1.07, "learning_rate": 0.0007527179487179488, "loss": 0.0903, "step": 5325 }, { "epoch": 1.07, "learning_rate": 0.0007514358974358975, "loss": 0.0914, "step": 5350 }, { "epoch": 1.07, "learning_rate": 0.0007501538461538461, "loss": 0.0947, "step": 5375 }, { "epoch": 1.07, "learning_rate": 0.0007488717948717949, "loss": 0.0861, "step": 5400 }, { "epoch": 1.07, "learning_rate": 0.0007475897435897436, "loss": 0.087, "step": 5425 }, { "epoch": 1.07, "learning_rate": 0.0007463076923076924, "loss": 0.1056, "step": 5450 }, { "epoch": 1.07, "learning_rate": 0.0007450256410256411, "loss": 0.0759, "step": 5475 }, { "epoch": 1.08, "learning_rate": 0.0007437435897435898, "loss": 0.0963, "step": 5500 }, { "epoch": 1.08, "learning_rate": 0.0007424615384615385, "loss": 0.079, "step": 5525 }, { "epoch": 1.08, "learning_rate": 0.0007411794871794872, "loss": 0.0808, "step": 5550 }, { "epoch": 1.08, "learning_rate": 0.000739897435897436, "loss": 0.0941, "step": 5575 }, { "epoch": 1.08, "learning_rate": 0.0007386153846153845, "loss": 0.0985, "step": 5600 }, { "epoch": 1.08, "eval_loss": 0.10466726869344711, "eval_runtime": 220.9516, "eval_samples_per_second": 4.526, "eval_steps_per_second": 0.285, "step": 5600 }, { "epoch": 1.08, "learning_rate": 0.0007373333333333333, "loss": 0.096, "step": 5625 }, { "epoch": 1.08, "learning_rate": 0.000736051282051282, "loss": 0.0836, "step": 5650 }, { "epoch": 1.08, "learning_rate": 0.0007347692307692308, "loss": 0.0871, "step": 5675 }, { "epoch": 1.09, "learning_rate": 0.0007334871794871795, "loss": 0.0806, "step": 5700 }, { "epoch": 1.09, "learning_rate": 0.0007322051282051281, "loss": 0.0743, "step": 5725 }, { "epoch": 1.09, "learning_rate": 0.0007309230769230769, "loss": 0.0764, "step": 5750 }, { "epoch": 1.09, "learning_rate": 0.0007296410256410256, "loss": 0.0785, "step": 5775 }, { "epoch": 1.09, "learning_rate": 0.0007283589743589744, "loss": 0.1074, "step": 5800 }, { "epoch": 1.09, "learning_rate": 0.0007270769230769231, "loss": 0.085, "step": 5825 }, { "epoch": 1.09, "learning_rate": 0.0007257948717948718, "loss": 0.0737, "step": 5850 }, { "epoch": 1.09, "learning_rate": 0.0007245128205128205, "loss": 0.0804, "step": 5875 }, { "epoch": 1.1, "learning_rate": 0.0007232307692307692, "loss": 0.0889, "step": 5900 }, { "epoch": 1.1, "learning_rate": 0.000721948717948718, "loss": 0.0962, "step": 5925 }, { "epoch": 1.1, "learning_rate": 0.0007206666666666667, "loss": 0.0869, "step": 5950 }, { "epoch": 1.1, "learning_rate": 0.0007193846153846154, "loss": 0.0913, "step": 5975 }, { "epoch": 1.1, "learning_rate": 0.0007181025641025641, "loss": 0.086, "step": 6000 }, { "epoch": 1.1, "eval_loss": 0.09825620800256729, "eval_runtime": 217.8898, "eval_samples_per_second": 4.589, "eval_steps_per_second": 0.289, "step": 6000 }, { "epoch": 1.1, "learning_rate": 0.0007168205128205129, "loss": 0.0859, "step": 6025 }, { "epoch": 1.1, "learning_rate": 0.0007155384615384616, "loss": 0.0757, "step": 6050 }, { "epoch": 1.1, "learning_rate": 0.0007142564102564103, "loss": 0.0643, "step": 6075 }, { "epoch": 1.11, "learning_rate": 0.000712974358974359, "loss": 0.0771, "step": 6100 }, { "epoch": 1.11, "learning_rate": 0.0007116923076923077, "loss": 0.0666, "step": 6125 }, { "epoch": 1.11, "learning_rate": 0.0007104102564102565, "loss": 0.0963, "step": 6150 }, { "epoch": 1.11, "learning_rate": 0.0007091282051282052, "loss": 0.1062, "step": 6175 }, { "epoch": 1.11, "learning_rate": 0.0007078461538461538, "loss": 0.0881, "step": 6200 }, { "epoch": 1.11, "learning_rate": 0.0007065641025641025, "loss": 0.0587, "step": 6225 }, { "epoch": 1.11, "learning_rate": 0.0007052820512820512, "loss": 0.0673, "step": 6250 }, { "epoch": 1.11, "learning_rate": 0.000704, "loss": 0.069, "step": 6275 }, { "epoch": 1.12, "learning_rate": 0.0007027179487179487, "loss": 0.0673, "step": 6300 }, { "epoch": 1.12, "learning_rate": 0.0007014358974358974, "loss": 0.0513, "step": 6325 }, { "epoch": 1.12, "learning_rate": 0.0007001538461538461, "loss": 0.0614, "step": 6350 }, { "epoch": 1.12, "learning_rate": 0.0006988717948717949, "loss": 0.0498, "step": 6375 }, { "epoch": 1.12, "learning_rate": 0.0006975897435897436, "loss": 0.0635, "step": 6400 }, { "epoch": 1.12, "eval_loss": 0.09686123579740524, "eval_runtime": 217.5581, "eval_samples_per_second": 4.596, "eval_steps_per_second": 0.29, "step": 6400 }, { "epoch": 1.12, "learning_rate": 0.0006963076923076924, "loss": 0.0656, "step": 6425 }, { "epoch": 1.12, "learning_rate": 0.000695025641025641, "loss": 0.0574, "step": 6450 }, { "epoch": 1.12, "learning_rate": 0.0006937435897435897, "loss": 0.0583, "step": 6475 }, { "epoch": 1.13, "learning_rate": 0.0006924615384615385, "loss": 0.0594, "step": 6500 }, { "epoch": 1.13, "learning_rate": 0.0006911794871794872, "loss": 0.0607, "step": 6525 }, { "epoch": 1.13, "learning_rate": 0.000689897435897436, "loss": 0.0544, "step": 6550 }, { "epoch": 1.13, "learning_rate": 0.0006886153846153846, "loss": 0.0466, "step": 6575 }, { "epoch": 1.13, "learning_rate": 0.0006873333333333334, "loss": 0.0574, "step": 6600 }, { "epoch": 1.13, "learning_rate": 0.0006860512820512821, "loss": 0.0655, "step": 6625 }, { "epoch": 1.13, "learning_rate": 0.0006847692307692308, "loss": 0.051, "step": 6650 }, { "epoch": 1.13, "learning_rate": 0.0006834871794871796, "loss": 0.0582, "step": 6675 }, { "epoch": 1.14, "learning_rate": 0.0006822051282051282, "loss": 0.0593, "step": 6700 }, { "epoch": 1.14, "learning_rate": 0.000680923076923077, "loss": 0.0597, "step": 6725 }, { "epoch": 1.14, "learning_rate": 0.0006796410256410257, "loss": 0.0603, "step": 6750 }, { "epoch": 1.14, "learning_rate": 0.0006783589743589745, "loss": 0.0655, "step": 6775 }, { "epoch": 1.14, "learning_rate": 0.0006770769230769231, "loss": 0.0664, "step": 6800 }, { "epoch": 1.14, "eval_loss": 0.0915135070681572, "eval_runtime": 217.3938, "eval_samples_per_second": 4.6, "eval_steps_per_second": 0.29, "step": 6800 }, { "epoch": 1.14, "learning_rate": 0.0006757948717948717, "loss": 0.0604, "step": 6825 }, { "epoch": 1.14, "learning_rate": 0.0006745128205128205, "loss": 0.0765, "step": 6850 }, { "epoch": 1.14, "learning_rate": 0.0006732307692307692, "loss": 0.0622, "step": 6875 }, { "epoch": 1.15, "learning_rate": 0.000671948717948718, "loss": 0.0572, "step": 6900 }, { "epoch": 1.15, "learning_rate": 0.0006706666666666666, "loss": 0.0423, "step": 6925 }, { "epoch": 1.15, "learning_rate": 0.0006693846153846154, "loss": 0.0504, "step": 6950 }, { "epoch": 1.15, "learning_rate": 0.0006681025641025641, "loss": 0.0747, "step": 6975 }, { "epoch": 1.15, "learning_rate": 0.0006668205128205128, "loss": 0.0651, "step": 7000 }, { "epoch": 1.15, "learning_rate": 0.0006655384615384616, "loss": 0.0549, "step": 7025 }, { "epoch": 1.15, "learning_rate": 0.0006642564102564102, "loss": 0.0512, "step": 7050 }, { "epoch": 1.15, "learning_rate": 0.000662974358974359, "loss": 0.067, "step": 7075 }, { "epoch": 1.16, "learning_rate": 0.0006616923076923077, "loss": 0.0638, "step": 7100 }, { "epoch": 1.16, "learning_rate": 0.0006604102564102565, "loss": 0.0512, "step": 7125 }, { "epoch": 1.16, "learning_rate": 0.0006591282051282051, "loss": 0.0557, "step": 7150 }, { "epoch": 1.16, "learning_rate": 0.0006578461538461538, "loss": 0.0497, "step": 7175 }, { "epoch": 1.16, "learning_rate": 0.0006565641025641026, "loss": 0.0605, "step": 7200 }, { "epoch": 1.16, "eval_loss": 0.09085912257432938, "eval_runtime": 217.4684, "eval_samples_per_second": 4.598, "eval_steps_per_second": 0.29, "step": 7200 }, { "epoch": 1.16, "learning_rate": 0.0006552820512820513, "loss": 0.0573, "step": 7225 }, { "epoch": 1.16, "learning_rate": 0.0006540000000000001, "loss": 0.0714, "step": 7250 }, { "epoch": 1.16, "learning_rate": 0.0006527179487179487, "loss": 0.0813, "step": 7275 }, { "epoch": 1.17, "learning_rate": 0.0006514358974358975, "loss": 0.0664, "step": 7300 }, { "epoch": 1.17, "learning_rate": 0.0006501538461538462, "loss": 0.0789, "step": 7325 }, { "epoch": 1.17, "learning_rate": 0.0006488717948717949, "loss": 0.0711, "step": 7350 }, { "epoch": 1.17, "learning_rate": 0.0006475897435897437, "loss": 0.0635, "step": 7375 }, { "epoch": 1.17, "learning_rate": 0.0006463076923076923, "loss": 0.0794, "step": 7400 }, { "epoch": 1.17, "learning_rate": 0.000645025641025641, "loss": 0.0742, "step": 7425 }, { "epoch": 1.17, "learning_rate": 0.0006437435897435897, "loss": 0.0763, "step": 7450 }, { "epoch": 1.17, "learning_rate": 0.0006424615384615385, "loss": 0.0783, "step": 7475 }, { "epoch": 1.18, "learning_rate": 0.0006411794871794871, "loss": 0.1092, "step": 7500 }, { "epoch": 1.18, "learning_rate": 0.0006398974358974358, "loss": 0.0765, "step": 7525 }, { "epoch": 1.18, "learning_rate": 0.0006386153846153846, "loss": 0.0926, "step": 7550 }, { "epoch": 1.18, "learning_rate": 0.0006373333333333333, "loss": 0.0669, "step": 7575 }, { "epoch": 1.18, "learning_rate": 0.0006360512820512821, "loss": 0.0645, "step": 7600 }, { "epoch": 1.18, "eval_loss": 0.10231851041316986, "eval_runtime": 216.1204, "eval_samples_per_second": 4.627, "eval_steps_per_second": 0.292, "step": 7600 }, { "epoch": 1.18, "learning_rate": 0.0006347692307692307, "loss": 0.0898, "step": 7625 }, { "epoch": 1.18, "learning_rate": 0.0006334871794871795, "loss": 0.103, "step": 7650 }, { "epoch": 1.18, "learning_rate": 0.0006322051282051282, "loss": 0.1355, "step": 7675 }, { "epoch": 1.19, "learning_rate": 0.0006309230769230769, "loss": 0.0953, "step": 7700 }, { "epoch": 1.19, "learning_rate": 0.0006296410256410257, "loss": 0.0807, "step": 7725 }, { "epoch": 1.19, "learning_rate": 0.0006283589743589743, "loss": 0.069, "step": 7750 }, { "epoch": 1.19, "learning_rate": 0.0006271282051282051, "loss": 0.0823, "step": 7775 }, { "epoch": 1.19, "learning_rate": 0.0006258461538461538, "loss": 0.0687, "step": 7800 }, { "epoch": 1.19, "learning_rate": 0.0006245641025641026, "loss": 0.0701, "step": 7825 }, { "epoch": 1.19, "learning_rate": 0.0006232820512820513, "loss": 0.089, "step": 7850 }, { "epoch": 1.19, "learning_rate": 0.000622, "loss": 0.0734, "step": 7875 }, { "epoch": 1.2, "learning_rate": 0.0006207179487179487, "loss": 0.0798, "step": 7900 }, { "epoch": 1.2, "learning_rate": 0.0006194358974358975, "loss": 0.0835, "step": 7925 }, { "epoch": 1.2, "learning_rate": 0.0006181538461538462, "loss": 0.1016, "step": 7950 }, { "epoch": 2.0, "learning_rate": 0.0006168717948717949, "loss": 0.1292, "step": 7975 }, { "epoch": 2.0, "learning_rate": 0.0006155897435897436, "loss": 0.1077, "step": 8000 }, { "epoch": 2.0, "eval_loss": 0.8477774858474731, "eval_runtime": 216.383, "eval_samples_per_second": 4.621, "eval_steps_per_second": 0.291, "step": 8000 }, { "epoch": 2.0, "learning_rate": 0.0006143076923076923, "loss": 0.109, "step": 8025 }, { "epoch": 2.0, "learning_rate": 0.0006130256410256411, "loss": 0.1169, "step": 8050 }, { "epoch": 2.01, "learning_rate": 0.0006117435897435898, "loss": 0.1108, "step": 8075 }, { "epoch": 2.01, "learning_rate": 0.0006104615384615386, "loss": 0.1014, "step": 8100 }, { "epoch": 2.01, "learning_rate": 0.0006091794871794872, "loss": 0.0945, "step": 8125 }, { "epoch": 2.01, "learning_rate": 0.0006078974358974359, "loss": 0.1005, "step": 8150 }, { "epoch": 2.01, "learning_rate": 0.0006066153846153847, "loss": 0.0888, "step": 8175 }, { "epoch": 2.01, "learning_rate": 0.0006053333333333333, "loss": 0.0959, "step": 8200 }, { "epoch": 2.01, "learning_rate": 0.000604051282051282, "loss": 0.1038, "step": 8225 }, { "epoch": 2.01, "learning_rate": 0.0006027692307692307, "loss": 0.0909, "step": 8250 }, { "epoch": 2.02, "learning_rate": 0.0006014871794871795, "loss": 0.0994, "step": 8275 }, { "epoch": 2.02, "learning_rate": 0.0006002051282051282, "loss": 0.0996, "step": 8300 }, { "epoch": 2.02, "learning_rate": 0.0005989230769230769, "loss": 0.0922, "step": 8325 }, { "epoch": 2.02, "learning_rate": 0.0005976410256410256, "loss": 0.099, "step": 8350 }, { "epoch": 2.02, "learning_rate": 0.0005963589743589743, "loss": 0.0924, "step": 8375 }, { "epoch": 2.02, "learning_rate": 0.0005950769230769231, "loss": 0.0891, "step": 8400 }, { "epoch": 2.02, "eval_loss": 0.8391351699829102, "eval_runtime": 216.0468, "eval_samples_per_second": 4.629, "eval_steps_per_second": 0.292, "step": 8400 }, { "epoch": 2.02, "learning_rate": 0.0005937948717948718, "loss": 0.0856, "step": 8425 }, { "epoch": 2.02, "learning_rate": 0.0005925128205128206, "loss": 0.082, "step": 8450 }, { "epoch": 2.03, "learning_rate": 0.0005912307692307692, "loss": 0.0887, "step": 8475 }, { "epoch": 2.03, "learning_rate": 0.0005899487179487179, "loss": 0.0836, "step": 8500 }, { "epoch": 2.03, "learning_rate": 0.0005886666666666667, "loss": 0.0841, "step": 8525 }, { "epoch": 2.03, "learning_rate": 0.0005873846153846154, "loss": 0.0814, "step": 8550 }, { "epoch": 2.03, "learning_rate": 0.0005861025641025642, "loss": 0.0903, "step": 8575 }, { "epoch": 2.03, "learning_rate": 0.0005848205128205128, "loss": 0.0846, "step": 8600 }, { "epoch": 2.03, "learning_rate": 0.0005835384615384616, "loss": 0.0759, "step": 8625 }, { "epoch": 2.03, "learning_rate": 0.0005822564102564103, "loss": 0.0817, "step": 8650 }, { "epoch": 2.04, "learning_rate": 0.000580974358974359, "loss": 0.0768, "step": 8675 }, { "epoch": 2.04, "learning_rate": 0.0005796923076923078, "loss": 0.0718, "step": 8700 }, { "epoch": 2.04, "learning_rate": 0.0005784102564102564, "loss": 0.0737, "step": 8725 }, { "epoch": 2.04, "learning_rate": 0.0005771282051282052, "loss": 0.0712, "step": 8750 }, { "epoch": 2.04, "learning_rate": 0.0005758461538461539, "loss": 0.0768, "step": 8775 }, { "epoch": 2.04, "learning_rate": 0.0005745641025641027, "loss": 0.0803, "step": 8800 }, { "epoch": 2.04, "eval_loss": 0.759874165058136, "eval_runtime": 215.1962, "eval_samples_per_second": 4.647, "eval_steps_per_second": 0.293, "step": 8800 }, { "epoch": 2.04, "learning_rate": 0.0005732820512820512, "loss": 0.068, "step": 8825 }, { "epoch": 2.04, "learning_rate": 0.0005719999999999999, "loss": 0.0679, "step": 8850 }, { "epoch": 2.05, "learning_rate": 0.0005707179487179487, "loss": 0.0687, "step": 8875 }, { "epoch": 2.05, "learning_rate": 0.0005694358974358974, "loss": 0.0675, "step": 8900 }, { "epoch": 2.05, "learning_rate": 0.0005681538461538462, "loss": 0.0736, "step": 8925 }, { "epoch": 2.05, "learning_rate": 0.0005668717948717948, "loss": 0.0737, "step": 8950 }, { "epoch": 2.05, "learning_rate": 0.0005655897435897436, "loss": 0.0737, "step": 8975 }, { "epoch": 2.05, "learning_rate": 0.0005643076923076923, "loss": 0.0674, "step": 9000 }, { "epoch": 2.05, "learning_rate": 0.000563025641025641, "loss": 0.0703, "step": 9025 }, { "epoch": 2.05, "learning_rate": 0.0005617435897435898, "loss": 0.0698, "step": 9050 }, { "epoch": 2.06, "learning_rate": 0.0005604615384615384, "loss": 0.0723, "step": 9075 }, { "epoch": 2.06, "learning_rate": 0.0005591794871794872, "loss": 0.0688, "step": 9100 }, { "epoch": 2.06, "learning_rate": 0.0005578974358974359, "loss": 0.0664, "step": 9125 }, { "epoch": 2.06, "learning_rate": 0.0005566153846153847, "loss": 0.0677, "step": 9150 }, { "epoch": 2.06, "learning_rate": 0.0005553333333333334, "loss": 0.0624, "step": 9175 }, { "epoch": 2.06, "learning_rate": 0.000554051282051282, "loss": 0.068, "step": 9200 }, { "epoch": 2.06, "eval_loss": 0.77803635597229, "eval_runtime": 216.146, "eval_samples_per_second": 4.627, "eval_steps_per_second": 0.291, "step": 9200 }, { "epoch": 2.06, "learning_rate": 0.0005527692307692308, "loss": 0.0717, "step": 9225 }, { "epoch": 2.06, "learning_rate": 0.0005514871794871795, "loss": 0.0692, "step": 9250 }, { "epoch": 2.07, "learning_rate": 0.0005502051282051283, "loss": 0.0657, "step": 9275 }, { "epoch": 2.07, "learning_rate": 0.000548923076923077, "loss": 0.064, "step": 9300 }, { "epoch": 2.07, "learning_rate": 0.0005476410256410257, "loss": 0.0661, "step": 9325 }, { "epoch": 2.07, "learning_rate": 0.0005463589743589744, "loss": 0.0604, "step": 9350 }, { "epoch": 2.07, "learning_rate": 0.0005450769230769232, "loss": 0.066, "step": 9375 }, { "epoch": 2.07, "learning_rate": 0.0005437948717948719, "loss": 0.0617, "step": 9400 }, { "epoch": 2.07, "learning_rate": 0.0005425128205128204, "loss": 0.0622, "step": 9425 }, { "epoch": 2.07, "learning_rate": 0.0005412307692307692, "loss": 0.056, "step": 9450 }, { "epoch": 2.08, "learning_rate": 0.0005399487179487179, "loss": 0.0621, "step": 9475 }, { "epoch": 2.08, "learning_rate": 0.0005386666666666667, "loss": 0.0578, "step": 9500 }, { "epoch": 2.08, "learning_rate": 0.0005373846153846154, "loss": 0.0526, "step": 9525 }, { "epoch": 2.08, "learning_rate": 0.0005361025641025641, "loss": 0.0703, "step": 9550 }, { "epoch": 2.08, "learning_rate": 0.0005348205128205128, "loss": 0.0629, "step": 9575 }, { "epoch": 2.08, "learning_rate": 0.0005335384615384615, "loss": 0.0723, "step": 9600 }, { "epoch": 2.08, "eval_loss": 0.09210965782403946, "eval_runtime": 216.3525, "eval_samples_per_second": 4.622, "eval_steps_per_second": 0.291, "step": 9600 }, { "epoch": 2.08, "learning_rate": 0.0005322564102564103, "loss": 0.0606, "step": 9625 }, { "epoch": 2.08, "learning_rate": 0.000530974358974359, "loss": 0.0577, "step": 9650 }, { "epoch": 2.09, "learning_rate": 0.0005296923076923077, "loss": 0.0595, "step": 9675 }, { "epoch": 2.09, "learning_rate": 0.0005284102564102564, "loss": 0.0561, "step": 9700 }, { "epoch": 2.09, "learning_rate": 0.0005271282051282052, "loss": 0.0558, "step": 9725 }, { "epoch": 2.09, "learning_rate": 0.0005258461538461539, "loss": 0.0562, "step": 9750 }, { "epoch": 2.09, "learning_rate": 0.0005245641025641025, "loss": 0.0756, "step": 9775 }, { "epoch": 2.09, "learning_rate": 0.0005232820512820513, "loss": 0.0624, "step": 9800 }, { "epoch": 2.09, "learning_rate": 0.000522, "loss": 0.0532, "step": 9825 }, { "epoch": 2.09, "learning_rate": 0.0005207179487179488, "loss": 0.0546, "step": 9850 }, { "epoch": 2.1, "learning_rate": 0.0005194358974358975, "loss": 0.0603, "step": 9875 }, { "epoch": 2.1, "learning_rate": 0.0005181538461538462, "loss": 0.0633, "step": 9900 }, { "epoch": 2.1, "learning_rate": 0.0005168717948717949, "loss": 0.0598, "step": 9925 }, { "epoch": 2.1, "learning_rate": 0.0005155897435897436, "loss": 0.0635, "step": 9950 }, { "epoch": 2.1, "learning_rate": 0.0005143076923076924, "loss": 0.0587, "step": 9975 }, { "epoch": 2.1, "learning_rate": 0.0005130256410256411, "loss": 0.0615, "step": 10000 }, { "epoch": 2.1, "eval_loss": 0.09180190414190292, "eval_runtime": 215.773, "eval_samples_per_second": 4.635, "eval_steps_per_second": 0.292, "step": 10000 }, { "epoch": 2.1, "learning_rate": 0.0005117435897435897, "loss": 0.0519, "step": 10025 }, { "epoch": 2.1, "learning_rate": 0.0005104615384615384, "loss": 0.0445, "step": 10050 }, { "epoch": 2.11, "learning_rate": 0.0005091794871794872, "loss": 0.0528, "step": 10075 }, { "epoch": 2.11, "learning_rate": 0.0005078974358974359, "loss": 0.0458, "step": 10100 }, { "epoch": 2.11, "learning_rate": 0.0005066153846153845, "loss": 0.0629, "step": 10125 }, { "epoch": 2.11, "learning_rate": 0.0005053333333333333, "loss": 0.08, "step": 10150 }, { "epoch": 2.11, "learning_rate": 0.000504051282051282, "loss": 0.0623, "step": 10175 }, { "epoch": 2.11, "learning_rate": 0.0005027692307692308, "loss": 0.0479, "step": 10200 }, { "epoch": 2.11, "learning_rate": 0.0005014871794871795, "loss": 0.052, "step": 10225 }, { "epoch": 2.11, "learning_rate": 0.0005002051282051282, "loss": 0.0546, "step": 10250 }, { "epoch": 2.12, "learning_rate": 0.0004989230769230769, "loss": 0.0495, "step": 10275 }, { "epoch": 2.12, "learning_rate": 0.0004976410256410256, "loss": 0.0403, "step": 10300 }, { "epoch": 2.12, "learning_rate": 0.0004963589743589744, "loss": 0.0468, "step": 10325 }, { "epoch": 2.12, "learning_rate": 0.0004950769230769231, "loss": 0.0393, "step": 10350 }, { "epoch": 2.12, "learning_rate": 0.0004937948717948718, "loss": 0.0454, "step": 10375 }, { "epoch": 2.12, "learning_rate": 0.0004925128205128205, "loss": 0.0493, "step": 10400 }, { "epoch": 2.12, "eval_loss": 0.09357196092605591, "eval_runtime": 218.7372, "eval_samples_per_second": 4.572, "eval_steps_per_second": 0.288, "step": 10400 }, { "epoch": 2.12, "learning_rate": 0.0004912307692307693, "loss": 0.0402, "step": 10425 }, { "epoch": 2.12, "learning_rate": 0.000489948717948718, "loss": 0.0379, "step": 10450 }, { "epoch": 2.13, "learning_rate": 0.0004886666666666667, "loss": 0.0403, "step": 10475 }, { "epoch": 2.13, "learning_rate": 0.0004873846153846154, "loss": 0.0404, "step": 10500 }, { "epoch": 2.13, "learning_rate": 0.0004861025641025641, "loss": 0.0376, "step": 10525 }, { "epoch": 2.13, "learning_rate": 0.0004848205128205128, "loss": 0.0336, "step": 10550 }, { "epoch": 2.13, "learning_rate": 0.0004835384615384615, "loss": 0.039, "step": 10575 }, { "epoch": 2.13, "learning_rate": 0.00048225641025641025, "loss": 0.0459, "step": 10600 }, { "epoch": 2.13, "learning_rate": 0.000480974358974359, "loss": 0.036, "step": 10625 }, { "epoch": 2.13, "learning_rate": 0.0004796923076923077, "loss": 0.0392, "step": 10650 }, { "epoch": 2.14, "learning_rate": 0.00047841025641025644, "loss": 0.0443, "step": 10675 }, { "epoch": 2.14, "learning_rate": 0.00047712820512820517, "loss": 0.0413, "step": 10700 }, { "epoch": 2.14, "learning_rate": 0.0004758461538461539, "loss": 0.0412, "step": 10725 }, { "epoch": 2.14, "learning_rate": 0.0004745641025641026, "loss": 0.0454, "step": 10750 }, { "epoch": 2.14, "learning_rate": 0.0004732820512820513, "loss": 0.0439, "step": 10775 }, { "epoch": 2.14, "learning_rate": 0.000472, "loss": 0.0412, "step": 10800 }, { "epoch": 2.14, "eval_loss": 0.08619751781225204, "eval_runtime": 217.221, "eval_samples_per_second": 4.604, "eval_steps_per_second": 0.29, "step": 10800 }, { "epoch": 2.14, "learning_rate": 0.0004707179487179487, "loss": 0.0537, "step": 10825 }, { "epoch": 2.14, "learning_rate": 0.00046943589743589744, "loss": 0.0436, "step": 10850 }, { "epoch": 2.15, "learning_rate": 0.00046815384615384617, "loss": 0.0404, "step": 10875 }, { "epoch": 2.15, "learning_rate": 0.0004668717948717949, "loss": 0.0303, "step": 10900 }, { "epoch": 2.15, "learning_rate": 0.0004655897435897436, "loss": 0.0332, "step": 10925 }, { "epoch": 2.15, "learning_rate": 0.0004643076923076923, "loss": 0.0501, "step": 10950 }, { "epoch": 2.15, "learning_rate": 0.00046302564102564104, "loss": 0.0483, "step": 10975 }, { "epoch": 2.15, "learning_rate": 0.00046174358974358977, "loss": 0.038, "step": 11000 }, { "epoch": 2.15, "learning_rate": 0.0004604615384615385, "loss": 0.034, "step": 11025 }, { "epoch": 2.15, "learning_rate": 0.0004591794871794872, "loss": 0.0446, "step": 11050 }, { "epoch": 2.16, "learning_rate": 0.00045789743589743595, "loss": 0.0476, "step": 11075 }, { "epoch": 2.16, "learning_rate": 0.0004566153846153846, "loss": 0.0368, "step": 11100 }, { "epoch": 2.16, "learning_rate": 0.0004553333333333333, "loss": 0.0411, "step": 11125 }, { "epoch": 2.16, "learning_rate": 0.00045405128205128204, "loss": 0.0349, "step": 11150 }, { "epoch": 2.16, "learning_rate": 0.00045276923076923077, "loss": 0.0417, "step": 11175 }, { "epoch": 2.16, "learning_rate": 0.0004514871794871795, "loss": 0.0402, "step": 11200 }, { "epoch": 2.16, "eval_loss": 0.08717386424541473, "eval_runtime": 217.4573, "eval_samples_per_second": 4.599, "eval_steps_per_second": 0.29, "step": 11200 }, { "epoch": 2.16, "learning_rate": 0.0004502051282051282, "loss": 0.0464, "step": 11225 }, { "epoch": 2.16, "learning_rate": 0.00044892307692307696, "loss": 0.0563, "step": 11250 }, { "epoch": 2.17, "learning_rate": 0.00044764102564102563, "loss": 0.0448, "step": 11275 }, { "epoch": 2.17, "learning_rate": 0.00044635897435897436, "loss": 0.0525, "step": 11300 }, { "epoch": 2.17, "learning_rate": 0.0004450769230769231, "loss": 0.0475, "step": 11325 }, { "epoch": 2.17, "learning_rate": 0.0004437948717948718, "loss": 0.0429, "step": 11350 }, { "epoch": 2.17, "learning_rate": 0.00044251282051282055, "loss": 0.0523, "step": 11375 }, { "epoch": 2.17, "learning_rate": 0.0004412307692307692, "loss": 0.0494, "step": 11400 }, { "epoch": 2.17, "learning_rate": 0.00043994871794871796, "loss": 0.0507, "step": 11425 }, { "epoch": 2.17, "learning_rate": 0.00043866666666666663, "loss": 0.0486, "step": 11450 }, { "epoch": 2.18, "learning_rate": 0.00043738461538461536, "loss": 0.0738, "step": 11475 }, { "epoch": 2.18, "learning_rate": 0.0004361025641025641, "loss": 0.0443, "step": 11500 }, { "epoch": 2.18, "learning_rate": 0.0004348205128205128, "loss": 0.06, "step": 11525 }, { "epoch": 2.18, "learning_rate": 0.00043353846153846155, "loss": 0.0438, "step": 11550 }, { "epoch": 2.18, "learning_rate": 0.0004322564102564103, "loss": 0.0393, "step": 11575 }, { "epoch": 2.18, "learning_rate": 0.000430974358974359, "loss": 0.0561, "step": 11600 }, { "epoch": 2.18, "eval_loss": 0.08517900854349136, "eval_runtime": 215.8861, "eval_samples_per_second": 4.632, "eval_steps_per_second": 0.292, "step": 11600 }, { "epoch": 2.18, "learning_rate": 0.00042969230769230774, "loss": 0.0633, "step": 11625 }, { "epoch": 2.18, "learning_rate": 0.0004284102564102564, "loss": 0.0937, "step": 11650 }, { "epoch": 2.19, "learning_rate": 0.00042712820512820515, "loss": 0.0647, "step": 11675 }, { "epoch": 2.19, "learning_rate": 0.0004258461538461538, "loss": 0.0517, "step": 11700 }, { "epoch": 2.19, "learning_rate": 0.00042456410256410255, "loss": 0.044, "step": 11725 }, { "epoch": 2.19, "learning_rate": 0.0004232820512820513, "loss": 0.048, "step": 11750 }, { "epoch": 2.19, "learning_rate": 0.000422, "loss": 0.0429, "step": 11775 }, { "epoch": 2.19, "learning_rate": 0.00042071794871794874, "loss": 0.0452, "step": 11800 }, { "epoch": 2.19, "learning_rate": 0.0004194358974358974, "loss": 0.0535, "step": 11825 }, { "epoch": 2.19, "learning_rate": 0.00041815384615384615, "loss": 0.049, "step": 11850 }, { "epoch": 2.2, "learning_rate": 0.0004168717948717949, "loss": 0.0464, "step": 11875 }, { "epoch": 2.2, "learning_rate": 0.0004155897435897436, "loss": 0.0564, "step": 11900 }, { "epoch": 2.2, "learning_rate": 0.00041430769230769234, "loss": 0.0673, "step": 11925 }, { "epoch": 3.0, "learning_rate": 0.00041302564102564107, "loss": 0.0679, "step": 11950 }, { "epoch": 3.0, "learning_rate": 0.0004117435897435898, "loss": 0.0642, "step": 11975 }, { "epoch": 3.0, "learning_rate": 0.00041046153846153847, "loss": 0.0664, "step": 12000 }, { "epoch": 3.0, "eval_loss": 0.10094589740037918, "eval_runtime": 219.2666, "eval_samples_per_second": 4.561, "eval_steps_per_second": 0.287, "step": 12000 }, { "epoch": 3.0, "learning_rate": 0.00040917948717948715, "loss": 0.0683, "step": 12025 }, { "epoch": 3.01, "learning_rate": 0.0004078974358974359, "loss": 0.0699, "step": 12050 }, { "epoch": 3.01, "learning_rate": 0.0004066153846153846, "loss": 0.0645, "step": 12075 }, { "epoch": 3.01, "learning_rate": 0.00040533333333333334, "loss": 0.0548, "step": 12100 }, { "epoch": 3.01, "learning_rate": 0.00040405128205128207, "loss": 0.0603, "step": 12125 }, { "epoch": 3.01, "learning_rate": 0.0004027692307692308, "loss": 0.0552, "step": 12150 }, { "epoch": 3.01, "learning_rate": 0.0004014871794871795, "loss": 0.0555, "step": 12175 }, { "epoch": 3.01, "learning_rate": 0.0004002051282051282, "loss": 0.0632, "step": 12200 }, { "epoch": 3.01, "learning_rate": 0.00039892307692307693, "loss": 0.0572, "step": 12225 }, { "epoch": 3.02, "learning_rate": 0.00039764102564102566, "loss": 0.0557, "step": 12250 }, { "epoch": 3.02, "learning_rate": 0.0003963589743589744, "loss": 0.0599, "step": 12275 }, { "epoch": 3.02, "learning_rate": 0.0003950769230769231, "loss": 0.056, "step": 12300 }, { "epoch": 3.02, "learning_rate": 0.0003937948717948718, "loss": 0.0583, "step": 12325 }, { "epoch": 3.02, "learning_rate": 0.00039251282051282053, "loss": 0.0564, "step": 12350 }, { "epoch": 3.02, "learning_rate": 0.0003912307692307692, "loss": 0.0538, "step": 12375 }, { "epoch": 3.02, "learning_rate": 0.00038994871794871793, "loss": 0.0525, "step": 12400 }, { "epoch": 3.02, "eval_loss": 0.13796697556972504, "eval_runtime": 216.1893, "eval_samples_per_second": 4.626, "eval_steps_per_second": 0.291, "step": 12400 }, { "epoch": 3.02, "learning_rate": 0.00038866666666666666, "loss": 0.0508, "step": 12425 }, { "epoch": 3.03, "learning_rate": 0.0003873846153846154, "loss": 0.0562, "step": 12450 }, { "epoch": 3.03, "learning_rate": 0.0003861025641025641, "loss": 0.0489, "step": 12475 }, { "epoch": 3.03, "learning_rate": 0.00038482051282051285, "loss": 0.0565, "step": 12500 }, { "epoch": 3.03, "learning_rate": 0.0003835384615384616, "loss": 0.0522, "step": 12525 }, { "epoch": 3.03, "learning_rate": 0.00038225641025641026, "loss": 0.0534, "step": 12550 }, { "epoch": 3.03, "learning_rate": 0.000380974358974359, "loss": 0.0539, "step": 12575 }, { "epoch": 3.03, "learning_rate": 0.0003796923076923077, "loss": 0.0457, "step": 12600 }, { "epoch": 3.03, "learning_rate": 0.0003784102564102564, "loss": 0.0509, "step": 12625 }, { "epoch": 3.04, "learning_rate": 0.0003771282051282051, "loss": 0.0512, "step": 12650 }, { "epoch": 3.04, "learning_rate": 0.00037584615384615385, "loss": 0.043, "step": 12675 }, { "epoch": 3.04, "learning_rate": 0.0003745641025641026, "loss": 0.0513, "step": 12700 }, { "epoch": 3.04, "learning_rate": 0.00037328205128205126, "loss": 0.0423, "step": 12725 }, { "epoch": 3.04, "learning_rate": 0.000372, "loss": 0.0482, "step": 12750 }, { "epoch": 3.04, "learning_rate": 0.0003707179487179487, "loss": 0.0517, "step": 12775 }, { "epoch": 3.04, "learning_rate": 0.00036943589743589745, "loss": 0.0414, "step": 12800 }, { "epoch": 3.04, "eval_loss": 0.1226244568824768, "eval_runtime": 216.9336, "eval_samples_per_second": 4.61, "eval_steps_per_second": 0.29, "step": 12800 }, { "epoch": 3.04, "learning_rate": 0.0003681538461538462, "loss": 0.0451, "step": 12825 }, { "epoch": 3.05, "learning_rate": 0.0003668717948717949, "loss": 0.0432, "step": 12850 }, { "epoch": 3.05, "learning_rate": 0.00036558974358974364, "loss": 0.0428, "step": 12875 }, { "epoch": 3.05, "learning_rate": 0.0003643076923076923, "loss": 0.0486, "step": 12900 }, { "epoch": 3.05, "learning_rate": 0.000363025641025641, "loss": 0.0459, "step": 12925 }, { "epoch": 3.05, "learning_rate": 0.0003617435897435897, "loss": 0.0465, "step": 12950 }, { "epoch": 3.05, "learning_rate": 0.00036046153846153845, "loss": 0.0417, "step": 12975 }, { "epoch": 3.05, "learning_rate": 0.0003591794871794872, "loss": 0.0417, "step": 13000 }, { "epoch": 3.05, "learning_rate": 0.00035794871794871797, "loss": 0.0433, "step": 13025 }, { "epoch": 3.06, "learning_rate": 0.0003566666666666667, "loss": 0.0494, "step": 13050 }, { "epoch": 3.06, "learning_rate": 0.00035538461538461543, "loss": 0.0407, "step": 13075 }, { "epoch": 3.06, "learning_rate": 0.0003541025641025641, "loss": 0.0372, "step": 13100 }, { "epoch": 3.06, "learning_rate": 0.0003528205128205128, "loss": 0.0455, "step": 13125 }, { "epoch": 3.06, "learning_rate": 0.0003515384615384615, "loss": 0.0375, "step": 13150 }, { "epoch": 3.06, "learning_rate": 0.00035025641025641024, "loss": 0.0435, "step": 13175 }, { "epoch": 3.06, "learning_rate": 0.00034897435897435897, "loss": 0.0437, "step": 13200 }, { "epoch": 3.06, "eval_loss": 0.1415424644947052, "eval_runtime": 215.4136, "eval_samples_per_second": 4.642, "eval_steps_per_second": 0.292, "step": 13200 }, { "epoch": 3.06, "learning_rate": 0.0003476923076923077, "loss": 0.0432, "step": 13225 }, { "epoch": 3.07, "learning_rate": 0.00034641025641025643, "loss": 0.0393, "step": 13250 }, { "epoch": 3.07, "learning_rate": 0.00034512820512820516, "loss": 0.0446, "step": 13275 }, { "epoch": 3.07, "learning_rate": 0.0003438461538461539, "loss": 0.0417, "step": 13300 }, { "epoch": 3.07, "learning_rate": 0.00034256410256410256, "loss": 0.0386, "step": 13325 }, { "epoch": 3.07, "learning_rate": 0.0003412820512820513, "loss": 0.0395, "step": 13350 }, { "epoch": 3.07, "learning_rate": 0.00034, "loss": 0.0394, "step": 13375 }, { "epoch": 3.07, "learning_rate": 0.0003387179487179487, "loss": 0.0364, "step": 13400 }, { "epoch": 3.07, "learning_rate": 0.00033743589743589743, "loss": 0.0334, "step": 13425 }, { "epoch": 3.08, "learning_rate": 0.00033615384615384616, "loss": 0.0374, "step": 13450 }, { "epoch": 3.08, "learning_rate": 0.0003348717948717949, "loss": 0.0352, "step": 13475 }, { "epoch": 3.08, "learning_rate": 0.00033358974358974357, "loss": 0.0317, "step": 13500 }, { "epoch": 3.08, "learning_rate": 0.0003323076923076923, "loss": 0.0427, "step": 13525 }, { "epoch": 3.08, "learning_rate": 0.000331025641025641, "loss": 0.038, "step": 13550 }, { "epoch": 3.08, "learning_rate": 0.00032974358974358976, "loss": 0.0414, "step": 13575 }, { "epoch": 3.08, "learning_rate": 0.0003284615384615385, "loss": 0.0368, "step": 13600 }, { "epoch": 3.08, "eval_loss": 0.08724867552518845, "eval_runtime": 216.1216, "eval_samples_per_second": 4.627, "eval_steps_per_second": 0.292, "step": 13600 }, { "epoch": 3.08, "learning_rate": 0.0003271794871794872, "loss": 0.0357, "step": 13625 }, { "epoch": 3.09, "learning_rate": 0.00032589743589743594, "loss": 0.0337, "step": 13650 }, { "epoch": 3.09, "learning_rate": 0.0003246153846153846, "loss": 0.0337, "step": 13675 }, { "epoch": 3.09, "learning_rate": 0.0003233333333333333, "loss": 0.033, "step": 13700 }, { "epoch": 3.09, "learning_rate": 0.000322051282051282, "loss": 0.0326, "step": 13725 }, { "epoch": 3.09, "learning_rate": 0.00032076923076923076, "loss": 0.0434, "step": 13750 }, { "epoch": 3.09, "learning_rate": 0.0003194871794871795, "loss": 0.0365, "step": 13775 }, { "epoch": 3.09, "learning_rate": 0.0003182051282051282, "loss": 0.0327, "step": 13800 }, { "epoch": 3.09, "learning_rate": 0.00031692307692307695, "loss": 0.0324, "step": 13825 }, { "epoch": 3.1, "learning_rate": 0.0003156410256410256, "loss": 0.038, "step": 13850 }, { "epoch": 3.1, "learning_rate": 0.00031435897435897435, "loss": 0.0379, "step": 13875 }, { "epoch": 3.1, "learning_rate": 0.0003130769230769231, "loss": 0.0409, "step": 13900 }, { "epoch": 3.1, "learning_rate": 0.0003117948717948718, "loss": 0.0391, "step": 13925 }, { "epoch": 3.1, "learning_rate": 0.00031051282051282054, "loss": 0.036, "step": 13950 }, { "epoch": 3.1, "learning_rate": 0.00030923076923076927, "loss": 0.038, "step": 13975 }, { "epoch": 3.1, "learning_rate": 0.000307948717948718, "loss": 0.0311, "step": 14000 }, { "epoch": 3.1, "eval_loss": 0.08812109380960464, "eval_runtime": 216.1669, "eval_samples_per_second": 4.626, "eval_steps_per_second": 0.291, "step": 14000 } ], "logging_steps": 25, "max_steps": 20000, "num_train_epochs": 9223372036854775807, "save_steps": 400, "total_flos": 1.441894654420992e+21, "trial_name": null, "trial_params": null }