{ "best_metric": null, "best_model_checkpoint": null, "epoch": 58.67970660146699, "eval_steps": 2000, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "learning_rate": 8e-05, "loss": 0.8756, "step": 10 }, { "epoch": 0.39, "learning_rate": 8e-05, "loss": 0.8401, "step": 20 }, { "epoch": 0.59, "learning_rate": 8e-05, "loss": 0.8574, "step": 30 }, { "epoch": 0.78, "learning_rate": 8e-05, "loss": 0.8282, "step": 40 }, { "epoch": 0.98, "learning_rate": 8e-05, "loss": 0.7797, "step": 50 }, { "epoch": 1.17, "learning_rate": 8e-05, "loss": 0.7198, "step": 60 }, { "epoch": 1.37, "learning_rate": 8e-05, "loss": 0.6546, "step": 70 }, { "epoch": 1.56, "learning_rate": 8e-05, "loss": 0.6477, "step": 80 }, { "epoch": 1.76, "learning_rate": 8e-05, "loss": 0.6173, "step": 90 }, { "epoch": 1.96, "learning_rate": 8e-05, "loss": 0.6119, "step": 100 }, { "epoch": 2.15, "learning_rate": 8e-05, "loss": 0.5535, "step": 110 }, { "epoch": 2.35, "learning_rate": 8e-05, "loss": 0.4959, "step": 120 }, { "epoch": 2.54, "learning_rate": 8e-05, "loss": 0.5077, "step": 130 }, { "epoch": 2.74, "learning_rate": 8e-05, "loss": 0.5021, "step": 140 }, { "epoch": 2.93, "learning_rate": 8e-05, "loss": 0.4717, "step": 150 }, { "epoch": 3.13, "learning_rate": 8e-05, "loss": 0.427, "step": 160 }, { "epoch": 3.33, "learning_rate": 8e-05, "loss": 0.361, "step": 170 }, { "epoch": 3.52, "learning_rate": 8e-05, "loss": 0.3622, "step": 180 }, { "epoch": 3.72, "learning_rate": 8e-05, "loss": 0.3425, "step": 190 }, { "epoch": 3.91, "learning_rate": 8e-05, "loss": 0.3531, "step": 200 }, { "epoch": 4.11, "learning_rate": 8e-05, "loss": 0.3376, "step": 210 }, { "epoch": 4.3, "learning_rate": 8e-05, "loss": 0.2356, "step": 220 }, { "epoch": 4.5, "learning_rate": 8e-05, "loss": 0.2514, "step": 230 }, { "epoch": 4.69, "learning_rate": 8e-05, "loss": 0.2369, "step": 240 }, { "epoch": 4.89, "learning_rate": 8e-05, "loss": 0.2485, "step": 250 }, { "epoch": 5.09, "learning_rate": 8e-05, "loss": 0.2164, "step": 260 }, { "epoch": 5.28, "learning_rate": 8e-05, "loss": 0.1498, "step": 270 }, { "epoch": 5.48, "learning_rate": 8e-05, "loss": 0.1495, "step": 280 }, { "epoch": 5.67, "learning_rate": 8e-05, "loss": 0.166, "step": 290 }, { "epoch": 5.87, "learning_rate": 8e-05, "loss": 0.2012, "step": 300 }, { "epoch": 6.06, "learning_rate": 8e-05, "loss": 0.1569, "step": 310 }, { "epoch": 6.26, "learning_rate": 8e-05, "loss": 0.0878, "step": 320 }, { "epoch": 6.45, "learning_rate": 8e-05, "loss": 0.0969, "step": 330 }, { "epoch": 6.65, "learning_rate": 8e-05, "loss": 0.1229, "step": 340 }, { "epoch": 6.85, "learning_rate": 8e-05, "loss": 0.1199, "step": 350 }, { "epoch": 7.04, "learning_rate": 8e-05, "loss": 0.0996, "step": 360 }, { "epoch": 7.24, "learning_rate": 8e-05, "loss": 0.0568, "step": 370 }, { "epoch": 7.43, "learning_rate": 8e-05, "loss": 0.0792, "step": 380 }, { "epoch": 7.63, "learning_rate": 8e-05, "loss": 0.0781, "step": 390 }, { "epoch": 7.82, "learning_rate": 8e-05, "loss": 0.0852, "step": 400 }, { "epoch": 8.02, "learning_rate": 8e-05, "loss": 0.0611, "step": 410 }, { "epoch": 8.22, "learning_rate": 8e-05, "loss": 0.0399, "step": 420 }, { "epoch": 8.41, "learning_rate": 8e-05, "loss": 0.0597, "step": 430 }, { "epoch": 8.61, "learning_rate": 8e-05, "loss": 0.0502, "step": 440 }, { "epoch": 8.8, "learning_rate": 8e-05, "loss": 0.0535, "step": 450 }, { "epoch": 9.0, "learning_rate": 8e-05, "loss": 0.0347, "step": 460 }, { "epoch": 9.19, "learning_rate": 8e-05, "loss": 0.0323, "step": 470 }, { "epoch": 9.39, "learning_rate": 8e-05, "loss": 0.039, "step": 480 }, { "epoch": 9.58, "learning_rate": 8e-05, "loss": 0.0335, "step": 490 }, { "epoch": 9.78, "learning_rate": 8e-05, "loss": 0.0364, "step": 500 }, { "epoch": 9.98, "learning_rate": 8e-05, "loss": 0.0268, "step": 510 }, { "epoch": 10.17, "learning_rate": 8e-05, "loss": 0.0245, "step": 520 }, { "epoch": 10.37, "learning_rate": 8e-05, "loss": 0.0229, "step": 530 }, { "epoch": 10.56, "learning_rate": 8e-05, "loss": 0.0256, "step": 540 }, { "epoch": 10.76, "learning_rate": 8e-05, "loss": 0.0217, "step": 550 }, { "epoch": 10.95, "learning_rate": 8e-05, "loss": 0.0204, "step": 560 }, { "epoch": 11.15, "learning_rate": 8e-05, "loss": 0.0173, "step": 570 }, { "epoch": 11.34, "learning_rate": 8e-05, "loss": 0.0173, "step": 580 }, { "epoch": 11.54, "learning_rate": 8e-05, "loss": 0.0171, "step": 590 }, { "epoch": 11.74, "learning_rate": 8e-05, "loss": 0.0123, "step": 600 }, { "epoch": 11.93, "learning_rate": 8e-05, "loss": 0.0146, "step": 610 }, { "epoch": 12.13, "learning_rate": 8e-05, "loss": 0.0145, "step": 620 }, { "epoch": 12.32, "learning_rate": 8e-05, "loss": 0.0156, "step": 630 }, { "epoch": 12.52, "learning_rate": 8e-05, "loss": 0.0096, "step": 640 }, { "epoch": 12.71, "learning_rate": 8e-05, "loss": 0.0103, "step": 650 }, { "epoch": 12.91, "learning_rate": 8e-05, "loss": 0.0104, "step": 660 }, { "epoch": 13.11, "learning_rate": 8e-05, "loss": 0.0096, "step": 670 }, { "epoch": 13.3, "learning_rate": 8e-05, "loss": 0.0074, "step": 680 }, { "epoch": 13.5, "learning_rate": 8e-05, "loss": 0.0095, "step": 690 }, { "epoch": 13.69, "learning_rate": 8e-05, "loss": 0.0087, "step": 700 }, { "epoch": 13.89, "learning_rate": 8e-05, "loss": 0.0084, "step": 710 }, { "epoch": 14.08, "learning_rate": 8e-05, "loss": 0.0081, "step": 720 }, { "epoch": 14.28, "learning_rate": 8e-05, "loss": 0.0048, "step": 730 }, { "epoch": 14.47, "learning_rate": 8e-05, "loss": 0.0044, "step": 740 }, { "epoch": 14.67, "learning_rate": 8e-05, "loss": 0.008, "step": 750 }, { "epoch": 14.87, "learning_rate": 8e-05, "loss": 0.0067, "step": 760 }, { "epoch": 15.06, "learning_rate": 8e-05, "loss": 0.0065, "step": 770 }, { "epoch": 15.26, "learning_rate": 8e-05, "loss": 0.0061, "step": 780 }, { "epoch": 15.45, "learning_rate": 8e-05, "loss": 0.0074, "step": 790 }, { "epoch": 15.65, "learning_rate": 8e-05, "loss": 0.0062, "step": 800 }, { "epoch": 15.84, "learning_rate": 8e-05, "loss": 0.008, "step": 810 }, { "epoch": 16.04, "learning_rate": 8e-05, "loss": 0.0071, "step": 820 }, { "epoch": 16.23, "learning_rate": 8e-05, "loss": 0.0055, "step": 830 }, { "epoch": 16.43, "learning_rate": 8e-05, "loss": 0.0057, "step": 840 }, { "epoch": 16.63, "learning_rate": 8e-05, "loss": 0.0073, "step": 850 }, { "epoch": 16.82, "learning_rate": 8e-05, "loss": 0.0066, "step": 860 }, { "epoch": 17.02, "learning_rate": 8e-05, "loss": 0.0067, "step": 870 }, { "epoch": 17.21, "learning_rate": 8e-05, "loss": 0.0039, "step": 880 }, { "epoch": 17.41, "learning_rate": 8e-05, "loss": 0.0062, "step": 890 }, { "epoch": 17.6, "learning_rate": 8e-05, "loss": 0.0062, "step": 900 }, { "epoch": 17.8, "learning_rate": 8e-05, "loss": 0.0065, "step": 910 }, { "epoch": 18.0, "learning_rate": 8e-05, "loss": 0.0047, "step": 920 }, { "epoch": 18.19, "learning_rate": 8e-05, "loss": 0.0051, "step": 930 }, { "epoch": 18.39, "learning_rate": 8e-05, "loss": 0.0044, "step": 940 }, { "epoch": 18.58, "learning_rate": 8e-05, "loss": 0.0041, "step": 950 }, { "epoch": 18.78, "learning_rate": 8e-05, "loss": 0.0039, "step": 960 }, { "epoch": 18.97, "learning_rate": 8e-05, "loss": 0.0027, "step": 970 }, { "epoch": 19.17, "learning_rate": 8e-05, "loss": 0.0046, "step": 980 }, { "epoch": 19.36, "learning_rate": 8e-05, "loss": 0.0051, "step": 990 }, { "epoch": 19.56, "learning_rate": 8e-05, "loss": 0.0071, "step": 1000 }, { "epoch": 19.76, "learning_rate": 8e-05, "loss": 0.0064, "step": 1010 }, { "epoch": 19.95, "learning_rate": 8e-05, "loss": 0.0065, "step": 1020 }, { "epoch": 20.15, "learning_rate": 8e-05, "loss": 0.0056, "step": 1030 }, { "epoch": 20.34, "learning_rate": 8e-05, "loss": 0.0073, "step": 1040 }, { "epoch": 20.54, "learning_rate": 8e-05, "loss": 0.0078, "step": 1050 }, { "epoch": 20.73, "learning_rate": 8e-05, "loss": 0.0062, "step": 1060 }, { "epoch": 20.93, "learning_rate": 8e-05, "loss": 0.0045, "step": 1070 }, { "epoch": 21.12, "learning_rate": 8e-05, "loss": 0.0048, "step": 1080 }, { "epoch": 21.32, "learning_rate": 8e-05, "loss": 0.0036, "step": 1090 }, { "epoch": 21.52, "learning_rate": 8e-05, "loss": 0.005, "step": 1100 }, { "epoch": 21.71, "learning_rate": 8e-05, "loss": 0.0048, "step": 1110 }, { "epoch": 21.91, "learning_rate": 8e-05, "loss": 0.007, "step": 1120 }, { "epoch": 22.1, "learning_rate": 8e-05, "loss": 0.0061, "step": 1130 }, { "epoch": 22.3, "learning_rate": 8e-05, "loss": 0.0086, "step": 1140 }, { "epoch": 22.49, "learning_rate": 8e-05, "loss": 0.0049, "step": 1150 }, { "epoch": 22.69, "learning_rate": 8e-05, "loss": 0.008, "step": 1160 }, { "epoch": 22.89, "learning_rate": 8e-05, "loss": 0.0058, "step": 1170 }, { "epoch": 23.08, "learning_rate": 8e-05, "loss": 0.0078, "step": 1180 }, { "epoch": 23.28, "learning_rate": 8e-05, "loss": 0.0051, "step": 1190 }, { "epoch": 23.47, "learning_rate": 8e-05, "loss": 0.0051, "step": 1200 }, { "epoch": 23.67, "learning_rate": 8e-05, "loss": 0.0061, "step": 1210 }, { "epoch": 23.86, "learning_rate": 8e-05, "loss": 0.0072, "step": 1220 }, { "epoch": 24.06, "learning_rate": 8e-05, "loss": 0.0048, "step": 1230 }, { "epoch": 24.25, "learning_rate": 8e-05, "loss": 0.0041, "step": 1240 }, { "epoch": 24.45, "learning_rate": 8e-05, "loss": 0.0036, "step": 1250 }, { "epoch": 24.65, "learning_rate": 8e-05, "loss": 0.0043, "step": 1260 }, { "epoch": 24.84, "learning_rate": 8e-05, "loss": 0.0046, "step": 1270 }, { "epoch": 25.04, "learning_rate": 8e-05, "loss": 0.0055, "step": 1280 }, { "epoch": 25.23, "learning_rate": 8e-05, "loss": 0.0026, "step": 1290 }, { "epoch": 25.43, "learning_rate": 8e-05, "loss": 0.0043, "step": 1300 }, { "epoch": 25.62, "learning_rate": 8e-05, "loss": 0.0044, "step": 1310 }, { "epoch": 25.82, "learning_rate": 8e-05, "loss": 0.0041, "step": 1320 }, { "epoch": 26.01, "learning_rate": 8e-05, "loss": 0.0041, "step": 1330 }, { "epoch": 26.21, "learning_rate": 8e-05, "loss": 0.003, "step": 1340 }, { "epoch": 26.41, "learning_rate": 8e-05, "loss": 0.0021, "step": 1350 }, { "epoch": 26.6, "learning_rate": 8e-05, "loss": 0.0028, "step": 1360 }, { "epoch": 26.8, "learning_rate": 8e-05, "loss": 0.0036, "step": 1370 }, { "epoch": 26.99, "learning_rate": 8e-05, "loss": 0.0036, "step": 1380 }, { "epoch": 27.19, "learning_rate": 8e-05, "loss": 0.0024, "step": 1390 }, { "epoch": 27.38, "learning_rate": 8e-05, "loss": 0.0031, "step": 1400 }, { "epoch": 27.58, "learning_rate": 8e-05, "loss": 0.0021, "step": 1410 }, { "epoch": 27.78, "learning_rate": 8e-05, "loss": 0.0032, "step": 1420 }, { "epoch": 27.97, "learning_rate": 8e-05, "loss": 0.0024, "step": 1430 }, { "epoch": 28.17, "learning_rate": 8e-05, "loss": 0.0027, "step": 1440 }, { "epoch": 28.36, "learning_rate": 8e-05, "loss": 0.0014, "step": 1450 }, { "epoch": 28.56, "learning_rate": 8e-05, "loss": 0.0022, "step": 1460 }, { "epoch": 28.75, "learning_rate": 8e-05, "loss": 0.0025, "step": 1470 }, { "epoch": 28.95, "learning_rate": 8e-05, "loss": 0.0021, "step": 1480 }, { "epoch": 29.14, "learning_rate": 8e-05, "loss": 0.0024, "step": 1490 }, { "epoch": 29.34, "learning_rate": 8e-05, "loss": 0.0024, "step": 1500 }, { "epoch": 29.54, "learning_rate": 8e-05, "loss": 0.0023, "step": 1510 }, { "epoch": 29.73, "learning_rate": 8e-05, "loss": 0.0015, "step": 1520 }, { "epoch": 29.93, "learning_rate": 8e-05, "loss": 0.0013, "step": 1530 }, { "epoch": 30.12, "learning_rate": 8e-05, "loss": 0.0018, "step": 1540 }, { "epoch": 30.32, "learning_rate": 8e-05, "loss": 0.001, "step": 1550 }, { "epoch": 30.51, "learning_rate": 8e-05, "loss": 0.0019, "step": 1560 }, { "epoch": 30.71, "learning_rate": 8e-05, "loss": 0.0014, "step": 1570 }, { "epoch": 30.9, "learning_rate": 8e-05, "loss": 0.0019, "step": 1580 }, { "epoch": 31.1, "learning_rate": 8e-05, "loss": 0.0031, "step": 1590 }, { "epoch": 31.3, "learning_rate": 8e-05, "loss": 0.0014, "step": 1600 }, { "epoch": 31.49, "learning_rate": 8e-05, "loss": 0.0007, "step": 1610 }, { "epoch": 31.69, "learning_rate": 8e-05, "loss": 0.0016, "step": 1620 }, { "epoch": 31.88, "learning_rate": 8e-05, "loss": 0.0018, "step": 1630 }, { "epoch": 32.08, "learning_rate": 8e-05, "loss": 0.0017, "step": 1640 }, { "epoch": 32.27, "learning_rate": 8e-05, "loss": 0.0011, "step": 1650 }, { "epoch": 32.47, "learning_rate": 8e-05, "loss": 0.0021, "step": 1660 }, { "epoch": 32.67, "learning_rate": 8e-05, "loss": 0.0023, "step": 1670 }, { "epoch": 32.86, "learning_rate": 8e-05, "loss": 0.0013, "step": 1680 }, { "epoch": 33.06, "learning_rate": 8e-05, "loss": 0.002, "step": 1690 }, { "epoch": 33.25, "learning_rate": 8e-05, "loss": 0.0017, "step": 1700 }, { "epoch": 33.45, "learning_rate": 8e-05, "loss": 0.0034, "step": 1710 }, { "epoch": 33.64, "learning_rate": 8e-05, "loss": 0.0024, "step": 1720 }, { "epoch": 33.84, "learning_rate": 8e-05, "loss": 0.003, "step": 1730 }, { "epoch": 34.03, "learning_rate": 8e-05, "loss": 0.0036, "step": 1740 }, { "epoch": 34.23, "learning_rate": 8e-05, "loss": 0.0029, "step": 1750 }, { "epoch": 34.43, "learning_rate": 8e-05, "loss": 0.0035, "step": 1760 }, { "epoch": 34.62, "learning_rate": 8e-05, "loss": 0.004, "step": 1770 }, { "epoch": 34.82, "learning_rate": 8e-05, "loss": 0.0035, "step": 1780 }, { "epoch": 35.01, "learning_rate": 8e-05, "loss": 0.0051, "step": 1790 }, { "epoch": 35.21, "learning_rate": 8e-05, "loss": 0.0038, "step": 1800 }, { "epoch": 35.4, "learning_rate": 8e-05, "loss": 0.0035, "step": 1810 }, { "epoch": 35.6, "learning_rate": 8e-05, "loss": 0.004, "step": 1820 }, { "epoch": 35.79, "learning_rate": 8e-05, "loss": 0.0037, "step": 1830 }, { "epoch": 35.99, "learning_rate": 8e-05, "loss": 0.0055, "step": 1840 }, { "epoch": 36.19, "learning_rate": 8e-05, "loss": 0.0112, "step": 1850 }, { "epoch": 36.38, "learning_rate": 8e-05, "loss": 0.0031, "step": 1860 }, { "epoch": 36.58, "learning_rate": 8e-05, "loss": 0.0044, "step": 1870 }, { "epoch": 36.77, "learning_rate": 8e-05, "loss": 0.0057, "step": 1880 }, { "epoch": 36.97, "learning_rate": 8e-05, "loss": 0.0083, "step": 1890 }, { "epoch": 37.16, "learning_rate": 8e-05, "loss": 0.0043, "step": 1900 }, { "epoch": 37.36, "learning_rate": 8e-05, "loss": 0.0047, "step": 1910 }, { "epoch": 37.56, "learning_rate": 8e-05, "loss": 0.0048, "step": 1920 }, { "epoch": 37.75, "learning_rate": 8e-05, "loss": 0.0068, "step": 1930 }, { "epoch": 37.95, "learning_rate": 8e-05, "loss": 0.0068, "step": 1940 }, { "epoch": 38.14, "learning_rate": 8e-05, "loss": 0.006, "step": 1950 }, { "epoch": 38.34, "learning_rate": 8e-05, "loss": 0.0069, "step": 1960 }, { "epoch": 38.53, "learning_rate": 8e-05, "loss": 0.0069, "step": 1970 }, { "epoch": 38.73, "learning_rate": 8e-05, "loss": 0.005, "step": 1980 }, { "epoch": 38.92, "learning_rate": 8e-05, "loss": 0.0084, "step": 1990 }, { "epoch": 39.12, "learning_rate": 8e-05, "loss": 0.0067, "step": 2000 }, { "epoch": 39.12, "eval_loss": 0.8936046361923218, "eval_runtime": 0.3385, "eval_samples_per_second": 2.954, "eval_steps_per_second": 2.954, "step": 2000 }, { "epoch": 39.32, "learning_rate": 8e-05, "loss": 0.0051, "step": 2010 }, { "epoch": 39.51, "learning_rate": 8e-05, "loss": 0.0059, "step": 2020 }, { "epoch": 39.71, "learning_rate": 8e-05, "loss": 0.0069, "step": 2030 }, { "epoch": 39.9, "learning_rate": 8e-05, "loss": 0.0076, "step": 2040 }, { "epoch": 40.1, "learning_rate": 8e-05, "loss": 0.0061, "step": 2050 }, { "epoch": 40.29, "learning_rate": 8e-05, "loss": 0.0075, "step": 2060 }, { "epoch": 40.49, "learning_rate": 8e-05, "loss": 0.0066, "step": 2070 }, { "epoch": 40.68, "learning_rate": 8e-05, "loss": 0.0081, "step": 2080 }, { "epoch": 40.88, "learning_rate": 8e-05, "loss": 0.0062, "step": 2090 }, { "epoch": 41.08, "learning_rate": 8e-05, "loss": 0.0059, "step": 2100 }, { "epoch": 41.27, "learning_rate": 8e-05, "loss": 0.0035, "step": 2110 }, { "epoch": 41.47, "learning_rate": 8e-05, "loss": 0.0032, "step": 2120 }, { "epoch": 41.66, "learning_rate": 8e-05, "loss": 0.0061, "step": 2130 }, { "epoch": 41.86, "learning_rate": 8e-05, "loss": 0.0048, "step": 2140 }, { "epoch": 42.05, "learning_rate": 8e-05, "loss": 0.0079, "step": 2150 }, { "epoch": 42.25, "learning_rate": 8e-05, "loss": 0.0055, "step": 2160 }, { "epoch": 42.44, "learning_rate": 8e-05, "loss": 0.0038, "step": 2170 }, { "epoch": 42.64, "learning_rate": 8e-05, "loss": 0.0047, "step": 2180 }, { "epoch": 42.84, "learning_rate": 8e-05, "loss": 0.0032, "step": 2190 }, { "epoch": 43.03, "learning_rate": 8e-05, "loss": 0.0073, "step": 2200 }, { "epoch": 43.23, "learning_rate": 8e-05, "loss": 0.004, "step": 2210 }, { "epoch": 43.42, "learning_rate": 8e-05, "loss": 0.0058, "step": 2220 }, { "epoch": 43.62, "learning_rate": 8e-05, "loss": 0.0042, "step": 2230 }, { "epoch": 43.81, "learning_rate": 8e-05, "loss": 0.0045, "step": 2240 }, { "epoch": 44.01, "learning_rate": 8e-05, "loss": 0.0033, "step": 2250 }, { "epoch": 44.21, "learning_rate": 8e-05, "loss": 0.0025, "step": 2260 }, { "epoch": 44.4, "learning_rate": 8e-05, "loss": 0.0051, "step": 2270 }, { "epoch": 44.6, "learning_rate": 8e-05, "loss": 0.0049, "step": 2280 }, { "epoch": 44.79, "learning_rate": 8e-05, "loss": 0.004, "step": 2290 }, { "epoch": 44.99, "learning_rate": 8e-05, "loss": 0.0035, "step": 2300 }, { "epoch": 45.18, "learning_rate": 8e-05, "loss": 0.0023, "step": 2310 }, { "epoch": 45.38, "learning_rate": 8e-05, "loss": 0.0036, "step": 2320 }, { "epoch": 45.57, "learning_rate": 8e-05, "loss": 0.0035, "step": 2330 }, { "epoch": 45.77, "learning_rate": 8e-05, "loss": 0.0043, "step": 2340 }, { "epoch": 45.97, "learning_rate": 8e-05, "loss": 0.0041, "step": 2350 }, { "epoch": 46.16, "learning_rate": 8e-05, "loss": 0.0032, "step": 2360 }, { "epoch": 46.36, "learning_rate": 8e-05, "loss": 0.0026, "step": 2370 }, { "epoch": 46.55, "learning_rate": 8e-05, "loss": 0.0055, "step": 2380 }, { "epoch": 46.75, "learning_rate": 8e-05, "loss": 0.0038, "step": 2390 }, { "epoch": 46.94, "learning_rate": 8e-05, "loss": 0.0038, "step": 2400 }, { "epoch": 47.14, "learning_rate": 8e-05, "loss": 0.0017, "step": 2410 }, { "epoch": 47.33, "learning_rate": 8e-05, "loss": 0.002, "step": 2420 }, { "epoch": 47.53, "learning_rate": 8e-05, "loss": 0.0021, "step": 2430 }, { "epoch": 47.73, "learning_rate": 8e-05, "loss": 0.0018, "step": 2440 }, { "epoch": 47.92, "learning_rate": 8e-05, "loss": 0.002, "step": 2450 }, { "epoch": 48.12, "learning_rate": 8e-05, "loss": 0.003, "step": 2460 }, { "epoch": 48.31, "learning_rate": 8e-05, "loss": 0.0023, "step": 2470 }, { "epoch": 48.51, "learning_rate": 8e-05, "loss": 0.0019, "step": 2480 }, { "epoch": 48.7, "learning_rate": 8e-05, "loss": 0.0025, "step": 2490 }, { "epoch": 48.9, "learning_rate": 8e-05, "loss": 0.0017, "step": 2500 }, { "epoch": 49.1, "learning_rate": 8e-05, "loss": 0.0014, "step": 2510 }, { "epoch": 49.29, "learning_rate": 8e-05, "loss": 0.0015, "step": 2520 }, { "epoch": 49.49, "learning_rate": 8e-05, "loss": 0.0012, "step": 2530 }, { "epoch": 49.68, "learning_rate": 8e-05, "loss": 0.0024, "step": 2540 }, { "epoch": 49.88, "learning_rate": 8e-05, "loss": 0.0019, "step": 2550 }, { "epoch": 50.07, "learning_rate": 8e-05, "loss": 0.0013, "step": 2560 }, { "epoch": 50.27, "learning_rate": 8e-05, "loss": 0.0023, "step": 2570 }, { "epoch": 50.46, "learning_rate": 8e-05, "loss": 0.0008, "step": 2580 }, { "epoch": 50.66, "learning_rate": 8e-05, "loss": 0.0013, "step": 2590 }, { "epoch": 50.86, "learning_rate": 8e-05, "loss": 0.0012, "step": 2600 }, { "epoch": 51.05, "learning_rate": 8e-05, "loss": 0.0025, "step": 2610 }, { "epoch": 51.25, "learning_rate": 8e-05, "loss": 0.002, "step": 2620 }, { "epoch": 51.44, "learning_rate": 8e-05, "loss": 0.0013, "step": 2630 }, { "epoch": 51.64, "learning_rate": 8e-05, "loss": 0.0018, "step": 2640 }, { "epoch": 51.83, "learning_rate": 8e-05, "loss": 0.0015, "step": 2650 }, { "epoch": 52.03, "learning_rate": 8e-05, "loss": 0.0011, "step": 2660 }, { "epoch": 52.22, "learning_rate": 8e-05, "loss": 0.0008, "step": 2670 }, { "epoch": 52.42, "learning_rate": 8e-05, "loss": 0.0014, "step": 2680 }, { "epoch": 52.62, "learning_rate": 8e-05, "loss": 0.0012, "step": 2690 }, { "epoch": 52.81, "learning_rate": 8e-05, "loss": 0.0011, "step": 2700 }, { "epoch": 53.01, "learning_rate": 8e-05, "loss": 0.0019, "step": 2710 }, { "epoch": 53.2, "learning_rate": 8e-05, "loss": 0.0008, "step": 2720 }, { "epoch": 53.4, "learning_rate": 8e-05, "loss": 0.0013, "step": 2730 }, { "epoch": 53.59, "learning_rate": 8e-05, "loss": 0.0009, "step": 2740 }, { "epoch": 53.79, "learning_rate": 8e-05, "loss": 0.001, "step": 2750 }, { "epoch": 53.99, "learning_rate": 8e-05, "loss": 0.001, "step": 2760 }, { "epoch": 54.18, "learning_rate": 8e-05, "loss": 0.0009, "step": 2770 }, { "epoch": 54.38, "learning_rate": 8e-05, "loss": 0.0008, "step": 2780 }, { "epoch": 54.57, "learning_rate": 8e-05, "loss": 0.0008, "step": 2790 }, { "epoch": 54.77, "learning_rate": 8e-05, "loss": 0.0004, "step": 2800 }, { "epoch": 54.96, "learning_rate": 8e-05, "loss": 0.0009, "step": 2810 }, { "epoch": 55.16, "learning_rate": 8e-05, "loss": 0.0004, "step": 2820 }, { "epoch": 55.35, "learning_rate": 8e-05, "loss": 0.0005, "step": 2830 }, { "epoch": 55.55, "learning_rate": 8e-05, "loss": 0.0008, "step": 2840 }, { "epoch": 55.75, "learning_rate": 8e-05, "loss": 0.0009, "step": 2850 }, { "epoch": 55.94, "learning_rate": 8e-05, "loss": 0.0008, "step": 2860 }, { "epoch": 56.14, "learning_rate": 8e-05, "loss": 0.0006, "step": 2870 }, { "epoch": 56.33, "learning_rate": 8e-05, "loss": 0.0006, "step": 2880 }, { "epoch": 56.53, "learning_rate": 8e-05, "loss": 0.0005, "step": 2890 }, { "epoch": 56.72, "learning_rate": 8e-05, "loss": 0.0003, "step": 2900 }, { "epoch": 56.92, "learning_rate": 8e-05, "loss": 0.0026, "step": 2910 }, { "epoch": 57.11, "learning_rate": 8e-05, "loss": 0.0008, "step": 2920 }, { "epoch": 57.31, "learning_rate": 8e-05, "loss": 0.0008, "step": 2930 }, { "epoch": 57.51, "learning_rate": 8e-05, "loss": 0.0016, "step": 2940 }, { "epoch": 57.7, "learning_rate": 8e-05, "loss": 0.0004, "step": 2950 }, { "epoch": 57.9, "learning_rate": 8e-05, "loss": 0.0003, "step": 2960 }, { "epoch": 58.09, "learning_rate": 8e-05, "loss": 0.0007, "step": 2970 }, { "epoch": 58.29, "learning_rate": 8e-05, "loss": 0.0004, "step": 2980 }, { "epoch": 58.48, "learning_rate": 8e-05, "loss": 0.0002, "step": 2990 }, { "epoch": 58.68, "learning_rate": 8e-05, "loss": 0.0007, "step": 3000 } ], "logging_steps": 10, "max_steps": 100000, "num_train_epochs": 1961, "save_steps": 500, "total_flos": 2.34588386134997e+18, "trial_name": null, "trial_params": null }