{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8001484046500122, "global_step": 3639, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.253431117230021e-06, "loss": 3.9812, "step": 10 }, { "epoch": 0.01, "learning_rate": 3.7916371566987085e-06, "loss": 0.5473, "step": 20 }, { "epoch": 0.01, "learning_rate": 4.506862234460042e-06, "loss": 0.3243, "step": 30 }, { "epoch": 0.02, "learning_rate": 4.977968949561282e-06, "loss": 0.3058, "step": 40 }, { "epoch": 0.02, "learning_rate": 5.3298431961673955e-06, "loss": 0.2495, "step": 50 }, { "epoch": 0.03, "learning_rate": 5.610809355829126e-06, "loss": 0.3345, "step": 60 }, { "epoch": 0.03, "learning_rate": 5.844707706136422e-06, "loss": 0.2784, "step": 70 }, { "epoch": 0.04, "learning_rate": 6.04506827392873e-06, "loss": 0.2421, "step": 80 }, { "epoch": 0.04, "learning_rate": 6.220313630636092e-06, "loss": 0.2661, "step": 90 }, { "epoch": 0.05, "learning_rate": 6.376044588167711e-06, "loss": 0.2831, "step": 100 }, { "epoch": 0.05, "learning_rate": 6.516174989029968e-06, "loss": 0.2793, "step": 110 }, { "epoch": 0.06, "learning_rate": 6.643547802988659e-06, "loss": 0.2488, "step": 120 }, { "epoch": 0.06, "learning_rate": 6.7602933516900655e-06, "loss": 0.2379, "step": 130 }, { "epoch": 0.07, "learning_rate": 6.868049235636083e-06, "loss": 0.2388, "step": 140 }, { "epoch": 0.07, "learning_rate": 6.968101466168447e-06, "loss": 0.2273, "step": 150 }, { "epoch": 0.08, "learning_rate": 7.061478425531149e-06, "loss": 0.2571, "step": 160 }, { "epoch": 0.08, "learning_rate": 7.149015395297812e-06, "loss": 0.2466, "step": 170 }, { "epoch": 0.09, "learning_rate": 7.231400066791303e-06, "loss": 0.2315, "step": 180 }, { "epoch": 0.09, "learning_rate": 7.309205386775784e-06, "loss": 0.2101, "step": 190 }, { "epoch": 0.1, "learning_rate": 7.375715089769526e-06, "loss": 0.2425, "step": 200 }, { "epoch": 0.1, "learning_rate": 7.44608846772297e-06, "loss": 0.243, "step": 210 }, { "epoch": 0.11, "learning_rate": 7.513093380702437e-06, "loss": 0.2521, "step": 220 }, { "epoch": 0.11, "learning_rate": 7.57703761945179e-06, "loss": 0.2401, "step": 230 }, { "epoch": 0.12, "learning_rate": 7.638188625267883e-06, "loss": 0.2342, "step": 240 }, { "epoch": 0.12, "learning_rate": 7.696780248446552e-06, "loss": 0.2439, "step": 250 }, { "epoch": 0.13, "learning_rate": 7.753018148386997e-06, "loss": 0.2553, "step": 260 }, { "epoch": 0.13, "learning_rate": 7.80708415034011e-06, "loss": 0.2547, "step": 270 }, { "epoch": 0.14, "learning_rate": 7.859139791732239e-06, "loss": 0.2316, "step": 280 }, { "epoch": 0.14, "learning_rate": 7.909329232496527e-06, "loss": 0.2227, "step": 290 }, { "epoch": 0.15, "learning_rate": 7.957781661555314e-06, "loss": 0.21, "step": 300 }, { "epoch": 0.15, "learning_rate": 8e-06, "loss": 0.2145, "step": 310 }, { "epoch": 0.16, "learning_rate": 7.98611111111111e-06, "loss": 0.2101, "step": 320 }, { "epoch": 0.16, "learning_rate": 7.972222222222223e-06, "loss": 0.1999, "step": 330 }, { "epoch": 0.17, "learning_rate": 7.958333333333333e-06, "loss": 0.2254, "step": 340 }, { "epoch": 0.17, "learning_rate": 7.944444444444444e-06, "loss": 0.2597, "step": 350 }, { "epoch": 0.18, "learning_rate": 7.930555555555554e-06, "loss": 0.1814, "step": 360 }, { "epoch": 0.18, "learning_rate": 7.916666666666667e-06, "loss": 0.2173, "step": 370 }, { "epoch": 0.19, "learning_rate": 7.902777777777777e-06, "loss": 0.2136, "step": 380 }, { "epoch": 0.19, "learning_rate": 7.88888888888889e-06, "loss": 0.2015, "step": 390 }, { "epoch": 0.2, "learning_rate": 7.875e-06, "loss": 0.2059, "step": 400 }, { "epoch": 0.2, "learning_rate": 7.86111111111111e-06, "loss": 0.23, "step": 410 }, { "epoch": 0.21, "learning_rate": 7.847222222222221e-06, "loss": 0.166, "step": 420 }, { "epoch": 0.21, "learning_rate": 7.833333333333333e-06, "loss": 0.2403, "step": 430 }, { "epoch": 0.22, "learning_rate": 7.819444444444444e-06, "loss": 0.2111, "step": 440 }, { "epoch": 0.22, "learning_rate": 7.805555555555555e-06, "loss": 0.2309, "step": 450 }, { "epoch": 0.23, "learning_rate": 7.791666666666667e-06, "loss": 0.2117, "step": 460 }, { "epoch": 0.23, "learning_rate": 7.777777777777777e-06, "loss": 0.2085, "step": 470 }, { "epoch": 0.24, "learning_rate": 7.76388888888889e-06, "loss": 0.1904, "step": 480 }, { "epoch": 0.24, "learning_rate": 7.75e-06, "loss": 0.1954, "step": 490 }, { "epoch": 0.25, "learning_rate": 7.736111111111111e-06, "loss": 0.2056, "step": 500 }, { "epoch": 0.25, "learning_rate": 7.722222222222222e-06, "loss": 0.169, "step": 510 }, { "epoch": 0.26, "learning_rate": 7.708333333333332e-06, "loss": 0.1814, "step": 520 }, { "epoch": 0.26, "learning_rate": 7.694444444444444e-06, "loss": 0.21, "step": 530 }, { "epoch": 0.27, "learning_rate": 7.680555555555555e-06, "loss": 0.2285, "step": 540 }, { "epoch": 0.27, "learning_rate": 7.666666666666667e-06, "loss": 0.2074, "step": 550 }, { "epoch": 0.28, "learning_rate": 7.652777777777778e-06, "loss": 0.173, "step": 560 }, { "epoch": 0.28, "learning_rate": 7.638888888888888e-06, "loss": 0.187, "step": 570 }, { "epoch": 0.29, "learning_rate": 7.625e-06, "loss": 0.2002, "step": 580 }, { "epoch": 0.29, "learning_rate": 7.6111111111111104e-06, "loss": 0.2257, "step": 590 }, { "epoch": 0.3, "learning_rate": 7.597222222222222e-06, "loss": 0.2068, "step": 600 }, { "epoch": 0.3, "learning_rate": 7.5833333333333324e-06, "loss": 0.1968, "step": 610 }, { "epoch": 0.31, "learning_rate": 7.569444444444445e-06, "loss": 0.2122, "step": 620 }, { "epoch": 0.31, "learning_rate": 7.555555555555555e-06, "loss": 0.2306, "step": 630 }, { "epoch": 0.32, "learning_rate": 7.541666666666667e-06, "loss": 0.2108, "step": 640 }, { "epoch": 0.32, "learning_rate": 7.527777777777777e-06, "loss": 0.1923, "step": 650 }, { "epoch": 0.33, "learning_rate": 7.513888888888889e-06, "loss": 0.1989, "step": 660 }, { "epoch": 0.33, "learning_rate": 7.499999999999999e-06, "loss": 0.2179, "step": 670 }, { "epoch": 0.34, "learning_rate": 7.486111111111111e-06, "loss": 0.1915, "step": 680 }, { "epoch": 0.34, "learning_rate": 7.472222222222222e-06, "loss": 0.1965, "step": 690 }, { "epoch": 0.35, "learning_rate": 7.458333333333333e-06, "loss": 0.2425, "step": 700 }, { "epoch": 0.35, "learning_rate": 7.444444444444444e-06, "loss": 0.1933, "step": 710 }, { "epoch": 0.36, "learning_rate": 7.430555555555555e-06, "loss": 0.2314, "step": 720 }, { "epoch": 0.36, "learning_rate": 7.416666666666666e-06, "loss": 0.1901, "step": 730 }, { "epoch": 0.37, "learning_rate": 7.402777777777778e-06, "loss": 0.1852, "step": 740 }, { "epoch": 0.37, "learning_rate": 7.388888888888889e-06, "loss": 0.201, "step": 750 }, { "epoch": 0.38, "learning_rate": 7.375e-06, "loss": 0.1636, "step": 760 }, { "epoch": 0.38, "learning_rate": 7.36111111111111e-06, "loss": 0.184, "step": 770 }, { "epoch": 0.39, "learning_rate": 7.347222222222222e-06, "loss": 0.1618, "step": 780 }, { "epoch": 0.39, "learning_rate": 7.333333333333332e-06, "loss": 0.1867, "step": 790 }, { "epoch": 0.4, "learning_rate": 7.3194444444444446e-06, "loss": 0.1877, "step": 800 }, { "epoch": 0.4, "learning_rate": 7.305555555555555e-06, "loss": 0.1569, "step": 810 }, { "epoch": 0.41, "learning_rate": 7.291666666666667e-06, "loss": 0.1651, "step": 820 }, { "epoch": 0.41, "learning_rate": 7.277777777777777e-06, "loss": 0.215, "step": 830 }, { "epoch": 0.42, "learning_rate": 7.263888888888889e-06, "loss": 0.1609, "step": 840 }, { "epoch": 0.42, "learning_rate": 7.25e-06, "loss": 0.1935, "step": 850 }, { "epoch": 0.43, "learning_rate": 7.236111111111111e-06, "loss": 0.1688, "step": 860 }, { "epoch": 0.43, "learning_rate": 7.222222222222222e-06, "loss": 0.1645, "step": 870 }, { "epoch": 0.44, "learning_rate": 7.208333333333333e-06, "loss": 0.162, "step": 880 }, { "epoch": 0.44, "learning_rate": 7.194444444444444e-06, "loss": 0.1467, "step": 890 }, { "epoch": 0.45, "learning_rate": 7.180555555555555e-06, "loss": 0.1579, "step": 900 }, { "epoch": 0.45, "learning_rate": 7.166666666666667e-06, "loss": 0.1774, "step": 910 }, { "epoch": 0.46, "learning_rate": 7.1527777777777775e-06, "loss": 0.1931, "step": 920 }, { "epoch": 0.46, "learning_rate": 7.138888888888889e-06, "loss": 0.1505, "step": 930 }, { "epoch": 0.47, "learning_rate": 7.1249999999999995e-06, "loss": 0.1632, "step": 940 }, { "epoch": 0.47, "learning_rate": 7.11111111111111e-06, "loss": 0.169, "step": 950 }, { "epoch": 0.47, "learning_rate": 7.097222222222222e-06, "loss": 0.1762, "step": 960 }, { "epoch": 0.48, "learning_rate": 7.083333333333333e-06, "loss": 0.1838, "step": 970 }, { "epoch": 0.48, "learning_rate": 7.0694444444444444e-06, "loss": 0.1622, "step": 980 }, { "epoch": 0.49, "learning_rate": 7.055555555555555e-06, "loss": 0.1562, "step": 990 }, { "epoch": 0.49, "learning_rate": 7.0416666666666664e-06, "loss": 0.1475, "step": 1000 }, { "epoch": 0.5, "learning_rate": 7.027777777777777e-06, "loss": 0.1557, "step": 1010 }, { "epoch": 0.5, "learning_rate": 7.013888888888889e-06, "loss": 0.1851, "step": 1020 }, { "epoch": 0.51, "learning_rate": 7e-06, "loss": 0.1607, "step": 1030 }, { "epoch": 0.51, "learning_rate": 6.9861111111111105e-06, "loss": 0.1541, "step": 1040 }, { "epoch": 0.52, "learning_rate": 6.972222222222222e-06, "loss": 0.1509, "step": 1050 }, { "epoch": 0.52, "learning_rate": 6.9583333333333325e-06, "loss": 0.1754, "step": 1060 }, { "epoch": 0.53, "learning_rate": 6.944444444444444e-06, "loss": 0.205, "step": 1070 }, { "epoch": 0.53, "learning_rate": 6.930555555555555e-06, "loss": 0.2037, "step": 1080 }, { "epoch": 0.54, "learning_rate": 6.916666666666667e-06, "loss": 0.1424, "step": 1090 }, { "epoch": 0.54, "learning_rate": 6.902777777777777e-06, "loss": 0.1594, "step": 1100 }, { "epoch": 0.55, "learning_rate": 6.888888888888889e-06, "loss": 0.1838, "step": 1110 }, { "epoch": 0.55, "learning_rate": 6.874999999999999e-06, "loss": 0.1626, "step": 1120 }, { "epoch": 0.56, "learning_rate": 6.86111111111111e-06, "loss": 0.1689, "step": 1130 }, { "epoch": 0.56, "learning_rate": 6.847222222222222e-06, "loss": 0.1548, "step": 1140 }, { "epoch": 0.57, "learning_rate": 6.833333333333333e-06, "loss": 0.1435, "step": 1150 }, { "epoch": 0.57, "learning_rate": 6.819444444444444e-06, "loss": 0.1939, "step": 1160 }, { "epoch": 0.58, "learning_rate": 6.805555555555555e-06, "loss": 0.1802, "step": 1170 }, { "epoch": 0.58, "learning_rate": 6.791666666666666e-06, "loss": 0.1865, "step": 1180 }, { "epoch": 0.59, "learning_rate": 6.777777777777778e-06, "loss": 0.18, "step": 1190 }, { "epoch": 0.59, "learning_rate": 6.763888888888889e-06, "loss": 0.1862, "step": 1200 }, { "epoch": 0.6, "learning_rate": 6.75e-06, "loss": 0.183, "step": 1210 }, { "epoch": 0.6, "eval_loss": 0.234619140625, "eval_runtime": 601.4014, "eval_samples_per_second": 39.832, "eval_steps_per_second": 4.98, "step": 1213 }, { "epoch": 0.6, "learning_rate": 6.73611111111111e-06, "loss": 0.1954, "step": 1220 }, { "epoch": 0.61, "learning_rate": 6.722222222222222e-06, "loss": 0.1433, "step": 1230 }, { "epoch": 0.61, "learning_rate": 6.708333333333332e-06, "loss": 0.1732, "step": 1240 }, { "epoch": 0.62, "learning_rate": 6.694444444444445e-06, "loss": 0.1985, "step": 1250 }, { "epoch": 0.62, "learning_rate": 6.680555555555555e-06, "loss": 0.1786, "step": 1260 }, { "epoch": 0.63, "learning_rate": 6.666666666666667e-06, "loss": 0.167, "step": 1270 }, { "epoch": 0.63, "learning_rate": 6.652777777777777e-06, "loss": 0.1548, "step": 1280 }, { "epoch": 0.64, "learning_rate": 6.638888888888889e-06, "loss": 0.1559, "step": 1290 }, { "epoch": 0.64, "learning_rate": 6.625e-06, "loss": 0.176, "step": 1300 }, { "epoch": 0.65, "learning_rate": 6.611111111111111e-06, "loss": 0.1687, "step": 1310 }, { "epoch": 0.65, "learning_rate": 6.597222222222222e-06, "loss": 0.1497, "step": 1320 }, { "epoch": 0.66, "learning_rate": 6.583333333333333e-06, "loss": 0.1891, "step": 1330 }, { "epoch": 0.66, "learning_rate": 6.569444444444444e-06, "loss": 0.1783, "step": 1340 }, { "epoch": 0.67, "learning_rate": 6.555555555555555e-06, "loss": 0.168, "step": 1350 }, { "epoch": 0.67, "learning_rate": 6.541666666666667e-06, "loss": 0.1871, "step": 1360 }, { "epoch": 0.68, "learning_rate": 6.527777777777778e-06, "loss": 0.1601, "step": 1370 }, { "epoch": 0.68, "learning_rate": 6.513888888888889e-06, "loss": 0.1466, "step": 1380 }, { "epoch": 0.69, "learning_rate": 6.5e-06, "loss": 0.1799, "step": 1390 }, { "epoch": 0.69, "learning_rate": 6.48611111111111e-06, "loss": 0.1448, "step": 1400 }, { "epoch": 0.7, "learning_rate": 6.472222222222222e-06, "loss": 0.1459, "step": 1410 }, { "epoch": 0.7, "learning_rate": 6.458333333333333e-06, "loss": 0.1677, "step": 1420 }, { "epoch": 0.71, "learning_rate": 6.4444444444444445e-06, "loss": 0.1878, "step": 1430 }, { "epoch": 0.71, "learning_rate": 6.430555555555555e-06, "loss": 0.1494, "step": 1440 }, { "epoch": 0.72, "learning_rate": 6.4166666666666665e-06, "loss": 0.1406, "step": 1450 }, { "epoch": 0.72, "learning_rate": 6.402777777777777e-06, "loss": 0.1538, "step": 1460 }, { "epoch": 0.73, "learning_rate": 6.390277777777778e-06, "loss": 0.169, "step": 1470 }, { "epoch": 0.73, "learning_rate": 6.376388888888889e-06, "loss": 0.1444, "step": 1480 }, { "epoch": 0.74, "learning_rate": 6.3625e-06, "loss": 0.1561, "step": 1490 }, { "epoch": 0.74, "learning_rate": 6.348611111111111e-06, "loss": 0.1493, "step": 1500 }, { "epoch": 0.75, "learning_rate": 6.334722222222222e-06, "loss": 0.1547, "step": 1510 }, { "epoch": 0.75, "learning_rate": 6.320833333333333e-06, "loss": 0.1749, "step": 1520 }, { "epoch": 0.76, "learning_rate": 6.3069444444444445e-06, "loss": 0.1903, "step": 1530 }, { "epoch": 0.76, "learning_rate": 6.293055555555555e-06, "loss": 0.18, "step": 1540 }, { "epoch": 0.77, "learning_rate": 6.2791666666666665e-06, "loss": 0.1491, "step": 1550 }, { "epoch": 0.77, "learning_rate": 6.265277777777777e-06, "loss": 0.1777, "step": 1560 }, { "epoch": 0.78, "learning_rate": 6.2513888888888886e-06, "loss": 0.1226, "step": 1570 }, { "epoch": 0.78, "learning_rate": 6.237499999999999e-06, "loss": 0.1822, "step": 1580 }, { "epoch": 0.79, "learning_rate": 6.2236111111111114e-06, "loss": 0.143, "step": 1590 }, { "epoch": 0.79, "learning_rate": 6.209722222222222e-06, "loss": 0.1411, "step": 1600 }, { "epoch": 0.8, "learning_rate": 6.1958333333333334e-06, "loss": 0.1452, "step": 1610 }, { "epoch": 0.8, "learning_rate": 6.181944444444444e-06, "loss": 0.1673, "step": 1620 }, { "epoch": 0.81, "learning_rate": 6.169444444444444e-06, "loss": 0.1706, "step": 1630 }, { "epoch": 0.81, "learning_rate": 6.155555555555556e-06, "loss": 0.1755, "step": 1640 }, { "epoch": 0.82, "learning_rate": 6.141666666666667e-06, "loss": 0.1831, "step": 1650 }, { "epoch": 0.82, "learning_rate": 6.127777777777778e-06, "loss": 0.1402, "step": 1660 }, { "epoch": 0.83, "learning_rate": 6.113888888888889e-06, "loss": 0.1434, "step": 1670 }, { "epoch": 0.83, "learning_rate": 6.099999999999999e-06, "loss": 0.1347, "step": 1680 }, { "epoch": 0.84, "learning_rate": 6.086111111111111e-06, "loss": 0.1614, "step": 1690 }, { "epoch": 0.84, "learning_rate": 6.072222222222222e-06, "loss": 0.1533, "step": 1700 }, { "epoch": 0.85, "learning_rate": 6.0583333333333335e-06, "loss": 0.1666, "step": 1710 }, { "epoch": 0.85, "learning_rate": 6.044444444444444e-06, "loss": 0.1725, "step": 1720 }, { "epoch": 0.86, "learning_rate": 6.0305555555555555e-06, "loss": 0.1819, "step": 1730 }, { "epoch": 0.86, "learning_rate": 6.016666666666666e-06, "loss": 0.1739, "step": 1740 }, { "epoch": 0.87, "learning_rate": 6.0027777777777775e-06, "loss": 0.1573, "step": 1750 }, { "epoch": 0.87, "learning_rate": 5.988888888888889e-06, "loss": 0.1458, "step": 1760 }, { "epoch": 0.88, "learning_rate": 5.9749999999999995e-06, "loss": 0.1243, "step": 1770 }, { "epoch": 0.88, "learning_rate": 5.961111111111111e-06, "loss": 0.1657, "step": 1780 }, { "epoch": 0.89, "learning_rate": 5.9472222222222216e-06, "loss": 0.1871, "step": 1790 }, { "epoch": 0.89, "learning_rate": 5.933333333333333e-06, "loss": 0.1365, "step": 1800 }, { "epoch": 0.9, "learning_rate": 5.9194444444444444e-06, "loss": 0.0987, "step": 1810 }, { "epoch": 0.9, "learning_rate": 5.905555555555556e-06, "loss": 0.1396, "step": 1820 }, { "epoch": 0.91, "learning_rate": 5.8916666666666664e-06, "loss": 0.1693, "step": 1830 }, { "epoch": 0.91, "learning_rate": 5.877777777777778e-06, "loss": 0.1391, "step": 1840 }, { "epoch": 0.92, "learning_rate": 5.8638888888888885e-06, "loss": 0.1439, "step": 1850 }, { "epoch": 0.92, "learning_rate": 5.849999999999999e-06, "loss": 0.1503, "step": 1860 }, { "epoch": 0.93, "learning_rate": 5.836111111111111e-06, "loss": 0.1359, "step": 1870 }, { "epoch": 0.93, "learning_rate": 5.822222222222222e-06, "loss": 0.1423, "step": 1880 }, { "epoch": 0.93, "learning_rate": 5.808333333333333e-06, "loss": 0.1317, "step": 1890 }, { "epoch": 0.94, "learning_rate": 5.794444444444444e-06, "loss": 0.1431, "step": 1900 }, { "epoch": 0.94, "learning_rate": 5.780555555555555e-06, "loss": 0.1235, "step": 1910 }, { "epoch": 0.95, "learning_rate": 5.766666666666666e-06, "loss": 0.1397, "step": 1920 }, { "epoch": 0.95, "learning_rate": 5.752777777777778e-06, "loss": 0.1375, "step": 1930 }, { "epoch": 0.96, "learning_rate": 5.738888888888889e-06, "loss": 0.1341, "step": 1940 }, { "epoch": 0.96, "learning_rate": 5.724999999999999e-06, "loss": 0.1408, "step": 1950 }, { "epoch": 0.97, "learning_rate": 5.711111111111111e-06, "loss": 0.1766, "step": 1960 }, { "epoch": 0.97, "learning_rate": 5.697222222222221e-06, "loss": 0.177, "step": 1970 }, { "epoch": 0.98, "learning_rate": 5.683333333333334e-06, "loss": 0.1586, "step": 1980 }, { "epoch": 0.98, "learning_rate": 5.669444444444444e-06, "loss": 0.1447, "step": 1990 }, { "epoch": 0.99, "learning_rate": 5.655555555555556e-06, "loss": 0.1452, "step": 2000 }, { "epoch": 0.99, "learning_rate": 5.641666666666666e-06, "loss": 0.1078, "step": 2010 }, { "epoch": 1.0, "learning_rate": 5.627777777777778e-06, "loss": 0.1133, "step": 2020 }, { "epoch": 1.0, "learning_rate": 5.613888888888888e-06, "loss": 0.1288, "step": 2030 }, { "epoch": 1.01, "learning_rate": 5.6e-06, "loss": 0.1128, "step": 2040 }, { "epoch": 1.01, "learning_rate": 5.586111111111111e-06, "loss": 0.1298, "step": 2050 }, { "epoch": 1.02, "learning_rate": 5.572222222222222e-06, "loss": 0.1189, "step": 2060 }, { "epoch": 1.02, "learning_rate": 5.558333333333333e-06, "loss": 0.1478, "step": 2070 }, { "epoch": 1.03, "learning_rate": 5.544444444444444e-06, "loss": 0.1179, "step": 2080 }, { "epoch": 1.03, "learning_rate": 5.530555555555556e-06, "loss": 0.1119, "step": 2090 }, { "epoch": 1.04, "learning_rate": 5.516666666666667e-06, "loss": 0.1355, "step": 2100 }, { "epoch": 1.04, "learning_rate": 5.502777777777778e-06, "loss": 0.1105, "step": 2110 }, { "epoch": 1.05, "learning_rate": 5.488888888888889e-06, "loss": 0.1289, "step": 2120 }, { "epoch": 1.05, "learning_rate": 5.474999999999999e-06, "loss": 0.1127, "step": 2130 }, { "epoch": 1.06, "learning_rate": 5.461111111111111e-06, "loss": 0.1814, "step": 2140 }, { "epoch": 1.06, "learning_rate": 5.447222222222222e-06, "loss": 0.1554, "step": 2150 }, { "epoch": 1.07, "learning_rate": 5.4333333333333335e-06, "loss": 0.1038, "step": 2160 }, { "epoch": 1.07, "learning_rate": 5.419444444444444e-06, "loss": 0.1085, "step": 2170 }, { "epoch": 1.08, "learning_rate": 5.4055555555555556e-06, "loss": 0.1364, "step": 2180 }, { "epoch": 1.08, "learning_rate": 5.391666666666666e-06, "loss": 0.1089, "step": 2190 }, { "epoch": 1.09, "learning_rate": 5.377777777777778e-06, "loss": 0.1484, "step": 2200 }, { "epoch": 1.09, "learning_rate": 5.363888888888889e-06, "loss": 0.1254, "step": 2210 }, { "epoch": 1.1, "learning_rate": 5.35e-06, "loss": 0.1228, "step": 2220 }, { "epoch": 1.1, "learning_rate": 5.336111111111111e-06, "loss": 0.154, "step": 2230 }, { "epoch": 1.11, "learning_rate": 5.322222222222222e-06, "loss": 0.1745, "step": 2240 }, { "epoch": 1.11, "learning_rate": 5.308333333333333e-06, "loss": 0.1031, "step": 2250 }, { "epoch": 1.12, "learning_rate": 5.294444444444444e-06, "loss": 0.1359, "step": 2260 }, { "epoch": 1.12, "learning_rate": 5.280555555555556e-06, "loss": 0.1194, "step": 2270 }, { "epoch": 1.13, "learning_rate": 5.2666666666666665e-06, "loss": 0.1398, "step": 2280 }, { "epoch": 1.13, "learning_rate": 5.252777777777778e-06, "loss": 0.1277, "step": 2290 }, { "epoch": 1.14, "learning_rate": 5.2388888888888885e-06, "loss": 0.1121, "step": 2300 }, { "epoch": 1.14, "learning_rate": 5.224999999999999e-06, "loss": 0.1067, "step": 2310 }, { "epoch": 1.15, "learning_rate": 5.211111111111111e-06, "loss": 0.1101, "step": 2320 }, { "epoch": 1.15, "learning_rate": 5.197222222222222e-06, "loss": 0.1192, "step": 2330 }, { "epoch": 1.16, "learning_rate": 5.183333333333333e-06, "loss": 0.1055, "step": 2340 }, { "epoch": 1.16, "learning_rate": 5.169444444444444e-06, "loss": 0.1232, "step": 2350 }, { "epoch": 1.17, "learning_rate": 5.155555555555555e-06, "loss": 0.1096, "step": 2360 }, { "epoch": 1.17, "learning_rate": 5.141666666666666e-06, "loss": 0.0831, "step": 2370 }, { "epoch": 1.18, "learning_rate": 5.127777777777778e-06, "loss": 0.149, "step": 2380 }, { "epoch": 1.18, "learning_rate": 5.113888888888889e-06, "loss": 0.1328, "step": 2390 }, { "epoch": 1.19, "learning_rate": 5.0999999999999995e-06, "loss": 0.1193, "step": 2400 }, { "epoch": 1.19, "learning_rate": 5.086111111111111e-06, "loss": 0.1222, "step": 2410 }, { "epoch": 1.2, "learning_rate": 5.0722222222222215e-06, "loss": 0.1395, "step": 2420 }, { "epoch": 1.2, "eval_loss": 0.26611328125, "eval_runtime": 595.4698, "eval_samples_per_second": 40.229, "eval_steps_per_second": 5.03, "step": 2426 }, { "epoch": 1.2, "learning_rate": 5.058333333333334e-06, "loss": 0.1261, "step": 2430 }, { "epoch": 1.21, "learning_rate": 5.044444444444444e-06, "loss": 0.1416, "step": 2440 }, { "epoch": 1.21, "learning_rate": 5.030555555555556e-06, "loss": 0.1453, "step": 2450 }, { "epoch": 1.22, "learning_rate": 5.016666666666666e-06, "loss": 0.114, "step": 2460 }, { "epoch": 1.22, "learning_rate": 5.002777777777778e-06, "loss": 0.133, "step": 2470 }, { "epoch": 1.23, "learning_rate": 4.988888888888888e-06, "loss": 0.1207, "step": 2480 }, { "epoch": 1.23, "learning_rate": 4.975e-06, "loss": 0.1123, "step": 2490 }, { "epoch": 1.24, "learning_rate": 4.961111111111111e-06, "loss": 0.1039, "step": 2500 }, { "epoch": 1.24, "learning_rate": 4.947222222222222e-06, "loss": 0.091, "step": 2510 }, { "epoch": 1.25, "learning_rate": 4.933333333333333e-06, "loss": 0.1588, "step": 2520 }, { "epoch": 1.25, "learning_rate": 4.919444444444444e-06, "loss": 0.098, "step": 2530 }, { "epoch": 1.26, "learning_rate": 4.905555555555555e-06, "loss": 0.1033, "step": 2540 }, { "epoch": 1.26, "learning_rate": 4.891666666666667e-06, "loss": 0.1473, "step": 2550 }, { "epoch": 1.27, "learning_rate": 4.877777777777778e-06, "loss": 0.1331, "step": 2560 }, { "epoch": 1.27, "learning_rate": 4.863888888888889e-06, "loss": 0.1196, "step": 2570 }, { "epoch": 1.28, "learning_rate": 4.849999999999999e-06, "loss": 0.1379, "step": 2580 }, { "epoch": 1.28, "learning_rate": 4.836111111111111e-06, "loss": 0.124, "step": 2590 }, { "epoch": 1.29, "learning_rate": 4.822222222222222e-06, "loss": 0.1058, "step": 2600 }, { "epoch": 1.29, "learning_rate": 4.808333333333334e-06, "loss": 0.1132, "step": 2610 }, { "epoch": 1.3, "learning_rate": 4.794444444444444e-06, "loss": 0.1077, "step": 2620 }, { "epoch": 1.3, "learning_rate": 4.780555555555556e-06, "loss": 0.1266, "step": 2630 }, { "epoch": 1.31, "learning_rate": 4.766666666666666e-06, "loss": 0.1292, "step": 2640 }, { "epoch": 1.31, "learning_rate": 4.752777777777778e-06, "loss": 0.0969, "step": 2650 }, { "epoch": 1.32, "learning_rate": 4.738888888888889e-06, "loss": 0.0946, "step": 2660 }, { "epoch": 1.32, "learning_rate": 4.725e-06, "loss": 0.1479, "step": 2670 }, { "epoch": 1.33, "learning_rate": 4.711111111111111e-06, "loss": 0.133, "step": 2680 }, { "epoch": 1.33, "learning_rate": 4.697222222222222e-06, "loss": 0.1118, "step": 2690 }, { "epoch": 1.34, "learning_rate": 4.683333333333333e-06, "loss": 0.1362, "step": 2700 }, { "epoch": 1.34, "learning_rate": 4.669444444444444e-06, "loss": 0.1384, "step": 2710 }, { "epoch": 1.35, "learning_rate": 4.655555555555556e-06, "loss": 0.1275, "step": 2720 }, { "epoch": 1.35, "learning_rate": 4.6416666666666666e-06, "loss": 0.1416, "step": 2730 }, { "epoch": 1.36, "learning_rate": 4.627777777777778e-06, "loss": 0.1119, "step": 2740 }, { "epoch": 1.36, "learning_rate": 4.6138888888888886e-06, "loss": 0.1079, "step": 2750 }, { "epoch": 1.37, "learning_rate": 4.599999999999999e-06, "loss": 0.1197, "step": 2760 }, { "epoch": 1.37, "learning_rate": 4.5861111111111114e-06, "loss": 0.1115, "step": 2770 }, { "epoch": 1.38, "learning_rate": 4.572222222222222e-06, "loss": 0.1023, "step": 2780 }, { "epoch": 1.38, "learning_rate": 4.5583333333333335e-06, "loss": 0.1109, "step": 2790 }, { "epoch": 1.39, "learning_rate": 4.544444444444444e-06, "loss": 0.1184, "step": 2800 }, { "epoch": 1.39, "learning_rate": 4.5305555555555555e-06, "loss": 0.1103, "step": 2810 }, { "epoch": 1.4, "learning_rate": 4.516666666666666e-06, "loss": 0.1074, "step": 2820 }, { "epoch": 1.4, "learning_rate": 4.502777777777778e-06, "loss": 0.1139, "step": 2830 }, { "epoch": 1.4, "learning_rate": 4.488888888888889e-06, "loss": 0.1107, "step": 2840 }, { "epoch": 1.41, "learning_rate": 4.4749999999999995e-06, "loss": 0.0777, "step": 2850 }, { "epoch": 1.41, "learning_rate": 4.461111111111111e-06, "loss": 0.1348, "step": 2860 }, { "epoch": 1.42, "learning_rate": 4.4472222222222215e-06, "loss": 0.1362, "step": 2870 }, { "epoch": 1.42, "learning_rate": 4.433333333333333e-06, "loss": 0.1374, "step": 2880 }, { "epoch": 1.43, "learning_rate": 4.419444444444444e-06, "loss": 0.1135, "step": 2890 }, { "epoch": 1.43, "learning_rate": 4.405555555555556e-06, "loss": 0.1166, "step": 2900 }, { "epoch": 1.44, "learning_rate": 4.391666666666666e-06, "loss": 0.1202, "step": 2910 }, { "epoch": 1.44, "learning_rate": 4.377777777777778e-06, "loss": 0.1455, "step": 2920 }, { "epoch": 1.45, "learning_rate": 4.3638888888888884e-06, "loss": 0.0831, "step": 2930 }, { "epoch": 1.45, "learning_rate": 4.35e-06, "loss": 0.1211, "step": 2940 }, { "epoch": 1.46, "learning_rate": 4.336111111111111e-06, "loss": 0.1194, "step": 2950 }, { "epoch": 1.46, "learning_rate": 4.322222222222222e-06, "loss": 0.1124, "step": 2960 }, { "epoch": 1.47, "learning_rate": 4.308333333333333e-06, "loss": 0.0947, "step": 2970 }, { "epoch": 1.47, "learning_rate": 4.294444444444444e-06, "loss": 0.1152, "step": 2980 }, { "epoch": 1.48, "learning_rate": 4.280555555555555e-06, "loss": 0.0858, "step": 2990 }, { "epoch": 1.48, "learning_rate": 4.266666666666667e-06, "loss": 0.087, "step": 3000 }, { "epoch": 1.49, "learning_rate": 4.252777777777778e-06, "loss": 0.1237, "step": 3010 }, { "epoch": 1.49, "learning_rate": 4.238888888888889e-06, "loss": 0.0809, "step": 3020 }, { "epoch": 1.5, "learning_rate": 4.224999999999999e-06, "loss": 0.1022, "step": 3030 }, { "epoch": 1.5, "learning_rate": 4.211111111111111e-06, "loss": 0.101, "step": 3040 }, { "epoch": 1.51, "learning_rate": 4.197222222222221e-06, "loss": 0.1327, "step": 3050 }, { "epoch": 1.51, "learning_rate": 4.183333333333334e-06, "loss": 0.1056, "step": 3060 }, { "epoch": 1.52, "learning_rate": 4.169444444444444e-06, "loss": 0.1129, "step": 3070 }, { "epoch": 1.52, "learning_rate": 4.155555555555556e-06, "loss": 0.1119, "step": 3080 }, { "epoch": 1.53, "learning_rate": 4.141666666666666e-06, "loss": 0.1287, "step": 3090 }, { "epoch": 1.53, "learning_rate": 4.127777777777778e-06, "loss": 0.1179, "step": 3100 }, { "epoch": 1.54, "learning_rate": 4.113888888888889e-06, "loss": 0.1152, "step": 3110 }, { "epoch": 1.54, "learning_rate": 4.1e-06, "loss": 0.0833, "step": 3120 }, { "epoch": 1.55, "learning_rate": 4.086111111111111e-06, "loss": 0.1304, "step": 3130 }, { "epoch": 1.55, "learning_rate": 4.072222222222222e-06, "loss": 0.0813, "step": 3140 }, { "epoch": 1.56, "learning_rate": 4.058333333333333e-06, "loss": 0.1068, "step": 3150 }, { "epoch": 1.56, "learning_rate": 4.044444444444444e-06, "loss": 0.0998, "step": 3160 }, { "epoch": 1.57, "learning_rate": 4.030555555555556e-06, "loss": 0.1109, "step": 3170 }, { "epoch": 1.57, "learning_rate": 4.016666666666667e-06, "loss": 0.1193, "step": 3180 }, { "epoch": 1.58, "learning_rate": 4.002777777777778e-06, "loss": 0.1234, "step": 3190 }, { "epoch": 1.58, "learning_rate": 3.988888888888889e-06, "loss": 0.1049, "step": 3200 }, { "epoch": 1.59, "learning_rate": 3.975e-06, "loss": 0.1017, "step": 3210 }, { "epoch": 1.59, "learning_rate": 3.9611111111111115e-06, "loss": 0.1044, "step": 3220 }, { "epoch": 1.6, "learning_rate": 3.947222222222222e-06, "loss": 0.1263, "step": 3230 }, { "epoch": 1.6, "learning_rate": 3.933333333333333e-06, "loss": 0.1419, "step": 3240 }, { "epoch": 1.61, "learning_rate": 3.919444444444444e-06, "loss": 0.1294, "step": 3250 }, { "epoch": 1.61, "learning_rate": 3.9055555555555555e-06, "loss": 0.0939, "step": 3260 }, { "epoch": 1.62, "learning_rate": 3.891666666666666e-06, "loss": 0.1374, "step": 3270 }, { "epoch": 1.62, "learning_rate": 3.8777777777777775e-06, "loss": 0.0959, "step": 3280 }, { "epoch": 1.63, "learning_rate": 3.863888888888889e-06, "loss": 0.1009, "step": 3290 }, { "epoch": 1.63, "learning_rate": 3.8499999999999996e-06, "loss": 0.1305, "step": 3300 }, { "epoch": 1.64, "learning_rate": 3.836111111111111e-06, "loss": 0.1303, "step": 3310 }, { "epoch": 1.64, "learning_rate": 3.8222222222222224e-06, "loss": 0.1282, "step": 3320 }, { "epoch": 1.65, "learning_rate": 3.808333333333333e-06, "loss": 0.1053, "step": 3330 }, { "epoch": 1.65, "learning_rate": 3.794444444444444e-06, "loss": 0.1042, "step": 3340 }, { "epoch": 1.66, "learning_rate": 3.7805555555555555e-06, "loss": 0.0849, "step": 3350 }, { "epoch": 1.66, "learning_rate": 3.7666666666666665e-06, "loss": 0.1274, "step": 3360 }, { "epoch": 1.67, "learning_rate": 3.7527777777777775e-06, "loss": 0.1228, "step": 3370 }, { "epoch": 1.67, "learning_rate": 3.738888888888889e-06, "loss": 0.1129, "step": 3380 }, { "epoch": 1.68, "learning_rate": 3.725e-06, "loss": 0.1128, "step": 3390 }, { "epoch": 1.68, "learning_rate": 3.711111111111111e-06, "loss": 0.1317, "step": 3400 }, { "epoch": 1.69, "learning_rate": 3.6972222222222224e-06, "loss": 0.1246, "step": 3410 }, { "epoch": 1.69, "learning_rate": 3.683333333333333e-06, "loss": 0.0806, "step": 3420 }, { "epoch": 1.7, "learning_rate": 3.669444444444444e-06, "loss": 0.119, "step": 3430 }, { "epoch": 1.7, "learning_rate": 3.6555555555555554e-06, "loss": 0.0893, "step": 3440 }, { "epoch": 1.71, "learning_rate": 3.6416666666666664e-06, "loss": 0.1058, "step": 3450 }, { "epoch": 1.71, "learning_rate": 3.6277777777777774e-06, "loss": 0.0944, "step": 3460 }, { "epoch": 1.72, "learning_rate": 3.613888888888889e-06, "loss": 0.126, "step": 3470 }, { "epoch": 1.72, "learning_rate": 3.6e-06, "loss": 0.1104, "step": 3480 }, { "epoch": 1.73, "learning_rate": 3.5861111111111113e-06, "loss": 0.1108, "step": 3490 }, { "epoch": 1.73, "learning_rate": 3.5722222222222223e-06, "loss": 0.1197, "step": 3500 }, { "epoch": 1.74, "learning_rate": 3.558333333333333e-06, "loss": 0.1182, "step": 3510 }, { "epoch": 1.74, "learning_rate": 3.5444444444444443e-06, "loss": 0.0988, "step": 3520 }, { "epoch": 1.75, "learning_rate": 3.5305555555555553e-06, "loss": 0.0949, "step": 3530 }, { "epoch": 1.75, "learning_rate": 3.5166666666666663e-06, "loss": 0.1102, "step": 3540 }, { "epoch": 1.76, "learning_rate": 3.5027777777777777e-06, "loss": 0.1203, "step": 3550 }, { "epoch": 1.76, "learning_rate": 3.4888888888888888e-06, "loss": 0.1343, "step": 3560 }, { "epoch": 1.77, "learning_rate": 3.4749999999999998e-06, "loss": 0.0851, "step": 3570 }, { "epoch": 1.77, "learning_rate": 3.461111111111111e-06, "loss": 0.1181, "step": 3580 }, { "epoch": 1.78, "learning_rate": 3.447222222222222e-06, "loss": 0.0931, "step": 3590 }, { "epoch": 1.78, "learning_rate": 3.433333333333333e-06, "loss": 0.1068, "step": 3600 }, { "epoch": 1.79, "learning_rate": 3.4194444444444442e-06, "loss": 0.0884, "step": 3610 }, { "epoch": 1.79, "learning_rate": 3.4055555555555552e-06, "loss": 0.1326, "step": 3620 }, { "epoch": 1.8, "learning_rate": 3.3916666666666667e-06, "loss": 0.1307, "step": 3630 }, { "epoch": 1.8, "eval_loss": 0.249267578125, "eval_runtime": 590.255, "eval_samples_per_second": 40.584, "eval_steps_per_second": 5.074, "step": 3639 } ], "max_steps": 6063, "num_train_epochs": 3, "total_flos": 134441240494080.0, "trial_name": null, "trial_params": null }