{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.995790064552343, "eval_steps": 500, "global_step": 2225, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 1.3333333333333334e-06, "loss": 1.5563, "step": 8 }, { "epoch": 0.04, "learning_rate": 2.666666666666667e-06, "loss": 1.4423, "step": 16 }, { "epoch": 0.05, "learning_rate": 4.000000000000001e-06, "loss": 1.3027, "step": 24 }, { "epoch": 0.07, "learning_rate": 5.333333333333334e-06, "loss": 1.2535, "step": 32 }, { "epoch": 0.09, "learning_rate": 6.666666666666667e-06, "loss": 1.2352, "step": 40 }, { "epoch": 0.11, "learning_rate": 8.000000000000001e-06, "loss": 1.2077, "step": 48 }, { "epoch": 0.13, "learning_rate": 9.333333333333334e-06, "loss": 1.1871, "step": 56 }, { "epoch": 0.14, "learning_rate": 1.0666666666666667e-05, "loss": 1.1708, "step": 64 }, { "epoch": 0.16, "learning_rate": 1.2e-05, "loss": 1.1666, "step": 72 }, { "epoch": 0.18, "learning_rate": 1.3333333333333333e-05, "loss": 1.1394, "step": 80 }, { "epoch": 0.2, "learning_rate": 1.4666666666666668e-05, "loss": 1.1644, "step": 88 }, { "epoch": 0.22, "learning_rate": 1.6000000000000003e-05, "loss": 1.1521, "step": 96 }, { "epoch": 0.23, "learning_rate": 1.7333333333333336e-05, "loss": 1.142, "step": 104 }, { "epoch": 0.25, "learning_rate": 1.866666666666667e-05, "loss": 1.1314, "step": 112 }, { "epoch": 0.27, "learning_rate": 2e-05, "loss": 1.1495, "step": 120 }, { "epoch": 0.29, "learning_rate": 2.1333333333333335e-05, "loss": 1.1468, "step": 128 }, { "epoch": 0.31, "learning_rate": 2.2666666666666668e-05, "loss": 1.1457, "step": 136 }, { "epoch": 0.32, "learning_rate": 2.4e-05, "loss": 1.115, "step": 144 }, { "epoch": 0.34, "learning_rate": 2.5333333333333337e-05, "loss": 1.1279, "step": 152 }, { "epoch": 0.36, "learning_rate": 2.6666666666666667e-05, "loss": 1.1145, "step": 160 }, { "epoch": 0.38, "learning_rate": 2.8000000000000003e-05, "loss": 1.1209, "step": 168 }, { "epoch": 0.4, "learning_rate": 2.9333333333333336e-05, "loss": 1.1295, "step": 176 }, { "epoch": 0.41, "learning_rate": 3.066666666666667e-05, "loss": 1.1249, "step": 184 }, { "epoch": 0.43, "learning_rate": 3.2000000000000005e-05, "loss": 1.1294, "step": 192 }, { "epoch": 0.45, "learning_rate": 3.3333333333333335e-05, "loss": 1.103, "step": 200 }, { "epoch": 0.47, "learning_rate": 3.466666666666667e-05, "loss": 1.1241, "step": 208 }, { "epoch": 0.48, "learning_rate": 3.6e-05, "loss": 1.1306, "step": 216 }, { "epoch": 0.5, "learning_rate": 3.733333333333334e-05, "loss": 1.1096, "step": 224 }, { "epoch": 0.52, "learning_rate": 3.866666666666667e-05, "loss": 1.1242, "step": 232 }, { "epoch": 0.54, "learning_rate": 4e-05, "loss": 1.1109, "step": 240 }, { "epoch": 0.56, "learning_rate": 4.133333333333333e-05, "loss": 1.1091, "step": 248 }, { "epoch": 0.57, "learning_rate": 4.266666666666667e-05, "loss": 1.1304, "step": 256 }, { "epoch": 0.59, "learning_rate": 4.4000000000000006e-05, "loss": 1.1149, "step": 264 }, { "epoch": 0.61, "learning_rate": 4.5333333333333335e-05, "loss": 1.106, "step": 272 }, { "epoch": 0.63, "learning_rate": 4.666666666666667e-05, "loss": 1.1139, "step": 280 }, { "epoch": 0.65, "learning_rate": 4.8e-05, "loss": 1.1174, "step": 288 }, { "epoch": 0.66, "learning_rate": 4.933333333333334e-05, "loss": 1.0849, "step": 296 }, { "epoch": 0.68, "learning_rate": 5e-05, "loss": 1.1107, "step": 304 }, { "epoch": 0.7, "learning_rate": 5e-05, "loss": 1.0828, "step": 312 }, { "epoch": 0.72, "learning_rate": 5e-05, "loss": 1.1032, "step": 320 }, { "epoch": 0.74, "learning_rate": 5e-05, "loss": 1.15, "step": 328 }, { "epoch": 0.75, "learning_rate": 5e-05, "loss": 1.1088, "step": 336 }, { "epoch": 0.77, "learning_rate": 5e-05, "loss": 1.1008, "step": 344 }, { "epoch": 0.79, "learning_rate": 5e-05, "loss": 1.1174, "step": 352 }, { "epoch": 0.81, "learning_rate": 5e-05, "loss": 1.1043, "step": 360 }, { "epoch": 0.83, "learning_rate": 5e-05, "loss": 1.1196, "step": 368 }, { "epoch": 0.84, "learning_rate": 5e-05, "loss": 1.1339, "step": 376 }, { "epoch": 0.86, "learning_rate": 5e-05, "loss": 1.1186, "step": 384 }, { "epoch": 0.88, "learning_rate": 5e-05, "loss": 1.0837, "step": 392 }, { "epoch": 0.9, "learning_rate": 5e-05, "loss": 1.1105, "step": 400 }, { "epoch": 0.92, "learning_rate": 5e-05, "loss": 1.1249, "step": 408 }, { "epoch": 0.93, "learning_rate": 5e-05, "loss": 1.0846, "step": 416 }, { "epoch": 0.95, "learning_rate": 5e-05, "loss": 1.0925, "step": 424 }, { "epoch": 0.97, "learning_rate": 5e-05, "loss": 1.0917, "step": 432 }, { "epoch": 0.99, "learning_rate": 5e-05, "loss": 1.1177, "step": 440 }, { "epoch": 1.01, "learning_rate": 5e-05, "loss": 1.0964, "step": 448 }, { "epoch": 1.02, "learning_rate": 5e-05, "loss": 1.0494, "step": 456 }, { "epoch": 1.04, "learning_rate": 5e-05, "loss": 1.0444, "step": 464 }, { "epoch": 1.06, "learning_rate": 5e-05, "loss": 1.043, "step": 472 }, { "epoch": 1.08, "learning_rate": 5e-05, "loss": 1.0303, "step": 480 }, { "epoch": 1.1, "learning_rate": 5e-05, "loss": 1.0862, "step": 488 }, { "epoch": 1.11, "learning_rate": 5e-05, "loss": 1.03, "step": 496 }, { "epoch": 1.13, "learning_rate": 5e-05, "loss": 1.0122, "step": 504 }, { "epoch": 1.15, "learning_rate": 5e-05, "loss": 1.0202, "step": 512 }, { "epoch": 1.17, "learning_rate": 5e-05, "loss": 1.0331, "step": 520 }, { "epoch": 1.19, "learning_rate": 5e-05, "loss": 1.0644, "step": 528 }, { "epoch": 1.2, "learning_rate": 5e-05, "loss": 1.0172, "step": 536 }, { "epoch": 1.22, "learning_rate": 5e-05, "loss": 1.0436, "step": 544 }, { "epoch": 1.24, "learning_rate": 5e-05, "loss": 1.0495, "step": 552 }, { "epoch": 1.26, "learning_rate": 5e-05, "loss": 1.0474, "step": 560 }, { "epoch": 1.28, "learning_rate": 5e-05, "loss": 1.0436, "step": 568 }, { "epoch": 1.29, "learning_rate": 5e-05, "loss": 1.0226, "step": 576 }, { "epoch": 1.31, "learning_rate": 5e-05, "loss": 1.0431, "step": 584 }, { "epoch": 1.33, "learning_rate": 5e-05, "loss": 1.0407, "step": 592 }, { "epoch": 1.35, "learning_rate": 5e-05, "loss": 1.033, "step": 600 }, { "epoch": 1.37, "learning_rate": 5e-05, "loss": 1.0202, "step": 608 }, { "epoch": 1.38, "learning_rate": 5e-05, "loss": 1.0791, "step": 616 }, { "epoch": 1.4, "learning_rate": 5e-05, "loss": 1.0281, "step": 624 }, { "epoch": 1.42, "learning_rate": 5e-05, "loss": 1.0793, "step": 632 }, { "epoch": 1.44, "learning_rate": 5e-05, "loss": 1.0557, "step": 640 }, { "epoch": 1.45, "learning_rate": 5e-05, "loss": 1.0413, "step": 648 }, { "epoch": 1.47, "learning_rate": 5e-05, "loss": 1.0679, "step": 656 }, { "epoch": 1.49, "learning_rate": 5e-05, "loss": 1.0334, "step": 664 }, { "epoch": 1.51, "learning_rate": 5e-05, "loss": 1.0163, "step": 672 }, { "epoch": 1.53, "learning_rate": 5e-05, "loss": 1.0454, "step": 680 }, { "epoch": 1.54, "learning_rate": 5e-05, "loss": 1.0489, "step": 688 }, { "epoch": 1.56, "learning_rate": 5e-05, "loss": 1.0437, "step": 696 }, { "epoch": 1.58, "learning_rate": 5e-05, "loss": 1.0278, "step": 704 }, { "epoch": 1.6, "learning_rate": 5e-05, "loss": 1.0165, "step": 712 }, { "epoch": 1.62, "learning_rate": 5e-05, "loss": 1.0673, "step": 720 }, { "epoch": 1.63, "learning_rate": 5e-05, "loss": 1.0469, "step": 728 }, { "epoch": 1.65, "learning_rate": 5e-05, "loss": 1.0453, "step": 736 }, { "epoch": 1.67, "learning_rate": 5e-05, "loss": 1.0523, "step": 744 }, { "epoch": 1.69, "learning_rate": 5e-05, "loss": 1.0433, "step": 752 }, { "epoch": 1.71, "learning_rate": 5e-05, "loss": 1.0156, "step": 760 }, { "epoch": 1.72, "learning_rate": 5e-05, "loss": 1.0424, "step": 768 }, { "epoch": 1.74, "learning_rate": 5e-05, "loss": 1.0216, "step": 776 }, { "epoch": 1.76, "learning_rate": 5e-05, "loss": 1.0369, "step": 784 }, { "epoch": 1.78, "learning_rate": 5e-05, "loss": 1.0509, "step": 792 }, { "epoch": 1.8, "learning_rate": 5e-05, "loss": 1.0619, "step": 800 }, { "epoch": 1.81, "learning_rate": 5e-05, "loss": 1.0164, "step": 808 }, { "epoch": 1.83, "learning_rate": 5e-05, "loss": 1.0256, "step": 816 }, { "epoch": 1.85, "learning_rate": 5e-05, "loss": 1.0028, "step": 824 }, { "epoch": 1.87, "learning_rate": 5e-05, "loss": 1.0244, "step": 832 }, { "epoch": 1.89, "learning_rate": 5e-05, "loss": 1.0494, "step": 840 }, { "epoch": 1.9, "learning_rate": 5e-05, "loss": 1.0387, "step": 848 }, { "epoch": 1.92, "learning_rate": 5e-05, "loss": 1.0168, "step": 856 }, { "epoch": 1.94, "learning_rate": 5e-05, "loss": 1.0455, "step": 864 }, { "epoch": 1.96, "learning_rate": 5e-05, "loss": 1.0622, "step": 872 }, { "epoch": 1.98, "learning_rate": 5e-05, "loss": 1.0564, "step": 880 }, { "epoch": 1.99, "learning_rate": 5e-05, "loss": 1.0656, "step": 888 }, { "epoch": 2.01, "learning_rate": 5e-05, "loss": 0.9658, "step": 896 }, { "epoch": 2.03, "learning_rate": 5e-05, "loss": 0.9094, "step": 904 }, { "epoch": 2.05, "learning_rate": 5e-05, "loss": 0.9343, "step": 912 }, { "epoch": 2.07, "learning_rate": 5e-05, "loss": 0.8941, "step": 920 }, { "epoch": 2.08, "learning_rate": 5e-05, "loss": 0.8903, "step": 928 }, { "epoch": 2.1, "learning_rate": 5e-05, "loss": 0.892, "step": 936 }, { "epoch": 2.12, "learning_rate": 5e-05, "loss": 0.9425, "step": 944 }, { "epoch": 2.14, "learning_rate": 5e-05, "loss": 0.932, "step": 952 }, { "epoch": 2.16, "learning_rate": 5e-05, "loss": 0.9171, "step": 960 }, { "epoch": 2.17, "learning_rate": 5e-05, "loss": 0.8976, "step": 968 }, { "epoch": 2.19, "learning_rate": 5e-05, "loss": 0.9055, "step": 976 }, { "epoch": 2.21, "learning_rate": 5e-05, "loss": 0.9442, "step": 984 }, { "epoch": 2.23, "learning_rate": 5e-05, "loss": 0.8948, "step": 992 }, { "epoch": 2.25, "learning_rate": 5e-05, "loss": 0.9297, "step": 1000 }, { "epoch": 2.26, "learning_rate": 5e-05, "loss": 0.8957, "step": 1008 }, { "epoch": 2.28, "learning_rate": 5e-05, "loss": 0.9264, "step": 1016 }, { "epoch": 2.3, "learning_rate": 5e-05, "loss": 0.9391, "step": 1024 }, { "epoch": 2.32, "learning_rate": 5e-05, "loss": 0.8936, "step": 1032 }, { "epoch": 2.34, "learning_rate": 5e-05, "loss": 0.9537, "step": 1040 }, { "epoch": 2.35, "learning_rate": 5e-05, "loss": 0.9168, "step": 1048 }, { "epoch": 2.37, "learning_rate": 5e-05, "loss": 0.9092, "step": 1056 }, { "epoch": 2.39, "learning_rate": 5e-05, "loss": 0.9198, "step": 1064 }, { "epoch": 2.41, "learning_rate": 5e-05, "loss": 0.9284, "step": 1072 }, { "epoch": 2.42, "learning_rate": 5e-05, "loss": 0.8926, "step": 1080 }, { "epoch": 2.44, "learning_rate": 5e-05, "loss": 0.9338, "step": 1088 }, { "epoch": 2.46, "learning_rate": 5e-05, "loss": 0.91, "step": 1096 }, { "epoch": 2.48, "learning_rate": 5e-05, "loss": 0.9444, "step": 1104 }, { "epoch": 2.5, "learning_rate": 5e-05, "loss": 0.9326, "step": 1112 }, { "epoch": 2.51, "learning_rate": 5e-05, "loss": 0.901, "step": 1120 }, { "epoch": 2.53, "learning_rate": 5e-05, "loss": 0.9344, "step": 1128 }, { "epoch": 2.55, "learning_rate": 5e-05, "loss": 0.9344, "step": 1136 }, { "epoch": 2.57, "learning_rate": 5e-05, "loss": 0.9063, "step": 1144 }, { "epoch": 2.59, "learning_rate": 5e-05, "loss": 0.9123, "step": 1152 }, { "epoch": 2.6, "learning_rate": 5e-05, "loss": 0.9287, "step": 1160 }, { "epoch": 2.62, "learning_rate": 5e-05, "loss": 0.8998, "step": 1168 }, { "epoch": 2.64, "learning_rate": 5e-05, "loss": 0.9101, "step": 1176 }, { "epoch": 2.66, "learning_rate": 5e-05, "loss": 0.9353, "step": 1184 }, { "epoch": 2.68, "learning_rate": 5e-05, "loss": 0.942, "step": 1192 }, { "epoch": 2.69, "learning_rate": 5e-05, "loss": 0.9194, "step": 1200 }, { "epoch": 2.71, "learning_rate": 5e-05, "loss": 0.9221, "step": 1208 }, { "epoch": 2.73, "learning_rate": 5e-05, "loss": 0.9227, "step": 1216 }, { "epoch": 2.75, "learning_rate": 5e-05, "loss": 0.9409, "step": 1224 }, { "epoch": 2.77, "learning_rate": 5e-05, "loss": 0.9192, "step": 1232 }, { "epoch": 2.78, "learning_rate": 5e-05, "loss": 0.9274, "step": 1240 }, { "epoch": 2.8, "learning_rate": 5e-05, "loss": 0.9422, "step": 1248 }, { "epoch": 2.82, "learning_rate": 5e-05, "loss": 0.9236, "step": 1256 }, { "epoch": 2.84, "learning_rate": 5e-05, "loss": 0.9185, "step": 1264 }, { "epoch": 2.86, "learning_rate": 5e-05, "loss": 0.9179, "step": 1272 }, { "epoch": 2.87, "learning_rate": 5e-05, "loss": 0.9186, "step": 1280 }, { "epoch": 2.89, "learning_rate": 5e-05, "loss": 0.9117, "step": 1288 }, { "epoch": 2.91, "learning_rate": 5e-05, "loss": 0.9058, "step": 1296 }, { "epoch": 2.93, "learning_rate": 5e-05, "loss": 0.9486, "step": 1304 }, { "epoch": 2.95, "learning_rate": 5e-05, "loss": 0.9157, "step": 1312 }, { "epoch": 2.96, "learning_rate": 5e-05, "loss": 0.9119, "step": 1320 }, { "epoch": 2.98, "learning_rate": 5e-05, "loss": 0.9238, "step": 1328 }, { "epoch": 3.0, "learning_rate": 5e-05, "loss": 0.9201, "step": 1336 }, { "epoch": 3.02, "learning_rate": 5e-05, "loss": 0.7697, "step": 1344 }, { "epoch": 3.04, "learning_rate": 5e-05, "loss": 0.7608, "step": 1352 }, { "epoch": 3.05, "learning_rate": 5e-05, "loss": 0.7581, "step": 1360 }, { "epoch": 3.07, "learning_rate": 5e-05, "loss": 0.7518, "step": 1368 }, { "epoch": 3.09, "learning_rate": 5e-05, "loss": 0.7121, "step": 1376 }, { "epoch": 3.11, "learning_rate": 5e-05, "loss": 0.7284, "step": 1384 }, { "epoch": 3.13, "learning_rate": 5e-05, "loss": 0.7513, "step": 1392 }, { "epoch": 3.14, "learning_rate": 5e-05, "loss": 0.7529, "step": 1400 }, { "epoch": 3.16, "learning_rate": 5e-05, "loss": 0.7443, "step": 1408 }, { "epoch": 3.18, "learning_rate": 5e-05, "loss": 0.7603, "step": 1416 }, { "epoch": 3.2, "learning_rate": 5e-05, "loss": 0.7242, "step": 1424 }, { "epoch": 3.22, "learning_rate": 5e-05, "loss": 0.7504, "step": 1432 }, { "epoch": 3.23, "learning_rate": 5e-05, "loss": 0.7363, "step": 1440 }, { "epoch": 3.25, "learning_rate": 5e-05, "loss": 0.7307, "step": 1448 }, { "epoch": 3.27, "learning_rate": 5e-05, "loss": 0.757, "step": 1456 }, { "epoch": 3.29, "learning_rate": 5e-05, "loss": 0.7337, "step": 1464 }, { "epoch": 3.31, "learning_rate": 5e-05, "loss": 0.7338, "step": 1472 }, { "epoch": 3.32, "learning_rate": 5e-05, "loss": 0.7632, "step": 1480 }, { "epoch": 3.34, "learning_rate": 5e-05, "loss": 0.7643, "step": 1488 }, { "epoch": 3.36, "learning_rate": 5e-05, "loss": 0.7458, "step": 1496 }, { "epoch": 3.38, "learning_rate": 5e-05, "loss": 0.7704, "step": 1504 }, { "epoch": 3.39, "learning_rate": 5e-05, "loss": 0.755, "step": 1512 }, { "epoch": 3.41, "learning_rate": 5e-05, "loss": 0.7182, "step": 1520 }, { "epoch": 3.43, "learning_rate": 5e-05, "loss": 0.7533, "step": 1528 }, { "epoch": 3.45, "learning_rate": 5e-05, "loss": 0.7475, "step": 1536 }, { "epoch": 3.47, "learning_rate": 5e-05, "loss": 0.7671, "step": 1544 }, { "epoch": 3.48, "learning_rate": 5e-05, "loss": 0.7541, "step": 1552 }, { "epoch": 3.5, "learning_rate": 5e-05, "loss": 0.7758, "step": 1560 }, { "epoch": 3.52, "learning_rate": 5e-05, "loss": 0.7387, "step": 1568 }, { "epoch": 3.54, "learning_rate": 5e-05, "loss": 0.7928, "step": 1576 }, { "epoch": 3.56, "learning_rate": 5e-05, "loss": 0.7494, "step": 1584 }, { "epoch": 3.57, "learning_rate": 5e-05, "loss": 0.7716, "step": 1592 }, { "epoch": 3.59, "learning_rate": 5e-05, "loss": 0.7735, "step": 1600 }, { "epoch": 3.61, "learning_rate": 5e-05, "loss": 0.7688, "step": 1608 }, { "epoch": 3.63, "learning_rate": 5e-05, "loss": 0.7852, "step": 1616 }, { "epoch": 3.65, "learning_rate": 5e-05, "loss": 0.7501, "step": 1624 }, { "epoch": 3.66, "learning_rate": 5e-05, "loss": 0.7754, "step": 1632 }, { "epoch": 3.68, "learning_rate": 5e-05, "loss": 0.756, "step": 1640 }, { "epoch": 3.7, "learning_rate": 5e-05, "loss": 0.7778, "step": 1648 }, { "epoch": 3.72, "learning_rate": 5e-05, "loss": 0.7835, "step": 1656 }, { "epoch": 3.74, "learning_rate": 5e-05, "loss": 0.7714, "step": 1664 }, { "epoch": 3.75, "learning_rate": 5e-05, "loss": 0.7417, "step": 1672 }, { "epoch": 3.77, "learning_rate": 5e-05, "loss": 0.7803, "step": 1680 }, { "epoch": 3.79, "learning_rate": 5e-05, "loss": 0.7616, "step": 1688 }, { "epoch": 3.81, "learning_rate": 5e-05, "loss": 0.7574, "step": 1696 }, { "epoch": 3.83, "learning_rate": 5e-05, "loss": 0.7895, "step": 1704 }, { "epoch": 3.84, "learning_rate": 5e-05, "loss": 0.7773, "step": 1712 }, { "epoch": 3.86, "learning_rate": 5e-05, "loss": 0.7839, "step": 1720 }, { "epoch": 3.88, "learning_rate": 5e-05, "loss": 0.7675, "step": 1728 }, { "epoch": 3.9, "learning_rate": 5e-05, "loss": 0.7649, "step": 1736 }, { "epoch": 3.92, "learning_rate": 5e-05, "loss": 0.7656, "step": 1744 }, { "epoch": 3.93, "learning_rate": 5e-05, "loss": 0.7725, "step": 1752 }, { "epoch": 3.95, "learning_rate": 5e-05, "loss": 0.7609, "step": 1760 }, { "epoch": 3.97, "learning_rate": 5e-05, "loss": 0.7664, "step": 1768 }, { "epoch": 3.99, "learning_rate": 5e-05, "loss": 0.7945, "step": 1776 }, { "epoch": 4.01, "learning_rate": 5e-05, "loss": 0.7065, "step": 1784 }, { "epoch": 4.02, "learning_rate": 5e-05, "loss": 0.5463, "step": 1792 }, { "epoch": 4.04, "learning_rate": 5e-05, "loss": 0.549, "step": 1800 }, { "epoch": 4.06, "learning_rate": 5e-05, "loss": 0.571, "step": 1808 }, { "epoch": 4.08, "learning_rate": 5e-05, "loss": 0.5538, "step": 1816 }, { "epoch": 4.1, "learning_rate": 5e-05, "loss": 0.5649, "step": 1824 }, { "epoch": 4.11, "learning_rate": 5e-05, "loss": 0.5719, "step": 1832 }, { "epoch": 4.13, "learning_rate": 5e-05, "loss": 0.5273, "step": 1840 }, { "epoch": 4.15, "learning_rate": 5e-05, "loss": 0.5482, "step": 1848 }, { "epoch": 4.17, "learning_rate": 5e-05, "loss": 0.5708, "step": 1856 }, { "epoch": 4.19, "learning_rate": 5e-05, "loss": 0.5643, "step": 1864 }, { "epoch": 4.2, "learning_rate": 5e-05, "loss": 0.5372, "step": 1872 }, { "epoch": 4.22, "learning_rate": 5e-05, "loss": 0.5623, "step": 1880 }, { "epoch": 4.24, "learning_rate": 5e-05, "loss": 0.5869, "step": 1888 }, { "epoch": 4.26, "learning_rate": 5e-05, "loss": 0.5929, "step": 1896 }, { "epoch": 4.28, "learning_rate": 5e-05, "loss": 0.5645, "step": 1904 }, { "epoch": 4.29, "learning_rate": 5e-05, "loss": 0.5901, "step": 1912 }, { "epoch": 4.31, "learning_rate": 5e-05, "loss": 0.5585, "step": 1920 }, { "epoch": 4.33, "learning_rate": 5e-05, "loss": 0.571, "step": 1928 }, { "epoch": 4.35, "learning_rate": 5e-05, "loss": 0.5824, "step": 1936 }, { "epoch": 4.36, "learning_rate": 5e-05, "loss": 0.5801, "step": 1944 }, { "epoch": 4.38, "learning_rate": 5e-05, "loss": 0.5736, "step": 1952 }, { "epoch": 4.4, "learning_rate": 5e-05, "loss": 0.5874, "step": 1960 }, { "epoch": 4.42, "learning_rate": 5e-05, "loss": 0.5795, "step": 1968 }, { "epoch": 4.44, "learning_rate": 5e-05, "loss": 0.5868, "step": 1976 }, { "epoch": 4.45, "learning_rate": 5e-05, "loss": 0.5649, "step": 1984 }, { "epoch": 4.47, "learning_rate": 5e-05, "loss": 0.5769, "step": 1992 }, { "epoch": 4.49, "learning_rate": 5e-05, "loss": 0.5944, "step": 2000 }, { "epoch": 4.51, "learning_rate": 5e-05, "loss": 0.5876, "step": 2008 }, { "epoch": 4.53, "learning_rate": 5e-05, "loss": 0.6052, "step": 2016 }, { "epoch": 4.54, "learning_rate": 5e-05, "loss": 0.5994, "step": 2024 }, { "epoch": 4.56, "learning_rate": 5e-05, "loss": 0.5684, "step": 2032 }, { "epoch": 4.58, "learning_rate": 5e-05, "loss": 0.5927, "step": 2040 }, { "epoch": 4.6, "learning_rate": 5e-05, "loss": 0.5857, "step": 2048 }, { "epoch": 4.62, "learning_rate": 5e-05, "loss": 0.566, "step": 2056 }, { "epoch": 4.63, "learning_rate": 5e-05, "loss": 0.5763, "step": 2064 }, { "epoch": 4.65, "learning_rate": 5e-05, "loss": 0.5997, "step": 2072 }, { "epoch": 4.67, "learning_rate": 5e-05, "loss": 0.5655, "step": 2080 }, { "epoch": 4.69, "learning_rate": 5e-05, "loss": 0.5859, "step": 2088 }, { "epoch": 4.71, "learning_rate": 5e-05, "loss": 0.5636, "step": 2096 }, { "epoch": 4.72, "learning_rate": 5e-05, "loss": 0.5964, "step": 2104 }, { "epoch": 4.74, "learning_rate": 5e-05, "loss": 0.5534, "step": 2112 }, { "epoch": 4.76, "learning_rate": 5e-05, "loss": 0.5858, "step": 2120 }, { "epoch": 4.78, "learning_rate": 5e-05, "loss": 0.5916, "step": 2128 }, { "epoch": 4.8, "learning_rate": 5e-05, "loss": 0.5862, "step": 2136 }, { "epoch": 4.81, "learning_rate": 5e-05, "loss": 0.6118, "step": 2144 }, { "epoch": 4.83, "learning_rate": 5e-05, "loss": 0.6023, "step": 2152 }, { "epoch": 4.85, "learning_rate": 5e-05, "loss": 0.5927, "step": 2160 }, { "epoch": 4.87, "learning_rate": 5e-05, "loss": 0.5911, "step": 2168 }, { "epoch": 4.89, "learning_rate": 5e-05, "loss": 0.6043, "step": 2176 }, { "epoch": 4.9, "learning_rate": 5e-05, "loss": 0.5779, "step": 2184 }, { "epoch": 4.92, "learning_rate": 5e-05, "loss": 0.6112, "step": 2192 }, { "epoch": 4.94, "learning_rate": 5e-05, "loss": 0.5893, "step": 2200 }, { "epoch": 4.96, "learning_rate": 5e-05, "loss": 0.597, "step": 2208 }, { "epoch": 4.98, "learning_rate": 5e-05, "loss": 0.5992, "step": 2216 }, { "epoch": 4.99, "learning_rate": 5e-05, "loss": 0.5933, "step": 2224 } ], "logging_steps": 8, "max_steps": 2225, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.5950431556342907e+18, "trial_name": null, "trial_params": null }