{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.414368184733804, "global_step": 40000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "learning_rate": 4.959910198845414e-05, "loss": 13.3635, "step": 500 }, { "epoch": 0.16, "learning_rate": 4.919820397690827e-05, "loss": 3.169, "step": 1000 }, { "epoch": 0.16, "eval_loss": 0.662490725517273, "eval_runtime": 80.6872, "eval_samples_per_second": 62.922, "eval_steps_per_second": 31.467, "step": 1000 }, { "epoch": 0.24, "learning_rate": 4.8797305965362415e-05, "loss": 0.578, "step": 1500 }, { "epoch": 0.32, "learning_rate": 4.839640795381655e-05, "loss": 0.2929, "step": 2000 }, { "epoch": 0.32, "eval_loss": 0.18057043850421906, "eval_runtime": 80.3642, "eval_samples_per_second": 63.175, "eval_steps_per_second": 31.594, "step": 2000 }, { "epoch": 0.4, "learning_rate": 4.799550994227069e-05, "loss": 0.2481, "step": 2500 }, { "epoch": 0.48, "learning_rate": 4.759461193072483e-05, "loss": 0.2159, "step": 3000 }, { "epoch": 0.48, "eval_loss": 0.1447182148694992, "eval_runtime": 80.0634, "eval_samples_per_second": 63.412, "eval_steps_per_second": 31.712, "step": 3000 }, { "epoch": 0.56, "learning_rate": 4.719371391917896e-05, "loss": 0.2131, "step": 3500 }, { "epoch": 0.64, "learning_rate": 4.6792815907633104e-05, "loss": 0.1955, "step": 4000 }, { "epoch": 0.64, "eval_loss": 0.13617677986621857, "eval_runtime": 80.8812, "eval_samples_per_second": 62.771, "eval_steps_per_second": 31.392, "step": 4000 }, { "epoch": 0.72, "learning_rate": 4.639191789608724e-05, "loss": 0.1799, "step": 4500 }, { "epoch": 0.8, "learning_rate": 4.5991019884541375e-05, "loss": 0.1785, "step": 5000 }, { "epoch": 0.8, "eval_loss": 0.12113407254219055, "eval_runtime": 81.0258, "eval_samples_per_second": 62.659, "eval_steps_per_second": 31.336, "step": 5000 }, { "epoch": 0.88, "learning_rate": 4.559012187299551e-05, "loss": 0.1664, "step": 5500 }, { "epoch": 0.96, "learning_rate": 4.5189223861449645e-05, "loss": 0.1661, "step": 6000 }, { "epoch": 0.96, "eval_loss": 0.1217775046825409, "eval_runtime": 80.4348, "eval_samples_per_second": 63.119, "eval_steps_per_second": 31.566, "step": 6000 }, { "epoch": 1.04, "learning_rate": 4.478832584990379e-05, "loss": 0.152, "step": 6500 }, { "epoch": 1.12, "learning_rate": 4.438742783835793e-05, "loss": 0.1429, "step": 7000 }, { "epoch": 1.12, "eval_loss": 0.11830633133649826, "eval_runtime": 80.5584, "eval_samples_per_second": 63.023, "eval_steps_per_second": 31.518, "step": 7000 }, { "epoch": 1.2, "learning_rate": 4.3986529826812064e-05, "loss": 0.1436, "step": 7500 }, { "epoch": 1.28, "learning_rate": 4.35856318152662e-05, "loss": 0.138, "step": 8000 }, { "epoch": 1.28, "eval_loss": 0.11041316390037537, "eval_runtime": 80.3972, "eval_samples_per_second": 63.149, "eval_steps_per_second": 31.581, "step": 8000 }, { "epoch": 1.36, "learning_rate": 4.3184733803720335e-05, "loss": 0.139, "step": 8500 }, { "epoch": 1.44, "learning_rate": 4.278383579217447e-05, "loss": 0.1328, "step": 9000 }, { "epoch": 1.44, "eval_loss": 0.11453410983085632, "eval_runtime": 80.9914, "eval_samples_per_second": 62.686, "eval_steps_per_second": 31.349, "step": 9000 }, { "epoch": 1.52, "learning_rate": 4.238293778062861e-05, "loss": 0.1336, "step": 9500 }, { "epoch": 1.6, "learning_rate": 4.198203976908275e-05, "loss": 0.1257, "step": 10000 }, { "epoch": 1.6, "eval_loss": 0.11141891777515411, "eval_runtime": 80.764, "eval_samples_per_second": 62.862, "eval_steps_per_second": 31.437, "step": 10000 }, { "epoch": 1.68, "learning_rate": 4.158114175753688e-05, "loss": 0.129, "step": 10500 }, { "epoch": 1.76, "learning_rate": 4.118024374599102e-05, "loss": 0.1283, "step": 11000 }, { "epoch": 1.76, "eval_loss": 0.10540181398391724, "eval_runtime": 80.8099, "eval_samples_per_second": 62.826, "eval_steps_per_second": 31.419, "step": 11000 }, { "epoch": 1.84, "learning_rate": 4.077934573444516e-05, "loss": 0.1329, "step": 11500 }, { "epoch": 1.92, "learning_rate": 4.03784477228993e-05, "loss": 0.1233, "step": 12000 }, { "epoch": 1.92, "eval_loss": 0.10773167759180069, "eval_runtime": 80.5271, "eval_samples_per_second": 63.047, "eval_steps_per_second": 31.53, "step": 12000 }, { "epoch": 2.0, "learning_rate": 3.9977549711353436e-05, "loss": 0.121, "step": 12500 }, { "epoch": 2.08, "learning_rate": 3.957665169980757e-05, "loss": 0.1022, "step": 13000 }, { "epoch": 2.08, "eval_loss": 0.1054486483335495, "eval_runtime": 80.727, "eval_samples_per_second": 62.891, "eval_steps_per_second": 31.452, "step": 13000 }, { "epoch": 2.16, "learning_rate": 3.917575368826171e-05, "loss": 0.113, "step": 13500 }, { "epoch": 2.25, "learning_rate": 3.877485567671584e-05, "loss": 0.1015, "step": 14000 }, { "epoch": 2.25, "eval_loss": 0.10687392950057983, "eval_runtime": 80.3483, "eval_samples_per_second": 63.187, "eval_steps_per_second": 31.6, "step": 14000 }, { "epoch": 2.33, "learning_rate": 3.8373957665169984e-05, "loss": 0.1052, "step": 14500 }, { "epoch": 2.41, "learning_rate": 3.797305965362412e-05, "loss": 0.1013, "step": 15000 }, { "epoch": 2.41, "eval_loss": 0.0990850031375885, "eval_runtime": 80.592, "eval_samples_per_second": 62.996, "eval_steps_per_second": 31.504, "step": 15000 }, { "epoch": 2.49, "learning_rate": 3.7572161642078254e-05, "loss": 0.1107, "step": 15500 }, { "epoch": 2.57, "learning_rate": 3.7171263630532396e-05, "loss": 0.1024, "step": 16000 }, { "epoch": 2.57, "eval_loss": 0.10092990845441818, "eval_runtime": 81.1078, "eval_samples_per_second": 62.596, "eval_steps_per_second": 31.304, "step": 16000 }, { "epoch": 2.65, "learning_rate": 3.677036561898653e-05, "loss": 0.0986, "step": 16500 }, { "epoch": 2.73, "learning_rate": 3.6369467607440674e-05, "loss": 0.1011, "step": 17000 }, { "epoch": 2.73, "eval_loss": 0.09937073290348053, "eval_runtime": 79.8753, "eval_samples_per_second": 63.562, "eval_steps_per_second": 31.787, "step": 17000 }, { "epoch": 2.81, "learning_rate": 3.596856959589481e-05, "loss": 0.1009, "step": 17500 }, { "epoch": 2.89, "learning_rate": 3.5567671584348944e-05, "loss": 0.1011, "step": 18000 }, { "epoch": 2.89, "eval_loss": 0.09842894226312637, "eval_runtime": 80.4877, "eval_samples_per_second": 63.078, "eval_steps_per_second": 31.545, "step": 18000 }, { "epoch": 2.97, "learning_rate": 3.516677357280308e-05, "loss": 0.1008, "step": 18500 }, { "epoch": 3.05, "learning_rate": 3.4765875561257214e-05, "loss": 0.0935, "step": 19000 }, { "epoch": 3.05, "eval_loss": 0.1000189483165741, "eval_runtime": 80.5915, "eval_samples_per_second": 62.997, "eval_steps_per_second": 31.505, "step": 19000 }, { "epoch": 3.13, "learning_rate": 3.436497754971135e-05, "loss": 0.0847, "step": 19500 }, { "epoch": 3.21, "learning_rate": 3.396407953816549e-05, "loss": 0.086, "step": 20000 }, { "epoch": 3.21, "eval_loss": 0.10353543609380722, "eval_runtime": 80.3489, "eval_samples_per_second": 63.187, "eval_steps_per_second": 31.6, "step": 20000 }, { "epoch": 3.29, "learning_rate": 3.3563181526619633e-05, "loss": 0.0858, "step": 20500 }, { "epoch": 3.37, "learning_rate": 3.316228351507377e-05, "loss": 0.0868, "step": 21000 }, { "epoch": 3.37, "eval_loss": 0.10058429092168808, "eval_runtime": 80.7731, "eval_samples_per_second": 62.855, "eval_steps_per_second": 31.434, "step": 21000 }, { "epoch": 3.45, "learning_rate": 3.2761385503527904e-05, "loss": 0.0822, "step": 21500 }, { "epoch": 3.53, "learning_rate": 3.236048749198204e-05, "loss": 0.0854, "step": 22000 }, { "epoch": 3.53, "eval_loss": 0.09908822923898697, "eval_runtime": 80.4062, "eval_samples_per_second": 63.142, "eval_steps_per_second": 31.577, "step": 22000 }, { "epoch": 3.61, "learning_rate": 3.195958948043618e-05, "loss": 0.0835, "step": 22500 }, { "epoch": 3.69, "learning_rate": 3.1558691468890316e-05, "loss": 0.0898, "step": 23000 }, { "epoch": 3.69, "eval_loss": 0.09145330637693405, "eval_runtime": 81.1442, "eval_samples_per_second": 62.568, "eval_steps_per_second": 31.29, "step": 23000 }, { "epoch": 3.77, "learning_rate": 3.115779345734445e-05, "loss": 0.0876, "step": 23500 }, { "epoch": 3.85, "learning_rate": 3.0756895445798587e-05, "loss": 0.08, "step": 24000 }, { "epoch": 3.85, "eval_loss": 0.09377244859933853, "eval_runtime": 80.6911, "eval_samples_per_second": 62.919, "eval_steps_per_second": 31.466, "step": 24000 }, { "epoch": 3.93, "learning_rate": 3.0355997434252725e-05, "loss": 0.091, "step": 24500 }, { "epoch": 4.01, "learning_rate": 2.9955099422706867e-05, "loss": 0.0778, "step": 25000 }, { "epoch": 4.01, "eval_loss": 0.09943201392889023, "eval_runtime": 80.4527, "eval_samples_per_second": 63.105, "eval_steps_per_second": 31.559, "step": 25000 }, { "epoch": 4.09, "learning_rate": 2.9554201411161002e-05, "loss": 0.0704, "step": 25500 }, { "epoch": 4.17, "learning_rate": 2.915330339961514e-05, "loss": 0.0699, "step": 26000 }, { "epoch": 4.17, "eval_loss": 0.09294962137937546, "eval_runtime": 80.2201, "eval_samples_per_second": 63.288, "eval_steps_per_second": 31.65, "step": 26000 }, { "epoch": 4.25, "learning_rate": 2.8752405388069276e-05, "loss": 0.079, "step": 26500 }, { "epoch": 4.33, "learning_rate": 2.835150737652341e-05, "loss": 0.0754, "step": 27000 }, { "epoch": 4.33, "eval_loss": 0.09366082400083542, "eval_runtime": 80.2272, "eval_samples_per_second": 63.283, "eval_steps_per_second": 31.648, "step": 27000 }, { "epoch": 4.41, "learning_rate": 2.7950609364977553e-05, "loss": 0.0707, "step": 27500 }, { "epoch": 4.49, "learning_rate": 2.754971135343169e-05, "loss": 0.0727, "step": 28000 }, { "epoch": 4.49, "eval_loss": 0.09205422550439835, "eval_runtime": 80.3418, "eval_samples_per_second": 63.193, "eval_steps_per_second": 31.602, "step": 28000 }, { "epoch": 4.57, "learning_rate": 2.7148813341885827e-05, "loss": 0.0723, "step": 28500 }, { "epoch": 4.65, "learning_rate": 2.6747915330339962e-05, "loss": 0.0718, "step": 29000 }, { "epoch": 4.65, "eval_loss": 0.09299702942371368, "eval_runtime": 80.7661, "eval_samples_per_second": 62.861, "eval_steps_per_second": 31.436, "step": 29000 }, { "epoch": 4.73, "learning_rate": 2.6347017318794097e-05, "loss": 0.0725, "step": 29500 }, { "epoch": 4.81, "learning_rate": 2.594611930724824e-05, "loss": 0.0741, "step": 30000 }, { "epoch": 4.81, "eval_loss": 0.09144437313079834, "eval_runtime": 80.7469, "eval_samples_per_second": 62.875, "eval_steps_per_second": 31.444, "step": 30000 }, { "epoch": 4.89, "learning_rate": 2.5545221295702375e-05, "loss": 0.0708, "step": 30500 }, { "epoch": 4.97, "learning_rate": 2.5144323284156513e-05, "loss": 0.0687, "step": 31000 }, { "epoch": 4.97, "eval_loss": 0.0920889675617218, "eval_runtime": 80.5972, "eval_samples_per_second": 62.992, "eval_steps_per_second": 31.502, "step": 31000 }, { "epoch": 5.05, "learning_rate": 2.474342527261065e-05, "loss": 0.0677, "step": 31500 }, { "epoch": 5.13, "learning_rate": 2.4342527261064787e-05, "loss": 0.0617, "step": 32000 }, { "epoch": 5.13, "eval_loss": 0.09358620643615723, "eval_runtime": 80.47, "eval_samples_per_second": 63.092, "eval_steps_per_second": 31.552, "step": 32000 }, { "epoch": 5.21, "learning_rate": 2.3941629249518922e-05, "loss": 0.0591, "step": 32500 }, { "epoch": 5.29, "learning_rate": 2.354073123797306e-05, "loss": 0.0609, "step": 33000 }, { "epoch": 5.29, "eval_loss": 0.09032619744539261, "eval_runtime": 80.985, "eval_samples_per_second": 62.691, "eval_steps_per_second": 31.351, "step": 33000 }, { "epoch": 5.37, "learning_rate": 2.31398332264272e-05, "loss": 0.0657, "step": 33500 }, { "epoch": 5.45, "learning_rate": 2.2738935214881335e-05, "loss": 0.0652, "step": 34000 }, { "epoch": 5.45, "eval_loss": 0.0909246951341629, "eval_runtime": 81.1024, "eval_samples_per_second": 62.6, "eval_steps_per_second": 31.306, "step": 34000 }, { "epoch": 5.53, "learning_rate": 2.233803720333547e-05, "loss": 0.0629, "step": 34500 }, { "epoch": 5.61, "learning_rate": 2.1937139191789612e-05, "loss": 0.0641, "step": 35000 }, { "epoch": 5.61, "eval_loss": 0.09035832434892654, "eval_runtime": 80.259, "eval_samples_per_second": 63.258, "eval_steps_per_second": 31.635, "step": 35000 }, { "epoch": 5.69, "learning_rate": 2.1536241180243747e-05, "loss": 0.0642, "step": 35500 }, { "epoch": 5.77, "learning_rate": 2.1135343168697885e-05, "loss": 0.06, "step": 36000 }, { "epoch": 5.77, "eval_loss": 0.09021405130624771, "eval_runtime": 80.1923, "eval_samples_per_second": 63.31, "eval_steps_per_second": 31.661, "step": 36000 }, { "epoch": 5.85, "learning_rate": 2.073444515715202e-05, "loss": 0.063, "step": 36500 }, { "epoch": 5.93, "learning_rate": 2.033354714560616e-05, "loss": 0.0643, "step": 37000 }, { "epoch": 5.93, "eval_loss": 0.0934990718960762, "eval_runtime": 79.9682, "eval_samples_per_second": 63.488, "eval_steps_per_second": 31.75, "step": 37000 }, { "epoch": 6.01, "learning_rate": 1.9932649134060298e-05, "loss": 0.0641, "step": 37500 }, { "epoch": 6.09, "learning_rate": 1.9531751122514433e-05, "loss": 0.0552, "step": 38000 }, { "epoch": 6.09, "eval_loss": 0.09127607196569443, "eval_runtime": 81.2206, "eval_samples_per_second": 62.509, "eval_steps_per_second": 31.261, "step": 38000 }, { "epoch": 6.17, "learning_rate": 1.913085311096857e-05, "loss": 0.0586, "step": 38500 }, { "epoch": 6.25, "learning_rate": 1.8729955099422707e-05, "loss": 0.0517, "step": 39000 }, { "epoch": 6.25, "eval_loss": 0.09465406835079193, "eval_runtime": 80.2376, "eval_samples_per_second": 63.275, "eval_steps_per_second": 31.644, "step": 39000 }, { "epoch": 6.33, "learning_rate": 1.8329057087876845e-05, "loss": 0.055, "step": 39500 }, { "epoch": 6.41, "learning_rate": 1.7928159076330984e-05, "loss": 0.0592, "step": 40000 }, { "epoch": 6.41, "eval_loss": 0.08911187946796417, "eval_runtime": 80.0512, "eval_samples_per_second": 63.422, "eval_steps_per_second": 31.717, "step": 40000 } ], "max_steps": 62360, "num_train_epochs": 10, "total_flos": 1.9182220110515405e+17, "trial_name": null, "trial_params": null }