{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9583172175775787, "global_step": 198864, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 1.9950413059216727e-05, "loss": 14.0627, "step": 500 }, { "epoch": 0.01, "learning_rate": 1.9900826118433453e-05, "loss": 6.3799, "step": 1000 }, { "epoch": 0.02, "learning_rate": 1.9851239177650176e-05, "loss": 5.31, "step": 1500 }, { "epoch": 0.03, "learning_rate": 1.9801652236866898e-05, "loss": 4.9781, "step": 2000 }, { "epoch": 0.04, "learning_rate": 1.9752065296083624e-05, "loss": 4.7392, "step": 2500 }, { "epoch": 0.04, "learning_rate": 1.970247835530035e-05, "loss": 4.5779, "step": 3000 }, { "epoch": 0.05, "learning_rate": 1.9652891414517075e-05, "loss": 4.4691, "step": 3500 }, { "epoch": 0.06, "learning_rate": 1.96033044737338e-05, "loss": 4.3745, "step": 4000 }, { "epoch": 0.07, "learning_rate": 1.9553717532950524e-05, "loss": 4.2883, "step": 4500 }, { "epoch": 0.07, "learning_rate": 1.9504130592167246e-05, "loss": 4.2342, "step": 5000 }, { "epoch": 0.08, "learning_rate": 1.9454543651383972e-05, "loss": 4.1614, "step": 5500 }, { "epoch": 0.09, "learning_rate": 1.9404956710600698e-05, "loss": 4.1279, "step": 6000 }, { "epoch": 0.1, "learning_rate": 1.9355369769817423e-05, "loss": 4.0802, "step": 6500 }, { "epoch": 0.1, "learning_rate": 1.930578282903415e-05, "loss": 4.0298, "step": 7000 }, { "epoch": 0.11, "learning_rate": 1.925619588825087e-05, "loss": 3.9697, "step": 7500 }, { "epoch": 0.12, "learning_rate": 1.9206608947467594e-05, "loss": 3.9584, "step": 8000 }, { "epoch": 0.13, "learning_rate": 1.915702200668432e-05, "loss": 3.9196, "step": 8500 }, { "epoch": 0.13, "learning_rate": 1.9107435065901046e-05, "loss": 3.9081, "step": 9000 }, { "epoch": 0.14, "learning_rate": 1.905784812511777e-05, "loss": 3.8419, "step": 9500 }, { "epoch": 0.15, "learning_rate": 1.9008261184334497e-05, "loss": 3.8363, "step": 10000 }, { "epoch": 0.16, "learning_rate": 1.895867424355122e-05, "loss": 3.8047, "step": 10500 }, { "epoch": 0.16, "learning_rate": 1.8909087302767945e-05, "loss": 3.7728, "step": 11000 }, { "epoch": 0.17, "learning_rate": 1.8859500361984668e-05, "loss": 3.7731, "step": 11500 }, { "epoch": 0.18, "learning_rate": 1.8809913421201393e-05, "loss": 3.7408, "step": 12000 }, { "epoch": 0.19, "learning_rate": 1.876032648041812e-05, "loss": 3.7027, "step": 12500 }, { "epoch": 0.19, "learning_rate": 1.8710739539634845e-05, "loss": 3.6865, "step": 13000 }, { "epoch": 0.2, "learning_rate": 1.8661152598851567e-05, "loss": 3.6456, "step": 13500 }, { "epoch": 0.21, "learning_rate": 1.8611565658068293e-05, "loss": 3.6539, "step": 14000 }, { "epoch": 0.22, "learning_rate": 1.8561978717285016e-05, "loss": 3.6222, "step": 14500 }, { "epoch": 0.22, "learning_rate": 1.851239177650174e-05, "loss": 3.6127, "step": 15000 }, { "epoch": 0.23, "learning_rate": 1.8462804835718467e-05, "loss": 3.6133, "step": 15500 }, { "epoch": 0.24, "learning_rate": 1.8413217894935193e-05, "loss": 3.5863, "step": 16000 }, { "epoch": 0.25, "learning_rate": 1.8363630954151915e-05, "loss": 3.5669, "step": 16500 }, { "epoch": 0.25, "learning_rate": 1.831404401336864e-05, "loss": 3.5518, "step": 17000 }, { "epoch": 0.26, "learning_rate": 1.8264457072585367e-05, "loss": 3.5368, "step": 17500 }, { "epoch": 0.27, "learning_rate": 1.821487013180209e-05, "loss": 3.5294, "step": 18000 }, { "epoch": 0.28, "learning_rate": 1.8165283191018815e-05, "loss": 3.5097, "step": 18500 }, { "epoch": 0.28, "learning_rate": 1.811569625023554e-05, "loss": 3.5198, "step": 19000 }, { "epoch": 0.29, "learning_rate": 1.8066109309452263e-05, "loss": 3.4702, "step": 19500 }, { "epoch": 0.3, "learning_rate": 1.801652236866899e-05, "loss": 3.485, "step": 20000 }, { "epoch": 0.3, "learning_rate": 1.7966935427885715e-05, "loss": 3.4853, "step": 20500 }, { "epoch": 0.31, "learning_rate": 1.7917348487102437e-05, "loss": 3.4395, "step": 21000 }, { "epoch": 0.32, "learning_rate": 1.7867761546319163e-05, "loss": 3.4515, "step": 21500 }, { "epoch": 0.33, "learning_rate": 1.781817460553589e-05, "loss": 3.4307, "step": 22000 }, { "epoch": 0.33, "learning_rate": 1.776858766475261e-05, "loss": 3.4343, "step": 22500 }, { "epoch": 0.34, "learning_rate": 1.7719000723969337e-05, "loss": 3.4053, "step": 23000 }, { "epoch": 0.35, "learning_rate": 1.7669413783186063e-05, "loss": 3.4008, "step": 23500 }, { "epoch": 0.36, "learning_rate": 1.7619826842402785e-05, "loss": 3.3951, "step": 24000 }, { "epoch": 0.36, "learning_rate": 1.757023990161951e-05, "loss": 3.3871, "step": 24500 }, { "epoch": 0.37, "learning_rate": 1.7520652960836234e-05, "loss": 3.3822, "step": 25000 }, { "epoch": 0.38, "learning_rate": 1.747106602005296e-05, "loss": 3.3816, "step": 25500 }, { "epoch": 0.39, "learning_rate": 1.7421479079269685e-05, "loss": 3.3759, "step": 26000 }, { "epoch": 0.39, "learning_rate": 1.737189213848641e-05, "loss": 3.3624, "step": 26500 }, { "epoch": 0.4, "learning_rate": 1.7322305197703137e-05, "loss": 3.3535, "step": 27000 }, { "epoch": 0.41, "learning_rate": 1.727271825691986e-05, "loss": 3.3366, "step": 27500 }, { "epoch": 0.42, "learning_rate": 1.722313131613658e-05, "loss": 3.3245, "step": 28000 }, { "epoch": 0.42, "learning_rate": 1.7173544375353307e-05, "loss": 3.3575, "step": 28500 }, { "epoch": 0.43, "learning_rate": 1.7123957434570033e-05, "loss": 3.3133, "step": 29000 }, { "epoch": 0.44, "learning_rate": 1.707437049378676e-05, "loss": 3.3124, "step": 29500 }, { "epoch": 0.45, "learning_rate": 1.7024783553003485e-05, "loss": 3.3295, "step": 30000 }, { "epoch": 0.45, "learning_rate": 1.6975196612220207e-05, "loss": 3.3192, "step": 30500 }, { "epoch": 0.46, "learning_rate": 1.692560967143693e-05, "loss": 3.3241, "step": 31000 }, { "epoch": 0.47, "learning_rate": 1.6876022730653655e-05, "loss": 3.2989, "step": 31500 }, { "epoch": 0.48, "learning_rate": 1.682643578987038e-05, "loss": 3.2956, "step": 32000 }, { "epoch": 0.48, "learning_rate": 1.6776848849087107e-05, "loss": 3.2889, "step": 32500 }, { "epoch": 0.49, "learning_rate": 1.6727261908303833e-05, "loss": 3.2934, "step": 33000 }, { "epoch": 0.5, "learning_rate": 1.6677674967520555e-05, "loss": 3.2642, "step": 33500 }, { "epoch": 0.51, "learning_rate": 1.6628088026737277e-05, "loss": 3.2513, "step": 34000 }, { "epoch": 0.51, "learning_rate": 1.6578501085954003e-05, "loss": 3.2584, "step": 34500 }, { "epoch": 0.52, "learning_rate": 1.652891414517073e-05, "loss": 3.2576, "step": 35000 }, { "epoch": 0.53, "learning_rate": 1.6479327204387455e-05, "loss": 3.2532, "step": 35500 }, { "epoch": 0.54, "learning_rate": 1.642974026360418e-05, "loss": 3.2349, "step": 36000 }, { "epoch": 0.54, "learning_rate": 1.6380153322820903e-05, "loss": 3.2349, "step": 36500 }, { "epoch": 0.55, "learning_rate": 1.6330566382037625e-05, "loss": 3.2158, "step": 37000 }, { "epoch": 0.56, "learning_rate": 1.628097944125435e-05, "loss": 3.2309, "step": 37500 }, { "epoch": 0.57, "learning_rate": 1.6231392500471077e-05, "loss": 3.2227, "step": 38000 }, { "epoch": 0.57, "learning_rate": 1.6181805559687803e-05, "loss": 3.2134, "step": 38500 }, { "epoch": 0.58, "learning_rate": 1.613221861890453e-05, "loss": 3.2206, "step": 39000 }, { "epoch": 0.59, "learning_rate": 1.608263167812125e-05, "loss": 3.2002, "step": 39500 }, { "epoch": 0.6, "learning_rate": 1.6033044737337973e-05, "loss": 3.1988, "step": 40000 }, { "epoch": 0.6, "learning_rate": 1.59834577965547e-05, "loss": 3.2081, "step": 40500 }, { "epoch": 0.61, "learning_rate": 1.5933870855771425e-05, "loss": 3.1891, "step": 41000 }, { "epoch": 0.62, "learning_rate": 1.588428391498815e-05, "loss": 3.2007, "step": 41500 }, { "epoch": 0.62, "learning_rate": 1.5834696974204877e-05, "loss": 3.1948, "step": 42000 }, { "epoch": 0.63, "learning_rate": 1.57851100334216e-05, "loss": 3.1673, "step": 42500 }, { "epoch": 0.64, "learning_rate": 1.5735523092638325e-05, "loss": 3.158, "step": 43000 }, { "epoch": 0.65, "learning_rate": 1.5685936151855047e-05, "loss": 3.1561, "step": 43500 }, { "epoch": 0.65, "learning_rate": 1.5636349211071773e-05, "loss": 3.1734, "step": 44000 }, { "epoch": 0.66, "learning_rate": 1.55867622702885e-05, "loss": 3.1401, "step": 44500 }, { "epoch": 0.67, "learning_rate": 1.5537175329505225e-05, "loss": 3.1463, "step": 45000 }, { "epoch": 0.68, "learning_rate": 1.5487588388721947e-05, "loss": 3.1431, "step": 45500 }, { "epoch": 0.68, "learning_rate": 1.5438001447938673e-05, "loss": 3.1316, "step": 46000 }, { "epoch": 0.69, "learning_rate": 1.5388414507155395e-05, "loss": 3.1606, "step": 46500 }, { "epoch": 0.7, "learning_rate": 1.533882756637212e-05, "loss": 3.1362, "step": 47000 }, { "epoch": 0.71, "learning_rate": 1.5289240625588847e-05, "loss": 3.1335, "step": 47500 }, { "epoch": 0.71, "learning_rate": 1.523965368480557e-05, "loss": 3.149, "step": 48000 }, { "epoch": 0.72, "learning_rate": 1.5190066744022297e-05, "loss": 3.1293, "step": 48500 }, { "epoch": 0.73, "learning_rate": 1.514047980323902e-05, "loss": 3.1286, "step": 49000 }, { "epoch": 0.74, "learning_rate": 1.5090892862455743e-05, "loss": 3.1196, "step": 49500 }, { "epoch": 0.74, "learning_rate": 1.5041305921672469e-05, "loss": 3.1238, "step": 50000 }, { "epoch": 0.75, "learning_rate": 1.4991718980889195e-05, "loss": 3.1033, "step": 50500 }, { "epoch": 0.76, "learning_rate": 1.4942132040105919e-05, "loss": 3.1112, "step": 51000 }, { "epoch": 0.77, "learning_rate": 1.4892545099322645e-05, "loss": 3.0936, "step": 51500 }, { "epoch": 0.77, "learning_rate": 1.4842958158539369e-05, "loss": 3.107, "step": 52000 }, { "epoch": 0.78, "learning_rate": 1.4793371217756094e-05, "loss": 3.1063, "step": 52500 }, { "epoch": 0.79, "learning_rate": 1.4743784276972817e-05, "loss": 3.0639, "step": 53000 }, { "epoch": 0.8, "learning_rate": 1.4694197336189543e-05, "loss": 3.1028, "step": 53500 }, { "epoch": 0.8, "learning_rate": 1.4644610395406267e-05, "loss": 3.0821, "step": 54000 }, { "epoch": 0.81, "learning_rate": 1.4595023454622992e-05, "loss": 3.0596, "step": 54500 }, { "epoch": 0.82, "learning_rate": 1.4545436513839717e-05, "loss": 3.0787, "step": 55000 }, { "epoch": 0.83, "learning_rate": 1.4495849573056442e-05, "loss": 3.0755, "step": 55500 }, { "epoch": 0.83, "learning_rate": 1.4446262632273165e-05, "loss": 3.066, "step": 56000 }, { "epoch": 0.84, "learning_rate": 1.439667569148989e-05, "loss": 3.0695, "step": 56500 }, { "epoch": 0.85, "learning_rate": 1.4347088750706615e-05, "loss": 3.059, "step": 57000 }, { "epoch": 0.86, "learning_rate": 1.429750180992334e-05, "loss": 3.0628, "step": 57500 }, { "epoch": 0.86, "learning_rate": 1.4247914869140065e-05, "loss": 3.0733, "step": 58000 }, { "epoch": 0.87, "learning_rate": 1.419832792835679e-05, "loss": 3.0591, "step": 58500 }, { "epoch": 0.88, "learning_rate": 1.4148740987573514e-05, "loss": 3.0468, "step": 59000 }, { "epoch": 0.89, "learning_rate": 1.4099154046790237e-05, "loss": 3.0265, "step": 59500 }, { "epoch": 0.89, "learning_rate": 1.4049567106006963e-05, "loss": 3.0282, "step": 60000 }, { "epoch": 0.9, "learning_rate": 1.3999980165223688e-05, "loss": 3.0222, "step": 60500 }, { "epoch": 0.91, "learning_rate": 1.3950393224440413e-05, "loss": 3.0275, "step": 61000 }, { "epoch": 0.91, "learning_rate": 1.3900806283657138e-05, "loss": 3.0277, "step": 61500 }, { "epoch": 0.92, "learning_rate": 1.3851219342873862e-05, "loss": 3.0551, "step": 62000 }, { "epoch": 0.93, "learning_rate": 1.3801632402090585e-05, "loss": 3.0205, "step": 62500 }, { "epoch": 0.94, "learning_rate": 1.375204546130731e-05, "loss": 3.023, "step": 63000 }, { "epoch": 0.94, "learning_rate": 1.3702458520524036e-05, "loss": 3.0244, "step": 63500 }, { "epoch": 0.95, "learning_rate": 1.365287157974076e-05, "loss": 3.0116, "step": 64000 }, { "epoch": 0.96, "learning_rate": 1.3603284638957486e-05, "loss": 3.0141, "step": 64500 }, { "epoch": 0.97, "learning_rate": 1.355369769817421e-05, "loss": 3.0284, "step": 65000 }, { "epoch": 0.97, "learning_rate": 1.3504110757390933e-05, "loss": 3.0236, "step": 65500 }, { "epoch": 0.98, "learning_rate": 1.3454523816607659e-05, "loss": 3.013, "step": 66000 }, { "epoch": 0.99, "learning_rate": 1.3404936875824384e-05, "loss": 3.0027, "step": 66500 }, { "epoch": 1.0, "learning_rate": 1.3355349935041108e-05, "loss": 3.0155, "step": 67000 }, { "epoch": 1.0, "eval_bleu": 11.298551127218651, "eval_loss": 2.3749005794525146, "eval_runtime": 4929.9601, "eval_samples_per_second": 8.201, "eval_steps_per_second": 0.513, "step": 67222 }, { "epoch": 1.0, "learning_rate": 1.3305762994257834e-05, "loss": 3.0195, "step": 67500 }, { "epoch": 1.01, "learning_rate": 1.3256176053474558e-05, "loss": 2.9924, "step": 68000 }, { "epoch": 1.02, "learning_rate": 1.3206589112691284e-05, "loss": 2.997, "step": 68500 }, { "epoch": 1.03, "learning_rate": 1.3157002171908007e-05, "loss": 2.9694, "step": 69000 }, { "epoch": 1.03, "learning_rate": 1.3107415231124732e-05, "loss": 2.9804, "step": 69500 }, { "epoch": 1.04, "learning_rate": 1.3057828290341456e-05, "loss": 2.9879, "step": 70000 }, { "epoch": 1.05, "learning_rate": 1.3008241349558182e-05, "loss": 2.9919, "step": 70500 }, { "epoch": 1.06, "learning_rate": 1.2958654408774906e-05, "loss": 2.9875, "step": 71000 }, { "epoch": 1.06, "learning_rate": 1.2909067467991632e-05, "loss": 2.9912, "step": 71500 }, { "epoch": 1.07, "learning_rate": 1.2859480527208354e-05, "loss": 2.974, "step": 72000 }, { "epoch": 1.08, "learning_rate": 1.280989358642508e-05, "loss": 2.9581, "step": 72500 }, { "epoch": 1.09, "learning_rate": 1.2760306645641804e-05, "loss": 2.975, "step": 73000 }, { "epoch": 1.09, "learning_rate": 1.271071970485853e-05, "loss": 2.9737, "step": 73500 }, { "epoch": 1.1, "learning_rate": 1.2661132764075254e-05, "loss": 2.9722, "step": 74000 }, { "epoch": 1.11, "learning_rate": 1.261154582329198e-05, "loss": 2.9727, "step": 74500 }, { "epoch": 1.12, "learning_rate": 1.2561958882508702e-05, "loss": 2.9618, "step": 75000 }, { "epoch": 1.12, "learning_rate": 1.2512371941725428e-05, "loss": 2.9554, "step": 75500 }, { "epoch": 1.13, "learning_rate": 1.2462785000942152e-05, "loss": 2.961, "step": 76000 }, { "epoch": 1.14, "learning_rate": 1.2413198060158878e-05, "loss": 2.9627, "step": 76500 }, { "epoch": 1.15, "learning_rate": 1.2363611119375602e-05, "loss": 2.9896, "step": 77000 }, { "epoch": 1.15, "learning_rate": 1.2314024178592328e-05, "loss": 2.9433, "step": 77500 }, { "epoch": 1.16, "learning_rate": 1.2264437237809052e-05, "loss": 2.9329, "step": 78000 }, { "epoch": 1.17, "learning_rate": 1.2214850297025776e-05, "loss": 2.9552, "step": 78500 }, { "epoch": 1.18, "learning_rate": 1.21652633562425e-05, "loss": 2.9382, "step": 79000 }, { "epoch": 1.18, "learning_rate": 1.2115676415459226e-05, "loss": 2.9629, "step": 79500 }, { "epoch": 1.19, "learning_rate": 1.206608947467595e-05, "loss": 2.9555, "step": 80000 }, { "epoch": 1.2, "learning_rate": 1.2016502533892676e-05, "loss": 2.9364, "step": 80500 }, { "epoch": 1.2, "learning_rate": 1.19669155931094e-05, "loss": 2.9296, "step": 81000 }, { "epoch": 1.21, "learning_rate": 1.1917328652326124e-05, "loss": 2.9483, "step": 81500 }, { "epoch": 1.22, "learning_rate": 1.1867741711542848e-05, "loss": 2.9605, "step": 82000 }, { "epoch": 1.23, "learning_rate": 1.1818154770759574e-05, "loss": 2.928, "step": 82500 }, { "epoch": 1.23, "learning_rate": 1.1768567829976298e-05, "loss": 2.9216, "step": 83000 }, { "epoch": 1.24, "learning_rate": 1.1718980889193024e-05, "loss": 2.9402, "step": 83500 }, { "epoch": 1.25, "learning_rate": 1.1669393948409748e-05, "loss": 2.9311, "step": 84000 }, { "epoch": 1.26, "learning_rate": 1.1619807007626474e-05, "loss": 2.9537, "step": 84500 }, { "epoch": 1.26, "learning_rate": 1.1570220066843196e-05, "loss": 2.919, "step": 85000 }, { "epoch": 1.27, "learning_rate": 1.1520633126059922e-05, "loss": 2.918, "step": 85500 }, { "epoch": 1.28, "learning_rate": 1.1471046185276646e-05, "loss": 2.9339, "step": 86000 }, { "epoch": 1.29, "learning_rate": 1.1421459244493372e-05, "loss": 2.9071, "step": 86500 }, { "epoch": 1.29, "learning_rate": 1.1371872303710096e-05, "loss": 2.9397, "step": 87000 }, { "epoch": 1.3, "learning_rate": 1.1322285362926822e-05, "loss": 2.9225, "step": 87500 }, { "epoch": 1.31, "learning_rate": 1.1272698422143544e-05, "loss": 2.9248, "step": 88000 }, { "epoch": 1.32, "learning_rate": 1.122311148136027e-05, "loss": 2.9132, "step": 88500 }, { "epoch": 1.32, "learning_rate": 1.1173524540576994e-05, "loss": 2.8945, "step": 89000 }, { "epoch": 1.33, "learning_rate": 1.112393759979372e-05, "loss": 2.905, "step": 89500 }, { "epoch": 1.34, "learning_rate": 1.1074350659010444e-05, "loss": 2.9256, "step": 90000 }, { "epoch": 1.35, "learning_rate": 1.102476371822717e-05, "loss": 2.9089, "step": 90500 }, { "epoch": 1.35, "learning_rate": 1.0975176777443892e-05, "loss": 2.9104, "step": 91000 }, { "epoch": 1.36, "learning_rate": 1.0925589836660618e-05, "loss": 2.9226, "step": 91500 }, { "epoch": 1.37, "learning_rate": 1.0876002895877342e-05, "loss": 2.902, "step": 92000 }, { "epoch": 1.38, "learning_rate": 1.0826415955094068e-05, "loss": 2.8831, "step": 92500 }, { "epoch": 1.38, "learning_rate": 1.0776829014310792e-05, "loss": 2.906, "step": 93000 }, { "epoch": 1.39, "learning_rate": 1.0727242073527518e-05, "loss": 2.906, "step": 93500 }, { "epoch": 1.4, "learning_rate": 1.0677655132744242e-05, "loss": 2.8901, "step": 94000 }, { "epoch": 1.41, "learning_rate": 1.0628068191960966e-05, "loss": 2.9063, "step": 94500 }, { "epoch": 1.41, "learning_rate": 1.057848125117769e-05, "loss": 2.8765, "step": 95000 }, { "epoch": 1.42, "learning_rate": 1.0528894310394416e-05, "loss": 2.9022, "step": 95500 }, { "epoch": 1.43, "learning_rate": 1.047930736961114e-05, "loss": 2.8906, "step": 96000 }, { "epoch": 1.44, "learning_rate": 1.0429720428827866e-05, "loss": 2.8627, "step": 96500 }, { "epoch": 1.44, "learning_rate": 1.038013348804459e-05, "loss": 2.8789, "step": 97000 }, { "epoch": 1.45, "learning_rate": 1.0330546547261314e-05, "loss": 2.8782, "step": 97500 }, { "epoch": 1.46, "learning_rate": 1.0280959606478038e-05, "loss": 2.8706, "step": 98000 }, { "epoch": 1.47, "learning_rate": 1.0231372665694764e-05, "loss": 2.8434, "step": 98500 }, { "epoch": 1.47, "learning_rate": 1.0181785724911488e-05, "loss": 2.8851, "step": 99000 }, { "epoch": 1.48, "learning_rate": 1.0132198784128214e-05, "loss": 2.8806, "step": 99500 }, { "epoch": 1.49, "learning_rate": 1.0082611843344938e-05, "loss": 2.8695, "step": 100000 }, { "epoch": 1.5, "learning_rate": 1.0033024902561664e-05, "loss": 2.8775, "step": 100500 }, { "epoch": 1.5, "learning_rate": 9.983437961778388e-06, "loss": 2.8717, "step": 101000 }, { "epoch": 1.51, "learning_rate": 9.933851020995112e-06, "loss": 2.8616, "step": 101500 }, { "epoch": 1.52, "learning_rate": 9.884264080211836e-06, "loss": 2.8656, "step": 102000 }, { "epoch": 1.52, "learning_rate": 9.834677139428562e-06, "loss": 2.8867, "step": 102500 }, { "epoch": 1.53, "learning_rate": 9.785090198645286e-06, "loss": 2.8491, "step": 103000 }, { "epoch": 1.54, "learning_rate": 9.73550325786201e-06, "loss": 2.8716, "step": 103500 }, { "epoch": 1.55, "learning_rate": 9.685916317078736e-06, "loss": 2.8743, "step": 104000 }, { "epoch": 1.55, "learning_rate": 9.63632937629546e-06, "loss": 2.8503, "step": 104500 }, { "epoch": 1.56, "learning_rate": 9.586742435512184e-06, "loss": 2.8625, "step": 105000 }, { "epoch": 1.57, "learning_rate": 9.53715549472891e-06, "loss": 2.8237, "step": 105500 }, { "epoch": 1.58, "learning_rate": 9.487568553945634e-06, "loss": 2.8619, "step": 106000 }, { "epoch": 1.58, "learning_rate": 9.437981613162358e-06, "loss": 2.8629, "step": 106500 }, { "epoch": 1.59, "learning_rate": 9.388394672379084e-06, "loss": 2.8441, "step": 107000 }, { "epoch": 1.6, "learning_rate": 9.338807731595808e-06, "loss": 2.8569, "step": 107500 }, { "epoch": 1.61, "learning_rate": 9.289220790812532e-06, "loss": 2.8511, "step": 108000 }, { "epoch": 1.61, "learning_rate": 9.239633850029258e-06, "loss": 2.8701, "step": 108500 }, { "epoch": 1.62, "learning_rate": 9.190046909245982e-06, "loss": 2.8572, "step": 109000 }, { "epoch": 1.63, "learning_rate": 9.140459968462706e-06, "loss": 2.8673, "step": 109500 }, { "epoch": 1.64, "learning_rate": 9.090873027679432e-06, "loss": 2.8621, "step": 110000 }, { "epoch": 1.64, "learning_rate": 9.041286086896156e-06, "loss": 2.8592, "step": 110500 }, { "epoch": 1.65, "learning_rate": 8.99169914611288e-06, "loss": 2.8582, "step": 111000 }, { "epoch": 1.66, "learning_rate": 8.942112205329606e-06, "loss": 2.8666, "step": 111500 }, { "epoch": 1.67, "learning_rate": 8.89252526454633e-06, "loss": 2.8588, "step": 112000 }, { "epoch": 1.67, "learning_rate": 8.842938323763054e-06, "loss": 2.8475, "step": 112500 }, { "epoch": 1.68, "learning_rate": 8.79335138297978e-06, "loss": 2.8357, "step": 113000 }, { "epoch": 1.69, "learning_rate": 8.743764442196504e-06, "loss": 2.8608, "step": 113500 }, { "epoch": 1.7, "learning_rate": 8.69417750141323e-06, "loss": 2.8532, "step": 114000 }, { "epoch": 1.7, "learning_rate": 8.644590560629953e-06, "loss": 2.8545, "step": 114500 }, { "epoch": 1.71, "learning_rate": 8.595003619846678e-06, "loss": 2.8277, "step": 115000 }, { "epoch": 1.72, "learning_rate": 8.545416679063403e-06, "loss": 2.8509, "step": 115500 }, { "epoch": 1.73, "learning_rate": 8.495829738280127e-06, "loss": 2.8413, "step": 116000 }, { "epoch": 1.73, "learning_rate": 8.446242797496852e-06, "loss": 2.838, "step": 116500 }, { "epoch": 1.74, "learning_rate": 8.396655856713577e-06, "loss": 2.8543, "step": 117000 }, { "epoch": 1.75, "learning_rate": 8.347068915930301e-06, "loss": 2.8347, "step": 117500 }, { "epoch": 1.76, "learning_rate": 8.297481975147026e-06, "loss": 2.8669, "step": 118000 }, { "epoch": 1.76, "learning_rate": 8.247895034363751e-06, "loss": 2.8228, "step": 118500 }, { "epoch": 1.77, "learning_rate": 8.198308093580475e-06, "loss": 2.8385, "step": 119000 }, { "epoch": 1.78, "learning_rate": 8.1487211527972e-06, "loss": 2.8257, "step": 119500 }, { "epoch": 1.79, "learning_rate": 8.099134212013925e-06, "loss": 2.8362, "step": 120000 }, { "epoch": 1.79, "learning_rate": 8.04954727123065e-06, "loss": 2.8319, "step": 120500 }, { "epoch": 1.8, "learning_rate": 7.999960330447374e-06, "loss": 2.8356, "step": 121000 }, { "epoch": 1.81, "learning_rate": 7.9503733896641e-06, "loss": 2.8199, "step": 121500 }, { "epoch": 1.81, "learning_rate": 7.900786448880823e-06, "loss": 2.8039, "step": 122000 }, { "epoch": 1.82, "learning_rate": 7.851199508097548e-06, "loss": 2.832, "step": 122500 }, { "epoch": 1.83, "learning_rate": 7.801612567314273e-06, "loss": 2.8125, "step": 123000 }, { "epoch": 1.84, "learning_rate": 7.752025626530997e-06, "loss": 2.8005, "step": 123500 }, { "epoch": 1.84, "learning_rate": 7.702438685747721e-06, "loss": 2.8402, "step": 124000 }, { "epoch": 1.85, "learning_rate": 7.652851744964447e-06, "loss": 2.8186, "step": 124500 }, { "epoch": 1.86, "learning_rate": 7.603264804181172e-06, "loss": 2.8296, "step": 125000 }, { "epoch": 1.87, "learning_rate": 7.5536778633978955e-06, "loss": 2.8193, "step": 125500 }, { "epoch": 1.87, "learning_rate": 7.50409092261462e-06, "loss": 2.8093, "step": 126000 }, { "epoch": 1.88, "learning_rate": 7.454503981831346e-06, "loss": 2.8383, "step": 126500 }, { "epoch": 1.89, "learning_rate": 7.4049170410480695e-06, "loss": 2.821, "step": 127000 }, { "epoch": 1.9, "learning_rate": 7.355330100264794e-06, "loss": 2.7976, "step": 127500 }, { "epoch": 1.9, "learning_rate": 7.30574315948152e-06, "loss": 2.8183, "step": 128000 }, { "epoch": 1.91, "learning_rate": 7.2561562186982434e-06, "loss": 2.8089, "step": 128500 }, { "epoch": 1.92, "learning_rate": 7.206569277914968e-06, "loss": 2.818, "step": 129000 }, { "epoch": 1.93, "learning_rate": 7.156982337131694e-06, "loss": 2.8052, "step": 129500 }, { "epoch": 1.93, "learning_rate": 7.107395396348419e-06, "loss": 2.8183, "step": 130000 }, { "epoch": 1.94, "learning_rate": 7.057808455565142e-06, "loss": 2.8098, "step": 130500 }, { "epoch": 1.95, "learning_rate": 7.008221514781868e-06, "loss": 2.8155, "step": 131000 }, { "epoch": 1.96, "learning_rate": 6.958634573998593e-06, "loss": 2.8074, "step": 131500 }, { "epoch": 1.96, "learning_rate": 6.909047633215316e-06, "loss": 2.7913, "step": 132000 }, { "epoch": 1.97, "learning_rate": 6.859460692432042e-06, "loss": 2.8122, "step": 132500 }, { "epoch": 1.98, "learning_rate": 6.809873751648767e-06, "loss": 2.8327, "step": 133000 }, { "epoch": 1.99, "learning_rate": 6.76028681086549e-06, "loss": 2.7897, "step": 133500 }, { "epoch": 1.99, "learning_rate": 6.710699870082215e-06, "loss": 2.7777, "step": 134000 }, { "epoch": 2.0, "eval_bleu": 13.585366050482984, "eval_loss": 2.2518081665039062, "eval_runtime": 4182.4693, "eval_samples_per_second": 9.667, "eval_steps_per_second": 0.604, "step": 134444 }, { "epoch": 2.0, "learning_rate": 6.661112929298941e-06, "loss": 2.7994, "step": 134500 }, { "epoch": 2.01, "learning_rate": 6.611525988515664e-06, "loss": 2.8167, "step": 135000 }, { "epoch": 2.02, "learning_rate": 6.561939047732389e-06, "loss": 2.8123, "step": 135500 }, { "epoch": 2.02, "learning_rate": 6.512352106949115e-06, "loss": 2.7844, "step": 136000 }, { "epoch": 2.03, "learning_rate": 6.462765166165838e-06, "loss": 2.7956, "step": 136500 }, { "epoch": 2.04, "learning_rate": 6.413178225382563e-06, "loss": 2.7968, "step": 137000 }, { "epoch": 2.05, "learning_rate": 6.363591284599289e-06, "loss": 2.7916, "step": 137500 }, { "epoch": 2.05, "learning_rate": 6.314004343816014e-06, "loss": 2.7958, "step": 138000 }, { "epoch": 2.06, "learning_rate": 6.264417403032737e-06, "loss": 2.7855, "step": 138500 }, { "epoch": 2.07, "learning_rate": 6.214830462249463e-06, "loss": 2.7876, "step": 139000 }, { "epoch": 2.08, "learning_rate": 6.165243521466188e-06, "loss": 2.7724, "step": 139500 }, { "epoch": 2.08, "learning_rate": 6.115656580682911e-06, "loss": 2.8021, "step": 140000 }, { "epoch": 2.09, "learning_rate": 6.066069639899637e-06, "loss": 2.8024, "step": 140500 }, { "epoch": 2.1, "learning_rate": 6.016482699116362e-06, "loss": 2.7891, "step": 141000 }, { "epoch": 2.1, "learning_rate": 5.966895758333085e-06, "loss": 2.7592, "step": 141500 }, { "epoch": 2.11, "learning_rate": 5.917308817549811e-06, "loss": 2.7895, "step": 142000 }, { "epoch": 2.12, "learning_rate": 5.867721876766536e-06, "loss": 2.8106, "step": 142500 }, { "epoch": 2.13, "learning_rate": 5.818134935983259e-06, "loss": 2.7985, "step": 143000 }, { "epoch": 2.13, "learning_rate": 5.768547995199985e-06, "loss": 2.8137, "step": 143500 }, { "epoch": 2.14, "learning_rate": 5.71896105441671e-06, "loss": 2.7824, "step": 144000 }, { "epoch": 2.15, "learning_rate": 5.669374113633433e-06, "loss": 2.7878, "step": 144500 }, { "epoch": 2.16, "learning_rate": 5.619787172850158e-06, "loss": 2.7861, "step": 145000 }, { "epoch": 2.16, "learning_rate": 5.570200232066884e-06, "loss": 2.7886, "step": 145500 }, { "epoch": 2.17, "learning_rate": 5.520613291283607e-06, "loss": 2.7798, "step": 146000 }, { "epoch": 2.18, "learning_rate": 5.471026350500332e-06, "loss": 2.8015, "step": 146500 }, { "epoch": 2.19, "learning_rate": 5.421439409717058e-06, "loss": 2.8013, "step": 147000 }, { "epoch": 2.19, "learning_rate": 5.371852468933783e-06, "loss": 2.7567, "step": 147500 }, { "epoch": 2.2, "learning_rate": 5.322265528150506e-06, "loss": 2.8004, "step": 148000 }, { "epoch": 2.21, "learning_rate": 5.272678587367232e-06, "loss": 2.7817, "step": 148500 }, { "epoch": 2.22, "learning_rate": 5.223091646583957e-06, "loss": 2.7674, "step": 149000 }, { "epoch": 2.22, "learning_rate": 5.17350470580068e-06, "loss": 2.7882, "step": 149500 }, { "epoch": 2.23, "learning_rate": 5.123917765017406e-06, "loss": 2.7695, "step": 150000 }, { "epoch": 2.24, "learning_rate": 5.074330824234131e-06, "loss": 2.7708, "step": 150500 }, { "epoch": 2.25, "learning_rate": 5.024743883450854e-06, "loss": 2.7791, "step": 151000 }, { "epoch": 2.25, "learning_rate": 4.97515694266758e-06, "loss": 2.7998, "step": 151500 }, { "epoch": 2.26, "learning_rate": 4.925570001884304e-06, "loss": 2.7546, "step": 152000 }, { "epoch": 2.27, "learning_rate": 4.875983061101029e-06, "loss": 2.7576, "step": 152500 }, { "epoch": 2.28, "learning_rate": 4.826396120317754e-06, "loss": 2.7917, "step": 153000 }, { "epoch": 2.28, "learning_rate": 4.776809179534479e-06, "loss": 2.7828, "step": 153500 }, { "epoch": 2.29, "learning_rate": 4.727222238751203e-06, "loss": 2.7884, "step": 154000 }, { "epoch": 2.3, "learning_rate": 4.677635297967928e-06, "loss": 2.7807, "step": 154500 }, { "epoch": 2.31, "learning_rate": 4.628048357184653e-06, "loss": 2.7652, "step": 155000 }, { "epoch": 2.31, "learning_rate": 4.578461416401377e-06, "loss": 2.7918, "step": 155500 }, { "epoch": 2.32, "learning_rate": 4.528874475618102e-06, "loss": 2.7771, "step": 156000 }, { "epoch": 2.33, "learning_rate": 4.479287534834827e-06, "loss": 2.7561, "step": 156500 }, { "epoch": 2.34, "learning_rate": 4.429700594051551e-06, "loss": 2.7789, "step": 157000 }, { "epoch": 2.34, "learning_rate": 4.380113653268276e-06, "loss": 2.7629, "step": 157500 }, { "epoch": 2.35, "learning_rate": 4.330526712485001e-06, "loss": 2.7672, "step": 158000 }, { "epoch": 2.36, "learning_rate": 4.280939771701725e-06, "loss": 2.7643, "step": 158500 }, { "epoch": 2.37, "learning_rate": 4.23135283091845e-06, "loss": 2.7836, "step": 159000 }, { "epoch": 2.37, "learning_rate": 4.181765890135175e-06, "loss": 2.7742, "step": 159500 }, { "epoch": 2.38, "learning_rate": 4.132178949351899e-06, "loss": 2.7504, "step": 160000 }, { "epoch": 2.39, "learning_rate": 4.082592008568624e-06, "loss": 2.7738, "step": 160500 }, { "epoch": 2.4, "learning_rate": 4.033005067785349e-06, "loss": 2.7741, "step": 161000 }, { "epoch": 2.4, "learning_rate": 3.983418127002074e-06, "loss": 2.768, "step": 161500 }, { "epoch": 2.41, "learning_rate": 3.933831186218798e-06, "loss": 2.7874, "step": 162000 }, { "epoch": 2.42, "learning_rate": 3.884244245435523e-06, "loss": 2.7733, "step": 162500 }, { "epoch": 2.42, "learning_rate": 3.834657304652248e-06, "loss": 2.7677, "step": 163000 }, { "epoch": 2.43, "learning_rate": 3.7850703638689717e-06, "loss": 2.7601, "step": 163500 }, { "epoch": 2.44, "learning_rate": 3.735483423085696e-06, "loss": 2.7832, "step": 164000 }, { "epoch": 2.45, "learning_rate": 3.685896482302421e-06, "loss": 2.7704, "step": 164500 }, { "epoch": 2.45, "learning_rate": 3.6363095415191457e-06, "loss": 2.7698, "step": 165000 }, { "epoch": 2.46, "learning_rate": 3.5867226007358706e-06, "loss": 2.7724, "step": 165500 }, { "epoch": 2.47, "learning_rate": 3.537135659952595e-06, "loss": 2.7647, "step": 166000 }, { "epoch": 2.48, "learning_rate": 3.4875487191693196e-06, "loss": 2.7719, "step": 166500 }, { "epoch": 2.48, "learning_rate": 3.4379617783860446e-06, "loss": 2.7578, "step": 167000 }, { "epoch": 2.49, "learning_rate": 3.388374837602769e-06, "loss": 2.7478, "step": 167500 }, { "epoch": 2.5, "learning_rate": 3.3387878968194936e-06, "loss": 2.7747, "step": 168000 }, { "epoch": 2.51, "learning_rate": 3.2892009560362186e-06, "loss": 2.7533, "step": 168500 }, { "epoch": 2.51, "learning_rate": 3.239614015252943e-06, "loss": 2.7488, "step": 169000 }, { "epoch": 2.52, "learning_rate": 3.190027074469668e-06, "loss": 2.7577, "step": 169500 }, { "epoch": 2.53, "learning_rate": 3.1404401336863925e-06, "loss": 2.7646, "step": 170000 }, { "epoch": 2.54, "learning_rate": 3.090853192903117e-06, "loss": 2.7556, "step": 170500 }, { "epoch": 2.54, "learning_rate": 3.041266252119842e-06, "loss": 2.7578, "step": 171000 }, { "epoch": 2.55, "learning_rate": 2.9916793113365665e-06, "loss": 2.7428, "step": 171500 }, { "epoch": 2.56, "learning_rate": 2.942092370553291e-06, "loss": 2.7487, "step": 172000 }, { "epoch": 2.57, "learning_rate": 2.892505429770016e-06, "loss": 2.7457, "step": 172500 }, { "epoch": 2.57, "learning_rate": 2.8429184889867405e-06, "loss": 2.7366, "step": 173000 }, { "epoch": 2.58, "learning_rate": 2.7933315482034655e-06, "loss": 2.7497, "step": 173500 }, { "epoch": 2.59, "learning_rate": 2.74374460742019e-06, "loss": 2.7467, "step": 174000 }, { "epoch": 2.6, "learning_rate": 2.6941576666369145e-06, "loss": 2.7633, "step": 174500 }, { "epoch": 2.6, "learning_rate": 2.6445707258536394e-06, "loss": 2.7681, "step": 175000 }, { "epoch": 2.61, "learning_rate": 2.594983785070364e-06, "loss": 2.7552, "step": 175500 }, { "epoch": 2.62, "learning_rate": 2.5453968442870885e-06, "loss": 2.7539, "step": 176000 }, { "epoch": 2.63, "learning_rate": 2.4958099035038134e-06, "loss": 2.7393, "step": 176500 }, { "epoch": 2.63, "learning_rate": 2.446222962720538e-06, "loss": 2.7727, "step": 177000 }, { "epoch": 2.64, "learning_rate": 2.396636021937263e-06, "loss": 2.736, "step": 177500 }, { "epoch": 2.65, "learning_rate": 2.3470490811539874e-06, "loss": 2.7546, "step": 178000 }, { "epoch": 2.66, "learning_rate": 2.297462140370712e-06, "loss": 2.7601, "step": 178500 }, { "epoch": 2.66, "learning_rate": 2.247875199587437e-06, "loss": 2.7456, "step": 179000 }, { "epoch": 2.67, "learning_rate": 2.1982882588041614e-06, "loss": 2.76, "step": 179500 }, { "epoch": 2.68, "learning_rate": 2.1487013180208863e-06, "loss": 2.7396, "step": 180000 }, { "epoch": 2.69, "learning_rate": 2.099114377237611e-06, "loss": 2.761, "step": 180500 }, { "epoch": 2.69, "learning_rate": 2.0495274364543354e-06, "loss": 2.7603, "step": 181000 }, { "epoch": 2.7, "learning_rate": 1.9999404956710603e-06, "loss": 2.7614, "step": 181500 }, { "epoch": 2.71, "learning_rate": 1.950353554887785e-06, "loss": 2.7638, "step": 182000 }, { "epoch": 2.71, "learning_rate": 1.9007666141045096e-06, "loss": 2.7806, "step": 182500 }, { "epoch": 2.72, "learning_rate": 1.8511796733212343e-06, "loss": 2.7561, "step": 183000 }, { "epoch": 2.73, "learning_rate": 1.801592732537959e-06, "loss": 2.7473, "step": 183500 }, { "epoch": 2.74, "learning_rate": 1.7520057917546838e-06, "loss": 2.7405, "step": 184000 }, { "epoch": 2.74, "learning_rate": 1.7024188509714083e-06, "loss": 2.742, "step": 184500 }, { "epoch": 2.75, "learning_rate": 1.652831910188133e-06, "loss": 2.7387, "step": 185000 }, { "epoch": 2.76, "learning_rate": 1.6032449694048577e-06, "loss": 2.7681, "step": 185500 }, { "epoch": 2.77, "learning_rate": 1.5536580286215825e-06, "loss": 2.7599, "step": 186000 }, { "epoch": 2.77, "learning_rate": 1.504071087838307e-06, "loss": 2.7497, "step": 186500 }, { "epoch": 2.78, "learning_rate": 1.4544841470550317e-06, "loss": 2.7417, "step": 187000 }, { "epoch": 2.79, "learning_rate": 1.4048972062717565e-06, "loss": 2.739, "step": 187500 }, { "epoch": 2.8, "learning_rate": 1.3553102654884812e-06, "loss": 2.7633, "step": 188000 }, { "epoch": 2.8, "learning_rate": 1.3057233247052057e-06, "loss": 2.7516, "step": 188500 }, { "epoch": 2.81, "learning_rate": 1.2561363839219304e-06, "loss": 2.7638, "step": 189000 }, { "epoch": 2.82, "learning_rate": 1.2065494431386552e-06, "loss": 2.7397, "step": 189500 }, { "epoch": 2.83, "learning_rate": 1.1569625023553797e-06, "loss": 2.7614, "step": 190000 }, { "epoch": 2.83, "learning_rate": 1.1073755615721044e-06, "loss": 2.7235, "step": 190500 }, { "epoch": 2.84, "learning_rate": 1.0577886207888292e-06, "loss": 2.7401, "step": 191000 }, { "epoch": 2.85, "learning_rate": 1.0082016800055539e-06, "loss": 2.7573, "step": 191500 }, { "epoch": 2.86, "learning_rate": 9.586147392222784e-07, "loss": 2.7452, "step": 192000 }, { "epoch": 2.86, "learning_rate": 9.090277984390032e-07, "loss": 2.7505, "step": 192500 }, { "epoch": 2.87, "learning_rate": 8.594408576557279e-07, "loss": 2.7233, "step": 193000 }, { "epoch": 2.88, "learning_rate": 8.098539168724526e-07, "loss": 2.7409, "step": 193500 }, { "epoch": 2.89, "learning_rate": 7.602669760891772e-07, "loss": 2.7424, "step": 194000 }, { "epoch": 2.89, "learning_rate": 7.10680035305902e-07, "loss": 2.7563, "step": 194500 }, { "epoch": 2.9, "learning_rate": 6.610930945226266e-07, "loss": 2.7478, "step": 195000 }, { "epoch": 2.91, "learning_rate": 6.115061537393512e-07, "loss": 2.7555, "step": 195500 }, { "epoch": 2.92, "learning_rate": 5.619192129560759e-07, "loss": 2.7679, "step": 196000 }, { "epoch": 2.92, "learning_rate": 5.123322721728006e-07, "loss": 2.7219, "step": 196500 }, { "epoch": 2.93, "learning_rate": 4.6274533138952524e-07, "loss": 2.7283, "step": 197000 }, { "epoch": 2.94, "learning_rate": 4.131583906062499e-07, "loss": 2.7703, "step": 197500 }, { "epoch": 2.95, "learning_rate": 3.6357144982297465e-07, "loss": 2.7606, "step": 198000 }, { "epoch": 2.95, "learning_rate": 3.1398450903969933e-07, "loss": 2.7247, "step": 198500 } ], "max_steps": 201666, "num_train_epochs": 3, "total_flos": 2.3390721608830157e+17, "trial_name": null, "trial_params": null }