{ "best_metric": 1.8157883882522583, "best_model_checkpoint": "/tmp/tst-gun-gub-pt/checkpoint-40000", "epoch": 3.0, "eval_steps": 4000, "global_step": 40752, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 4.938653317628583e-05, "loss": 3.3185, "step": 500 }, { "epoch": 0.07, "learning_rate": 4.877306635257165e-05, "loss": 2.967, "step": 1000 }, { "epoch": 0.11, "learning_rate": 4.815959952885748e-05, "loss": 2.803, "step": 1500 }, { "epoch": 0.15, "learning_rate": 4.754613270514331e-05, "loss": 2.6964, "step": 2000 }, { "epoch": 0.18, "learning_rate": 4.6932665881429136e-05, "loss": 2.6199, "step": 2500 }, { "epoch": 0.22, "learning_rate": 4.631919905771496e-05, "loss": 2.5435, "step": 3000 }, { "epoch": 0.26, "learning_rate": 4.5705732234000786e-05, "loss": 2.4687, "step": 3500 }, { "epoch": 0.29, "learning_rate": 4.509226541028662e-05, "loss": 2.4421, "step": 4000 }, { "epoch": 0.29, "eval_bleu": 3.3785, "eval_gen_len": 68.48, "eval_loss": 2.5284605026245117, "eval_runtime": 39.0713, "eval_samples_per_second": 6.399, "eval_steps_per_second": 0.41, "step": 4000 }, { "epoch": 0.33, "learning_rate": 4.4478798586572437e-05, "loss": 2.3817, "step": 4500 }, { "epoch": 0.37, "learning_rate": 4.386533176285827e-05, "loss": 2.3424, "step": 5000 }, { "epoch": 0.4, "learning_rate": 4.3251864939144094e-05, "loss": 2.3047, "step": 5500 }, { "epoch": 0.44, "learning_rate": 4.263839811542992e-05, "loss": 2.2823, "step": 6000 }, { "epoch": 0.48, "learning_rate": 4.2024931291715744e-05, "loss": 2.2436, "step": 6500 }, { "epoch": 0.52, "learning_rate": 4.141146446800157e-05, "loss": 2.2266, "step": 7000 }, { "epoch": 0.55, "learning_rate": 4.07979976442874e-05, "loss": 2.1898, "step": 7500 }, { "epoch": 0.59, "learning_rate": 4.018453082057323e-05, "loss": 2.1667, "step": 8000 }, { "epoch": 0.59, "eval_bleu": 4.5883, "eval_gen_len": 58.6, "eval_loss": 2.3017916679382324, "eval_runtime": 34.6446, "eval_samples_per_second": 7.216, "eval_steps_per_second": 0.462, "step": 8000 }, { "epoch": 0.63, "learning_rate": 3.957106399685905e-05, "loss": 2.1403, "step": 8500 }, { "epoch": 0.66, "learning_rate": 3.895759717314488e-05, "loss": 2.1188, "step": 9000 }, { "epoch": 0.7, "learning_rate": 3.834413034943071e-05, "loss": 2.1002, "step": 9500 }, { "epoch": 0.74, "learning_rate": 3.773066352571653e-05, "loss": 2.0948, "step": 10000 }, { "epoch": 0.77, "learning_rate": 3.711719670200235e-05, "loss": 2.0678, "step": 10500 }, { "epoch": 0.81, "learning_rate": 3.6503729878288185e-05, "loss": 2.0502, "step": 11000 }, { "epoch": 0.85, "learning_rate": 3.589026305457401e-05, "loss": 2.0416, "step": 11500 }, { "epoch": 0.88, "learning_rate": 3.5276796230859836e-05, "loss": 2.0255, "step": 12000 }, { "epoch": 0.88, "eval_bleu": 5.1052, "eval_gen_len": 67.3, "eval_loss": 2.1290316581726074, "eval_runtime": 40.7019, "eval_samples_per_second": 6.142, "eval_steps_per_second": 0.393, "step": 12000 }, { "epoch": 0.92, "learning_rate": 3.466332940714566e-05, "loss": 2.0057, "step": 12500 }, { "epoch": 0.96, "learning_rate": 3.404986258343149e-05, "loss": 1.9979, "step": 13000 }, { "epoch": 0.99, "learning_rate": 3.343639575971731e-05, "loss": 1.9726, "step": 13500 }, { "epoch": 1.03, "learning_rate": 3.2822928936003144e-05, "loss": 1.9261, "step": 14000 }, { "epoch": 1.07, "learning_rate": 3.220946211228897e-05, "loss": 1.9133, "step": 14500 }, { "epoch": 1.1, "learning_rate": 3.15959952885748e-05, "loss": 1.9109, "step": 15000 }, { "epoch": 1.14, "learning_rate": 3.098252846486062e-05, "loss": 1.9171, "step": 15500 }, { "epoch": 1.18, "learning_rate": 3.0369061641146445e-05, "loss": 1.8995, "step": 16000 }, { "epoch": 1.18, "eval_bleu": 7.8429, "eval_gen_len": 55.48, "eval_loss": 2.0535314083099365, "eval_runtime": 32.581, "eval_samples_per_second": 7.673, "eval_steps_per_second": 0.491, "step": 16000 }, { "epoch": 1.21, "learning_rate": 2.9755594817432277e-05, "loss": 1.8771, "step": 16500 }, { "epoch": 1.25, "learning_rate": 2.91421279937181e-05, "loss": 1.8841, "step": 17000 }, { "epoch": 1.29, "learning_rate": 2.8528661170003927e-05, "loss": 1.8787, "step": 17500 }, { "epoch": 1.33, "learning_rate": 2.7915194346289753e-05, "loss": 1.8647, "step": 18000 }, { "epoch": 1.36, "learning_rate": 2.730172752257558e-05, "loss": 1.8488, "step": 18500 }, { "epoch": 1.4, "learning_rate": 2.6688260698861407e-05, "loss": 1.8342, "step": 19000 }, { "epoch": 1.44, "learning_rate": 2.6074793875147235e-05, "loss": 1.833, "step": 19500 }, { "epoch": 1.47, "learning_rate": 2.546132705143306e-05, "loss": 1.8322, "step": 20000 }, { "epoch": 1.47, "eval_bleu": 7.2663, "eval_gen_len": 58.24, "eval_loss": 1.9960261583328247, "eval_runtime": 35.7526, "eval_samples_per_second": 6.993, "eval_steps_per_second": 0.448, "step": 20000 }, { "epoch": 1.51, "learning_rate": 2.4847860227718886e-05, "loss": 1.8187, "step": 20500 }, { "epoch": 1.55, "learning_rate": 2.4234393404004714e-05, "loss": 1.8118, "step": 21000 }, { "epoch": 1.58, "learning_rate": 2.362092658029054e-05, "loss": 1.8144, "step": 21500 }, { "epoch": 1.62, "learning_rate": 2.3007459756576365e-05, "loss": 1.7989, "step": 22000 }, { "epoch": 1.66, "learning_rate": 2.2393992932862194e-05, "loss": 1.8068, "step": 22500 }, { "epoch": 1.69, "learning_rate": 2.178052610914802e-05, "loss": 1.8043, "step": 23000 }, { "epoch": 1.73, "learning_rate": 2.1167059285433844e-05, "loss": 1.7863, "step": 23500 }, { "epoch": 1.77, "learning_rate": 2.055359246171967e-05, "loss": 1.7868, "step": 24000 }, { "epoch": 1.77, "eval_bleu": 7.0981, "eval_gen_len": 66.34, "eval_loss": 1.922366976737976, "eval_runtime": 40.7924, "eval_samples_per_second": 6.129, "eval_steps_per_second": 0.392, "step": 24000 }, { "epoch": 1.8, "learning_rate": 1.9940125638005498e-05, "loss": 1.7759, "step": 24500 }, { "epoch": 1.84, "learning_rate": 1.9326658814291323e-05, "loss": 1.7653, "step": 25000 }, { "epoch": 1.88, "learning_rate": 1.871319199057715e-05, "loss": 1.7689, "step": 25500 }, { "epoch": 1.91, "learning_rate": 1.8099725166862977e-05, "loss": 1.758, "step": 26000 }, { "epoch": 1.95, "learning_rate": 1.7486258343148802e-05, "loss": 1.754, "step": 26500 }, { "epoch": 1.99, "learning_rate": 1.687279151943463e-05, "loss": 1.7393, "step": 27000 }, { "epoch": 2.02, "learning_rate": 1.6259324695720456e-05, "loss": 1.7151, "step": 27500 }, { "epoch": 2.06, "learning_rate": 1.5645857872006285e-05, "loss": 1.7012, "step": 28000 }, { "epoch": 2.06, "eval_bleu": 7.5657, "eval_gen_len": 60.3, "eval_loss": 1.8868523836135864, "eval_runtime": 38.0123, "eval_samples_per_second": 6.577, "eval_steps_per_second": 0.421, "step": 28000 }, { "epoch": 2.1, "learning_rate": 1.503239104829211e-05, "loss": 1.6993, "step": 28500 }, { "epoch": 2.13, "learning_rate": 1.4418924224577934e-05, "loss": 1.6944, "step": 29000 }, { "epoch": 2.17, "learning_rate": 1.380545740086376e-05, "loss": 1.7064, "step": 29500 }, { "epoch": 2.21, "learning_rate": 1.3191990577149588e-05, "loss": 1.6993, "step": 30000 }, { "epoch": 2.25, "learning_rate": 1.2578523753435415e-05, "loss": 1.6893, "step": 30500 }, { "epoch": 2.28, "learning_rate": 1.1965056929721242e-05, "loss": 1.6874, "step": 31000 }, { "epoch": 2.32, "learning_rate": 1.1351590106007069e-05, "loss": 1.6793, "step": 31500 }, { "epoch": 2.36, "learning_rate": 1.0738123282292894e-05, "loss": 1.6773, "step": 32000 }, { "epoch": 2.36, "eval_bleu": 7.9888, "eval_gen_len": 61.18, "eval_loss": 1.8613367080688477, "eval_runtime": 38.7116, "eval_samples_per_second": 6.458, "eval_steps_per_second": 0.413, "step": 32000 }, { "epoch": 2.39, "learning_rate": 1.012465645857872e-05, "loss": 1.6798, "step": 32500 }, { "epoch": 2.43, "learning_rate": 9.511189634864546e-06, "loss": 1.6823, "step": 33000 }, { "epoch": 2.47, "learning_rate": 8.897722811150373e-06, "loss": 1.6711, "step": 33500 }, { "epoch": 2.5, "learning_rate": 8.2842559874362e-06, "loss": 1.679, "step": 34000 }, { "epoch": 2.54, "learning_rate": 7.670789163722027e-06, "loss": 1.6662, "step": 34500 }, { "epoch": 2.58, "learning_rate": 7.057322340007853e-06, "loss": 1.6716, "step": 35000 }, { "epoch": 2.61, "learning_rate": 6.4438555162936784e-06, "loss": 1.6569, "step": 35500 }, { "epoch": 2.65, "learning_rate": 5.830388692579505e-06, "loss": 1.6631, "step": 36000 }, { "epoch": 2.65, "eval_bleu": 8.0862, "eval_gen_len": 60.5, "eval_loss": 1.8354450464248657, "eval_runtime": 38.0466, "eval_samples_per_second": 6.571, "eval_steps_per_second": 0.421, "step": 36000 }, { "epoch": 2.69, "learning_rate": 5.216921868865332e-06, "loss": 1.6602, "step": 36500 }, { "epoch": 2.72, "learning_rate": 4.6034550451511585e-06, "loss": 1.6549, "step": 37000 }, { "epoch": 2.76, "learning_rate": 3.989988221436985e-06, "loss": 1.6521, "step": 37500 }, { "epoch": 2.8, "learning_rate": 3.3765213977228115e-06, "loss": 1.6643, "step": 38000 }, { "epoch": 2.83, "learning_rate": 2.7630545740086376e-06, "loss": 1.6533, "step": 38500 }, { "epoch": 2.87, "learning_rate": 2.149587750294464e-06, "loss": 1.6596, "step": 39000 }, { "epoch": 2.91, "learning_rate": 1.5361209265802905e-06, "loss": 1.6664, "step": 39500 }, { "epoch": 2.94, "learning_rate": 9.22654102866117e-07, "loss": 1.6379, "step": 40000 }, { "epoch": 2.94, "eval_bleu": 8.4077, "eval_gen_len": 60.18, "eval_loss": 1.8157883882522583, "eval_runtime": 35.7813, "eval_samples_per_second": 6.987, "eval_steps_per_second": 0.447, "step": 40000 }, { "epoch": 2.98, "learning_rate": 3.091872791519435e-07, "loss": 1.6493, "step": 40500 }, { "epoch": 3.0, "step": 40752, "total_flos": 3.4728164062396416e+16, "train_loss": 1.939443588256836, "train_runtime": 15380.8623, "train_samples_per_second": 42.392, "train_steps_per_second": 2.65 } ], "logging_steps": 500, "max_steps": 40752, "num_train_epochs": 3, "save_steps": 4000, "total_flos": 3.4728164062396416e+16, "trial_name": null, "trial_params": null }