{ "best_metric": 19.9878, "best_model_checkpoint": "ckpt_mt5/google/mt5-large/iwslt2017_de_en/lr5e-05_e49/checkpoint-64410", "epoch": 7.0, "global_step": 90174, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 5e-05, "loss": 2.9083, "step": 500 }, { "epoch": 0.08, "learning_rate": 5e-05, "loss": 1.7423, "step": 1000 }, { "epoch": 0.12, "learning_rate": 5e-05, "loss": 1.6399, "step": 1500 }, { "epoch": 0.16, "learning_rate": 5e-05, "loss": 1.5877, "step": 2000 }, { "epoch": 0.19, "learning_rate": 5e-05, "loss": 1.5372, "step": 2500 }, { "epoch": 0.23, "learning_rate": 5e-05, "loss": 1.5221, "step": 3000 }, { "epoch": 0.27, "learning_rate": 5e-05, "loss": 1.5048, "step": 3500 }, { "epoch": 0.31, "learning_rate": 5e-05, "loss": 1.4684, "step": 4000 }, { "epoch": 0.35, "learning_rate": 5e-05, "loss": 1.461, "step": 4500 }, { "epoch": 0.39, "learning_rate": 5e-05, "loss": 1.4498, "step": 5000 }, { "epoch": 0.43, "learning_rate": 5e-05, "loss": 1.4377, "step": 5500 }, { "epoch": 0.47, "learning_rate": 5e-05, "loss": 1.4265, "step": 6000 }, { "epoch": 0.5, "learning_rate": 5e-05, "loss": 1.4181, "step": 6500 }, { "epoch": 0.54, "learning_rate": 5e-05, "loss": 1.4023, "step": 7000 }, { "epoch": 0.58, "learning_rate": 5e-05, "loss": 1.4015, "step": 7500 }, { "epoch": 0.62, "learning_rate": 5e-05, "loss": 1.3927, "step": 8000 }, { "epoch": 0.66, "learning_rate": 5e-05, "loss": 1.3676, "step": 8500 }, { "epoch": 0.7, "learning_rate": 5e-05, "loss": 1.3643, "step": 9000 }, { "epoch": 0.74, "learning_rate": 5e-05, "loss": 1.3544, "step": 9500 }, { "epoch": 0.78, "learning_rate": 5e-05, "loss": 1.3459, "step": 10000 }, { "epoch": 0.82, "learning_rate": 5e-05, "loss": 1.3652, "step": 10500 }, { "epoch": 0.85, "learning_rate": 5e-05, "loss": 1.3385, "step": 11000 }, { "epoch": 0.89, "learning_rate": 5e-05, "loss": 1.3395, "step": 11500 }, { "epoch": 0.93, "learning_rate": 5e-05, "loss": 1.3303, "step": 12000 }, { "epoch": 0.97, "learning_rate": 5e-05, "loss": 1.3309, "step": 12500 }, { "epoch": 1.0, "eval_bleu": 19.0434, "eval_gen_len": 16.8277, "eval_loss": 1.0267103910446167, "eval_runtime": 54.7318, "eval_samples_per_second": 16.225, "eval_steps_per_second": 1.023, "step": 12882 }, { "epoch": 1.01, "learning_rate": 5e-05, "loss": 1.3097, "step": 13000 }, { "epoch": 1.05, "learning_rate": 5e-05, "loss": 1.2515, "step": 13500 }, { "epoch": 1.09, "learning_rate": 5e-05, "loss": 1.2635, "step": 14000 }, { "epoch": 1.13, "learning_rate": 5e-05, "loss": 1.2475, "step": 14500 }, { "epoch": 1.16, "learning_rate": 5e-05, "loss": 1.2404, "step": 15000 }, { "epoch": 1.2, "learning_rate": 5e-05, "loss": 1.2447, "step": 15500 }, { "epoch": 1.24, "learning_rate": 5e-05, "loss": 1.2333, "step": 16000 }, { "epoch": 1.28, "learning_rate": 5e-05, "loss": 1.2481, "step": 16500 }, { "epoch": 1.32, "learning_rate": 5e-05, "loss": 1.2257, "step": 17000 }, { "epoch": 1.36, "learning_rate": 5e-05, "loss": 1.2382, "step": 17500 }, { "epoch": 1.4, "learning_rate": 5e-05, "loss": 1.228, "step": 18000 }, { "epoch": 1.44, "learning_rate": 5e-05, "loss": 1.2217, "step": 18500 }, { "epoch": 1.47, "learning_rate": 5e-05, "loss": 1.221, "step": 19000 }, { "epoch": 1.51, "learning_rate": 5e-05, "loss": 1.2139, "step": 19500 }, { "epoch": 1.55, "learning_rate": 5e-05, "loss": 1.2143, "step": 20000 }, { "epoch": 1.59, "learning_rate": 5e-05, "loss": 1.2305, "step": 20500 }, { "epoch": 1.63, "learning_rate": 5e-05, "loss": 1.2265, "step": 21000 }, { "epoch": 1.67, "learning_rate": 5e-05, "loss": 1.2177, "step": 21500 }, { "epoch": 1.71, "learning_rate": 5e-05, "loss": 1.2248, "step": 22000 }, { "epoch": 1.75, "learning_rate": 5e-05, "loss": 1.2105, "step": 22500 }, { "epoch": 1.79, "learning_rate": 5e-05, "loss": 1.198, "step": 23000 }, { "epoch": 1.82, "learning_rate": 5e-05, "loss": 1.2073, "step": 23500 }, { "epoch": 1.86, "learning_rate": 5e-05, "loss": 1.2087, "step": 24000 }, { "epoch": 1.9, "learning_rate": 5e-05, "loss": 1.1997, "step": 24500 }, { "epoch": 1.94, "learning_rate": 5e-05, "loss": 1.1992, "step": 25000 }, { "epoch": 1.98, "learning_rate": 5e-05, "loss": 1.1944, "step": 25500 }, { "epoch": 2.0, "eval_bleu": 19.4468, "eval_gen_len": 16.8266, "eval_loss": 0.9961099624633789, "eval_runtime": 55.2432, "eval_samples_per_second": 16.074, "eval_steps_per_second": 1.014, "step": 25764 }, { "epoch": 2.02, "learning_rate": 5e-05, "loss": 1.1576, "step": 26000 }, { "epoch": 2.06, "learning_rate": 5e-05, "loss": 1.1239, "step": 26500 }, { "epoch": 2.1, "learning_rate": 5e-05, "loss": 1.1292, "step": 27000 }, { "epoch": 2.13, "learning_rate": 5e-05, "loss": 1.1238, "step": 27500 }, { "epoch": 2.17, "learning_rate": 5e-05, "loss": 1.1194, "step": 28000 }, { "epoch": 2.21, "learning_rate": 5e-05, "loss": 1.1326, "step": 28500 }, { "epoch": 2.25, "learning_rate": 5e-05, "loss": 1.1292, "step": 29000 }, { "epoch": 2.29, "learning_rate": 5e-05, "loss": 1.1089, "step": 29500 }, { "epoch": 2.33, "learning_rate": 5e-05, "loss": 1.1271, "step": 30000 }, { "epoch": 2.37, "learning_rate": 5e-05, "loss": 1.1199, "step": 30500 }, { "epoch": 2.41, "learning_rate": 5e-05, "loss": 1.1279, "step": 31000 }, { "epoch": 2.45, "learning_rate": 5e-05, "loss": 1.124, "step": 31500 }, { "epoch": 2.48, "learning_rate": 5e-05, "loss": 1.1188, "step": 32000 }, { "epoch": 2.52, "learning_rate": 5e-05, "loss": 1.1251, "step": 32500 }, { "epoch": 2.56, "learning_rate": 5e-05, "loss": 1.1231, "step": 33000 }, { "epoch": 2.6, "learning_rate": 5e-05, "loss": 1.1223, "step": 33500 }, { "epoch": 2.64, "learning_rate": 5e-05, "loss": 1.1198, "step": 34000 }, { "epoch": 2.68, "learning_rate": 5e-05, "loss": 1.1248, "step": 34500 }, { "epoch": 2.72, "learning_rate": 5e-05, "loss": 1.1106, "step": 35000 }, { "epoch": 2.76, "learning_rate": 5e-05, "loss": 1.1171, "step": 35500 }, { "epoch": 2.79, "learning_rate": 5e-05, "loss": 1.1133, "step": 36000 }, { "epoch": 2.83, "learning_rate": 5e-05, "loss": 1.1254, "step": 36500 }, { "epoch": 2.87, "learning_rate": 5e-05, "loss": 1.1141, "step": 37000 }, { "epoch": 2.91, "learning_rate": 5e-05, "loss": 1.1238, "step": 37500 }, { "epoch": 2.95, "learning_rate": 5e-05, "loss": 1.1058, "step": 38000 }, { "epoch": 2.99, "learning_rate": 5e-05, "loss": 1.1119, "step": 38500 }, { "epoch": 3.0, "eval_bleu": 19.9654, "eval_gen_len": 16.8525, "eval_loss": 0.9805649518966675, "eval_runtime": 55.4985, "eval_samples_per_second": 16.0, "eval_steps_per_second": 1.009, "step": 38646 }, { "epoch": 3.03, "learning_rate": 5e-05, "loss": 1.0585, "step": 39000 }, { "epoch": 3.07, "learning_rate": 5e-05, "loss": 1.0476, "step": 39500 }, { "epoch": 3.11, "learning_rate": 5e-05, "loss": 1.0405, "step": 40000 }, { "epoch": 3.14, "learning_rate": 5e-05, "loss": 1.0549, "step": 40500 }, { "epoch": 3.18, "learning_rate": 5e-05, "loss": 1.0379, "step": 41000 }, { "epoch": 3.22, "learning_rate": 5e-05, "loss": 1.049, "step": 41500 }, { "epoch": 3.26, "learning_rate": 5e-05, "loss": 1.0367, "step": 42000 }, { "epoch": 3.3, "learning_rate": 5e-05, "loss": 1.0553, "step": 42500 }, { "epoch": 3.34, "learning_rate": 5e-05, "loss": 1.0408, "step": 43000 }, { "epoch": 3.38, "learning_rate": 5e-05, "loss": 1.041, "step": 43500 }, { "epoch": 3.42, "learning_rate": 5e-05, "loss": 1.0516, "step": 44000 }, { "epoch": 3.45, "learning_rate": 5e-05, "loss": 1.0448, "step": 44500 }, { "epoch": 3.49, "learning_rate": 5e-05, "loss": 1.0512, "step": 45000 }, { "epoch": 3.53, "learning_rate": 5e-05, "loss": 1.0487, "step": 45500 }, { "epoch": 3.57, "learning_rate": 5e-05, "loss": 1.0504, "step": 46000 }, { "epoch": 3.61, "learning_rate": 5e-05, "loss": 1.0436, "step": 46500 }, { "epoch": 3.65, "learning_rate": 5e-05, "loss": 1.0439, "step": 47000 }, { "epoch": 3.69, "learning_rate": 5e-05, "loss": 1.0386, "step": 47500 }, { "epoch": 3.73, "learning_rate": 5e-05, "loss": 1.0413, "step": 48000 }, { "epoch": 3.76, "learning_rate": 5e-05, "loss": 1.0512, "step": 48500 }, { "epoch": 3.8, "learning_rate": 5e-05, "loss": 1.0544, "step": 49000 }, { "epoch": 3.84, "learning_rate": 5e-05, "loss": 1.042, "step": 49500 }, { "epoch": 3.88, "learning_rate": 5e-05, "loss": 1.0411, "step": 50000 }, { "epoch": 3.92, "learning_rate": 5e-05, "loss": 1.0501, "step": 50500 }, { "epoch": 3.96, "learning_rate": 5e-05, "loss": 1.0397, "step": 51000 }, { "epoch": 4.0, "learning_rate": 5e-05, "loss": 1.0495, "step": 51500 }, { "epoch": 4.0, "eval_bleu": 19.9788, "eval_gen_len": 16.8604, "eval_loss": 0.9743499159812927, "eval_runtime": 56.6985, "eval_samples_per_second": 15.662, "eval_steps_per_second": 0.988, "step": 51528 }, { "epoch": 4.04, "learning_rate": 5e-05, "loss": 0.9885, "step": 52000 }, { "epoch": 4.08, "learning_rate": 5e-05, "loss": 0.9636, "step": 52500 }, { "epoch": 4.11, "learning_rate": 5e-05, "loss": 0.9842, "step": 53000 }, { "epoch": 4.15, "learning_rate": 5e-05, "loss": 0.9882, "step": 53500 }, { "epoch": 4.19, "learning_rate": 5e-05, "loss": 0.9787, "step": 54000 }, { "epoch": 4.23, "learning_rate": 5e-05, "loss": 0.9809, "step": 54500 }, { "epoch": 4.27, "learning_rate": 5e-05, "loss": 0.9807, "step": 55000 }, { "epoch": 4.31, "learning_rate": 5e-05, "loss": 0.9824, "step": 55500 }, { "epoch": 4.35, "learning_rate": 5e-05, "loss": 0.9815, "step": 56000 }, { "epoch": 4.39, "learning_rate": 5e-05, "loss": 0.9763, "step": 56500 }, { "epoch": 4.42, "learning_rate": 5e-05, "loss": 0.9809, "step": 57000 }, { "epoch": 4.46, "learning_rate": 5e-05, "loss": 0.9885, "step": 57500 }, { "epoch": 4.5, "learning_rate": 5e-05, "loss": 0.9825, "step": 58000 }, { "epoch": 4.54, "learning_rate": 5e-05, "loss": 0.9856, "step": 58500 }, { "epoch": 4.58, "learning_rate": 5e-05, "loss": 0.9848, "step": 59000 }, { "epoch": 4.62, "learning_rate": 5e-05, "loss": 0.9876, "step": 59500 }, { "epoch": 4.66, "learning_rate": 5e-05, "loss": 0.9834, "step": 60000 }, { "epoch": 4.7, "learning_rate": 5e-05, "loss": 0.9939, "step": 60500 }, { "epoch": 4.74, "learning_rate": 5e-05, "loss": 0.9845, "step": 61000 }, { "epoch": 4.77, "learning_rate": 5e-05, "loss": 0.9858, "step": 61500 }, { "epoch": 4.81, "learning_rate": 5e-05, "loss": 0.9885, "step": 62000 }, { "epoch": 4.85, "learning_rate": 5e-05, "loss": 0.9971, "step": 62500 }, { "epoch": 4.89, "learning_rate": 5e-05, "loss": 0.9867, "step": 63000 }, { "epoch": 4.93, "learning_rate": 5e-05, "loss": 0.9941, "step": 63500 }, { "epoch": 4.97, "learning_rate": 5e-05, "loss": 0.9849, "step": 64000 }, { "epoch": 5.0, "eval_bleu": 19.9878, "eval_gen_len": 16.8468, "eval_loss": 0.9771544337272644, "eval_runtime": 56.8362, "eval_samples_per_second": 15.624, "eval_steps_per_second": 0.985, "step": 64410 }, { "epoch": 5.01, "learning_rate": 5e-05, "loss": 0.9758, "step": 64500 }, { "epoch": 5.05, "learning_rate": 5e-05, "loss": 0.9183, "step": 65000 }, { "epoch": 5.08, "learning_rate": 5e-05, "loss": 0.9121, "step": 65500 }, { "epoch": 5.12, "learning_rate": 5e-05, "loss": 0.9197, "step": 66000 }, { "epoch": 5.16, "learning_rate": 5e-05, "loss": 0.926, "step": 66500 }, { "epoch": 5.2, "learning_rate": 5e-05, "loss": 0.9233, "step": 67000 }, { "epoch": 5.24, "learning_rate": 5e-05, "loss": 0.9263, "step": 67500 }, { "epoch": 5.28, "learning_rate": 5e-05, "loss": 0.9316, "step": 68000 }, { "epoch": 5.32, "learning_rate": 5e-05, "loss": 0.9251, "step": 68500 }, { "epoch": 5.36, "learning_rate": 5e-05, "loss": 0.9309, "step": 69000 }, { "epoch": 5.4, "learning_rate": 5e-05, "loss": 0.9261, "step": 69500 }, { "epoch": 5.43, "learning_rate": 5e-05, "loss": 0.931, "step": 70000 }, { "epoch": 5.47, "learning_rate": 5e-05, "loss": 0.9367, "step": 70500 }, { "epoch": 5.51, "learning_rate": 5e-05, "loss": 0.9347, "step": 71000 }, { "epoch": 5.55, "learning_rate": 5e-05, "loss": 0.9384, "step": 71500 }, { "epoch": 5.59, "learning_rate": 5e-05, "loss": 0.9334, "step": 72000 }, { "epoch": 5.63, "learning_rate": 5e-05, "loss": 0.9353, "step": 72500 }, { "epoch": 5.67, "learning_rate": 5e-05, "loss": 0.929, "step": 73000 }, { "epoch": 5.71, "learning_rate": 5e-05, "loss": 0.9419, "step": 73500 }, { "epoch": 5.74, "learning_rate": 5e-05, "loss": 0.9353, "step": 74000 }, { "epoch": 5.78, "learning_rate": 5e-05, "loss": 0.9302, "step": 74500 }, { "epoch": 5.82, "learning_rate": 5e-05, "loss": 0.9448, "step": 75000 }, { "epoch": 5.86, "learning_rate": 5e-05, "loss": 0.9277, "step": 75500 }, { "epoch": 5.9, "learning_rate": 5e-05, "loss": 0.9325, "step": 76000 }, { "epoch": 5.94, "learning_rate": 5e-05, "loss": 0.9451, "step": 76500 }, { "epoch": 5.98, "learning_rate": 5e-05, "loss": 0.9336, "step": 77000 }, { "epoch": 6.0, "eval_bleu": 19.7392, "eval_gen_len": 16.7646, "eval_loss": 0.9833778738975525, "eval_runtime": 58.8288, "eval_samples_per_second": 15.095, "eval_steps_per_second": 0.952, "step": 77292 }, { "epoch": 6.02, "learning_rate": 5e-05, "loss": 0.9089, "step": 77500 }, { "epoch": 6.05, "learning_rate": 5e-05, "loss": 0.8559, "step": 78000 }, { "epoch": 6.09, "learning_rate": 5e-05, "loss": 0.8688, "step": 78500 }, { "epoch": 6.13, "learning_rate": 5e-05, "loss": 0.872, "step": 79000 }, { "epoch": 6.17, "learning_rate": 5e-05, "loss": 0.8789, "step": 79500 }, { "epoch": 6.21, "learning_rate": 5e-05, "loss": 0.8814, "step": 80000 }, { "epoch": 6.25, "learning_rate": 5e-05, "loss": 0.8721, "step": 80500 }, { "epoch": 6.29, "learning_rate": 5e-05, "loss": 0.8864, "step": 81000 }, { "epoch": 6.33, "learning_rate": 5e-05, "loss": 0.8875, "step": 81500 }, { "epoch": 6.37, "learning_rate": 5e-05, "loss": 0.8751, "step": 82000 }, { "epoch": 6.4, "learning_rate": 5e-05, "loss": 0.8772, "step": 82500 }, { "epoch": 6.44, "learning_rate": 5e-05, "loss": 0.8742, "step": 83000 }, { "epoch": 6.48, "learning_rate": 5e-05, "loss": 0.8773, "step": 83500 }, { "epoch": 6.52, "learning_rate": 5e-05, "loss": 0.8783, "step": 84000 }, { "epoch": 6.56, "learning_rate": 5e-05, "loss": 0.8845, "step": 84500 }, { "epoch": 6.6, "learning_rate": 5e-05, "loss": 0.8718, "step": 85000 }, { "epoch": 6.64, "learning_rate": 5e-05, "loss": 0.8871, "step": 85500 }, { "epoch": 6.68, "learning_rate": 5e-05, "loss": 0.8834, "step": 86000 }, { "epoch": 6.71, "learning_rate": 5e-05, "loss": 0.8979, "step": 86500 }, { "epoch": 6.75, "learning_rate": 5e-05, "loss": 0.8875, "step": 87000 }, { "epoch": 6.79, "learning_rate": 5e-05, "loss": 0.8959, "step": 87500 }, { "epoch": 6.83, "learning_rate": 5e-05, "loss": 0.8874, "step": 88000 }, { "epoch": 6.87, "learning_rate": 5e-05, "loss": 0.8817, "step": 88500 }, { "epoch": 6.91, "learning_rate": 5e-05, "loss": 0.8894, "step": 89000 }, { "epoch": 6.95, "learning_rate": 5e-05, "loss": 0.8925, "step": 89500 }, { "epoch": 6.99, "learning_rate": 5e-05, "loss": 0.8903, "step": 90000 }, { "epoch": 7.0, "eval_bleu": 19.9464, "eval_gen_len": 16.8491, "eval_loss": 0.9870715737342834, "eval_runtime": 56.1043, "eval_samples_per_second": 15.828, "eval_steps_per_second": 0.998, "step": 90174 } ], "max_steps": 631218, "num_train_epochs": 49, "total_flos": 6.276153481836626e+17, "trial_name": null, "trial_params": null }