{ "best_metric": null, "best_model_checkpoint": null, "epoch": 25.31645569620253, "eval_steps": 500, "global_step": 42000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3, "learning_rate": 4.969861362266426e-05, "loss": 3.7006, "step": 500 }, { "epoch": 0.6, "learning_rate": 4.939722724532851e-05, "loss": 3.5872, "step": 1000 }, { "epoch": 0.9, "learning_rate": 4.909584086799277e-05, "loss": 3.4617, "step": 1500 }, { "epoch": 1.0, "eval_bleu": 12.3227, "eval_gen_len": 54.6075, "eval_loss": 3.1113245487213135, "eval_runtime": 122.2496, "eval_samples_per_second": 3.272, "eval_steps_per_second": 0.409, "step": 1659 }, { "epoch": 1.21, "learning_rate": 4.8794454490657024e-05, "loss": 3.298, "step": 2000 }, { "epoch": 1.51, "learning_rate": 4.849306811332128e-05, "loss": 3.2018, "step": 2500 }, { "epoch": 1.81, "learning_rate": 4.8191681735985535e-05, "loss": 3.1014, "step": 3000 }, { "epoch": 2.0, "eval_bleu": 15.8487, "eval_gen_len": 50.1125, "eval_loss": 2.8111488819122314, "eval_runtime": 92.4044, "eval_samples_per_second": 4.329, "eval_steps_per_second": 0.541, "step": 3318 }, { "epoch": 2.11, "learning_rate": 4.789029535864979e-05, "loss": 2.9998, "step": 3500 }, { "epoch": 2.41, "learning_rate": 4.7588908981314046e-05, "loss": 2.883, "step": 4000 }, { "epoch": 2.71, "learning_rate": 4.7287522603978304e-05, "loss": 2.8409, "step": 4500 }, { "epoch": 3.0, "eval_bleu": 20.5509, "eval_gen_len": 43.98, "eval_loss": 2.617112398147583, "eval_runtime": 70.0674, "eval_samples_per_second": 5.709, "eval_steps_per_second": 0.714, "step": 4977 }, { "epoch": 3.01, "learning_rate": 4.6986136226642556e-05, "loss": 2.8043, "step": 5000 }, { "epoch": 3.32, "learning_rate": 4.6684749849306815e-05, "loss": 2.6486, "step": 5500 }, { "epoch": 3.62, "learning_rate": 4.638336347197107e-05, "loss": 2.6127, "step": 6000 }, { "epoch": 3.92, "learning_rate": 4.6081977094635326e-05, "loss": 2.5718, "step": 6500 }, { "epoch": 4.0, "eval_bleu": 21.5273, "eval_gen_len": 40.8575, "eval_loss": 2.4335193634033203, "eval_runtime": 62.0368, "eval_samples_per_second": 6.448, "eval_steps_per_second": 0.806, "step": 6636 }, { "epoch": 4.22, "learning_rate": 4.5780590717299585e-05, "loss": 2.4535, "step": 7000 }, { "epoch": 4.52, "learning_rate": 4.547920433996384e-05, "loss": 2.4269, "step": 7500 }, { "epoch": 4.82, "learning_rate": 4.5177817962628096e-05, "loss": 2.3852, "step": 8000 }, { "epoch": 5.0, "eval_bleu": 24.0185, "eval_gen_len": 38.945, "eval_loss": 2.2908990383148193, "eval_runtime": 53.5509, "eval_samples_per_second": 7.47, "eval_steps_per_second": 0.934, "step": 8295 }, { "epoch": 5.12, "learning_rate": 4.487643158529235e-05, "loss": 2.3305, "step": 8500 }, { "epoch": 5.42, "learning_rate": 4.45750452079566e-05, "loss": 2.2361, "step": 9000 }, { "epoch": 5.73, "learning_rate": 4.427365883062086e-05, "loss": 2.2201, "step": 9500 }, { "epoch": 6.0, "eval_bleu": 25.0722, "eval_gen_len": 38.4525, "eval_loss": 2.2150681018829346, "eval_runtime": 47.2306, "eval_samples_per_second": 8.469, "eval_steps_per_second": 1.059, "step": 9954 }, { "epoch": 6.03, "learning_rate": 4.397227245328511e-05, "loss": 2.1955, "step": 10000 }, { "epoch": 6.33, "learning_rate": 4.367088607594937e-05, "loss": 2.0928, "step": 10500 }, { "epoch": 6.63, "learning_rate": 4.336949969861363e-05, "loss": 2.0947, "step": 11000 }, { "epoch": 6.93, "learning_rate": 4.306811332127788e-05, "loss": 2.0583, "step": 11500 }, { "epoch": 7.0, "eval_bleu": 26.051, "eval_gen_len": 40.0775, "eval_loss": 2.1219234466552734, "eval_runtime": 55.6386, "eval_samples_per_second": 7.189, "eval_steps_per_second": 0.899, "step": 11613 }, { "epoch": 7.23, "learning_rate": 4.276672694394214e-05, "loss": 1.9657, "step": 12000 }, { "epoch": 7.53, "learning_rate": 4.246534056660639e-05, "loss": 1.9594, "step": 12500 }, { "epoch": 7.84, "learning_rate": 4.216395418927065e-05, "loss": 1.9464, "step": 13000 }, { "epoch": 8.0, "eval_bleu": 27.8486, "eval_gen_len": 39.54, "eval_loss": 2.0415802001953125, "eval_runtime": 50.0785, "eval_samples_per_second": 7.987, "eval_steps_per_second": 0.998, "step": 13272 }, { "epoch": 8.14, "learning_rate": 4.186256781193491e-05, "loss": 1.8901, "step": 13500 }, { "epoch": 8.44, "learning_rate": 4.1561181434599153e-05, "loss": 1.8331, "step": 14000 }, { "epoch": 8.74, "learning_rate": 4.125979505726341e-05, "loss": 1.8273, "step": 14500 }, { "epoch": 9.0, "eval_bleu": 28.6882, "eval_gen_len": 38.97, "eval_loss": 1.9714975357055664, "eval_runtime": 47.8353, "eval_samples_per_second": 8.362, "eval_steps_per_second": 1.045, "step": 14931 }, { "epoch": 9.04, "learning_rate": 4.095840867992767e-05, "loss": 1.8071, "step": 15000 }, { "epoch": 9.34, "learning_rate": 4.065702230259192e-05, "loss": 1.724, "step": 15500 }, { "epoch": 9.64, "learning_rate": 4.035563592525618e-05, "loss": 1.7173, "step": 16000 }, { "epoch": 9.95, "learning_rate": 4.0054249547920434e-05, "loss": 1.7341, "step": 16500 }, { "epoch": 10.0, "eval_bleu": 29.4158, "eval_gen_len": 39.27, "eval_loss": 1.922670602798462, "eval_runtime": 48.3901, "eval_samples_per_second": 8.266, "eval_steps_per_second": 1.033, "step": 16590 }, { "epoch": 10.25, "learning_rate": 3.975286317058469e-05, "loss": 1.6432, "step": 17000 }, { "epoch": 10.55, "learning_rate": 3.945147679324895e-05, "loss": 1.6414, "step": 17500 }, { "epoch": 10.85, "learning_rate": 3.9150090415913203e-05, "loss": 1.6285, "step": 18000 }, { "epoch": 11.0, "eval_bleu": 29.6336, "eval_gen_len": 39.7025, "eval_loss": 1.8723887205123901, "eval_runtime": 49.1746, "eval_samples_per_second": 8.134, "eval_steps_per_second": 1.017, "step": 18249 }, { "epoch": 11.15, "learning_rate": 3.884870403857746e-05, "loss": 1.5753, "step": 18500 }, { "epoch": 11.45, "learning_rate": 3.8547317661241714e-05, "loss": 1.5525, "step": 19000 }, { "epoch": 11.75, "learning_rate": 3.8245931283905966e-05, "loss": 1.5466, "step": 19500 }, { "epoch": 12.0, "eval_bleu": 31.3296, "eval_gen_len": 39.8675, "eval_loss": 1.816349744796753, "eval_runtime": 49.6256, "eval_samples_per_second": 8.06, "eval_steps_per_second": 1.008, "step": 19908 }, { "epoch": 12.06, "learning_rate": 3.7944544906570225e-05, "loss": 1.5254, "step": 20000 }, { "epoch": 12.36, "learning_rate": 3.764315852923448e-05, "loss": 1.4676, "step": 20500 }, { "epoch": 12.66, "learning_rate": 3.7341772151898736e-05, "loss": 1.4678, "step": 21000 }, { "epoch": 12.96, "learning_rate": 3.7040385774562995e-05, "loss": 1.4607, "step": 21500 }, { "epoch": 13.0, "eval_bleu": 31.7515, "eval_gen_len": 38.405, "eval_loss": 1.7929939031600952, "eval_runtime": 44.5172, "eval_samples_per_second": 8.985, "eval_steps_per_second": 1.123, "step": 21567 }, { "epoch": 13.26, "learning_rate": 3.6738999397227247e-05, "loss": 1.3787, "step": 22000 }, { "epoch": 13.56, "learning_rate": 3.6437613019891505e-05, "loss": 1.4049, "step": 22500 }, { "epoch": 13.86, "learning_rate": 3.613622664255576e-05, "loss": 1.385, "step": 23000 }, { "epoch": 14.0, "eval_bleu": 32.458, "eval_gen_len": 39.4675, "eval_loss": 1.7518789768218994, "eval_runtime": 49.1331, "eval_samples_per_second": 8.141, "eval_steps_per_second": 1.018, "step": 23226 }, { "epoch": 14.17, "learning_rate": 3.5834840265220016e-05, "loss": 1.3403, "step": 23500 }, { "epoch": 14.47, "learning_rate": 3.553345388788427e-05, "loss": 1.3166, "step": 24000 }, { "epoch": 14.77, "learning_rate": 3.523206751054853e-05, "loss": 1.321, "step": 24500 }, { "epoch": 15.0, "eval_bleu": 32.9411, "eval_gen_len": 38.8025, "eval_loss": 1.7194263935089111, "eval_runtime": 45.6686, "eval_samples_per_second": 8.759, "eval_steps_per_second": 1.095, "step": 24885 }, { "epoch": 15.07, "learning_rate": 3.493068113321278e-05, "loss": 1.2976, "step": 25000 }, { "epoch": 15.37, "learning_rate": 3.462929475587703e-05, "loss": 1.2358, "step": 25500 }, { "epoch": 15.67, "learning_rate": 3.432790837854129e-05, "loss": 1.2592, "step": 26000 }, { "epoch": 15.97, "learning_rate": 3.402652200120555e-05, "loss": 1.2662, "step": 26500 }, { "epoch": 16.0, "eval_bleu": 33.8478, "eval_gen_len": 39.1275, "eval_loss": 1.6950603723526, "eval_runtime": 49.9911, "eval_samples_per_second": 8.001, "eval_steps_per_second": 1.0, "step": 26544 }, { "epoch": 16.27, "learning_rate": 3.37251356238698e-05, "loss": 1.1963, "step": 27000 }, { "epoch": 16.58, "learning_rate": 3.342374924653406e-05, "loss": 1.2002, "step": 27500 }, { "epoch": 16.88, "learning_rate": 3.312236286919831e-05, "loss": 1.1939, "step": 28000 }, { "epoch": 17.0, "eval_bleu": 34.5277, "eval_gen_len": 39.0225, "eval_loss": 1.685713529586792, "eval_runtime": 49.4943, "eval_samples_per_second": 8.082, "eval_steps_per_second": 1.01, "step": 28203 }, { "epoch": 17.18, "learning_rate": 3.282097649186257e-05, "loss": 1.1459, "step": 28500 }, { "epoch": 17.48, "learning_rate": 3.251959011452683e-05, "loss": 1.1326, "step": 29000 }, { "epoch": 17.78, "learning_rate": 3.221820373719108e-05, "loss": 1.1406, "step": 29500 }, { "epoch": 18.0, "eval_bleu": 35.8691, "eval_gen_len": 38.76, "eval_loss": 1.6470690965652466, "eval_runtime": 45.2962, "eval_samples_per_second": 8.831, "eval_steps_per_second": 1.104, "step": 29862 }, { "epoch": 18.08, "learning_rate": 3.191681735985534e-05, "loss": 1.1292, "step": 30000 }, { "epoch": 18.38, "learning_rate": 3.161543098251959e-05, "loss": 1.071, "step": 30500 }, { "epoch": 18.69, "learning_rate": 3.1314044605183844e-05, "loss": 1.0918, "step": 31000 }, { "epoch": 18.99, "learning_rate": 3.10126582278481e-05, "loss": 1.0759, "step": 31500 }, { "epoch": 19.0, "eval_bleu": 36.4448, "eval_gen_len": 38.6925, "eval_loss": 1.6456927061080933, "eval_runtime": 46.4772, "eval_samples_per_second": 8.606, "eval_steps_per_second": 1.076, "step": 31521 }, { "epoch": 19.29, "learning_rate": 3.0711271850512355e-05, "loss": 1.0193, "step": 32000 }, { "epoch": 19.59, "learning_rate": 3.0409885473176613e-05, "loss": 1.0248, "step": 32500 }, { "epoch": 19.89, "learning_rate": 3.010849909584087e-05, "loss": 1.0378, "step": 33000 }, { "epoch": 20.0, "eval_bleu": 37.2905, "eval_gen_len": 38.945, "eval_loss": 1.6285927295684814, "eval_runtime": 49.741, "eval_samples_per_second": 8.042, "eval_steps_per_second": 1.005, "step": 33180 }, { "epoch": 20.19, "learning_rate": 2.9807112718505124e-05, "loss": 0.9915, "step": 33500 }, { "epoch": 20.49, "learning_rate": 2.9505726341169383e-05, "loss": 0.9848, "step": 34000 }, { "epoch": 20.8, "learning_rate": 2.9204339963833638e-05, "loss": 0.9851, "step": 34500 }, { "epoch": 21.0, "eval_bleu": 38.4264, "eval_gen_len": 38.7175, "eval_loss": 1.5997543334960938, "eval_runtime": 44.5032, "eval_samples_per_second": 8.988, "eval_steps_per_second": 1.124, "step": 34839 }, { "epoch": 21.1, "learning_rate": 2.8902953586497894e-05, "loss": 0.97, "step": 35000 }, { "epoch": 21.4, "learning_rate": 2.8601567209162146e-05, "loss": 0.9436, "step": 35500 }, { "epoch": 21.7, "learning_rate": 2.83001808318264e-05, "loss": 0.9372, "step": 36000 }, { "epoch": 22.0, "eval_bleu": 37.9614, "eval_gen_len": 38.9425, "eval_loss": 1.607030987739563, "eval_runtime": 47.0014, "eval_samples_per_second": 8.51, "eval_steps_per_second": 1.064, "step": 36498 }, { "epoch": 22.0, "learning_rate": 2.7998794454490656e-05, "loss": 0.9437, "step": 36500 }, { "epoch": 22.3, "learning_rate": 2.7697408077154912e-05, "loss": 0.8917, "step": 37000 }, { "epoch": 22.6, "learning_rate": 2.7396021699819167e-05, "loss": 0.8692, "step": 37500 }, { "epoch": 22.91, "learning_rate": 2.7094635322483426e-05, "loss": 0.9191, "step": 38000 }, { "epoch": 23.0, "eval_bleu": 38.8655, "eval_gen_len": 38.8825, "eval_loss": 1.5746939182281494, "eval_runtime": 50.4993, "eval_samples_per_second": 7.921, "eval_steps_per_second": 0.99, "step": 38157 }, { "epoch": 23.21, "learning_rate": 2.679324894514768e-05, "loss": 0.8555, "step": 38500 }, { "epoch": 23.51, "learning_rate": 2.6491862567811937e-05, "loss": 0.8533, "step": 39000 }, { "epoch": 23.81, "learning_rate": 2.6190476190476192e-05, "loss": 0.8673, "step": 39500 }, { "epoch": 24.0, "eval_bleu": 39.4605, "eval_gen_len": 39.0175, "eval_loss": 1.5650146007537842, "eval_runtime": 50.8092, "eval_samples_per_second": 7.873, "eval_steps_per_second": 0.984, "step": 39816 }, { "epoch": 24.11, "learning_rate": 2.5889089813140448e-05, "loss": 0.841, "step": 40000 }, { "epoch": 24.41, "learning_rate": 2.5587703435804706e-05, "loss": 0.8155, "step": 40500 }, { "epoch": 24.71, "learning_rate": 2.5286317058468955e-05, "loss": 0.811, "step": 41000 }, { "epoch": 25.0, "eval_bleu": 39.6804, "eval_gen_len": 38.77, "eval_loss": 1.5603779554367065, "eval_runtime": 45.7389, "eval_samples_per_second": 8.745, "eval_steps_per_second": 1.093, "step": 41475 }, { "epoch": 25.02, "learning_rate": 2.4984930681133214e-05, "loss": 0.8335, "step": 41500 }, { "epoch": 25.32, "learning_rate": 2.468354430379747e-05, "loss": 0.7688, "step": 42000 } ], "logging_steps": 500, "max_steps": 82950, "num_train_epochs": 50, "save_steps": 500, "total_flos": 1.0127259403812864e+16, "trial_name": null, "trial_params": null }