{ "best_metric": 23.6596, "best_model_checkpoint": "/local1/hfs/gs_stuff/ft-wmt14/checkpoint-100000", "epoch": 2.7777777777777777, "eval_steps": 10000, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1388888888888889, "grad_norm": 1.066943645477295, "learning_rate": 0.000475, "loss": 1.9627, "step": 5000 }, { "epoch": 0.2777777777777778, "grad_norm": 0.9774492383003235, "learning_rate": 0.00045000000000000004, "loss": 1.7738, "step": 10000 }, { "epoch": 0.2777777777777778, "eval_bleu": 20.1598, "eval_gen_len": 28.1563, "eval_loss": 1.914583444595337, "eval_runtime": 241.8013, "eval_samples_per_second": 12.407, "eval_steps_per_second": 1.551, "step": 10000 }, { "epoch": 0.4166666666666667, "grad_norm": 1.4306731224060059, "learning_rate": 0.000425, "loss": 1.6951, "step": 15000 }, { "epoch": 0.5555555555555556, "grad_norm": 1.1782424449920654, "learning_rate": 0.0004, "loss": 1.6498, "step": 20000 }, { "epoch": 0.5555555555555556, "eval_bleu": 21.4167, "eval_gen_len": 27.853, "eval_loss": 1.855008840560913, "eval_runtime": 242.3949, "eval_samples_per_second": 12.376, "eval_steps_per_second": 1.547, "step": 20000 }, { "epoch": 0.6944444444444444, "grad_norm": 1.219376802444458, "learning_rate": 0.000375, "loss": 1.6172, "step": 25000 }, { "epoch": 0.8333333333333334, "grad_norm": 1.2735612392425537, "learning_rate": 0.00035, "loss": 1.5903, "step": 30000 }, { "epoch": 0.8333333333333334, "eval_bleu": 22.604, "eval_gen_len": 27.7613, "eval_loss": 1.8276705741882324, "eval_runtime": 240.5149, "eval_samples_per_second": 12.473, "eval_steps_per_second": 1.559, "step": 30000 }, { "epoch": 0.9722222222222222, "grad_norm": 1.0282609462738037, "learning_rate": 0.00032500000000000004, "loss": 1.5633, "step": 35000 }, { "epoch": 1.1111111111111112, "grad_norm": 1.406827688217163, "learning_rate": 0.0003, "loss": 1.5151, "step": 40000 }, { "epoch": 1.1111111111111112, "eval_bleu": 22.1273, "eval_gen_len": 27.3187, "eval_loss": 1.8127936124801636, "eval_runtime": 234.7049, "eval_samples_per_second": 12.782, "eval_steps_per_second": 1.598, "step": 40000 }, { "epoch": 1.25, "grad_norm": 1.174306035041809, "learning_rate": 0.000275, "loss": 1.5004, "step": 45000 }, { "epoch": 1.3888888888888888, "grad_norm": 1.5665515661239624, "learning_rate": 0.00025, "loss": 1.4866, "step": 50000 }, { "epoch": 1.3888888888888888, "eval_bleu": 22.8295, "eval_gen_len": 27.419, "eval_loss": 1.7999275922775269, "eval_runtime": 233.8115, "eval_samples_per_second": 12.831, "eval_steps_per_second": 1.604, "step": 50000 }, { "epoch": 1.5277777777777777, "grad_norm": 1.1425319910049438, "learning_rate": 0.00022500000000000002, "loss": 1.4799, "step": 55000 }, { "epoch": 1.6666666666666665, "grad_norm": 1.123904824256897, "learning_rate": 0.0002, "loss": 1.4696, "step": 60000 }, { "epoch": 1.6666666666666665, "eval_bleu": 22.9923, "eval_gen_len": 27.7387, "eval_loss": 1.780959963798523, "eval_runtime": 240.0938, "eval_samples_per_second": 12.495, "eval_steps_per_second": 1.562, "step": 60000 }, { "epoch": 1.8055555555555556, "grad_norm": 1.4292243719100952, "learning_rate": 0.000175, "loss": 1.4613, "step": 65000 }, { "epoch": 1.9444444444444444, "grad_norm": 1.1662226915359497, "learning_rate": 0.00015, "loss": 1.4508, "step": 70000 }, { "epoch": 1.9444444444444444, "eval_bleu": 23.1046, "eval_gen_len": 27.7057, "eval_loss": 1.7654317617416382, "eval_runtime": 236.6367, "eval_samples_per_second": 12.678, "eval_steps_per_second": 1.585, "step": 70000 }, { "epoch": 2.0833333333333335, "grad_norm": 0.9245423674583435, "learning_rate": 0.000125, "loss": 1.4235, "step": 75000 }, { "epoch": 2.2222222222222223, "grad_norm": 1.2502944469451904, "learning_rate": 0.0001, "loss": 1.4053, "step": 80000 }, { "epoch": 2.2222222222222223, "eval_bleu": 23.5079, "eval_gen_len": 27.643, "eval_loss": 1.758699655532837, "eval_runtime": 237.5663, "eval_samples_per_second": 12.628, "eval_steps_per_second": 1.579, "step": 80000 }, { "epoch": 2.361111111111111, "grad_norm": 0.9593023061752319, "learning_rate": 7.5e-05, "loss": 1.408, "step": 85000 }, { "epoch": 2.5, "grad_norm": 1.440004825592041, "learning_rate": 5e-05, "loss": 1.3956, "step": 90000 }, { "epoch": 2.5, "eval_bleu": 23.3848, "eval_gen_len": 27.6637, "eval_loss": 1.752461552619934, "eval_runtime": 237.0184, "eval_samples_per_second": 12.657, "eval_steps_per_second": 1.582, "step": 90000 }, { "epoch": 2.638888888888889, "grad_norm": 1.1929932832717896, "learning_rate": 2.5e-05, "loss": 1.3938, "step": 95000 }, { "epoch": 2.7777777777777777, "grad_norm": 1.0216492414474487, "learning_rate": 0.0, "loss": 1.3903, "step": 100000 }, { "epoch": 2.7777777777777777, "eval_bleu": 23.6596, "eval_gen_len": 27.526, "eval_loss": 1.7469114065170288, "eval_runtime": 235.9542, "eval_samples_per_second": 12.714, "eval_steps_per_second": 1.589, "step": 100000 }, { "epoch": 2.7777777777777777, "step": 100000, "total_flos": 3.803274433029734e+16, "train_loss": 1.5316169482421875, "train_runtime": 15895.0874, "train_samples_per_second": 100.66, "train_steps_per_second": 6.291 } ], "logging_steps": 5000, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "total_flos": 3.803274433029734e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }