|
{ |
|
"best_metric": 23.6596, |
|
"best_model_checkpoint": "/local1/hfs/gs_stuff/ft-wmt14/checkpoint-100000", |
|
"epoch": 2.7777777777777777, |
|
"eval_steps": 10000, |
|
"global_step": 100000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1388888888888889, |
|
"grad_norm": 1.066943645477295, |
|
"learning_rate": 0.000475, |
|
"loss": 1.9627, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 0.9774492383003235, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 1.7738, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"eval_bleu": 20.1598, |
|
"eval_gen_len": 28.1563, |
|
"eval_loss": 1.914583444595337, |
|
"eval_runtime": 241.8013, |
|
"eval_samples_per_second": 12.407, |
|
"eval_steps_per_second": 1.551, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 1.4306731224060059, |
|
"learning_rate": 0.000425, |
|
"loss": 1.6951, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 1.1782424449920654, |
|
"learning_rate": 0.0004, |
|
"loss": 1.6498, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"eval_bleu": 21.4167, |
|
"eval_gen_len": 27.853, |
|
"eval_loss": 1.855008840560913, |
|
"eval_runtime": 242.3949, |
|
"eval_samples_per_second": 12.376, |
|
"eval_steps_per_second": 1.547, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.6944444444444444, |
|
"grad_norm": 1.219376802444458, |
|
"learning_rate": 0.000375, |
|
"loss": 1.6172, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 1.2735612392425537, |
|
"learning_rate": 0.00035, |
|
"loss": 1.5903, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"eval_bleu": 22.604, |
|
"eval_gen_len": 27.7613, |
|
"eval_loss": 1.8276705741882324, |
|
"eval_runtime": 240.5149, |
|
"eval_samples_per_second": 12.473, |
|
"eval_steps_per_second": 1.559, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.9722222222222222, |
|
"grad_norm": 1.0282609462738037, |
|
"learning_rate": 0.00032500000000000004, |
|
"loss": 1.5633, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 1.406827688217163, |
|
"learning_rate": 0.0003, |
|
"loss": 1.5151, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"eval_bleu": 22.1273, |
|
"eval_gen_len": 27.3187, |
|
"eval_loss": 1.8127936124801636, |
|
"eval_runtime": 234.7049, |
|
"eval_samples_per_second": 12.782, |
|
"eval_steps_per_second": 1.598, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.174306035041809, |
|
"learning_rate": 0.000275, |
|
"loss": 1.5004, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 1.5665515661239624, |
|
"learning_rate": 0.00025, |
|
"loss": 1.4866, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"eval_bleu": 22.8295, |
|
"eval_gen_len": 27.419, |
|
"eval_loss": 1.7999275922775269, |
|
"eval_runtime": 233.8115, |
|
"eval_samples_per_second": 12.831, |
|
"eval_steps_per_second": 1.604, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.5277777777777777, |
|
"grad_norm": 1.1425319910049438, |
|
"learning_rate": 0.00022500000000000002, |
|
"loss": 1.4799, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 1.123904824256897, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4696, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"eval_bleu": 22.9923, |
|
"eval_gen_len": 27.7387, |
|
"eval_loss": 1.780959963798523, |
|
"eval_runtime": 240.0938, |
|
"eval_samples_per_second": 12.495, |
|
"eval_steps_per_second": 1.562, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.8055555555555556, |
|
"grad_norm": 1.4292243719100952, |
|
"learning_rate": 0.000175, |
|
"loss": 1.4613, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"grad_norm": 1.1662226915359497, |
|
"learning_rate": 0.00015, |
|
"loss": 1.4508, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"eval_bleu": 23.1046, |
|
"eval_gen_len": 27.7057, |
|
"eval_loss": 1.7654317617416382, |
|
"eval_runtime": 236.6367, |
|
"eval_samples_per_second": 12.678, |
|
"eval_steps_per_second": 1.585, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 0.9245423674583435, |
|
"learning_rate": 0.000125, |
|
"loss": 1.4235, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 1.2502944469451904, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4053, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"eval_bleu": 23.5079, |
|
"eval_gen_len": 27.643, |
|
"eval_loss": 1.758699655532837, |
|
"eval_runtime": 237.5663, |
|
"eval_samples_per_second": 12.628, |
|
"eval_steps_per_second": 1.579, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.361111111111111, |
|
"grad_norm": 0.9593023061752319, |
|
"learning_rate": 7.5e-05, |
|
"loss": 1.408, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.440004825592041, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3956, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_bleu": 23.3848, |
|
"eval_gen_len": 27.6637, |
|
"eval_loss": 1.752461552619934, |
|
"eval_runtime": 237.0184, |
|
"eval_samples_per_second": 12.657, |
|
"eval_steps_per_second": 1.582, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.638888888888889, |
|
"grad_norm": 1.1929932832717896, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3938, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 1.0216492414474487, |
|
"learning_rate": 0.0, |
|
"loss": 1.3903, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"eval_bleu": 23.6596, |
|
"eval_gen_len": 27.526, |
|
"eval_loss": 1.7469114065170288, |
|
"eval_runtime": 235.9542, |
|
"eval_samples_per_second": 12.714, |
|
"eval_steps_per_second": 1.589, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"step": 100000, |
|
"total_flos": 3.803274433029734e+16, |
|
"train_loss": 1.5316169482421875, |
|
"train_runtime": 15895.0874, |
|
"train_samples_per_second": 100.66, |
|
"train_steps_per_second": 6.291 |
|
} |
|
], |
|
"logging_steps": 5000, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10000, |
|
"total_flos": 3.803274433029734e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|