|
{ |
|
"best_metric": 50.2114, |
|
"best_model_checkpoint": "./jako_13p_tokenie_run1/checkpoint-19200", |
|
"epoch": 9.997403271877435, |
|
"eval_steps": 1600, |
|
"global_step": 19250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 4.9212598425196856e-05, |
|
"loss": 1.7571, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 4.79002624671916e-05, |
|
"loss": 1.3294, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 4.6587926509186354e-05, |
|
"loss": 1.2125, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_bleu": 44.2732, |
|
"eval_gen_len": 18.9394, |
|
"eval_loss": 1.1356315612792969, |
|
"eval_runtime": 557.6241, |
|
"eval_samples_per_second": 13.81, |
|
"eval_steps_per_second": 0.864, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"learning_rate": 4.52755905511811e-05, |
|
"loss": 1.1386, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"learning_rate": 4.396325459317586e-05, |
|
"loss": 0.9283, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 4.2650918635170604e-05, |
|
"loss": 0.8519, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"eval_bleu": 47.1622, |
|
"eval_gen_len": 18.3936, |
|
"eval_loss": 1.061800241470337, |
|
"eval_runtime": 524.2089, |
|
"eval_samples_per_second": 14.691, |
|
"eval_steps_per_second": 0.919, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"learning_rate": 4.133858267716536e-05, |
|
"loss": 0.8109, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"learning_rate": 4.00262467191601e-05, |
|
"loss": 0.7727, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"learning_rate": 3.871391076115486e-05, |
|
"loss": 0.6394, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"eval_bleu": 47.7818, |
|
"eval_gen_len": 18.3397, |
|
"eval_loss": 1.0923182964324951, |
|
"eval_runtime": 516.7936, |
|
"eval_samples_per_second": 14.902, |
|
"eval_steps_per_second": 0.933, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"learning_rate": 3.740157480314961e-05, |
|
"loss": 0.5875, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"learning_rate": 3.608923884514436e-05, |
|
"loss": 0.5625, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"learning_rate": 3.4776902887139105e-05, |
|
"loss": 0.532, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"eval_bleu": 48.4283, |
|
"eval_gen_len": 18.3375, |
|
"eval_loss": 1.1293830871582031, |
|
"eval_runtime": 519.3842, |
|
"eval_samples_per_second": 14.827, |
|
"eval_steps_per_second": 0.928, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"learning_rate": 3.3464566929133864e-05, |
|
"loss": 0.4299, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"learning_rate": 3.215223097112861e-05, |
|
"loss": 0.3984, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"learning_rate": 3.083989501312336e-05, |
|
"loss": 0.3857, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"learning_rate": 2.952755905511811e-05, |
|
"loss": 0.3543, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"eval_bleu": 47.7916, |
|
"eval_gen_len": 18.4422, |
|
"eval_loss": 1.176469087600708, |
|
"eval_runtime": 519.0077, |
|
"eval_samples_per_second": 14.838, |
|
"eval_steps_per_second": 0.929, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"learning_rate": 2.8215223097112863e-05, |
|
"loss": 0.2836, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"learning_rate": 2.6902887139107612e-05, |
|
"loss": 0.2648, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"learning_rate": 2.5590551181102364e-05, |
|
"loss": 0.2569, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"eval_bleu": 48.1268, |
|
"eval_gen_len": 18.5385, |
|
"eval_loss": 1.2102879285812378, |
|
"eval_runtime": 526.7602, |
|
"eval_samples_per_second": 14.62, |
|
"eval_steps_per_second": 0.915, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"learning_rate": 2.4278215223097113e-05, |
|
"loss": 0.2268, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"learning_rate": 2.2965879265091865e-05, |
|
"loss": 0.1854, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 5.71, |
|
"learning_rate": 2.1653543307086614e-05, |
|
"loss": 0.1732, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"eval_bleu": 48.9329, |
|
"eval_gen_len": 18.2085, |
|
"eval_loss": 1.25494384765625, |
|
"eval_runtime": 505.0437, |
|
"eval_samples_per_second": 15.248, |
|
"eval_steps_per_second": 0.954, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"learning_rate": 2.0341207349081366e-05, |
|
"loss": 0.1693, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 6.23, |
|
"learning_rate": 1.9028871391076115e-05, |
|
"loss": 0.1453, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 6.49, |
|
"learning_rate": 1.7716535433070868e-05, |
|
"loss": 0.1228, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 6.65, |
|
"eval_bleu": 49.0248, |
|
"eval_gen_len": 18.2133, |
|
"eval_loss": 1.3022269010543823, |
|
"eval_runtime": 504.7977, |
|
"eval_samples_per_second": 15.256, |
|
"eval_steps_per_second": 0.955, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"learning_rate": 1.6404199475065617e-05, |
|
"loss": 0.1158, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"learning_rate": 1.5091863517060367e-05, |
|
"loss": 0.1144, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"learning_rate": 1.377952755905512e-05, |
|
"loss": 0.0937, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"eval_bleu": 49.3503, |
|
"eval_gen_len": 18.1673, |
|
"eval_loss": 1.317897081375122, |
|
"eval_runtime": 503.3739, |
|
"eval_samples_per_second": 15.299, |
|
"eval_steps_per_second": 0.958, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"learning_rate": 1.246719160104987e-05, |
|
"loss": 0.0829, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"learning_rate": 1.115485564304462e-05, |
|
"loss": 0.0783, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 8.05, |
|
"learning_rate": 9.842519685039371e-06, |
|
"loss": 0.0779, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 8.31, |
|
"learning_rate": 8.530183727034122e-06, |
|
"loss": 0.0627, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 8.31, |
|
"eval_bleu": 49.5551, |
|
"eval_gen_len": 18.2672, |
|
"eval_loss": 1.3408894538879395, |
|
"eval_runtime": 506.5726, |
|
"eval_samples_per_second": 15.202, |
|
"eval_steps_per_second": 0.951, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 8.57, |
|
"learning_rate": 7.2178477690288725e-06, |
|
"loss": 0.0579, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"learning_rate": 5.905511811023622e-06, |
|
"loss": 0.0551, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"learning_rate": 4.593175853018373e-06, |
|
"loss": 0.0558, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"eval_bleu": 49.7808, |
|
"eval_gen_len": 18.2815, |
|
"eval_loss": 1.3544921875, |
|
"eval_runtime": 505.3645, |
|
"eval_samples_per_second": 15.239, |
|
"eval_steps_per_second": 0.954, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 9.35, |
|
"learning_rate": 3.2808398950131235e-06, |
|
"loss": 0.0456, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 9.61, |
|
"learning_rate": 1.968503937007874e-06, |
|
"loss": 0.0433, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 9.87, |
|
"learning_rate": 6.561679790026247e-07, |
|
"loss": 0.0442, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 9.97, |
|
"eval_bleu": 50.2114, |
|
"eval_gen_len": 18.2159, |
|
"eval_loss": 1.3559678792953491, |
|
"eval_runtime": 503.7841, |
|
"eval_samples_per_second": 15.286, |
|
"eval_steps_per_second": 0.957, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 19250, |
|
"total_flos": 1.334951937048576e+18, |
|
"train_loss": 0.4018082245665711, |
|
"train_runtime": 39148.2976, |
|
"train_samples_per_second": 15.739, |
|
"train_steps_per_second": 0.492 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 19250, |
|
"num_train_epochs": 10, |
|
"save_steps": 1600, |
|
"total_flos": 1.334951937048576e+18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|