|
{ |
|
"best_metric": 78.92909907885071, |
|
"best_model_checkpoint": "/root/turkic_qa/tr_uzn_models/tr_uzn_xlm_roberta_base_model/checkpoint-7308", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 8120, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"step": 812, |
|
"train_exact_match": 28.97102897102897, |
|
"train_f1": 47.43836868671638, |
|
"train_runtime": 18.4368, |
|
"train_samples_per_second": 90.037, |
|
"train_steps_per_second": 3.254 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 62.82646942138672, |
|
"learning_rate": 5e-06, |
|
"loss": 4.1982, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_exact_match": 27.4375, |
|
"eval_f1": 46.68812257417232, |
|
"eval_runtime": 56.5084, |
|
"eval_samples_per_second": 90.447, |
|
"eval_steps_per_second": 3.238, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 1624, |
|
"train_exact_match": 59.54045954045954, |
|
"train_f1": 75.73675798511488, |
|
"train_runtime": 18.1425, |
|
"train_samples_per_second": 89.679, |
|
"train_steps_per_second": 3.252 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 65.56416320800781, |
|
"learning_rate": 1e-05, |
|
"loss": 1.581, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_exact_match": 52.5625, |
|
"eval_f1": 71.03718374348476, |
|
"eval_runtime": 56.6314, |
|
"eval_samples_per_second": 90.25, |
|
"eval_steps_per_second": 3.231, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 2436, |
|
"train_exact_match": 66.03396603396604, |
|
"train_f1": 80.90860196742766, |
|
"train_runtime": 17.5378, |
|
"train_samples_per_second": 90.034, |
|
"train_steps_per_second": 3.25 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 56.48591613769531, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 1.0352, |
|
"step": 2436 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_exact_match": 59.1875, |
|
"eval_f1": 75.75121931796592, |
|
"eval_runtime": 56.5131, |
|
"eval_samples_per_second": 90.439, |
|
"eval_steps_per_second": 3.238, |
|
"step": 2436 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 3248, |
|
"train_exact_match": 70.92907092907093, |
|
"train_f1": 85.47419759277868, |
|
"train_runtime": 18.0502, |
|
"train_samples_per_second": 88.254, |
|
"train_steps_per_second": 3.158 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 23.379392623901367, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.8114, |
|
"step": 3248 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_exact_match": 61.375, |
|
"eval_f1": 77.2668497754134, |
|
"eval_runtime": 57.6273, |
|
"eval_samples_per_second": 88.691, |
|
"eval_steps_per_second": 3.176, |
|
"step": 3248 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 4060, |
|
"train_exact_match": 76.42357642357642, |
|
"train_f1": 87.93119243163716, |
|
"train_runtime": 18.0592, |
|
"train_samples_per_second": 88.376, |
|
"train_steps_per_second": 3.156 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 36.056884765625, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.6739, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_exact_match": 62.0, |
|
"eval_f1": 77.40449528954417, |
|
"eval_runtime": 58.1956, |
|
"eval_samples_per_second": 87.824, |
|
"eval_steps_per_second": 3.145, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"step": 4872, |
|
"train_exact_match": 76.72327672327673, |
|
"train_f1": 89.14222582781746, |
|
"train_runtime": 18.7035, |
|
"train_samples_per_second": 87.203, |
|
"train_steps_per_second": 3.154 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 14.642334938049316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.567, |
|
"step": 4872 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_exact_match": 62.71875, |
|
"eval_f1": 78.11173049836967, |
|
"eval_runtime": 57.9474, |
|
"eval_samples_per_second": 88.201, |
|
"eval_steps_per_second": 3.158, |
|
"step": 4872 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"step": 5684, |
|
"train_exact_match": 78.52147852147853, |
|
"train_f1": 89.51285838266682, |
|
"train_runtime": 18.2594, |
|
"train_samples_per_second": 87.297, |
|
"train_steps_per_second": 3.122 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 17.107421875, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.4875, |
|
"step": 5684 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_exact_match": 62.875, |
|
"eval_f1": 78.08839073173891, |
|
"eval_runtime": 58.6231, |
|
"eval_samples_per_second": 87.184, |
|
"eval_steps_per_second": 3.122, |
|
"step": 5684 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"step": 6496, |
|
"train_exact_match": 82.31768231768231, |
|
"train_f1": 91.06806991799819, |
|
"train_runtime": 17.4317, |
|
"train_samples_per_second": 88.23, |
|
"train_steps_per_second": 3.155 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 37.56465148925781, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.4315, |
|
"step": 6496 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_exact_match": 63.4375, |
|
"eval_f1": 78.70384115394582, |
|
"eval_runtime": 58.0199, |
|
"eval_samples_per_second": 88.09, |
|
"eval_steps_per_second": 3.154, |
|
"step": 6496 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"step": 7308, |
|
"train_exact_match": 83.31668331668331, |
|
"train_f1": 92.69987725307878, |
|
"train_runtime": 18.674, |
|
"train_samples_per_second": 87.394, |
|
"train_steps_per_second": 3.159 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 52.06901168823242, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.3884, |
|
"step": 7308 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_exact_match": 63.78125, |
|
"eval_f1": 78.92909907885071, |
|
"eval_runtime": 58.0352, |
|
"eval_samples_per_second": 88.067, |
|
"eval_steps_per_second": 3.153, |
|
"step": 7308 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 8120, |
|
"train_exact_match": 82.61738261738262, |
|
"train_f1": 91.90869875798059, |
|
"train_runtime": 17.1824, |
|
"train_samples_per_second": 87.764, |
|
"train_steps_per_second": 3.143 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 24.732524871826172, |
|
"learning_rate": 0.0, |
|
"loss": 0.3607, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_exact_match": 63.53125, |
|
"eval_f1": 78.82926814591693, |
|
"eval_runtime": 58.3589, |
|
"eval_samples_per_second": 87.579, |
|
"eval_steps_per_second": 3.136, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 8120, |
|
"total_flos": 4.451320899376128e+16, |
|
"train_loss": 1.0534881366297528, |
|
"train_runtime": 5226.7321, |
|
"train_samples_per_second": 43.457, |
|
"train_steps_per_second": 1.554 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 8120, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 4.451320899376128e+16, |
|
"train_batch_size": 28, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|