{ "best_metric": 76.09384754933019, "best_model_checkpoint": "/root/turkic_qa/ru_uzn_models/ru_uzn_xlm_roberta_base_squad_model/checkpoint-5022", "epoch": 10.0, "eval_steps": 500, "global_step": 5580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "step": 558, "train_exact_match": 54.345654345654346, "train_f1": 72.79270519378026, "train_runtime": 11.7013, "train_samples_per_second": 90.247, "train_steps_per_second": 3.248 }, { "epoch": 1.0, "grad_norm": 80.72039031982422, "learning_rate": 5e-06, "loss": 1.7207, "step": 558 }, { "epoch": 1.0, "eval_exact_match": 52.21875, "eval_f1": 71.55991959566774, "eval_runtime": 37.0088, "eval_samples_per_second": 90.762, "eval_steps_per_second": 3.242, "step": 558 }, { "epoch": 2.0, "step": 1116, "train_exact_match": 59.54045954045954, "train_f1": 78.95661212870544, "train_runtime": 11.6455, "train_samples_per_second": 89.39, "train_steps_per_second": 3.263 }, { "epoch": 2.0, "grad_norm": 73.76553344726562, "learning_rate": 1e-05, "loss": 1.352, "step": 1116 }, { "epoch": 2.0, "eval_exact_match": 55.46875, "eval_f1": 74.01581668435861, "eval_runtime": 37.1575, "eval_samples_per_second": 90.399, "eval_steps_per_second": 3.229, "step": 1116 }, { "epoch": 3.0, "step": 1674, "train_exact_match": 66.13386613386614, "train_f1": 83.54611322142382, "train_runtime": 11.6678, "train_samples_per_second": 90.163, "train_steps_per_second": 3.257 }, { "epoch": 3.0, "grad_norm": 73.22183227539062, "learning_rate": 8.750000000000001e-06, "loss": 1.1278, "step": 1674 }, { "epoch": 3.0, "eval_exact_match": 57.40625, "eval_f1": 75.65492929099143, "eval_runtime": 37.0792, "eval_samples_per_second": 90.59, "eval_steps_per_second": 3.236, "step": 1674 }, { "epoch": 4.0, "step": 2232, "train_exact_match": 67.23276723276723, "train_f1": 85.47766787697041, "train_runtime": 11.6568, "train_samples_per_second": 89.733, "train_steps_per_second": 3.26 }, { "epoch": 4.0, "grad_norm": 12.79330825805664, "learning_rate": 7.500000000000001e-06, "loss": 0.9505, "step": 2232 }, { "epoch": 4.0, "eval_exact_match": 57.9375, "eval_f1": 75.99186608404226, "eval_runtime": 37.0588, "eval_samples_per_second": 90.64, "eval_steps_per_second": 3.238, "step": 2232 }, { "epoch": 5.0, "step": 2790, "train_exact_match": 73.62637362637362, "train_f1": 88.41775048247749, "train_runtime": 11.6282, "train_samples_per_second": 89.61, "train_steps_per_second": 3.268 }, { "epoch": 5.0, "grad_norm": 57.348087310791016, "learning_rate": 6.25e-06, "loss": 0.8175, "step": 2790 }, { "epoch": 5.0, "eval_exact_match": 58.34375, "eval_f1": 76.03000703182897, "eval_runtime": 37.1357, "eval_samples_per_second": 90.452, "eval_steps_per_second": 3.231, "step": 2790 }, { "epoch": 6.0, "step": 3348, "train_exact_match": 78.62137862137862, "train_f1": 90.6016744099524, "train_runtime": 11.6628, "train_samples_per_second": 89.515, "train_steps_per_second": 3.258 }, { "epoch": 6.0, "grad_norm": 90.37775421142578, "learning_rate": 5e-06, "loss": 0.7139, "step": 3348 }, { "epoch": 6.0, "eval_exact_match": 58.8125, "eval_f1": 75.74834491209981, "eval_runtime": 37.1026, "eval_samples_per_second": 90.533, "eval_steps_per_second": 3.234, "step": 3348 }, { "epoch": 7.0, "step": 3906, "train_exact_match": 78.82117882117882, "train_f1": 92.66952734441121, "train_runtime": 11.5936, "train_samples_per_second": 89.705, "train_steps_per_second": 3.278 }, { "epoch": 7.0, "grad_norm": 47.05348587036133, "learning_rate": 3.7500000000000005e-06, "loss": 0.6345, "step": 3906 }, { "epoch": 7.0, "eval_exact_match": 58.90625, "eval_f1": 76.05687183756119, "eval_runtime": 37.0467, "eval_samples_per_second": 90.669, "eval_steps_per_second": 3.239, "step": 3906 }, { "epoch": 8.0, "step": 4464, "train_exact_match": 80.21978021978022, "train_f1": 92.08409867123048, "train_runtime": 11.6489, "train_samples_per_second": 89.708, "train_steps_per_second": 3.262 }, { "epoch": 8.0, "grad_norm": 45.41628646850586, "learning_rate": 2.5e-06, "loss": 0.5799, "step": 4464 }, { "epoch": 8.0, "eval_exact_match": 58.9375, "eval_f1": 75.91207274041706, "eval_runtime": 37.0443, "eval_samples_per_second": 90.675, "eval_steps_per_second": 3.239, "step": 4464 }, { "epoch": 9.0, "step": 5022, "train_exact_match": 83.11688311688312, "train_f1": 93.78525979268544, "train_runtime": 11.634, "train_samples_per_second": 89.737, "train_steps_per_second": 3.266 }, { "epoch": 9.0, "grad_norm": 66.75804901123047, "learning_rate": 1.25e-06, "loss": 0.5267, "step": 5022 }, { "epoch": 9.0, "eval_exact_match": 58.96875, "eval_f1": 76.09384754933019, "eval_runtime": 37.1002, "eval_samples_per_second": 90.538, "eval_steps_per_second": 3.234, "step": 5022 }, { "epoch": 10.0, "step": 5580, "train_exact_match": 81.71828171828172, "train_f1": 93.01859252339878, "train_runtime": 11.6538, "train_samples_per_second": 89.842, "train_steps_per_second": 3.261 }, { "epoch": 10.0, "grad_norm": 77.05963897705078, "learning_rate": 0.0, "loss": 0.4995, "step": 5580 }, { "epoch": 10.0, "eval_exact_match": 58.71875, "eval_f1": 76.0502313865696, "eval_runtime": 37.0653, "eval_samples_per_second": 90.624, "eval_steps_per_second": 3.238, "step": 5580 }, { "epoch": 10.0, "step": 5580, "total_flos": 3.056976081243648e+16, "train_loss": 0.8922852929775006, "train_runtime": 3581.679, "train_samples_per_second": 43.552, "train_steps_per_second": 1.558 } ], "logging_steps": 500, "max_steps": 5580, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.056976081243648e+16, "train_batch_size": 28, "trial_name": null, "trial_params": null }