adapters-mistral-bnb8-QLORA-super_glue-rte
/
trainer_state-mistral-bnb8-QLORA-super_glue-rte-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 7.68, | |
"eval_steps": 1, | |
"global_step": 30, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.256, | |
"grad_norm": 160.99264526367188, | |
"learning_rate": 2.5e-05, | |
"loss": 4.2069, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.256, | |
"eval_accuracy": 0.468, | |
"eval_loss": 4.29254674911499, | |
"eval_runtime": 32.123, | |
"eval_samples_per_second": 7.783, | |
"eval_steps_per_second": 0.996, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.512, | |
"grad_norm": 175.32472229003906, | |
"learning_rate": 5e-05, | |
"loss": 4.0384, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.512, | |
"eval_accuracy": 0.536, | |
"eval_loss": 2.9174447059631348, | |
"eval_runtime": 32.2226, | |
"eval_samples_per_second": 7.759, | |
"eval_steps_per_second": 0.993, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.768, | |
"grad_norm": 46.10775375366211, | |
"learning_rate": 4.8214285714285716e-05, | |
"loss": 2.4382, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.768, | |
"eval_accuracy": 0.532, | |
"eval_loss": 4.273119926452637, | |
"eval_runtime": 32.426, | |
"eval_samples_per_second": 7.71, | |
"eval_steps_per_second": 0.987, | |
"step": 3 | |
}, | |
{ | |
"epoch": 1.024, | |
"grad_norm": 194.090087890625, | |
"learning_rate": 4.642857142857143e-05, | |
"loss": 4.3124, | |
"step": 4 | |
}, | |
{ | |
"epoch": 1.024, | |
"eval_accuracy": 0.616, | |
"eval_loss": 2.8796591758728027, | |
"eval_runtime": 32.2442, | |
"eval_samples_per_second": 7.753, | |
"eval_steps_per_second": 0.992, | |
"step": 4 | |
}, | |
{ | |
"epoch": 1.28, | |
"grad_norm": 135.1618194580078, | |
"learning_rate": 4.464285714285715e-05, | |
"loss": 2.3322, | |
"step": 5 | |
}, | |
{ | |
"epoch": 1.28, | |
"eval_accuracy": 0.704, | |
"eval_loss": 1.7112903594970703, | |
"eval_runtime": 32.2755, | |
"eval_samples_per_second": 7.746, | |
"eval_steps_per_second": 0.991, | |
"step": 5 | |
}, | |
{ | |
"epoch": 1.536, | |
"grad_norm": 41.44600296020508, | |
"learning_rate": 4.2857142857142856e-05, | |
"loss": 1.0083, | |
"step": 6 | |
}, | |
{ | |
"epoch": 1.536, | |
"eval_accuracy": 0.74, | |
"eval_loss": 1.8849633932113647, | |
"eval_runtime": 32.2473, | |
"eval_samples_per_second": 7.753, | |
"eval_steps_per_second": 0.992, | |
"step": 6 | |
}, | |
{ | |
"epoch": 1.792, | |
"grad_norm": 97.29663848876953, | |
"learning_rate": 4.107142857142857e-05, | |
"loss": 1.7488, | |
"step": 7 | |
}, | |
{ | |
"epoch": 1.792, | |
"eval_accuracy": 0.756, | |
"eval_loss": 1.5529009103775024, | |
"eval_runtime": 32.2389, | |
"eval_samples_per_second": 7.755, | |
"eval_steps_per_second": 0.993, | |
"step": 7 | |
}, | |
{ | |
"epoch": 2.048, | |
"grad_norm": 55.8738899230957, | |
"learning_rate": 3.928571428571429e-05, | |
"loss": 0.9772, | |
"step": 8 | |
}, | |
{ | |
"epoch": 2.048, | |
"eval_accuracy": 0.748, | |
"eval_loss": 1.2250745296478271, | |
"eval_runtime": 32.2277, | |
"eval_samples_per_second": 7.757, | |
"eval_steps_per_second": 0.993, | |
"step": 8 | |
}, | |
{ | |
"epoch": 2.304, | |
"grad_norm": 31.146005630493164, | |
"learning_rate": 3.7500000000000003e-05, | |
"loss": 0.6818, | |
"step": 9 | |
}, | |
{ | |
"epoch": 2.304, | |
"eval_accuracy": 0.772, | |
"eval_loss": 1.144472360610962, | |
"eval_runtime": 32.2009, | |
"eval_samples_per_second": 7.764, | |
"eval_steps_per_second": 0.994, | |
"step": 9 | |
}, | |
{ | |
"epoch": 2.56, | |
"grad_norm": 24.017282485961914, | |
"learning_rate": 3.571428571428572e-05, | |
"loss": 0.424, | |
"step": 10 | |
}, | |
{ | |
"epoch": 2.56, | |
"eval_accuracy": 0.768, | |
"eval_loss": 1.0808576345443726, | |
"eval_runtime": 32.236, | |
"eval_samples_per_second": 7.755, | |
"eval_steps_per_second": 0.993, | |
"step": 10 | |
}, | |
{ | |
"epoch": 2.816, | |
"grad_norm": 28.137168884277344, | |
"learning_rate": 3.392857142857143e-05, | |
"loss": 0.2724, | |
"step": 11 | |
}, | |
{ | |
"epoch": 2.816, | |
"eval_accuracy": 0.8, | |
"eval_loss": 1.0712668895721436, | |
"eval_runtime": 32.2185, | |
"eval_samples_per_second": 7.76, | |
"eval_steps_per_second": 0.993, | |
"step": 11 | |
}, | |
{ | |
"epoch": 3.072, | |
"grad_norm": 22.6978816986084, | |
"learning_rate": 3.2142857142857144e-05, | |
"loss": 0.374, | |
"step": 12 | |
}, | |
{ | |
"epoch": 3.072, | |
"eval_accuracy": 0.78, | |
"eval_loss": 1.1186448335647583, | |
"eval_runtime": 32.2247, | |
"eval_samples_per_second": 7.758, | |
"eval_steps_per_second": 0.993, | |
"step": 12 | |
}, | |
{ | |
"epoch": 3.328, | |
"grad_norm": 31.165828704833984, | |
"learning_rate": 3.0357142857142857e-05, | |
"loss": 0.2233, | |
"step": 13 | |
}, | |
{ | |
"epoch": 3.328, | |
"eval_accuracy": 0.8, | |
"eval_loss": 1.0234947204589844, | |
"eval_runtime": 32.2239, | |
"eval_samples_per_second": 7.758, | |
"eval_steps_per_second": 0.993, | |
"step": 13 | |
}, | |
{ | |
"epoch": 3.584, | |
"grad_norm": 15.037803649902344, | |
"learning_rate": 2.857142857142857e-05, | |
"loss": 0.0988, | |
"step": 14 | |
}, | |
{ | |
"epoch": 3.584, | |
"eval_accuracy": 0.78, | |
"eval_loss": 1.006734848022461, | |
"eval_runtime": 32.2408, | |
"eval_samples_per_second": 7.754, | |
"eval_steps_per_second": 0.993, | |
"step": 14 | |
}, | |
{ | |
"epoch": 3.84, | |
"grad_norm": 15.06645679473877, | |
"learning_rate": 2.6785714285714288e-05, | |
"loss": 0.0703, | |
"step": 15 | |
}, | |
{ | |
"epoch": 3.84, | |
"eval_accuracy": 0.768, | |
"eval_loss": 1.0643097162246704, | |
"eval_runtime": 32.2507, | |
"eval_samples_per_second": 7.752, | |
"eval_steps_per_second": 0.992, | |
"step": 15 | |
}, | |
{ | |
"epoch": 4.096, | |
"grad_norm": 8.253623962402344, | |
"learning_rate": 2.5e-05, | |
"loss": 0.0497, | |
"step": 16 | |
}, | |
{ | |
"epoch": 4.096, | |
"eval_accuracy": 0.76, | |
"eval_loss": 1.0730018615722656, | |
"eval_runtime": 32.242, | |
"eval_samples_per_second": 7.754, | |
"eval_steps_per_second": 0.992, | |
"step": 16 | |
}, | |
{ | |
"epoch": 4.352, | |
"grad_norm": 3.153515338897705, | |
"learning_rate": 2.3214285714285715e-05, | |
"loss": 0.0117, | |
"step": 17 | |
}, | |
{ | |
"epoch": 4.352, | |
"eval_accuracy": 0.788, | |
"eval_loss": 0.9917842149734497, | |
"eval_runtime": 32.2425, | |
"eval_samples_per_second": 7.754, | |
"eval_steps_per_second": 0.992, | |
"step": 17 | |
}, | |
{ | |
"epoch": 4.608, | |
"grad_norm": 2.2511026859283447, | |
"learning_rate": 2.1428571428571428e-05, | |
"loss": 0.0087, | |
"step": 18 | |
}, | |
{ | |
"epoch": 4.608, | |
"eval_accuracy": 0.8, | |
"eval_loss": 1.0100549459457397, | |
"eval_runtime": 32.2466, | |
"eval_samples_per_second": 7.753, | |
"eval_steps_per_second": 0.992, | |
"step": 18 | |
}, | |
{ | |
"epoch": 4.864, | |
"grad_norm": 1.4641634225845337, | |
"learning_rate": 1.9642857142857145e-05, | |
"loss": 0.0055, | |
"step": 19 | |
}, | |
{ | |
"epoch": 4.864, | |
"eval_accuracy": 0.792, | |
"eval_loss": 1.0336966514587402, | |
"eval_runtime": 32.2417, | |
"eval_samples_per_second": 7.754, | |
"eval_steps_per_second": 0.993, | |
"step": 19 | |
}, | |
{ | |
"epoch": 5.12, | |
"grad_norm": 5.901674270629883, | |
"learning_rate": 1.785714285714286e-05, | |
"loss": 0.0245, | |
"step": 20 | |
}, | |
{ | |
"epoch": 5.12, | |
"eval_accuracy": 0.816, | |
"eval_loss": 0.9779098033905029, | |
"eval_runtime": 32.246, | |
"eval_samples_per_second": 7.753, | |
"eval_steps_per_second": 0.992, | |
"step": 20 | |
}, | |
{ | |
"epoch": 5.376, | |
"grad_norm": 1.2532176971435547, | |
"learning_rate": 1.6071428571428572e-05, | |
"loss": 0.0027, | |
"step": 21 | |
}, | |
{ | |
"epoch": 5.376, | |
"eval_accuracy": 0.824, | |
"eval_loss": 0.8769565224647522, | |
"eval_runtime": 32.285, | |
"eval_samples_per_second": 7.744, | |
"eval_steps_per_second": 0.991, | |
"step": 21 | |
}, | |
{ | |
"epoch": 5.632, | |
"grad_norm": 2.1642556190490723, | |
"learning_rate": 1.4285714285714285e-05, | |
"loss": 0.0074, | |
"step": 22 | |
}, | |
{ | |
"epoch": 5.632, | |
"eval_accuracy": 0.828, | |
"eval_loss": 0.8246294260025024, | |
"eval_runtime": 32.2699, | |
"eval_samples_per_second": 7.747, | |
"eval_steps_per_second": 0.992, | |
"step": 22 | |
}, | |
{ | |
"epoch": 5.888, | |
"grad_norm": 0.6380490064620972, | |
"learning_rate": 1.25e-05, | |
"loss": 0.002, | |
"step": 23 | |
}, | |
{ | |
"epoch": 5.888, | |
"eval_accuracy": 0.824, | |
"eval_loss": 0.8333019614219666, | |
"eval_runtime": 32.2596, | |
"eval_samples_per_second": 7.75, | |
"eval_steps_per_second": 0.992, | |
"step": 23 | |
}, | |
{ | |
"epoch": 6.144, | |
"grad_norm": 0.19555674493312836, | |
"learning_rate": 1.0714285714285714e-05, | |
"loss": 0.0008, | |
"step": 24 | |
}, | |
{ | |
"epoch": 6.144, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.8673015236854553, | |
"eval_runtime": 32.2609, | |
"eval_samples_per_second": 7.749, | |
"eval_steps_per_second": 0.992, | |
"step": 24 | |
}, | |
{ | |
"epoch": 6.4, | |
"grad_norm": 0.6775424480438232, | |
"learning_rate": 8.92857142857143e-06, | |
"loss": 0.0014, | |
"step": 25 | |
}, | |
{ | |
"epoch": 6.4, | |
"eval_accuracy": 0.824, | |
"eval_loss": 0.8875783681869507, | |
"eval_runtime": 32.2411, | |
"eval_samples_per_second": 7.754, | |
"eval_steps_per_second": 0.993, | |
"step": 25 | |
}, | |
{ | |
"epoch": 6.656, | |
"grad_norm": 2.227677583694458, | |
"learning_rate": 7.142857142857143e-06, | |
"loss": 0.0039, | |
"step": 26 | |
}, | |
{ | |
"epoch": 6.656, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.8781009912490845, | |
"eval_runtime": 32.2526, | |
"eval_samples_per_second": 7.751, | |
"eval_steps_per_second": 0.992, | |
"step": 26 | |
}, | |
{ | |
"epoch": 6.912, | |
"grad_norm": 1.2826372385025024, | |
"learning_rate": 5.357142857142857e-06, | |
"loss": 0.0022, | |
"step": 27 | |
}, | |
{ | |
"epoch": 6.912, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.8575786352157593, | |
"eval_runtime": 32.2408, | |
"eval_samples_per_second": 7.754, | |
"eval_steps_per_second": 0.993, | |
"step": 27 | |
}, | |
{ | |
"epoch": 7.168, | |
"grad_norm": 0.16007262468338013, | |
"learning_rate": 3.5714285714285714e-06, | |
"loss": 0.0004, | |
"step": 28 | |
}, | |
{ | |
"epoch": 7.168, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.844285249710083, | |
"eval_runtime": 32.2539, | |
"eval_samples_per_second": 7.751, | |
"eval_steps_per_second": 0.992, | |
"step": 28 | |
}, | |
{ | |
"epoch": 7.424, | |
"grad_norm": 0.10789692401885986, | |
"learning_rate": 1.7857142857142857e-06, | |
"loss": 0.0004, | |
"step": 29 | |
}, | |
{ | |
"epoch": 7.424, | |
"eval_accuracy": 0.828, | |
"eval_loss": 0.8423502445220947, | |
"eval_runtime": 32.2676, | |
"eval_samples_per_second": 7.748, | |
"eval_steps_per_second": 0.992, | |
"step": 29 | |
}, | |
{ | |
"epoch": 7.68, | |
"grad_norm": 0.06230718642473221, | |
"learning_rate": 0.0, | |
"loss": 0.0002, | |
"step": 30 | |
}, | |
{ | |
"epoch": 7.68, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.8364017605781555, | |
"eval_runtime": 32.2513, | |
"eval_samples_per_second": 7.752, | |
"eval_steps_per_second": 0.992, | |
"step": 30 | |
}, | |
{ | |
"epoch": 7.68, | |
"step": 30, | |
"total_flos": 5.4187482488832e+16, | |
"train_loss": 0.7776199621614068, | |
"train_runtime": 4368.1472, | |
"train_samples_per_second": 2.289, | |
"train_steps_per_second": 0.007 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 30, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"total_flos": 5.4187482488832e+16, | |
"train_batch_size": 8, | |
"trial_name": null, | |
"trial_params": null | |
} | |