adapters-llama2-bnb8-QLORA-super_glue-axg
/
trainer_state-llama2-bnb8-QLORA-super_glue-axg-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 10.0, | |
"eval_steps": 1, | |
"global_step": 90, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.1111111111111111, | |
"grad_norm": NaN, | |
"learning_rate": 0.0, | |
"loss": 0.8046, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.1111111111111111, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.9468807578086853, | |
"eval_runtime": 1.4964, | |
"eval_samples_per_second": 48.117, | |
"eval_steps_per_second": 3.341, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.2222222222222222, | |
"grad_norm": NaN, | |
"learning_rate": 0.0, | |
"loss": 0.834, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.2222222222222222, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.9468807578086853, | |
"eval_runtime": 1.4798, | |
"eval_samples_per_second": 48.657, | |
"eval_steps_per_second": 3.379, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"grad_norm": NaN, | |
"learning_rate": 0.0, | |
"loss": 0.864, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.9468807578086853, | |
"eval_runtime": 1.4711, | |
"eval_samples_per_second": 48.945, | |
"eval_steps_per_second": 3.399, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.4444444444444444, | |
"grad_norm": NaN, | |
"learning_rate": 0.0, | |
"loss": 0.6596, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.4444444444444444, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.9468807578086853, | |
"eval_runtime": 1.4781, | |
"eval_samples_per_second": 48.711, | |
"eval_steps_per_second": 3.383, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.5555555555555556, | |
"grad_norm": 15.292896270751953, | |
"learning_rate": 2.5e-05, | |
"loss": 0.9107, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.5555555555555556, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.9468807578086853, | |
"eval_runtime": 1.4681, | |
"eval_samples_per_second": 49.042, | |
"eval_steps_per_second": 3.406, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"grad_norm": NaN, | |
"learning_rate": 2.5e-05, | |
"loss": 0.8285, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.9468807578086853, | |
"eval_runtime": 1.4656, | |
"eval_samples_per_second": 49.125, | |
"eval_steps_per_second": 3.411, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.7777777777777778, | |
"grad_norm": 38.5906867980957, | |
"learning_rate": 5e-05, | |
"loss": 0.8436, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.7777777777777778, | |
"eval_accuracy": 0.5416666666666666, | |
"eval_loss": 0.8725653886795044, | |
"eval_runtime": 1.466, | |
"eval_samples_per_second": 49.113, | |
"eval_steps_per_second": 3.411, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.8888888888888888, | |
"grad_norm": 11.768951416015625, | |
"learning_rate": 4.943181818181818e-05, | |
"loss": 0.8397, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.8888888888888888, | |
"eval_accuracy": 0.5138888888888888, | |
"eval_loss": 0.89170241355896, | |
"eval_runtime": 1.4681, | |
"eval_samples_per_second": 49.044, | |
"eval_steps_per_second": 3.406, | |
"step": 8 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 38.63240432739258, | |
"learning_rate": 4.886363636363637e-05, | |
"loss": 0.7961, | |
"step": 9 | |
}, | |
{ | |
"epoch": 1.0, | |
"eval_accuracy": 0.5833333333333334, | |
"eval_loss": 0.9097018837928772, | |
"eval_runtime": 1.4715, | |
"eval_samples_per_second": 48.93, | |
"eval_steps_per_second": 3.398, | |
"step": 9 | |
}, | |
{ | |
"epoch": 1.1111111111111112, | |
"grad_norm": 25.327926635742188, | |
"learning_rate": 4.829545454545455e-05, | |
"loss": 0.6728, | |
"step": 10 | |
}, | |
{ | |
"epoch": 1.1111111111111112, | |
"eval_accuracy": 0.5416666666666666, | |
"eval_loss": 0.8724534511566162, | |
"eval_runtime": 1.4733, | |
"eval_samples_per_second": 48.871, | |
"eval_steps_per_second": 3.394, | |
"step": 10 | |
}, | |
{ | |
"epoch": 1.2222222222222223, | |
"grad_norm": 41.665626525878906, | |
"learning_rate": 4.772727272727273e-05, | |
"loss": 0.8504, | |
"step": 11 | |
}, | |
{ | |
"epoch": 1.2222222222222223, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.8109580278396606, | |
"eval_runtime": 1.4661, | |
"eval_samples_per_second": 49.109, | |
"eval_steps_per_second": 3.41, | |
"step": 11 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"grad_norm": 28.891475677490234, | |
"learning_rate": 4.715909090909091e-05, | |
"loss": 0.6348, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.7847561240196228, | |
"eval_runtime": 1.4756, | |
"eval_samples_per_second": 48.794, | |
"eval_steps_per_second": 3.388, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.4444444444444444, | |
"grad_norm": 15.667322158813477, | |
"learning_rate": 4.659090909090909e-05, | |
"loss": 0.7075, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.4444444444444444, | |
"eval_accuracy": 0.4861111111111111, | |
"eval_loss": 0.8036965131759644, | |
"eval_runtime": 1.4698, | |
"eval_samples_per_second": 48.987, | |
"eval_steps_per_second": 3.402, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.5555555555555556, | |
"grad_norm": 14.308302879333496, | |
"learning_rate": 4.602272727272727e-05, | |
"loss": 0.625, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.5555555555555556, | |
"eval_accuracy": 0.4444444444444444, | |
"eval_loss": 0.844111979007721, | |
"eval_runtime": 1.4649, | |
"eval_samples_per_second": 49.152, | |
"eval_steps_per_second": 3.413, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"grad_norm": 33.23239517211914, | |
"learning_rate": 4.545454545454546e-05, | |
"loss": 0.7687, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.8427666425704956, | |
"eval_runtime": 1.4735, | |
"eval_samples_per_second": 48.862, | |
"eval_steps_per_second": 3.393, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.7777777777777777, | |
"grad_norm": 23.251686096191406, | |
"learning_rate": 4.488636363636364e-05, | |
"loss": 0.5379, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.7777777777777777, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 0.7749015092849731, | |
"eval_runtime": 1.4807, | |
"eval_samples_per_second": 48.625, | |
"eval_steps_per_second": 3.377, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.8888888888888888, | |
"grad_norm": 11.318711280822754, | |
"learning_rate": 4.431818181818182e-05, | |
"loss": 0.5477, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.8888888888888888, | |
"eval_accuracy": 0.6805555555555556, | |
"eval_loss": 0.68642258644104, | |
"eval_runtime": 1.4816, | |
"eval_samples_per_second": 48.595, | |
"eval_steps_per_second": 3.375, | |
"step": 17 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 4.618575096130371, | |
"learning_rate": 4.375e-05, | |
"loss": 0.54, | |
"step": 18 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.7083333333333334, | |
"eval_loss": 0.6404860019683838, | |
"eval_runtime": 1.4695, | |
"eval_samples_per_second": 48.996, | |
"eval_steps_per_second": 3.402, | |
"step": 18 | |
}, | |
{ | |
"epoch": 2.111111111111111, | |
"grad_norm": 4.5261077880859375, | |
"learning_rate": 4.318181818181819e-05, | |
"loss": 0.4569, | |
"step": 19 | |
}, | |
{ | |
"epoch": 2.111111111111111, | |
"eval_accuracy": 0.6527777777777778, | |
"eval_loss": 0.641287088394165, | |
"eval_runtime": 1.4917, | |
"eval_samples_per_second": 48.268, | |
"eval_steps_per_second": 3.352, | |
"step": 19 | |
}, | |
{ | |
"epoch": 2.2222222222222223, | |
"grad_norm": 16.080116271972656, | |
"learning_rate": 4.261363636363637e-05, | |
"loss": 0.4211, | |
"step": 20 | |
}, | |
{ | |
"epoch": 2.2222222222222223, | |
"eval_accuracy": 0.6805555555555556, | |
"eval_loss": 0.5904948115348816, | |
"eval_runtime": 1.6079, | |
"eval_samples_per_second": 44.778, | |
"eval_steps_per_second": 3.11, | |
"step": 20 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"grad_norm": 10.328635215759277, | |
"learning_rate": 4.204545454545455e-05, | |
"loss": 0.4088, | |
"step": 21 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"eval_accuracy": 0.6666666666666666, | |
"eval_loss": 0.5809131264686584, | |
"eval_runtime": 1.4918, | |
"eval_samples_per_second": 48.264, | |
"eval_steps_per_second": 3.352, | |
"step": 21 | |
}, | |
{ | |
"epoch": 2.4444444444444446, | |
"grad_norm": 3.862455368041992, | |
"learning_rate": 4.1477272727272734e-05, | |
"loss": 0.3588, | |
"step": 22 | |
}, | |
{ | |
"epoch": 2.4444444444444446, | |
"eval_accuracy": 0.6388888888888888, | |
"eval_loss": 0.5841391086578369, | |
"eval_runtime": 1.4927, | |
"eval_samples_per_second": 48.235, | |
"eval_steps_per_second": 3.35, | |
"step": 22 | |
}, | |
{ | |
"epoch": 2.5555555555555554, | |
"grad_norm": 21.388599395751953, | |
"learning_rate": 4.0909090909090915e-05, | |
"loss": 0.3224, | |
"step": 23 | |
}, | |
{ | |
"epoch": 2.5555555555555554, | |
"eval_accuracy": 0.6805555555555556, | |
"eval_loss": 0.5694243311882019, | |
"eval_runtime": 1.4741, | |
"eval_samples_per_second": 48.845, | |
"eval_steps_per_second": 3.392, | |
"step": 23 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"grad_norm": 18.525726318359375, | |
"learning_rate": 4.034090909090909e-05, | |
"loss": 0.3886, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"eval_accuracy": 0.6944444444444444, | |
"eval_loss": 0.5775553584098816, | |
"eval_runtime": 1.4808, | |
"eval_samples_per_second": 48.621, | |
"eval_steps_per_second": 3.376, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.7777777777777777, | |
"grad_norm": 8.507179260253906, | |
"learning_rate": 3.9772727272727275e-05, | |
"loss": 0.4483, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.7777777777777777, | |
"eval_accuracy": 0.7083333333333334, | |
"eval_loss": 0.5739086866378784, | |
"eval_runtime": 1.5173, | |
"eval_samples_per_second": 47.452, | |
"eval_steps_per_second": 3.295, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.888888888888889, | |
"grad_norm": 10.042552947998047, | |
"learning_rate": 3.9204545454545456e-05, | |
"loss": 0.2571, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.888888888888889, | |
"eval_accuracy": 0.6944444444444444, | |
"eval_loss": 0.5965893864631653, | |
"eval_runtime": 1.513, | |
"eval_samples_per_second": 47.589, | |
"eval_steps_per_second": 3.305, | |
"step": 26 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 4.674591541290283, | |
"learning_rate": 3.8636363636363636e-05, | |
"loss": 0.2023, | |
"step": 27 | |
}, | |
{ | |
"epoch": 3.0, | |
"eval_accuracy": 0.7222222222222222, | |
"eval_loss": 0.6090915203094482, | |
"eval_runtime": 1.4987, | |
"eval_samples_per_second": 48.041, | |
"eval_steps_per_second": 3.336, | |
"step": 27 | |
}, | |
{ | |
"epoch": 3.111111111111111, | |
"grad_norm": 4.555273532867432, | |
"learning_rate": 3.8068181818181816e-05, | |
"loss": 0.1835, | |
"step": 28 | |
}, | |
{ | |
"epoch": 3.111111111111111, | |
"eval_accuracy": 0.7222222222222222, | |
"eval_loss": 0.6348134279251099, | |
"eval_runtime": 1.5215, | |
"eval_samples_per_second": 47.323, | |
"eval_steps_per_second": 3.286, | |
"step": 28 | |
}, | |
{ | |
"epoch": 3.2222222222222223, | |
"grad_norm": 21.892303466796875, | |
"learning_rate": 3.7500000000000003e-05, | |
"loss": 0.2978, | |
"step": 29 | |
}, | |
{ | |
"epoch": 3.2222222222222223, | |
"eval_accuracy": 0.75, | |
"eval_loss": 0.6449768543243408, | |
"eval_runtime": 1.4942, | |
"eval_samples_per_second": 48.187, | |
"eval_steps_per_second": 3.346, | |
"step": 29 | |
}, | |
{ | |
"epoch": 3.3333333333333335, | |
"grad_norm": 3.3101179599761963, | |
"learning_rate": 3.6931818181818184e-05, | |
"loss": 0.1375, | |
"step": 30 | |
}, | |
{ | |
"epoch": 3.3333333333333335, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 0.6778775453567505, | |
"eval_runtime": 1.581, | |
"eval_samples_per_second": 45.542, | |
"eval_steps_per_second": 3.163, | |
"step": 30 | |
}, | |
{ | |
"epoch": 3.4444444444444446, | |
"grad_norm": 5.253066062927246, | |
"learning_rate": 3.6363636363636364e-05, | |
"loss": 0.0773, | |
"step": 31 | |
}, | |
{ | |
"epoch": 3.4444444444444446, | |
"eval_accuracy": 0.75, | |
"eval_loss": 0.6996697783470154, | |
"eval_runtime": 1.4905, | |
"eval_samples_per_second": 48.307, | |
"eval_steps_per_second": 3.355, | |
"step": 31 | |
}, | |
{ | |
"epoch": 3.5555555555555554, | |
"grad_norm": 6.1245317459106445, | |
"learning_rate": 3.579545454545455e-05, | |
"loss": 0.1398, | |
"step": 32 | |
}, | |
{ | |
"epoch": 3.5555555555555554, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 0.7371202111244202, | |
"eval_runtime": 1.5564, | |
"eval_samples_per_second": 46.261, | |
"eval_steps_per_second": 3.213, | |
"step": 32 | |
}, | |
{ | |
"epoch": 3.6666666666666665, | |
"grad_norm": 8.124380111694336, | |
"learning_rate": 3.522727272727273e-05, | |
"loss": 0.1665, | |
"step": 33 | |
}, | |
{ | |
"epoch": 3.6666666666666665, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 0.7838079929351807, | |
"eval_runtime": 1.4902, | |
"eval_samples_per_second": 48.315, | |
"eval_steps_per_second": 3.355, | |
"step": 33 | |
}, | |
{ | |
"epoch": 3.7777777777777777, | |
"grad_norm": 5.12769079208374, | |
"learning_rate": 3.465909090909091e-05, | |
"loss": 0.0566, | |
"step": 34 | |
}, | |
{ | |
"epoch": 3.7777777777777777, | |
"eval_accuracy": 0.7222222222222222, | |
"eval_loss": 0.823827862739563, | |
"eval_runtime": 1.4808, | |
"eval_samples_per_second": 48.623, | |
"eval_steps_per_second": 3.377, | |
"step": 34 | |
}, | |
{ | |
"epoch": 3.888888888888889, | |
"grad_norm": 4.054475784301758, | |
"learning_rate": 3.409090909090909e-05, | |
"loss": 0.0416, | |
"step": 35 | |
}, | |
{ | |
"epoch": 3.888888888888889, | |
"eval_accuracy": 0.7083333333333334, | |
"eval_loss": 0.8069868087768555, | |
"eval_runtime": 1.4641, | |
"eval_samples_per_second": 49.176, | |
"eval_steps_per_second": 3.415, | |
"step": 35 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 2.9879519939422607, | |
"learning_rate": 3.352272727272727e-05, | |
"loss": 0.082, | |
"step": 36 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 0.9339464902877808, | |
"eval_runtime": 1.475, | |
"eval_samples_per_second": 48.815, | |
"eval_steps_per_second": 3.39, | |
"step": 36 | |
}, | |
{ | |
"epoch": 4.111111111111111, | |
"grad_norm": 5.574306488037109, | |
"learning_rate": 3.295454545454545e-05, | |
"loss": 0.0587, | |
"step": 37 | |
}, | |
{ | |
"epoch": 4.111111111111111, | |
"eval_accuracy": 0.75, | |
"eval_loss": 0.898453414440155, | |
"eval_runtime": 1.4751, | |
"eval_samples_per_second": 48.809, | |
"eval_steps_per_second": 3.39, | |
"step": 37 | |
}, | |
{ | |
"epoch": 4.222222222222222, | |
"grad_norm": 4.23170280456543, | |
"learning_rate": 3.238636363636364e-05, | |
"loss": 0.1462, | |
"step": 38 | |
}, | |
{ | |
"epoch": 4.222222222222222, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 0.8970211744308472, | |
"eval_runtime": 1.496, | |
"eval_samples_per_second": 48.127, | |
"eval_steps_per_second": 3.342, | |
"step": 38 | |
}, | |
{ | |
"epoch": 4.333333333333333, | |
"grad_norm": 4.057853698730469, | |
"learning_rate": 3.181818181818182e-05, | |
"loss": 0.0474, | |
"step": 39 | |
}, | |
{ | |
"epoch": 4.333333333333333, | |
"eval_accuracy": 0.7083333333333334, | |
"eval_loss": 0.9456102848052979, | |
"eval_runtime": 1.4756, | |
"eval_samples_per_second": 48.794, | |
"eval_steps_per_second": 3.388, | |
"step": 39 | |
}, | |
{ | |
"epoch": 4.444444444444445, | |
"grad_norm": 1.9089412689208984, | |
"learning_rate": 3.125e-05, | |
"loss": 0.0247, | |
"step": 40 | |
}, | |
{ | |
"epoch": 4.444444444444445, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 1.0376629829406738, | |
"eval_runtime": 1.4767, | |
"eval_samples_per_second": 48.757, | |
"eval_steps_per_second": 3.386, | |
"step": 40 | |
}, | |
{ | |
"epoch": 4.555555555555555, | |
"grad_norm": 8.005538940429688, | |
"learning_rate": 3.068181818181818e-05, | |
"loss": 0.0492, | |
"step": 41 | |
}, | |
{ | |
"epoch": 4.555555555555555, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 1.09256112575531, | |
"eval_runtime": 1.4742, | |
"eval_samples_per_second": 48.839, | |
"eval_steps_per_second": 3.392, | |
"step": 41 | |
}, | |
{ | |
"epoch": 4.666666666666667, | |
"grad_norm": 7.321335792541504, | |
"learning_rate": 3.0113636363636365e-05, | |
"loss": 0.0541, | |
"step": 42 | |
}, | |
{ | |
"epoch": 4.666666666666667, | |
"eval_accuracy": 0.75, | |
"eval_loss": 1.060943365097046, | |
"eval_runtime": 1.4853, | |
"eval_samples_per_second": 48.474, | |
"eval_steps_per_second": 3.366, | |
"step": 42 | |
}, | |
{ | |
"epoch": 4.777777777777778, | |
"grad_norm": 3.932873487472534, | |
"learning_rate": 2.954545454545455e-05, | |
"loss": 0.0222, | |
"step": 43 | |
}, | |
{ | |
"epoch": 4.777777777777778, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 1.047785997390747, | |
"eval_runtime": 1.49, | |
"eval_samples_per_second": 48.321, | |
"eval_steps_per_second": 3.356, | |
"step": 43 | |
}, | |
{ | |
"epoch": 4.888888888888889, | |
"grad_norm": 0.5964356660842896, | |
"learning_rate": 2.8977272727272732e-05, | |
"loss": 0.007, | |
"step": 44 | |
}, | |
{ | |
"epoch": 4.888888888888889, | |
"eval_accuracy": 0.7222222222222222, | |
"eval_loss": 0.9296979904174805, | |
"eval_runtime": 1.476, | |
"eval_samples_per_second": 48.781, | |
"eval_steps_per_second": 3.388, | |
"step": 44 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 5.5752363204956055, | |
"learning_rate": 2.8409090909090912e-05, | |
"loss": 0.0461, | |
"step": 45 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_accuracy": 0.75, | |
"eval_loss": 0.8986176252365112, | |
"eval_runtime": 1.5427, | |
"eval_samples_per_second": 46.672, | |
"eval_steps_per_second": 3.241, | |
"step": 45 | |
}, | |
{ | |
"epoch": 5.111111111111111, | |
"grad_norm": 0.39924749732017517, | |
"learning_rate": 2.784090909090909e-05, | |
"loss": 0.004, | |
"step": 46 | |
}, | |
{ | |
"epoch": 5.111111111111111, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 0.8829209804534912, | |
"eval_runtime": 1.5029, | |
"eval_samples_per_second": 47.908, | |
"eval_steps_per_second": 3.327, | |
"step": 46 | |
}, | |
{ | |
"epoch": 5.222222222222222, | |
"grad_norm": 1.3180583715438843, | |
"learning_rate": 2.7272727272727273e-05, | |
"loss": 0.0085, | |
"step": 47 | |
}, | |
{ | |
"epoch": 5.222222222222222, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 0.9565764665603638, | |
"eval_runtime": 1.4989, | |
"eval_samples_per_second": 48.036, | |
"eval_steps_per_second": 3.336, | |
"step": 47 | |
}, | |
{ | |
"epoch": 5.333333333333333, | |
"grad_norm": 0.30337390303611755, | |
"learning_rate": 2.6704545454545453e-05, | |
"loss": 0.0025, | |
"step": 48 | |
}, | |
{ | |
"epoch": 5.333333333333333, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 0.9712399244308472, | |
"eval_runtime": 1.4897, | |
"eval_samples_per_second": 48.33, | |
"eval_steps_per_second": 3.356, | |
"step": 48 | |
}, | |
{ | |
"epoch": 5.444444444444445, | |
"grad_norm": 1.2145888805389404, | |
"learning_rate": 2.6136363636363637e-05, | |
"loss": 0.0115, | |
"step": 49 | |
}, | |
{ | |
"epoch": 5.444444444444445, | |
"eval_accuracy": 0.75, | |
"eval_loss": 1.0557817220687866, | |
"eval_runtime": 1.5821, | |
"eval_samples_per_second": 45.51, | |
"eval_steps_per_second": 3.16, | |
"step": 49 | |
}, | |
{ | |
"epoch": 5.555555555555555, | |
"grad_norm": 0.5874105095863342, | |
"learning_rate": 2.5568181818181817e-05, | |
"loss": 0.0063, | |
"step": 50 | |
}, | |
{ | |
"epoch": 5.555555555555555, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.0096588134765625, | |
"eval_runtime": 1.5483, | |
"eval_samples_per_second": 46.502, | |
"eval_steps_per_second": 3.229, | |
"step": 50 | |
}, | |
{ | |
"epoch": 5.666666666666667, | |
"grad_norm": 1.8710988759994507, | |
"learning_rate": 2.5e-05, | |
"loss": 0.0082, | |
"step": 51 | |
}, | |
{ | |
"epoch": 5.666666666666667, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 1.1002558469772339, | |
"eval_runtime": 1.5485, | |
"eval_samples_per_second": 46.496, | |
"eval_steps_per_second": 3.229, | |
"step": 51 | |
}, | |
{ | |
"epoch": 5.777777777777778, | |
"grad_norm": 0.08881735056638718, | |
"learning_rate": 2.4431818181818185e-05, | |
"loss": 0.0005, | |
"step": 52 | |
}, | |
{ | |
"epoch": 5.777777777777778, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 1.111068844795227, | |
"eval_runtime": 1.4791, | |
"eval_samples_per_second": 48.677, | |
"eval_steps_per_second": 3.38, | |
"step": 52 | |
}, | |
{ | |
"epoch": 5.888888888888889, | |
"grad_norm": 0.055561624467372894, | |
"learning_rate": 2.3863636363636365e-05, | |
"loss": 0.0004, | |
"step": 53 | |
}, | |
{ | |
"epoch": 5.888888888888889, | |
"eval_accuracy": 0.75, | |
"eval_loss": 1.1046401262283325, | |
"eval_runtime": 1.5013, | |
"eval_samples_per_second": 47.957, | |
"eval_steps_per_second": 3.33, | |
"step": 53 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 0.051147185266017914, | |
"learning_rate": 2.3295454545454546e-05, | |
"loss": 0.0004, | |
"step": 54 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.1201319694519043, | |
"eval_runtime": 1.4825, | |
"eval_samples_per_second": 48.566, | |
"eval_steps_per_second": 3.373, | |
"step": 54 | |
}, | |
{ | |
"epoch": 6.111111111111111, | |
"grad_norm": 0.03255375474691391, | |
"learning_rate": 2.272727272727273e-05, | |
"loss": 0.0003, | |
"step": 55 | |
}, | |
{ | |
"epoch": 6.111111111111111, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.1359165906906128, | |
"eval_runtime": 1.4977, | |
"eval_samples_per_second": 48.075, | |
"eval_steps_per_second": 3.339, | |
"step": 55 | |
}, | |
{ | |
"epoch": 6.222222222222222, | |
"grad_norm": 0.0207963977009058, | |
"learning_rate": 2.215909090909091e-05, | |
"loss": 0.0001, | |
"step": 56 | |
}, | |
{ | |
"epoch": 6.222222222222222, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.2502617835998535, | |
"eval_runtime": 1.4856, | |
"eval_samples_per_second": 48.464, | |
"eval_steps_per_second": 3.366, | |
"step": 56 | |
}, | |
{ | |
"epoch": 6.333333333333333, | |
"grad_norm": 0.14410509169101715, | |
"learning_rate": 2.1590909090909093e-05, | |
"loss": 0.0006, | |
"step": 57 | |
}, | |
{ | |
"epoch": 6.333333333333333, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.1601958274841309, | |
"eval_runtime": 1.4859, | |
"eval_samples_per_second": 48.455, | |
"eval_steps_per_second": 3.365, | |
"step": 57 | |
}, | |
{ | |
"epoch": 6.444444444444445, | |
"grad_norm": 0.054281365126371384, | |
"learning_rate": 2.1022727272727274e-05, | |
"loss": 0.0003, | |
"step": 58 | |
}, | |
{ | |
"epoch": 6.444444444444445, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.2148370742797852, | |
"eval_runtime": 1.4936, | |
"eval_samples_per_second": 48.206, | |
"eval_steps_per_second": 3.348, | |
"step": 58 | |
}, | |
{ | |
"epoch": 6.555555555555555, | |
"grad_norm": 0.02057286538183689, | |
"learning_rate": 2.0454545454545457e-05, | |
"loss": 0.0002, | |
"step": 59 | |
}, | |
{ | |
"epoch": 6.555555555555555, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.3522080183029175, | |
"eval_runtime": 1.4763, | |
"eval_samples_per_second": 48.769, | |
"eval_steps_per_second": 3.387, | |
"step": 59 | |
}, | |
{ | |
"epoch": 6.666666666666667, | |
"grad_norm": 0.0014641749439761043, | |
"learning_rate": 1.9886363636363638e-05, | |
"loss": 0.0, | |
"step": 60 | |
}, | |
{ | |
"epoch": 6.666666666666667, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.2812823057174683, | |
"eval_runtime": 1.4521, | |
"eval_samples_per_second": 49.584, | |
"eval_steps_per_second": 3.443, | |
"step": 60 | |
}, | |
{ | |
"epoch": 6.777777777777778, | |
"grad_norm": 0.018109608441591263, | |
"learning_rate": 1.9318181818181818e-05, | |
"loss": 0.0001, | |
"step": 61 | |
}, | |
{ | |
"epoch": 6.777777777777778, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.253478765487671, | |
"eval_runtime": 1.4921, | |
"eval_samples_per_second": 48.254, | |
"eval_steps_per_second": 3.351, | |
"step": 61 | |
}, | |
{ | |
"epoch": 6.888888888888889, | |
"grad_norm": 0.05381924659013748, | |
"learning_rate": 1.8750000000000002e-05, | |
"loss": 0.0002, | |
"step": 62 | |
}, | |
{ | |
"epoch": 6.888888888888889, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.3584959506988525, | |
"eval_runtime": 1.4846, | |
"eval_samples_per_second": 48.499, | |
"eval_steps_per_second": 3.368, | |
"step": 62 | |
}, | |
{ | |
"epoch": 7.0, | |
"grad_norm": 0.002476187190040946, | |
"learning_rate": 1.8181818181818182e-05, | |
"loss": 0.0, | |
"step": 63 | |
}, | |
{ | |
"epoch": 7.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.364111304283142, | |
"eval_runtime": 1.4702, | |
"eval_samples_per_second": 48.973, | |
"eval_steps_per_second": 3.401, | |
"step": 63 | |
}, | |
{ | |
"epoch": 7.111111111111111, | |
"grad_norm": 0.014680763706564903, | |
"learning_rate": 1.7613636363636366e-05, | |
"loss": 0.0001, | |
"step": 64 | |
}, | |
{ | |
"epoch": 7.111111111111111, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.3435033559799194, | |
"eval_runtime": 1.4658, | |
"eval_samples_per_second": 49.118, | |
"eval_steps_per_second": 3.411, | |
"step": 64 | |
}, | |
{ | |
"epoch": 7.222222222222222, | |
"grad_norm": 0.010304883122444153, | |
"learning_rate": 1.7045454545454546e-05, | |
"loss": 0.0, | |
"step": 65 | |
}, | |
{ | |
"epoch": 7.222222222222222, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.3150172233581543, | |
"eval_runtime": 1.4673, | |
"eval_samples_per_second": 49.071, | |
"eval_steps_per_second": 3.408, | |
"step": 65 | |
}, | |
{ | |
"epoch": 7.333333333333333, | |
"grad_norm": 0.014999941922724247, | |
"learning_rate": 1.6477272727272726e-05, | |
"loss": 0.0001, | |
"step": 66 | |
}, | |
{ | |
"epoch": 7.333333333333333, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.396562099456787, | |
"eval_runtime": 1.4754, | |
"eval_samples_per_second": 48.799, | |
"eval_steps_per_second": 3.389, | |
"step": 66 | |
}, | |
{ | |
"epoch": 7.444444444444445, | |
"grad_norm": 0.021909315139055252, | |
"learning_rate": 1.590909090909091e-05, | |
"loss": 0.0001, | |
"step": 67 | |
}, | |
{ | |
"epoch": 7.444444444444445, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.3331305980682373, | |
"eval_runtime": 1.4793, | |
"eval_samples_per_second": 48.672, | |
"eval_steps_per_second": 3.38, | |
"step": 67 | |
}, | |
{ | |
"epoch": 7.555555555555555, | |
"grad_norm": 0.0006957874284125865, | |
"learning_rate": 1.534090909090909e-05, | |
"loss": 0.0, | |
"step": 68 | |
}, | |
{ | |
"epoch": 7.555555555555555, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.3853394985198975, | |
"eval_runtime": 1.4743, | |
"eval_samples_per_second": 48.838, | |
"eval_steps_per_second": 3.392, | |
"step": 68 | |
}, | |
{ | |
"epoch": 7.666666666666667, | |
"grad_norm": 0.016373885795474052, | |
"learning_rate": 1.4772727272727274e-05, | |
"loss": 0.0001, | |
"step": 69 | |
}, | |
{ | |
"epoch": 7.666666666666667, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.342584490776062, | |
"eval_runtime": 1.5274, | |
"eval_samples_per_second": 47.14, | |
"eval_steps_per_second": 3.274, | |
"step": 69 | |
}, | |
{ | |
"epoch": 7.777777777777778, | |
"grad_norm": 0.09788516908884048, | |
"learning_rate": 1.4204545454545456e-05, | |
"loss": 0.0002, | |
"step": 70 | |
}, | |
{ | |
"epoch": 7.777777777777778, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.3891209363937378, | |
"eval_runtime": 1.4849, | |
"eval_samples_per_second": 48.488, | |
"eval_steps_per_second": 3.367, | |
"step": 70 | |
}, | |
{ | |
"epoch": 7.888888888888889, | |
"grad_norm": 0.004993033595383167, | |
"learning_rate": 1.3636363636363637e-05, | |
"loss": 0.0, | |
"step": 71 | |
}, | |
{ | |
"epoch": 7.888888888888889, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.461037039756775, | |
"eval_runtime": 1.489, | |
"eval_samples_per_second": 48.354, | |
"eval_steps_per_second": 3.358, | |
"step": 71 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 0.00147097313310951, | |
"learning_rate": 1.3068181818181819e-05, | |
"loss": 0.0, | |
"step": 72 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.8194444444444444, | |
"eval_loss": 1.3761588335037231, | |
"eval_runtime": 1.4965, | |
"eval_samples_per_second": 48.111, | |
"eval_steps_per_second": 3.341, | |
"step": 72 | |
}, | |
{ | |
"epoch": 8.11111111111111, | |
"grad_norm": 0.011906428262591362, | |
"learning_rate": 1.25e-05, | |
"loss": 0.0001, | |
"step": 73 | |
}, | |
{ | |
"epoch": 8.11111111111111, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.4502863883972168, | |
"eval_runtime": 1.5145, | |
"eval_samples_per_second": 47.539, | |
"eval_steps_per_second": 3.301, | |
"step": 73 | |
}, | |
{ | |
"epoch": 8.222222222222221, | |
"grad_norm": 0.0011542687425389886, | |
"learning_rate": 1.1931818181818183e-05, | |
"loss": 0.0, | |
"step": 74 | |
}, | |
{ | |
"epoch": 8.222222222222221, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4473508596420288, | |
"eval_runtime": 1.4978, | |
"eval_samples_per_second": 48.069, | |
"eval_steps_per_second": 3.338, | |
"step": 74 | |
}, | |
{ | |
"epoch": 8.333333333333334, | |
"grad_norm": 0.0013727850746363401, | |
"learning_rate": 1.1363636363636365e-05, | |
"loss": 0.0, | |
"step": 75 | |
}, | |
{ | |
"epoch": 8.333333333333334, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.4718106985092163, | |
"eval_runtime": 1.4975, | |
"eval_samples_per_second": 48.08, | |
"eval_steps_per_second": 3.339, | |
"step": 75 | |
}, | |
{ | |
"epoch": 8.444444444444445, | |
"grad_norm": 0.005262918770313263, | |
"learning_rate": 1.0795454545454547e-05, | |
"loss": 0.0, | |
"step": 76 | |
}, | |
{ | |
"epoch": 8.444444444444445, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.3964359760284424, | |
"eval_runtime": 1.4988, | |
"eval_samples_per_second": 48.037, | |
"eval_steps_per_second": 3.336, | |
"step": 76 | |
}, | |
{ | |
"epoch": 8.555555555555555, | |
"grad_norm": 0.011616203002631664, | |
"learning_rate": 1.0227272727272729e-05, | |
"loss": 0.0, | |
"step": 77 | |
}, | |
{ | |
"epoch": 8.555555555555555, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.365304708480835, | |
"eval_runtime": 1.5036, | |
"eval_samples_per_second": 47.884, | |
"eval_steps_per_second": 3.325, | |
"step": 77 | |
}, | |
{ | |
"epoch": 8.666666666666666, | |
"grad_norm": 0.0016829015221446753, | |
"learning_rate": 9.659090909090909e-06, | |
"loss": 0.0, | |
"step": 78 | |
}, | |
{ | |
"epoch": 8.666666666666666, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.3907890319824219, | |
"eval_runtime": 1.5386, | |
"eval_samples_per_second": 46.797, | |
"eval_steps_per_second": 3.25, | |
"step": 78 | |
}, | |
{ | |
"epoch": 8.777777777777779, | |
"grad_norm": 0.007697275839745998, | |
"learning_rate": 9.090909090909091e-06, | |
"loss": 0.0, | |
"step": 79 | |
}, | |
{ | |
"epoch": 8.777777777777779, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.2962956428527832, | |
"eval_runtime": 1.5653, | |
"eval_samples_per_second": 45.997, | |
"eval_steps_per_second": 3.194, | |
"step": 79 | |
}, | |
{ | |
"epoch": 8.88888888888889, | |
"grad_norm": 0.0010299325222149491, | |
"learning_rate": 8.522727272727273e-06, | |
"loss": 0.0, | |
"step": 80 | |
}, | |
{ | |
"epoch": 8.88888888888889, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.383674144744873, | |
"eval_runtime": 1.5762, | |
"eval_samples_per_second": 45.681, | |
"eval_steps_per_second": 3.172, | |
"step": 80 | |
}, | |
{ | |
"epoch": 9.0, | |
"grad_norm": 0.005102647002786398, | |
"learning_rate": 7.954545454545455e-06, | |
"loss": 0.0, | |
"step": 81 | |
}, | |
{ | |
"epoch": 9.0, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.3699817657470703, | |
"eval_runtime": 1.4837, | |
"eval_samples_per_second": 48.527, | |
"eval_steps_per_second": 3.37, | |
"step": 81 | |
}, | |
{ | |
"epoch": 9.11111111111111, | |
"grad_norm": 0.0035217173863202333, | |
"learning_rate": 7.386363636363637e-06, | |
"loss": 0.0, | |
"step": 82 | |
}, | |
{ | |
"epoch": 9.11111111111111, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.4067063331604004, | |
"eval_runtime": 1.5387, | |
"eval_samples_per_second": 46.792, | |
"eval_steps_per_second": 3.249, | |
"step": 82 | |
}, | |
{ | |
"epoch": 9.222222222222221, | |
"grad_norm": 0.0014651613309979439, | |
"learning_rate": 6.818181818181818e-06, | |
"loss": 0.0, | |
"step": 83 | |
}, | |
{ | |
"epoch": 9.222222222222221, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.3422017097473145, | |
"eval_runtime": 1.4993, | |
"eval_samples_per_second": 48.023, | |
"eval_steps_per_second": 3.335, | |
"step": 83 | |
}, | |
{ | |
"epoch": 9.333333333333334, | |
"grad_norm": 0.002838965505361557, | |
"learning_rate": 6.25e-06, | |
"loss": 0.0, | |
"step": 84 | |
}, | |
{ | |
"epoch": 9.333333333333334, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4219313859939575, | |
"eval_runtime": 1.5163, | |
"eval_samples_per_second": 47.485, | |
"eval_steps_per_second": 3.298, | |
"step": 84 | |
}, | |
{ | |
"epoch": 9.444444444444445, | |
"grad_norm": 0.010845585726201534, | |
"learning_rate": 5.681818181818182e-06, | |
"loss": 0.0, | |
"step": 85 | |
}, | |
{ | |
"epoch": 9.444444444444445, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.5043329000473022, | |
"eval_runtime": 1.5102, | |
"eval_samples_per_second": 47.675, | |
"eval_steps_per_second": 3.311, | |
"step": 85 | |
}, | |
{ | |
"epoch": 9.555555555555555, | |
"grad_norm": 0.0013982527889311314, | |
"learning_rate": 5.113636363636364e-06, | |
"loss": 0.0, | |
"step": 86 | |
}, | |
{ | |
"epoch": 9.555555555555555, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4191205501556396, | |
"eval_runtime": 1.537, | |
"eval_samples_per_second": 46.846, | |
"eval_steps_per_second": 3.253, | |
"step": 86 | |
}, | |
{ | |
"epoch": 9.666666666666666, | |
"grad_norm": 0.0016395855927839875, | |
"learning_rate": 4.5454545454545455e-06, | |
"loss": 0.0, | |
"step": 87 | |
}, | |
{ | |
"epoch": 9.666666666666666, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.4259694814682007, | |
"eval_runtime": 1.4834, | |
"eval_samples_per_second": 48.536, | |
"eval_steps_per_second": 3.371, | |
"step": 87 | |
}, | |
{ | |
"epoch": 9.777777777777779, | |
"grad_norm": 0.0018825188744813204, | |
"learning_rate": 3.9772727272727275e-06, | |
"loss": 0.0, | |
"step": 88 | |
}, | |
{ | |
"epoch": 9.777777777777779, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.4664974212646484, | |
"eval_runtime": 1.4871, | |
"eval_samples_per_second": 48.416, | |
"eval_steps_per_second": 3.362, | |
"step": 88 | |
}, | |
{ | |
"epoch": 9.88888888888889, | |
"grad_norm": 0.00835789367556572, | |
"learning_rate": 3.409090909090909e-06, | |
"loss": 0.0, | |
"step": 89 | |
}, | |
{ | |
"epoch": 9.88888888888889, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.4490399360656738, | |
"eval_runtime": 1.5036, | |
"eval_samples_per_second": 47.886, | |
"eval_steps_per_second": 3.325, | |
"step": 89 | |
}, | |
{ | |
"epoch": 10.0, | |
"grad_norm": 0.001027365098707378, | |
"learning_rate": 2.840909090909091e-06, | |
"loss": 0.0, | |
"step": 90 | |
}, | |
{ | |
"epoch": 10.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.3632524013519287, | |
"eval_runtime": 1.484, | |
"eval_samples_per_second": 48.518, | |
"eval_steps_per_second": 3.369, | |
"step": 90 | |
}, | |
{ | |
"epoch": 10.0, | |
"step": 90, | |
"total_flos": 5016736558481408.0, | |
"train_loss": 0.20236469821797476, | |
"train_runtime": 343.5318, | |
"train_samples_per_second": 8.267, | |
"train_steps_per_second": 0.262 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 90, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"total_flos": 5016736558481408.0, | |
"train_batch_size": 4, | |
"trial_name": null, | |
"trial_params": null | |
} | |