adapters-llama2-bf16-QLORA-super_glue-rte
/
trainer_state-llama2-bf16-QLORA-super_glue-rte-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 10.0, | |
"eval_steps": 1, | |
"global_step": 160, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.0625, | |
"grad_norm": 9.625, | |
"learning_rate": 2.5e-05, | |
"loss": 1.0231, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.0625, | |
"eval_accuracy": 0.496, | |
"eval_loss": 1.0858867168426514, | |
"eval_runtime": 4.9172, | |
"eval_samples_per_second": 50.841, | |
"eval_steps_per_second": 1.627, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.125, | |
"grad_norm": 10.5625, | |
"learning_rate": 5e-05, | |
"loss": 0.9224, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.125, | |
"eval_accuracy": 0.488, | |
"eval_loss": 1.0843095779418945, | |
"eval_runtime": 4.9094, | |
"eval_samples_per_second": 50.923, | |
"eval_steps_per_second": 1.63, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.1875, | |
"grad_norm": 11.875, | |
"learning_rate": 4.968354430379747e-05, | |
"loss": 1.2584, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.1875, | |
"eval_accuracy": 0.488, | |
"eval_loss": 1.0734808444976807, | |
"eval_runtime": 4.9054, | |
"eval_samples_per_second": 50.964, | |
"eval_steps_per_second": 1.631, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.25, | |
"grad_norm": 13.3125, | |
"learning_rate": 4.936708860759494e-05, | |
"loss": 1.0442, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.25, | |
"eval_accuracy": 0.492, | |
"eval_loss": 1.0604006052017212, | |
"eval_runtime": 4.9069, | |
"eval_samples_per_second": 50.948, | |
"eval_steps_per_second": 1.63, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.3125, | |
"grad_norm": 22.625, | |
"learning_rate": 4.905063291139241e-05, | |
"loss": 1.3363, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.3125, | |
"eval_accuracy": 0.496, | |
"eval_loss": 1.0435388088226318, | |
"eval_runtime": 4.9047, | |
"eval_samples_per_second": 50.972, | |
"eval_steps_per_second": 1.631, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.375, | |
"grad_norm": 10.625, | |
"learning_rate": 4.8734177215189874e-05, | |
"loss": 1.0358, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.375, | |
"eval_accuracy": 0.504, | |
"eval_loss": 1.0265413522720337, | |
"eval_runtime": 4.9042, | |
"eval_samples_per_second": 50.977, | |
"eval_steps_per_second": 1.631, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.4375, | |
"grad_norm": 7.5625, | |
"learning_rate": 4.8417721518987346e-05, | |
"loss": 0.9276, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.4375, | |
"eval_accuracy": 0.508, | |
"eval_loss": 1.0061290264129639, | |
"eval_runtime": 4.9077, | |
"eval_samples_per_second": 50.941, | |
"eval_steps_per_second": 1.63, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 16.75, | |
"learning_rate": 4.810126582278481e-05, | |
"loss": 0.9751, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.5, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.9898163080215454, | |
"eval_runtime": 4.9054, | |
"eval_samples_per_second": 50.964, | |
"eval_steps_per_second": 1.631, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.5625, | |
"grad_norm": 10.875, | |
"learning_rate": 4.778481012658228e-05, | |
"loss": 1.1136, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.5625, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.9671441912651062, | |
"eval_runtime": 4.8588, | |
"eval_samples_per_second": 51.453, | |
"eval_steps_per_second": 1.646, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.625, | |
"grad_norm": 11.0, | |
"learning_rate": 4.7468354430379746e-05, | |
"loss": 0.8243, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.625, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.9475510120391846, | |
"eval_runtime": 4.9061, | |
"eval_samples_per_second": 50.957, | |
"eval_steps_per_second": 1.631, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.6875, | |
"grad_norm": 10.875, | |
"learning_rate": 4.715189873417722e-05, | |
"loss": 0.9469, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.6875, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.9229685068130493, | |
"eval_runtime": 4.9037, | |
"eval_samples_per_second": 50.982, | |
"eval_steps_per_second": 1.631, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.75, | |
"grad_norm": 10.125, | |
"learning_rate": 4.683544303797468e-05, | |
"loss": 0.9348, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.75, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.9007159471511841, | |
"eval_runtime": 4.9072, | |
"eval_samples_per_second": 50.945, | |
"eval_steps_per_second": 1.63, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.8125, | |
"grad_norm": 9.6875, | |
"learning_rate": 4.6518987341772154e-05, | |
"loss": 0.939, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.8125, | |
"eval_accuracy": 0.548, | |
"eval_loss": 0.8757832050323486, | |
"eval_runtime": 4.9116, | |
"eval_samples_per_second": 50.9, | |
"eval_steps_per_second": 1.629, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.875, | |
"grad_norm": 16.375, | |
"learning_rate": 4.6202531645569625e-05, | |
"loss": 0.8007, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.875, | |
"eval_accuracy": 0.556, | |
"eval_loss": 0.8450800776481628, | |
"eval_runtime": 4.9085, | |
"eval_samples_per_second": 50.932, | |
"eval_steps_per_second": 1.63, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.9375, | |
"grad_norm": 13.625, | |
"learning_rate": 4.588607594936709e-05, | |
"loss": 0.8642, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.9375, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.8132224082946777, | |
"eval_runtime": 4.9071, | |
"eval_samples_per_second": 50.946, | |
"eval_steps_per_second": 1.63, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 16.5, | |
"learning_rate": 4.556962025316456e-05, | |
"loss": 0.6672, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.0, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.7814155220985413, | |
"eval_runtime": 4.8576, | |
"eval_samples_per_second": 51.465, | |
"eval_steps_per_second": 1.647, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.0625, | |
"grad_norm": 9.1875, | |
"learning_rate": 4.525316455696203e-05, | |
"loss": 0.7072, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.0625, | |
"eval_accuracy": 0.612, | |
"eval_loss": 0.7591574788093567, | |
"eval_runtime": 4.9059, | |
"eval_samples_per_second": 50.959, | |
"eval_steps_per_second": 1.631, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.125, | |
"grad_norm": 8.3125, | |
"learning_rate": 4.49367088607595e-05, | |
"loss": 0.7815, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.125, | |
"eval_accuracy": 0.644, | |
"eval_loss": 0.7339711785316467, | |
"eval_runtime": 4.9112, | |
"eval_samples_per_second": 50.904, | |
"eval_steps_per_second": 1.629, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.1875, | |
"grad_norm": 7.0, | |
"learning_rate": 4.462025316455696e-05, | |
"loss": 0.6226, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.1875, | |
"eval_accuracy": 0.66, | |
"eval_loss": 0.7131918668746948, | |
"eval_runtime": 4.9082, | |
"eval_samples_per_second": 50.935, | |
"eval_steps_per_second": 1.63, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.25, | |
"grad_norm": 11.25, | |
"learning_rate": 4.430379746835443e-05, | |
"loss": 0.6456, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.25, | |
"eval_accuracy": 0.664, | |
"eval_loss": 0.6907175183296204, | |
"eval_runtime": 4.9075, | |
"eval_samples_per_second": 50.943, | |
"eval_steps_per_second": 1.63, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.3125, | |
"grad_norm": 14.0625, | |
"learning_rate": 4.3987341772151904e-05, | |
"loss": 0.8005, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.3125, | |
"eval_accuracy": 0.668, | |
"eval_loss": 0.6769924163818359, | |
"eval_runtime": 4.9073, | |
"eval_samples_per_second": 50.945, | |
"eval_steps_per_second": 1.63, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.375, | |
"grad_norm": 6.09375, | |
"learning_rate": 4.367088607594937e-05, | |
"loss": 0.4491, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.375, | |
"eval_accuracy": 0.688, | |
"eval_loss": 0.6586074233055115, | |
"eval_runtime": 4.906, | |
"eval_samples_per_second": 50.958, | |
"eval_steps_per_second": 1.631, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.4375, | |
"grad_norm": 6.0625, | |
"learning_rate": 4.3354430379746834e-05, | |
"loss": 0.474, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.4375, | |
"eval_accuracy": 0.692, | |
"eval_loss": 0.6458801031112671, | |
"eval_runtime": 4.8557, | |
"eval_samples_per_second": 51.486, | |
"eval_steps_per_second": 1.648, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.5, | |
"grad_norm": 7.3125, | |
"learning_rate": 4.3037974683544305e-05, | |
"loss": 0.5048, | |
"step": 24 | |
}, | |
{ | |
"epoch": 1.5, | |
"eval_accuracy": 0.704, | |
"eval_loss": 0.6440668702125549, | |
"eval_runtime": 4.9064, | |
"eval_samples_per_second": 50.954, | |
"eval_steps_per_second": 1.631, | |
"step": 24 | |
}, | |
{ | |
"epoch": 1.5625, | |
"grad_norm": 9.75, | |
"learning_rate": 4.2721518987341776e-05, | |
"loss": 0.443, | |
"step": 25 | |
}, | |
{ | |
"epoch": 1.5625, | |
"eval_accuracy": 0.712, | |
"eval_loss": 0.639043927192688, | |
"eval_runtime": 4.9058, | |
"eval_samples_per_second": 50.96, | |
"eval_steps_per_second": 1.631, | |
"step": 25 | |
}, | |
{ | |
"epoch": 1.625, | |
"grad_norm": 6.40625, | |
"learning_rate": 4.240506329113924e-05, | |
"loss": 0.4949, | |
"step": 26 | |
}, | |
{ | |
"epoch": 1.625, | |
"eval_accuracy": 0.716, | |
"eval_loss": 0.6155992150306702, | |
"eval_runtime": 4.9046, | |
"eval_samples_per_second": 50.972, | |
"eval_steps_per_second": 1.631, | |
"step": 26 | |
}, | |
{ | |
"epoch": 1.6875, | |
"grad_norm": 16.0, | |
"learning_rate": 4.208860759493671e-05, | |
"loss": 0.5333, | |
"step": 27 | |
}, | |
{ | |
"epoch": 1.6875, | |
"eval_accuracy": 0.708, | |
"eval_loss": 0.6270785927772522, | |
"eval_runtime": 4.9087, | |
"eval_samples_per_second": 50.93, | |
"eval_steps_per_second": 1.63, | |
"step": 27 | |
}, | |
{ | |
"epoch": 1.75, | |
"grad_norm": 8.25, | |
"learning_rate": 4.177215189873418e-05, | |
"loss": 0.5316, | |
"step": 28 | |
}, | |
{ | |
"epoch": 1.75, | |
"eval_accuracy": 0.696, | |
"eval_loss": 0.6393130421638489, | |
"eval_runtime": 4.9075, | |
"eval_samples_per_second": 50.943, | |
"eval_steps_per_second": 1.63, | |
"step": 28 | |
}, | |
{ | |
"epoch": 1.8125, | |
"grad_norm": 26.875, | |
"learning_rate": 4.145569620253165e-05, | |
"loss": 0.5872, | |
"step": 29 | |
}, | |
{ | |
"epoch": 1.8125, | |
"eval_accuracy": 0.708, | |
"eval_loss": 0.6233423948287964, | |
"eval_runtime": 4.9097, | |
"eval_samples_per_second": 50.919, | |
"eval_steps_per_second": 1.629, | |
"step": 29 | |
}, | |
{ | |
"epoch": 1.875, | |
"grad_norm": 14.3125, | |
"learning_rate": 4.113924050632912e-05, | |
"loss": 0.532, | |
"step": 30 | |
}, | |
{ | |
"epoch": 1.875, | |
"eval_accuracy": 0.728, | |
"eval_loss": 0.5802625417709351, | |
"eval_runtime": 4.9064, | |
"eval_samples_per_second": 50.954, | |
"eval_steps_per_second": 1.631, | |
"step": 30 | |
}, | |
{ | |
"epoch": 1.9375, | |
"grad_norm": 5.8125, | |
"learning_rate": 4.0822784810126584e-05, | |
"loss": 0.5531, | |
"step": 31 | |
}, | |
{ | |
"epoch": 1.9375, | |
"eval_accuracy": 0.748, | |
"eval_loss": 0.5553041100502014, | |
"eval_runtime": 4.9066, | |
"eval_samples_per_second": 50.952, | |
"eval_steps_per_second": 1.63, | |
"step": 31 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 19.375, | |
"learning_rate": 4.050632911392405e-05, | |
"loss": 0.4206, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.74, | |
"eval_loss": 0.5429959893226624, | |
"eval_runtime": 4.9071, | |
"eval_samples_per_second": 50.947, | |
"eval_steps_per_second": 1.63, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.0625, | |
"grad_norm": 11.4375, | |
"learning_rate": 4.018987341772152e-05, | |
"loss": 0.4597, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.0625, | |
"eval_accuracy": 0.72, | |
"eval_loss": 0.5543990731239319, | |
"eval_runtime": 4.9053, | |
"eval_samples_per_second": 50.965, | |
"eval_steps_per_second": 1.631, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.125, | |
"grad_norm": 12.0, | |
"learning_rate": 3.987341772151899e-05, | |
"loss": 0.4009, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.125, | |
"eval_accuracy": 0.716, | |
"eval_loss": 0.5574648976325989, | |
"eval_runtime": 4.9099, | |
"eval_samples_per_second": 50.917, | |
"eval_steps_per_second": 1.629, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.1875, | |
"grad_norm": 13.0625, | |
"learning_rate": 3.9556962025316456e-05, | |
"loss": 0.5113, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.1875, | |
"eval_accuracy": 0.732, | |
"eval_loss": 0.5373346209526062, | |
"eval_runtime": 4.9081, | |
"eval_samples_per_second": 50.936, | |
"eval_steps_per_second": 1.63, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.25, | |
"grad_norm": 18.25, | |
"learning_rate": 3.924050632911392e-05, | |
"loss": 0.3224, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.25, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.5083271265029907, | |
"eval_runtime": 4.9092, | |
"eval_samples_per_second": 50.925, | |
"eval_steps_per_second": 1.63, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.3125, | |
"grad_norm": 13.8125, | |
"learning_rate": 3.89240506329114e-05, | |
"loss": 0.2552, | |
"step": 37 | |
}, | |
{ | |
"epoch": 2.3125, | |
"eval_accuracy": 0.756, | |
"eval_loss": 0.48166918754577637, | |
"eval_runtime": 4.9082, | |
"eval_samples_per_second": 50.935, | |
"eval_steps_per_second": 1.63, | |
"step": 37 | |
}, | |
{ | |
"epoch": 2.375, | |
"grad_norm": 13.0625, | |
"learning_rate": 3.8607594936708864e-05, | |
"loss": 0.3693, | |
"step": 38 | |
}, | |
{ | |
"epoch": 2.375, | |
"eval_accuracy": 0.76, | |
"eval_loss": 0.4727539122104645, | |
"eval_runtime": 4.8582, | |
"eval_samples_per_second": 51.459, | |
"eval_steps_per_second": 1.647, | |
"step": 38 | |
}, | |
{ | |
"epoch": 2.4375, | |
"grad_norm": 5.71875, | |
"learning_rate": 3.829113924050633e-05, | |
"loss": 0.3886, | |
"step": 39 | |
}, | |
{ | |
"epoch": 2.4375, | |
"eval_accuracy": 0.76, | |
"eval_loss": 0.4700586497783661, | |
"eval_runtime": 4.9039, | |
"eval_samples_per_second": 50.979, | |
"eval_steps_per_second": 1.631, | |
"step": 39 | |
}, | |
{ | |
"epoch": 2.5, | |
"grad_norm": 12.6875, | |
"learning_rate": 3.79746835443038e-05, | |
"loss": 0.3666, | |
"step": 40 | |
}, | |
{ | |
"epoch": 2.5, | |
"eval_accuracy": 0.772, | |
"eval_loss": 0.46244245767593384, | |
"eval_runtime": 4.9122, | |
"eval_samples_per_second": 50.894, | |
"eval_steps_per_second": 1.629, | |
"step": 40 | |
}, | |
{ | |
"epoch": 2.5625, | |
"grad_norm": 5.21875, | |
"learning_rate": 3.765822784810127e-05, | |
"loss": 0.3614, | |
"step": 41 | |
}, | |
{ | |
"epoch": 2.5625, | |
"eval_accuracy": 0.772, | |
"eval_loss": 0.45654377341270447, | |
"eval_runtime": 4.8573, | |
"eval_samples_per_second": 51.469, | |
"eval_steps_per_second": 1.647, | |
"step": 41 | |
}, | |
{ | |
"epoch": 2.625, | |
"grad_norm": 3.375, | |
"learning_rate": 3.7341772151898736e-05, | |
"loss": 0.2228, | |
"step": 42 | |
}, | |
{ | |
"epoch": 2.625, | |
"eval_accuracy": 0.768, | |
"eval_loss": 0.45055416226387024, | |
"eval_runtime": 4.9079, | |
"eval_samples_per_second": 50.938, | |
"eval_steps_per_second": 1.63, | |
"step": 42 | |
}, | |
{ | |
"epoch": 2.6875, | |
"grad_norm": 7.34375, | |
"learning_rate": 3.70253164556962e-05, | |
"loss": 0.2365, | |
"step": 43 | |
}, | |
{ | |
"epoch": 2.6875, | |
"eval_accuracy": 0.772, | |
"eval_loss": 0.4469849467277527, | |
"eval_runtime": 4.9083, | |
"eval_samples_per_second": 50.935, | |
"eval_steps_per_second": 1.63, | |
"step": 43 | |
}, | |
{ | |
"epoch": 2.75, | |
"grad_norm": 7.5625, | |
"learning_rate": 3.670886075949367e-05, | |
"loss": 0.3274, | |
"step": 44 | |
}, | |
{ | |
"epoch": 2.75, | |
"eval_accuracy": 0.784, | |
"eval_loss": 0.44335371255874634, | |
"eval_runtime": 4.9063, | |
"eval_samples_per_second": 50.955, | |
"eval_steps_per_second": 1.631, | |
"step": 44 | |
}, | |
{ | |
"epoch": 2.8125, | |
"grad_norm": 11.125, | |
"learning_rate": 3.639240506329114e-05, | |
"loss": 0.2592, | |
"step": 45 | |
}, | |
{ | |
"epoch": 2.8125, | |
"eval_accuracy": 0.788, | |
"eval_loss": 0.44204553961753845, | |
"eval_runtime": 4.9034, | |
"eval_samples_per_second": 50.985, | |
"eval_steps_per_second": 1.632, | |
"step": 45 | |
}, | |
{ | |
"epoch": 2.875, | |
"grad_norm": 3.75, | |
"learning_rate": 3.607594936708861e-05, | |
"loss": 0.2343, | |
"step": 46 | |
}, | |
{ | |
"epoch": 2.875, | |
"eval_accuracy": 0.78, | |
"eval_loss": 0.4448190927505493, | |
"eval_runtime": 4.9077, | |
"eval_samples_per_second": 50.94, | |
"eval_steps_per_second": 1.63, | |
"step": 46 | |
}, | |
{ | |
"epoch": 2.9375, | |
"grad_norm": 9.375, | |
"learning_rate": 3.575949367088608e-05, | |
"loss": 0.219, | |
"step": 47 | |
}, | |
{ | |
"epoch": 2.9375, | |
"eval_accuracy": 0.788, | |
"eval_loss": 0.44333717226982117, | |
"eval_runtime": 4.9081, | |
"eval_samples_per_second": 50.936, | |
"eval_steps_per_second": 1.63, | |
"step": 47 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 5.25, | |
"learning_rate": 3.5443037974683544e-05, | |
"loss": 0.2329, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.0, | |
"eval_accuracy": 0.796, | |
"eval_loss": 0.44067704677581787, | |
"eval_runtime": 4.9046, | |
"eval_samples_per_second": 50.973, | |
"eval_steps_per_second": 1.631, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.0625, | |
"grad_norm": 6.28125, | |
"learning_rate": 3.5126582278481015e-05, | |
"loss": 0.2697, | |
"step": 49 | |
}, | |
{ | |
"epoch": 3.0625, | |
"eval_accuracy": 0.796, | |
"eval_loss": 0.4345742166042328, | |
"eval_runtime": 4.905, | |
"eval_samples_per_second": 50.969, | |
"eval_steps_per_second": 1.631, | |
"step": 49 | |
}, | |
{ | |
"epoch": 3.125, | |
"grad_norm": 5.71875, | |
"learning_rate": 3.4810126582278487e-05, | |
"loss": 0.1796, | |
"step": 50 | |
}, | |
{ | |
"epoch": 3.125, | |
"eval_accuracy": 0.796, | |
"eval_loss": 0.42894840240478516, | |
"eval_runtime": 4.9069, | |
"eval_samples_per_second": 50.949, | |
"eval_steps_per_second": 1.63, | |
"step": 50 | |
}, | |
{ | |
"epoch": 3.1875, | |
"grad_norm": 9.125, | |
"learning_rate": 3.449367088607595e-05, | |
"loss": 0.1754, | |
"step": 51 | |
}, | |
{ | |
"epoch": 3.1875, | |
"eval_accuracy": 0.792, | |
"eval_loss": 0.43848368525505066, | |
"eval_runtime": 4.9038, | |
"eval_samples_per_second": 50.981, | |
"eval_steps_per_second": 1.631, | |
"step": 51 | |
}, | |
{ | |
"epoch": 3.25, | |
"grad_norm": 3.28125, | |
"learning_rate": 3.4177215189873416e-05, | |
"loss": 0.1621, | |
"step": 52 | |
}, | |
{ | |
"epoch": 3.25, | |
"eval_accuracy": 0.78, | |
"eval_loss": 0.4773699939250946, | |
"eval_runtime": 4.9079, | |
"eval_samples_per_second": 50.938, | |
"eval_steps_per_second": 1.63, | |
"step": 52 | |
}, | |
{ | |
"epoch": 3.3125, | |
"grad_norm": 12.0625, | |
"learning_rate": 3.386075949367089e-05, | |
"loss": 0.2976, | |
"step": 53 | |
}, | |
{ | |
"epoch": 3.3125, | |
"eval_accuracy": 0.776, | |
"eval_loss": 0.5157197117805481, | |
"eval_runtime": 4.9079, | |
"eval_samples_per_second": 50.939, | |
"eval_steps_per_second": 1.63, | |
"step": 53 | |
}, | |
{ | |
"epoch": 3.375, | |
"grad_norm": 9.0625, | |
"learning_rate": 3.354430379746836e-05, | |
"loss": 0.2612, | |
"step": 54 | |
}, | |
{ | |
"epoch": 3.375, | |
"eval_accuracy": 0.772, | |
"eval_loss": 0.5219131112098694, | |
"eval_runtime": 4.9158, | |
"eval_samples_per_second": 50.856, | |
"eval_steps_per_second": 1.627, | |
"step": 54 | |
}, | |
{ | |
"epoch": 3.4375, | |
"grad_norm": 15.6875, | |
"learning_rate": 3.322784810126582e-05, | |
"loss": 0.2856, | |
"step": 55 | |
}, | |
{ | |
"epoch": 3.4375, | |
"eval_accuracy": 0.772, | |
"eval_loss": 0.5048539638519287, | |
"eval_runtime": 4.9093, | |
"eval_samples_per_second": 50.924, | |
"eval_steps_per_second": 1.63, | |
"step": 55 | |
}, | |
{ | |
"epoch": 3.5, | |
"grad_norm": 18.75, | |
"learning_rate": 3.291139240506329e-05, | |
"loss": 0.2473, | |
"step": 56 | |
}, | |
{ | |
"epoch": 3.5, | |
"eval_accuracy": 0.784, | |
"eval_loss": 0.4671543538570404, | |
"eval_runtime": 5.0157, | |
"eval_samples_per_second": 49.844, | |
"eval_steps_per_second": 1.595, | |
"step": 56 | |
}, | |
{ | |
"epoch": 3.5625, | |
"grad_norm": 5.90625, | |
"learning_rate": 3.2594936708860766e-05, | |
"loss": 0.1565, | |
"step": 57 | |
}, | |
{ | |
"epoch": 3.5625, | |
"eval_accuracy": 0.808, | |
"eval_loss": 0.4280901551246643, | |
"eval_runtime": 5.2697, | |
"eval_samples_per_second": 47.441, | |
"eval_steps_per_second": 1.518, | |
"step": 57 | |
}, | |
{ | |
"epoch": 3.625, | |
"grad_norm": 4.9375, | |
"learning_rate": 3.227848101265823e-05, | |
"loss": 0.1862, | |
"step": 58 | |
}, | |
{ | |
"epoch": 3.625, | |
"eval_accuracy": 0.792, | |
"eval_loss": 0.417385071516037, | |
"eval_runtime": 4.936, | |
"eval_samples_per_second": 50.648, | |
"eval_steps_per_second": 1.621, | |
"step": 58 | |
}, | |
{ | |
"epoch": 3.6875, | |
"grad_norm": 4.71875, | |
"learning_rate": 3.1962025316455695e-05, | |
"loss": 0.1997, | |
"step": 59 | |
}, | |
{ | |
"epoch": 3.6875, | |
"eval_accuracy": 0.812, | |
"eval_loss": 0.43094775080680847, | |
"eval_runtime": 5.3564, | |
"eval_samples_per_second": 46.673, | |
"eval_steps_per_second": 1.494, | |
"step": 59 | |
}, | |
{ | |
"epoch": 3.75, | |
"grad_norm": 4.21875, | |
"learning_rate": 3.1645569620253167e-05, | |
"loss": 0.2196, | |
"step": 60 | |
}, | |
{ | |
"epoch": 3.75, | |
"eval_accuracy": 0.816, | |
"eval_loss": 0.45009762048721313, | |
"eval_runtime": 5.0887, | |
"eval_samples_per_second": 49.129, | |
"eval_steps_per_second": 1.572, | |
"step": 60 | |
}, | |
{ | |
"epoch": 3.8125, | |
"grad_norm": 7.4375, | |
"learning_rate": 3.132911392405064e-05, | |
"loss": 0.1863, | |
"step": 61 | |
}, | |
{ | |
"epoch": 3.8125, | |
"eval_accuracy": 0.816, | |
"eval_loss": 0.4615320563316345, | |
"eval_runtime": 4.9972, | |
"eval_samples_per_second": 50.028, | |
"eval_steps_per_second": 1.601, | |
"step": 61 | |
}, | |
{ | |
"epoch": 3.875, | |
"grad_norm": 5.84375, | |
"learning_rate": 3.10126582278481e-05, | |
"loss": 0.1621, | |
"step": 62 | |
}, | |
{ | |
"epoch": 3.875, | |
"eval_accuracy": 0.816, | |
"eval_loss": 0.4588499367237091, | |
"eval_runtime": 4.9486, | |
"eval_samples_per_second": 50.519, | |
"eval_steps_per_second": 1.617, | |
"step": 62 | |
}, | |
{ | |
"epoch": 3.9375, | |
"grad_norm": 14.9375, | |
"learning_rate": 3.0696202531645574e-05, | |
"loss": 0.2533, | |
"step": 63 | |
}, | |
{ | |
"epoch": 3.9375, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.44347089529037476, | |
"eval_runtime": 4.9235, | |
"eval_samples_per_second": 50.777, | |
"eval_steps_per_second": 1.625, | |
"step": 63 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 4.15625, | |
"learning_rate": 3.0379746835443042e-05, | |
"loss": 0.1085, | |
"step": 64 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.4236921966075897, | |
"eval_runtime": 5.0108, | |
"eval_samples_per_second": 49.892, | |
"eval_steps_per_second": 1.597, | |
"step": 64 | |
}, | |
{ | |
"epoch": 4.0625, | |
"grad_norm": 10.0625, | |
"learning_rate": 3.0063291139240506e-05, | |
"loss": 0.1236, | |
"step": 65 | |
}, | |
{ | |
"epoch": 4.0625, | |
"eval_accuracy": 0.824, | |
"eval_loss": 0.40546613931655884, | |
"eval_runtime": 4.9996, | |
"eval_samples_per_second": 50.004, | |
"eval_steps_per_second": 1.6, | |
"step": 65 | |
}, | |
{ | |
"epoch": 4.125, | |
"grad_norm": 2.421875, | |
"learning_rate": 2.9746835443037974e-05, | |
"loss": 0.0979, | |
"step": 66 | |
}, | |
{ | |
"epoch": 4.125, | |
"eval_accuracy": 0.816, | |
"eval_loss": 0.3985447883605957, | |
"eval_runtime": 4.969, | |
"eval_samples_per_second": 50.312, | |
"eval_steps_per_second": 1.61, | |
"step": 66 | |
}, | |
{ | |
"epoch": 4.1875, | |
"grad_norm": 2.703125, | |
"learning_rate": 2.9430379746835446e-05, | |
"loss": 0.096, | |
"step": 67 | |
}, | |
{ | |
"epoch": 4.1875, | |
"eval_accuracy": 0.812, | |
"eval_loss": 0.40320640802383423, | |
"eval_runtime": 5.0458, | |
"eval_samples_per_second": 49.546, | |
"eval_steps_per_second": 1.585, | |
"step": 67 | |
}, | |
{ | |
"epoch": 4.25, | |
"grad_norm": 4.1875, | |
"learning_rate": 2.9113924050632914e-05, | |
"loss": 0.0776, | |
"step": 68 | |
}, | |
{ | |
"epoch": 4.25, | |
"eval_accuracy": 0.816, | |
"eval_loss": 0.40163105726242065, | |
"eval_runtime": 4.9683, | |
"eval_samples_per_second": 50.319, | |
"eval_steps_per_second": 1.61, | |
"step": 68 | |
}, | |
{ | |
"epoch": 4.3125, | |
"grad_norm": 3.734375, | |
"learning_rate": 2.879746835443038e-05, | |
"loss": 0.1406, | |
"step": 69 | |
}, | |
{ | |
"epoch": 4.3125, | |
"eval_accuracy": 0.812, | |
"eval_loss": 0.40083351731300354, | |
"eval_runtime": 5.0742, | |
"eval_samples_per_second": 49.269, | |
"eval_steps_per_second": 1.577, | |
"step": 69 | |
}, | |
{ | |
"epoch": 4.375, | |
"grad_norm": 3.0, | |
"learning_rate": 2.848101265822785e-05, | |
"loss": 0.1276, | |
"step": 70 | |
}, | |
{ | |
"epoch": 4.375, | |
"eval_accuracy": 0.816, | |
"eval_loss": 0.4019404351711273, | |
"eval_runtime": 5.0319, | |
"eval_samples_per_second": 49.683, | |
"eval_steps_per_second": 1.59, | |
"step": 70 | |
}, | |
{ | |
"epoch": 4.4375, | |
"grad_norm": 3.40625, | |
"learning_rate": 2.8164556962025318e-05, | |
"loss": 0.1169, | |
"step": 71 | |
}, | |
{ | |
"epoch": 4.4375, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.40479913353919983, | |
"eval_runtime": 5.0124, | |
"eval_samples_per_second": 49.876, | |
"eval_steps_per_second": 1.596, | |
"step": 71 | |
}, | |
{ | |
"epoch": 4.5, | |
"grad_norm": 10.875, | |
"learning_rate": 2.7848101265822786e-05, | |
"loss": 0.2681, | |
"step": 72 | |
}, | |
{ | |
"epoch": 4.5, | |
"eval_accuracy": 0.824, | |
"eval_loss": 0.40435412526130676, | |
"eval_runtime": 5.1364, | |
"eval_samples_per_second": 48.673, | |
"eval_steps_per_second": 1.558, | |
"step": 72 | |
}, | |
{ | |
"epoch": 4.5625, | |
"grad_norm": 5.59375, | |
"learning_rate": 2.7531645569620257e-05, | |
"loss": 0.1596, | |
"step": 73 | |
}, | |
{ | |
"epoch": 4.5625, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.4038302004337311, | |
"eval_runtime": 5.0213, | |
"eval_samples_per_second": 49.788, | |
"eval_steps_per_second": 1.593, | |
"step": 73 | |
}, | |
{ | |
"epoch": 4.625, | |
"grad_norm": 3.84375, | |
"learning_rate": 2.7215189873417722e-05, | |
"loss": 0.1036, | |
"step": 74 | |
}, | |
{ | |
"epoch": 4.625, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.40612995624542236, | |
"eval_runtime": 5.014, | |
"eval_samples_per_second": 49.861, | |
"eval_steps_per_second": 1.596, | |
"step": 74 | |
}, | |
{ | |
"epoch": 4.6875, | |
"grad_norm": 4.6875, | |
"learning_rate": 2.689873417721519e-05, | |
"loss": 0.1256, | |
"step": 75 | |
}, | |
{ | |
"epoch": 4.6875, | |
"eval_accuracy": 0.824, | |
"eval_loss": 0.4083655774593353, | |
"eval_runtime": 5.0762, | |
"eval_samples_per_second": 49.249, | |
"eval_steps_per_second": 1.576, | |
"step": 75 | |
}, | |
{ | |
"epoch": 4.75, | |
"grad_norm": 2.046875, | |
"learning_rate": 2.6582278481012658e-05, | |
"loss": 0.049, | |
"step": 76 | |
}, | |
{ | |
"epoch": 4.75, | |
"eval_accuracy": 0.828, | |
"eval_loss": 0.4126991629600525, | |
"eval_runtime": 5.1818, | |
"eval_samples_per_second": 48.246, | |
"eval_steps_per_second": 1.544, | |
"step": 76 | |
}, | |
{ | |
"epoch": 4.8125, | |
"grad_norm": 2.953125, | |
"learning_rate": 2.626582278481013e-05, | |
"loss": 0.1125, | |
"step": 77 | |
}, | |
{ | |
"epoch": 4.8125, | |
"eval_accuracy": 0.828, | |
"eval_loss": 0.41637206077575684, | |
"eval_runtime": 5.0132, | |
"eval_samples_per_second": 49.869, | |
"eval_steps_per_second": 1.596, | |
"step": 77 | |
}, | |
{ | |
"epoch": 4.875, | |
"grad_norm": 2.21875, | |
"learning_rate": 2.5949367088607597e-05, | |
"loss": 0.081, | |
"step": 78 | |
}, | |
{ | |
"epoch": 4.875, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.41693058609962463, | |
"eval_runtime": 5.1308, | |
"eval_samples_per_second": 48.725, | |
"eval_steps_per_second": 1.559, | |
"step": 78 | |
}, | |
{ | |
"epoch": 4.9375, | |
"grad_norm": 2.953125, | |
"learning_rate": 2.5632911392405062e-05, | |
"loss": 0.1308, | |
"step": 79 | |
}, | |
{ | |
"epoch": 4.9375, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.4119318425655365, | |
"eval_runtime": 5.3506, | |
"eval_samples_per_second": 46.723, | |
"eval_steps_per_second": 1.495, | |
"step": 79 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 3.078125, | |
"learning_rate": 2.5316455696202533e-05, | |
"loss": 0.0556, | |
"step": 80 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.4106213450431824, | |
"eval_runtime": 5.0414, | |
"eval_samples_per_second": 49.589, | |
"eval_steps_per_second": 1.587, | |
"step": 80 | |
}, | |
{ | |
"epoch": 5.0625, | |
"grad_norm": 5.4375, | |
"learning_rate": 2.5e-05, | |
"loss": 0.1657, | |
"step": 81 | |
}, | |
{ | |
"epoch": 5.0625, | |
"eval_accuracy": 0.816, | |
"eval_loss": 0.40592437982559204, | |
"eval_runtime": 4.991, | |
"eval_samples_per_second": 50.09, | |
"eval_steps_per_second": 1.603, | |
"step": 81 | |
}, | |
{ | |
"epoch": 5.125, | |
"grad_norm": 3.125, | |
"learning_rate": 2.468354430379747e-05, | |
"loss": 0.1037, | |
"step": 82 | |
}, | |
{ | |
"epoch": 5.125, | |
"eval_accuracy": 0.816, | |
"eval_loss": 0.40300390124320984, | |
"eval_runtime": 4.9485, | |
"eval_samples_per_second": 50.521, | |
"eval_steps_per_second": 1.617, | |
"step": 82 | |
}, | |
{ | |
"epoch": 5.1875, | |
"grad_norm": 2.78125, | |
"learning_rate": 2.4367088607594937e-05, | |
"loss": 0.0676, | |
"step": 83 | |
}, | |
{ | |
"epoch": 5.1875, | |
"eval_accuracy": 0.824, | |
"eval_loss": 0.39928579330444336, | |
"eval_runtime": 5.1773, | |
"eval_samples_per_second": 48.287, | |
"eval_steps_per_second": 1.545, | |
"step": 83 | |
}, | |
{ | |
"epoch": 5.25, | |
"grad_norm": 3.625, | |
"learning_rate": 2.4050632911392405e-05, | |
"loss": 0.0513, | |
"step": 84 | |
}, | |
{ | |
"epoch": 5.25, | |
"eval_accuracy": 0.812, | |
"eval_loss": 0.3965282738208771, | |
"eval_runtime": 4.9096, | |
"eval_samples_per_second": 50.921, | |
"eval_steps_per_second": 1.629, | |
"step": 84 | |
}, | |
{ | |
"epoch": 5.3125, | |
"grad_norm": 2.84375, | |
"learning_rate": 2.3734177215189873e-05, | |
"loss": 0.0539, | |
"step": 85 | |
}, | |
{ | |
"epoch": 5.3125, | |
"eval_accuracy": 0.816, | |
"eval_loss": 0.3946448862552643, | |
"eval_runtime": 4.9127, | |
"eval_samples_per_second": 50.889, | |
"eval_steps_per_second": 1.628, | |
"step": 85 | |
}, | |
{ | |
"epoch": 5.375, | |
"grad_norm": 2.40625, | |
"learning_rate": 2.341772151898734e-05, | |
"loss": 0.0757, | |
"step": 86 | |
}, | |
{ | |
"epoch": 5.375, | |
"eval_accuracy": 0.816, | |
"eval_loss": 0.3956676721572876, | |
"eval_runtime": 4.9093, | |
"eval_samples_per_second": 50.924, | |
"eval_steps_per_second": 1.63, | |
"step": 86 | |
}, | |
{ | |
"epoch": 5.4375, | |
"grad_norm": 1.4296875, | |
"learning_rate": 2.3101265822784813e-05, | |
"loss": 0.0394, | |
"step": 87 | |
}, | |
{ | |
"epoch": 5.4375, | |
"eval_accuracy": 0.812, | |
"eval_loss": 0.4012235105037689, | |
"eval_runtime": 4.9087, | |
"eval_samples_per_second": 50.93, | |
"eval_steps_per_second": 1.63, | |
"step": 87 | |
}, | |
{ | |
"epoch": 5.5, | |
"grad_norm": 1.2578125, | |
"learning_rate": 2.278481012658228e-05, | |
"loss": 0.034, | |
"step": 88 | |
}, | |
{ | |
"epoch": 5.5, | |
"eval_accuracy": 0.804, | |
"eval_loss": 0.40770140290260315, | |
"eval_runtime": 4.9931, | |
"eval_samples_per_second": 50.069, | |
"eval_steps_per_second": 1.602, | |
"step": 88 | |
}, | |
{ | |
"epoch": 5.5625, | |
"grad_norm": 3.9375, | |
"learning_rate": 2.246835443037975e-05, | |
"loss": 0.0601, | |
"step": 89 | |
}, | |
{ | |
"epoch": 5.5625, | |
"eval_accuracy": 0.804, | |
"eval_loss": 0.4079342782497406, | |
"eval_runtime": 5.4428, | |
"eval_samples_per_second": 45.932, | |
"eval_steps_per_second": 1.47, | |
"step": 89 | |
}, | |
{ | |
"epoch": 5.625, | |
"grad_norm": 2.609375, | |
"learning_rate": 2.2151898734177217e-05, | |
"loss": 0.0402, | |
"step": 90 | |
}, | |
{ | |
"epoch": 5.625, | |
"eval_accuracy": 0.812, | |
"eval_loss": 0.4079940915107727, | |
"eval_runtime": 5.2545, | |
"eval_samples_per_second": 47.578, | |
"eval_steps_per_second": 1.523, | |
"step": 90 | |
}, | |
{ | |
"epoch": 5.6875, | |
"grad_norm": 2.171875, | |
"learning_rate": 2.1835443037974685e-05, | |
"loss": 0.0443, | |
"step": 91 | |
}, | |
{ | |
"epoch": 5.6875, | |
"eval_accuracy": 0.824, | |
"eval_loss": 0.4047956168651581, | |
"eval_runtime": 5.1747, | |
"eval_samples_per_second": 48.312, | |
"eval_steps_per_second": 1.546, | |
"step": 91 | |
}, | |
{ | |
"epoch": 5.75, | |
"grad_norm": 1.0, | |
"learning_rate": 2.1518987341772153e-05, | |
"loss": 0.0213, | |
"step": 92 | |
}, | |
{ | |
"epoch": 5.75, | |
"eval_accuracy": 0.824, | |
"eval_loss": 0.4049394130706787, | |
"eval_runtime": 5.3443, | |
"eval_samples_per_second": 46.779, | |
"eval_steps_per_second": 1.497, | |
"step": 92 | |
}, | |
{ | |
"epoch": 5.8125, | |
"grad_norm": 3.09375, | |
"learning_rate": 2.120253164556962e-05, | |
"loss": 0.05, | |
"step": 93 | |
}, | |
{ | |
"epoch": 5.8125, | |
"eval_accuracy": 0.824, | |
"eval_loss": 0.40513497591018677, | |
"eval_runtime": 4.9931, | |
"eval_samples_per_second": 50.069, | |
"eval_steps_per_second": 1.602, | |
"step": 93 | |
}, | |
{ | |
"epoch": 5.875, | |
"grad_norm": 1.171875, | |
"learning_rate": 2.088607594936709e-05, | |
"loss": 0.0321, | |
"step": 94 | |
}, | |
{ | |
"epoch": 5.875, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.4072630703449249, | |
"eval_runtime": 4.9859, | |
"eval_samples_per_second": 50.141, | |
"eval_steps_per_second": 1.605, | |
"step": 94 | |
}, | |
{ | |
"epoch": 5.9375, | |
"grad_norm": 1.453125, | |
"learning_rate": 2.056962025316456e-05, | |
"loss": 0.0315, | |
"step": 95 | |
}, | |
{ | |
"epoch": 5.9375, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.40790075063705444, | |
"eval_runtime": 4.9401, | |
"eval_samples_per_second": 50.606, | |
"eval_steps_per_second": 1.619, | |
"step": 95 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 1.265625, | |
"learning_rate": 2.0253164556962025e-05, | |
"loss": 0.0207, | |
"step": 96 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.4082264304161072, | |
"eval_runtime": 4.9371, | |
"eval_samples_per_second": 50.637, | |
"eval_steps_per_second": 1.62, | |
"step": 96 | |
}, | |
{ | |
"epoch": 6.0625, | |
"grad_norm": 1.828125, | |
"learning_rate": 1.9936708860759496e-05, | |
"loss": 0.0235, | |
"step": 97 | |
}, | |
{ | |
"epoch": 6.0625, | |
"eval_accuracy": 0.828, | |
"eval_loss": 0.4142768681049347, | |
"eval_runtime": 4.9339, | |
"eval_samples_per_second": 50.67, | |
"eval_steps_per_second": 1.621, | |
"step": 97 | |
}, | |
{ | |
"epoch": 6.125, | |
"grad_norm": 2.0625, | |
"learning_rate": 1.962025316455696e-05, | |
"loss": 0.0247, | |
"step": 98 | |
}, | |
{ | |
"epoch": 6.125, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.41069871187210083, | |
"eval_runtime": 4.9302, | |
"eval_samples_per_second": 50.708, | |
"eval_steps_per_second": 1.623, | |
"step": 98 | |
}, | |
{ | |
"epoch": 6.1875, | |
"grad_norm": 1.2265625, | |
"learning_rate": 1.9303797468354432e-05, | |
"loss": 0.0148, | |
"step": 99 | |
}, | |
{ | |
"epoch": 6.1875, | |
"eval_accuracy": 0.836, | |
"eval_loss": 0.4168325662612915, | |
"eval_runtime": 4.9333, | |
"eval_samples_per_second": 50.676, | |
"eval_steps_per_second": 1.622, | |
"step": 99 | |
}, | |
{ | |
"epoch": 6.25, | |
"grad_norm": 3.078125, | |
"learning_rate": 1.89873417721519e-05, | |
"loss": 0.0365, | |
"step": 100 | |
}, | |
{ | |
"epoch": 6.25, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.4236372709274292, | |
"eval_runtime": 4.9348, | |
"eval_samples_per_second": 50.661, | |
"eval_steps_per_second": 1.621, | |
"step": 100 | |
}, | |
{ | |
"epoch": 6.3125, | |
"grad_norm": 1.46875, | |
"learning_rate": 1.8670886075949368e-05, | |
"loss": 0.0211, | |
"step": 101 | |
}, | |
{ | |
"epoch": 6.3125, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.4290027320384979, | |
"eval_runtime": 4.9336, | |
"eval_samples_per_second": 50.673, | |
"eval_steps_per_second": 1.622, | |
"step": 101 | |
}, | |
{ | |
"epoch": 6.375, | |
"grad_norm": 1.40625, | |
"learning_rate": 1.8354430379746836e-05, | |
"loss": 0.0183, | |
"step": 102 | |
}, | |
{ | |
"epoch": 6.375, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.4270361661911011, | |
"eval_runtime": 4.9305, | |
"eval_samples_per_second": 50.704, | |
"eval_steps_per_second": 1.623, | |
"step": 102 | |
}, | |
{ | |
"epoch": 6.4375, | |
"grad_norm": 1.4375, | |
"learning_rate": 1.8037974683544304e-05, | |
"loss": 0.0224, | |
"step": 103 | |
}, | |
{ | |
"epoch": 6.4375, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.4209546446800232, | |
"eval_runtime": 5.0135, | |
"eval_samples_per_second": 49.865, | |
"eval_steps_per_second": 1.596, | |
"step": 103 | |
}, | |
{ | |
"epoch": 6.5, | |
"grad_norm": 3.296875, | |
"learning_rate": 1.7721518987341772e-05, | |
"loss": 0.0437, | |
"step": 104 | |
}, | |
{ | |
"epoch": 6.5, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.4190281331539154, | |
"eval_runtime": 5.1552, | |
"eval_samples_per_second": 48.495, | |
"eval_steps_per_second": 1.552, | |
"step": 104 | |
}, | |
{ | |
"epoch": 6.5625, | |
"grad_norm": 1.0859375, | |
"learning_rate": 1.7405063291139243e-05, | |
"loss": 0.0113, | |
"step": 105 | |
}, | |
{ | |
"epoch": 6.5625, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.41917330026626587, | |
"eval_runtime": 5.2448, | |
"eval_samples_per_second": 47.666, | |
"eval_steps_per_second": 1.525, | |
"step": 105 | |
}, | |
{ | |
"epoch": 6.625, | |
"grad_norm": 2.96875, | |
"learning_rate": 1.7088607594936708e-05, | |
"loss": 0.0404, | |
"step": 106 | |
}, | |
{ | |
"epoch": 6.625, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.4207764267921448, | |
"eval_runtime": 5.3617, | |
"eval_samples_per_second": 46.627, | |
"eval_steps_per_second": 1.492, | |
"step": 106 | |
}, | |
{ | |
"epoch": 6.6875, | |
"grad_norm": 1.1484375, | |
"learning_rate": 1.677215189873418e-05, | |
"loss": 0.0127, | |
"step": 107 | |
}, | |
{ | |
"epoch": 6.6875, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.42192724347114563, | |
"eval_runtime": 5.0116, | |
"eval_samples_per_second": 49.884, | |
"eval_steps_per_second": 1.596, | |
"step": 107 | |
}, | |
{ | |
"epoch": 6.75, | |
"grad_norm": 0.62109375, | |
"learning_rate": 1.6455696202531644e-05, | |
"loss": 0.011, | |
"step": 108 | |
}, | |
{ | |
"epoch": 6.75, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.4295770227909088, | |
"eval_runtime": 5.1327, | |
"eval_samples_per_second": 48.708, | |
"eval_steps_per_second": 1.559, | |
"step": 108 | |
}, | |
{ | |
"epoch": 6.8125, | |
"grad_norm": 0.80078125, | |
"learning_rate": 1.6139240506329115e-05, | |
"loss": 0.0088, | |
"step": 109 | |
}, | |
{ | |
"epoch": 6.8125, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.4343023896217346, | |
"eval_runtime": 5.1895, | |
"eval_samples_per_second": 48.174, | |
"eval_steps_per_second": 1.542, | |
"step": 109 | |
}, | |
{ | |
"epoch": 6.875, | |
"grad_norm": 2.734375, | |
"learning_rate": 1.5822784810126583e-05, | |
"loss": 0.0285, | |
"step": 110 | |
}, | |
{ | |
"epoch": 6.875, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.4316946268081665, | |
"eval_runtime": 4.9676, | |
"eval_samples_per_second": 50.326, | |
"eval_steps_per_second": 1.61, | |
"step": 110 | |
}, | |
{ | |
"epoch": 6.9375, | |
"grad_norm": 1.984375, | |
"learning_rate": 1.550632911392405e-05, | |
"loss": 0.0179, | |
"step": 111 | |
}, | |
{ | |
"epoch": 6.9375, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.43008503317832947, | |
"eval_runtime": 4.9342, | |
"eval_samples_per_second": 50.666, | |
"eval_steps_per_second": 1.621, | |
"step": 111 | |
}, | |
{ | |
"epoch": 7.0, | |
"grad_norm": 2.15625, | |
"learning_rate": 1.5189873417721521e-05, | |
"loss": 0.0207, | |
"step": 112 | |
}, | |
{ | |
"epoch": 7.0, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.4265301525592804, | |
"eval_runtime": 4.9341, | |
"eval_samples_per_second": 50.668, | |
"eval_steps_per_second": 1.621, | |
"step": 112 | |
}, | |
{ | |
"epoch": 7.0625, | |
"grad_norm": 0.3671875, | |
"learning_rate": 1.4873417721518987e-05, | |
"loss": 0.005, | |
"step": 113 | |
}, | |
{ | |
"epoch": 7.0625, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.42337584495544434, | |
"eval_runtime": 4.9377, | |
"eval_samples_per_second": 50.631, | |
"eval_steps_per_second": 1.62, | |
"step": 113 | |
}, | |
{ | |
"epoch": 7.125, | |
"grad_norm": 0.408203125, | |
"learning_rate": 1.4556962025316457e-05, | |
"loss": 0.0072, | |
"step": 114 | |
}, | |
{ | |
"epoch": 7.125, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.42673492431640625, | |
"eval_runtime": 4.9384, | |
"eval_samples_per_second": 50.624, | |
"eval_steps_per_second": 1.62, | |
"step": 114 | |
}, | |
{ | |
"epoch": 7.1875, | |
"grad_norm": 0.6796875, | |
"learning_rate": 1.4240506329113925e-05, | |
"loss": 0.0105, | |
"step": 115 | |
}, | |
{ | |
"epoch": 7.1875, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.4281560778617859, | |
"eval_runtime": 4.9383, | |
"eval_samples_per_second": 50.624, | |
"eval_steps_per_second": 1.62, | |
"step": 115 | |
}, | |
{ | |
"epoch": 7.25, | |
"grad_norm": 0.4375, | |
"learning_rate": 1.3924050632911393e-05, | |
"loss": 0.0062, | |
"step": 116 | |
}, | |
{ | |
"epoch": 7.25, | |
"eval_accuracy": 0.836, | |
"eval_loss": 0.430206298828125, | |
"eval_runtime": 4.9311, | |
"eval_samples_per_second": 50.698, | |
"eval_steps_per_second": 1.622, | |
"step": 116 | |
}, | |
{ | |
"epoch": 7.3125, | |
"grad_norm": 1.4453125, | |
"learning_rate": 1.3607594936708861e-05, | |
"loss": 0.015, | |
"step": 117 | |
}, | |
{ | |
"epoch": 7.3125, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.43175867199897766, | |
"eval_runtime": 4.9439, | |
"eval_samples_per_second": 50.568, | |
"eval_steps_per_second": 1.618, | |
"step": 117 | |
}, | |
{ | |
"epoch": 7.375, | |
"grad_norm": 0.86328125, | |
"learning_rate": 1.3291139240506329e-05, | |
"loss": 0.0094, | |
"step": 118 | |
}, | |
{ | |
"epoch": 7.375, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.4308719336986542, | |
"eval_runtime": 4.9314, | |
"eval_samples_per_second": 50.696, | |
"eval_steps_per_second": 1.622, | |
"step": 118 | |
}, | |
{ | |
"epoch": 7.4375, | |
"grad_norm": 1.03125, | |
"learning_rate": 1.2974683544303799e-05, | |
"loss": 0.0086, | |
"step": 119 | |
}, | |
{ | |
"epoch": 7.4375, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.4366334080696106, | |
"eval_runtime": 4.9431, | |
"eval_samples_per_second": 50.576, | |
"eval_steps_per_second": 1.618, | |
"step": 119 | |
}, | |
{ | |
"epoch": 7.5, | |
"grad_norm": 0.2021484375, | |
"learning_rate": 1.2658227848101267e-05, | |
"loss": 0.0035, | |
"step": 120 | |
}, | |
{ | |
"epoch": 7.5, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.44147494435310364, | |
"eval_runtime": 4.9422, | |
"eval_samples_per_second": 50.585, | |
"eval_steps_per_second": 1.619, | |
"step": 120 | |
}, | |
{ | |
"epoch": 7.5625, | |
"grad_norm": 1.015625, | |
"learning_rate": 1.2341772151898735e-05, | |
"loss": 0.0088, | |
"step": 121 | |
}, | |
{ | |
"epoch": 7.5625, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.4474000036716461, | |
"eval_runtime": 4.943, | |
"eval_samples_per_second": 50.577, | |
"eval_steps_per_second": 1.618, | |
"step": 121 | |
}, | |
{ | |
"epoch": 7.625, | |
"grad_norm": 0.3359375, | |
"learning_rate": 1.2025316455696203e-05, | |
"loss": 0.0062, | |
"step": 122 | |
}, | |
{ | |
"epoch": 7.625, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.44968822598457336, | |
"eval_runtime": 4.9362, | |
"eval_samples_per_second": 50.646, | |
"eval_steps_per_second": 1.621, | |
"step": 122 | |
}, | |
{ | |
"epoch": 7.6875, | |
"grad_norm": 1.1953125, | |
"learning_rate": 1.170886075949367e-05, | |
"loss": 0.01, | |
"step": 123 | |
}, | |
{ | |
"epoch": 7.6875, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.45175397396087646, | |
"eval_runtime": 4.885, | |
"eval_samples_per_second": 51.177, | |
"eval_steps_per_second": 1.638, | |
"step": 123 | |
}, | |
{ | |
"epoch": 7.75, | |
"grad_norm": 0.6328125, | |
"learning_rate": 1.139240506329114e-05, | |
"loss": 0.008, | |
"step": 124 | |
}, | |
{ | |
"epoch": 7.75, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.45391473174095154, | |
"eval_runtime": 4.9403, | |
"eval_samples_per_second": 50.604, | |
"eval_steps_per_second": 1.619, | |
"step": 124 | |
}, | |
{ | |
"epoch": 7.8125, | |
"grad_norm": 0.431640625, | |
"learning_rate": 1.1075949367088608e-05, | |
"loss": 0.0071, | |
"step": 125 | |
}, | |
{ | |
"epoch": 7.8125, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.4616769552230835, | |
"eval_runtime": 4.9324, | |
"eval_samples_per_second": 50.685, | |
"eval_steps_per_second": 1.622, | |
"step": 125 | |
}, | |
{ | |
"epoch": 7.875, | |
"grad_norm": 0.72265625, | |
"learning_rate": 1.0759493670886076e-05, | |
"loss": 0.0065, | |
"step": 126 | |
}, | |
{ | |
"epoch": 7.875, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.4597095251083374, | |
"eval_runtime": 4.9365, | |
"eval_samples_per_second": 50.643, | |
"eval_steps_per_second": 1.621, | |
"step": 126 | |
}, | |
{ | |
"epoch": 7.9375, | |
"grad_norm": 0.361328125, | |
"learning_rate": 1.0443037974683544e-05, | |
"loss": 0.004, | |
"step": 127 | |
}, | |
{ | |
"epoch": 7.9375, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.4604904055595398, | |
"eval_runtime": 4.9367, | |
"eval_samples_per_second": 50.641, | |
"eval_steps_per_second": 1.621, | |
"step": 127 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 0.2099609375, | |
"learning_rate": 1.0126582278481012e-05, | |
"loss": 0.0027, | |
"step": 128 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.46479332447052, | |
"eval_runtime": 4.9361, | |
"eval_samples_per_second": 50.648, | |
"eval_steps_per_second": 1.621, | |
"step": 128 | |
}, | |
{ | |
"epoch": 8.0625, | |
"grad_norm": 0.224609375, | |
"learning_rate": 9.81012658227848e-06, | |
"loss": 0.0029, | |
"step": 129 | |
}, | |
{ | |
"epoch": 8.0625, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.47178688645362854, | |
"eval_runtime": 4.9337, | |
"eval_samples_per_second": 50.672, | |
"eval_steps_per_second": 1.622, | |
"step": 129 | |
}, | |
{ | |
"epoch": 8.125, | |
"grad_norm": 0.404296875, | |
"learning_rate": 9.49367088607595e-06, | |
"loss": 0.0058, | |
"step": 130 | |
}, | |
{ | |
"epoch": 8.125, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.4701623022556305, | |
"eval_runtime": 4.9934, | |
"eval_samples_per_second": 50.066, | |
"eval_steps_per_second": 1.602, | |
"step": 130 | |
}, | |
{ | |
"epoch": 8.1875, | |
"grad_norm": 0.39453125, | |
"learning_rate": 9.177215189873418e-06, | |
"loss": 0.0036, | |
"step": 131 | |
}, | |
{ | |
"epoch": 8.1875, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.4724094867706299, | |
"eval_runtime": 5.0541, | |
"eval_samples_per_second": 49.464, | |
"eval_steps_per_second": 1.583, | |
"step": 131 | |
}, | |
{ | |
"epoch": 8.25, | |
"grad_norm": 0.734375, | |
"learning_rate": 8.860759493670886e-06, | |
"loss": 0.0061, | |
"step": 132 | |
}, | |
{ | |
"epoch": 8.25, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.47215989232063293, | |
"eval_runtime": 5.2055, | |
"eval_samples_per_second": 48.026, | |
"eval_steps_per_second": 1.537, | |
"step": 132 | |
}, | |
{ | |
"epoch": 8.3125, | |
"grad_norm": 0.345703125, | |
"learning_rate": 8.544303797468354e-06, | |
"loss": 0.0044, | |
"step": 133 | |
}, | |
{ | |
"epoch": 8.3125, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.47440290451049805, | |
"eval_runtime": 4.9325, | |
"eval_samples_per_second": 50.684, | |
"eval_steps_per_second": 1.622, | |
"step": 133 | |
}, | |
{ | |
"epoch": 8.375, | |
"grad_norm": 0.43359375, | |
"learning_rate": 8.227848101265822e-06, | |
"loss": 0.0052, | |
"step": 134 | |
}, | |
{ | |
"epoch": 8.375, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.4775030016899109, | |
"eval_runtime": 5.0459, | |
"eval_samples_per_second": 49.545, | |
"eval_steps_per_second": 1.585, | |
"step": 134 | |
}, | |
{ | |
"epoch": 8.4375, | |
"grad_norm": 0.291015625, | |
"learning_rate": 7.911392405063292e-06, | |
"loss": 0.0058, | |
"step": 135 | |
}, | |
{ | |
"epoch": 8.4375, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.47711512446403503, | |
"eval_runtime": 4.9135, | |
"eval_samples_per_second": 50.88, | |
"eval_steps_per_second": 1.628, | |
"step": 135 | |
}, | |
{ | |
"epoch": 8.5, | |
"grad_norm": 0.1650390625, | |
"learning_rate": 7.5949367088607605e-06, | |
"loss": 0.0023, | |
"step": 136 | |
}, | |
{ | |
"epoch": 8.5, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.4801058769226074, | |
"eval_runtime": 4.9192, | |
"eval_samples_per_second": 50.821, | |
"eval_steps_per_second": 1.626, | |
"step": 136 | |
}, | |
{ | |
"epoch": 8.5625, | |
"grad_norm": 0.5703125, | |
"learning_rate": 7.2784810126582285e-06, | |
"loss": 0.0065, | |
"step": 137 | |
}, | |
{ | |
"epoch": 8.5625, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.47905686497688293, | |
"eval_runtime": 5.0724, | |
"eval_samples_per_second": 49.287, | |
"eval_steps_per_second": 1.577, | |
"step": 137 | |
}, | |
{ | |
"epoch": 8.625, | |
"grad_norm": 0.291015625, | |
"learning_rate": 6.9620253164556965e-06, | |
"loss": 0.0026, | |
"step": 138 | |
}, | |
{ | |
"epoch": 8.625, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.48366859555244446, | |
"eval_runtime": 4.9162, | |
"eval_samples_per_second": 50.852, | |
"eval_steps_per_second": 1.627, | |
"step": 138 | |
}, | |
{ | |
"epoch": 8.6875, | |
"grad_norm": 0.4140625, | |
"learning_rate": 6.6455696202531645e-06, | |
"loss": 0.0044, | |
"step": 139 | |
}, | |
{ | |
"epoch": 8.6875, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.48236754536628723, | |
"eval_runtime": 4.9088, | |
"eval_samples_per_second": 50.929, | |
"eval_steps_per_second": 1.63, | |
"step": 139 | |
}, | |
{ | |
"epoch": 8.75, | |
"grad_norm": 0.154296875, | |
"learning_rate": 6.329113924050633e-06, | |
"loss": 0.0024, | |
"step": 140 | |
}, | |
{ | |
"epoch": 8.75, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.4821201264858246, | |
"eval_runtime": 4.9092, | |
"eval_samples_per_second": 50.925, | |
"eval_steps_per_second": 1.63, | |
"step": 140 | |
}, | |
{ | |
"epoch": 8.8125, | |
"grad_norm": 0.19140625, | |
"learning_rate": 6.012658227848101e-06, | |
"loss": 0.003, | |
"step": 141 | |
}, | |
{ | |
"epoch": 8.8125, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.4807308316230774, | |
"eval_runtime": 4.9123, | |
"eval_samples_per_second": 50.892, | |
"eval_steps_per_second": 1.629, | |
"step": 141 | |
}, | |
{ | |
"epoch": 8.875, | |
"grad_norm": 0.337890625, | |
"learning_rate": 5.69620253164557e-06, | |
"loss": 0.0036, | |
"step": 142 | |
}, | |
{ | |
"epoch": 8.875, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.4836767613887787, | |
"eval_runtime": 4.915, | |
"eval_samples_per_second": 50.864, | |
"eval_steps_per_second": 1.628, | |
"step": 142 | |
}, | |
{ | |
"epoch": 8.9375, | |
"grad_norm": 0.251953125, | |
"learning_rate": 5.379746835443038e-06, | |
"loss": 0.0025, | |
"step": 143 | |
}, | |
{ | |
"epoch": 8.9375, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.4841257631778717, | |
"eval_runtime": 4.9119, | |
"eval_samples_per_second": 50.897, | |
"eval_steps_per_second": 1.629, | |
"step": 143 | |
}, | |
{ | |
"epoch": 9.0, | |
"grad_norm": 0.53515625, | |
"learning_rate": 5.063291139240506e-06, | |
"loss": 0.0052, | |
"step": 144 | |
}, | |
{ | |
"epoch": 9.0, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.48218366503715515, | |
"eval_runtime": 4.9085, | |
"eval_samples_per_second": 50.932, | |
"eval_steps_per_second": 1.63, | |
"step": 144 | |
}, | |
{ | |
"epoch": 9.0625, | |
"grad_norm": 0.1767578125, | |
"learning_rate": 4.746835443037975e-06, | |
"loss": 0.0022, | |
"step": 145 | |
}, | |
{ | |
"epoch": 9.0625, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.4810963273048401, | |
"eval_runtime": 4.9111, | |
"eval_samples_per_second": 50.905, | |
"eval_steps_per_second": 1.629, | |
"step": 145 | |
}, | |
{ | |
"epoch": 9.125, | |
"grad_norm": 0.44140625, | |
"learning_rate": 4.430379746835443e-06, | |
"loss": 0.007, | |
"step": 146 | |
}, | |
{ | |
"epoch": 9.125, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.482108473777771, | |
"eval_runtime": 4.9117, | |
"eval_samples_per_second": 50.899, | |
"eval_steps_per_second": 1.629, | |
"step": 146 | |
}, | |
{ | |
"epoch": 9.1875, | |
"grad_norm": 0.17578125, | |
"learning_rate": 4.113924050632911e-06, | |
"loss": 0.0033, | |
"step": 147 | |
}, | |
{ | |
"epoch": 9.1875, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.48591578006744385, | |
"eval_runtime": 4.9102, | |
"eval_samples_per_second": 50.914, | |
"eval_steps_per_second": 1.629, | |
"step": 147 | |
}, | |
{ | |
"epoch": 9.25, | |
"grad_norm": 0.1552734375, | |
"learning_rate": 3.7974683544303802e-06, | |
"loss": 0.0025, | |
"step": 148 | |
}, | |
{ | |
"epoch": 9.25, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.4883337914943695, | |
"eval_runtime": 4.9105, | |
"eval_samples_per_second": 50.912, | |
"eval_steps_per_second": 1.629, | |
"step": 148 | |
}, | |
{ | |
"epoch": 9.3125, | |
"grad_norm": 0.302734375, | |
"learning_rate": 3.4810126582278482e-06, | |
"loss": 0.0049, | |
"step": 149 | |
}, | |
{ | |
"epoch": 9.3125, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.4839572012424469, | |
"eval_runtime": 4.9139, | |
"eval_samples_per_second": 50.876, | |
"eval_steps_per_second": 1.628, | |
"step": 149 | |
}, | |
{ | |
"epoch": 9.375, | |
"grad_norm": 0.1611328125, | |
"learning_rate": 3.1645569620253167e-06, | |
"loss": 0.0026, | |
"step": 150 | |
}, | |
{ | |
"epoch": 9.375, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.4814904034137726, | |
"eval_runtime": 4.9134, | |
"eval_samples_per_second": 50.881, | |
"eval_steps_per_second": 1.628, | |
"step": 150 | |
}, | |
{ | |
"epoch": 9.4375, | |
"grad_norm": 0.2177734375, | |
"learning_rate": 2.848101265822785e-06, | |
"loss": 0.0021, | |
"step": 151 | |
}, | |
{ | |
"epoch": 9.4375, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.4863939881324768, | |
"eval_runtime": 4.914, | |
"eval_samples_per_second": 50.875, | |
"eval_steps_per_second": 1.628, | |
"step": 151 | |
}, | |
{ | |
"epoch": 9.5, | |
"grad_norm": 0.1484375, | |
"learning_rate": 2.531645569620253e-06, | |
"loss": 0.0023, | |
"step": 152 | |
}, | |
{ | |
"epoch": 9.5, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.4879688024520874, | |
"eval_runtime": 4.9108, | |
"eval_samples_per_second": 50.909, | |
"eval_steps_per_second": 1.629, | |
"step": 152 | |
}, | |
{ | |
"epoch": 9.5625, | |
"grad_norm": 0.263671875, | |
"learning_rate": 2.2151898734177215e-06, | |
"loss": 0.0039, | |
"step": 153 | |
}, | |
{ | |
"epoch": 9.5625, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.4850555956363678, | |
"eval_runtime": 4.9088, | |
"eval_samples_per_second": 50.929, | |
"eval_steps_per_second": 1.63, | |
"step": 153 | |
}, | |
{ | |
"epoch": 9.625, | |
"grad_norm": 0.208984375, | |
"learning_rate": 1.8987341772151901e-06, | |
"loss": 0.0025, | |
"step": 154 | |
}, | |
{ | |
"epoch": 9.625, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.48572131991386414, | |
"eval_runtime": 4.9101, | |
"eval_samples_per_second": 50.916, | |
"eval_steps_per_second": 1.629, | |
"step": 154 | |
}, | |
{ | |
"epoch": 9.6875, | |
"grad_norm": 0.21484375, | |
"learning_rate": 1.5822784810126583e-06, | |
"loss": 0.004, | |
"step": 155 | |
}, | |
{ | |
"epoch": 9.6875, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.4816167652606964, | |
"eval_runtime": 4.9101, | |
"eval_samples_per_second": 50.915, | |
"eval_steps_per_second": 1.629, | |
"step": 155 | |
}, | |
{ | |
"epoch": 9.75, | |
"grad_norm": 0.80078125, | |
"learning_rate": 1.2658227848101265e-06, | |
"loss": 0.0088, | |
"step": 156 | |
}, | |
{ | |
"epoch": 9.75, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.4846898913383484, | |
"eval_runtime": 5.0723, | |
"eval_samples_per_second": 49.288, | |
"eval_steps_per_second": 1.577, | |
"step": 156 | |
}, | |
{ | |
"epoch": 9.8125, | |
"grad_norm": 0.2138671875, | |
"learning_rate": 9.493670886075951e-07, | |
"loss": 0.0035, | |
"step": 157 | |
}, | |
{ | |
"epoch": 9.8125, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.48494383692741394, | |
"eval_runtime": 4.9092, | |
"eval_samples_per_second": 50.925, | |
"eval_steps_per_second": 1.63, | |
"step": 157 | |
}, | |
{ | |
"epoch": 9.875, | |
"grad_norm": 0.1708984375, | |
"learning_rate": 6.329113924050633e-07, | |
"loss": 0.0027, | |
"step": 158 | |
}, | |
{ | |
"epoch": 9.875, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.4813316762447357, | |
"eval_runtime": 4.9095, | |
"eval_samples_per_second": 50.922, | |
"eval_steps_per_second": 1.629, | |
"step": 158 | |
}, | |
{ | |
"epoch": 9.9375, | |
"grad_norm": 0.2099609375, | |
"learning_rate": 3.1645569620253163e-07, | |
"loss": 0.0022, | |
"step": 159 | |
}, | |
{ | |
"epoch": 9.9375, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.48336902260780334, | |
"eval_runtime": 4.911, | |
"eval_samples_per_second": 50.906, | |
"eval_steps_per_second": 1.629, | |
"step": 159 | |
}, | |
{ | |
"epoch": 10.0, | |
"grad_norm": 0.154296875, | |
"learning_rate": 0.0, | |
"loss": 0.0022, | |
"step": 160 | |
}, | |
{ | |
"epoch": 10.0, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.483336478471756, | |
"eval_runtime": 4.9132, | |
"eval_samples_per_second": 50.883, | |
"eval_steps_per_second": 1.628, | |
"step": 160 | |
}, | |
{ | |
"epoch": 10.0, | |
"step": 160, | |
"total_flos": 6.923398955820646e+16, | |
"train_loss": 0.22855629230616614, | |
"train_runtime": 1357.293, | |
"train_samples_per_second": 7.368, | |
"train_steps_per_second": 0.118 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 160, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": false, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 6.923398955820646e+16, | |
"train_batch_size": 8, | |
"trial_name": null, | |
"trial_params": null | |
} | |