{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0625, "grad_norm": 9.625, "learning_rate": 2.5e-05, "loss": 1.0231, "step": 1 }, { "epoch": 0.0625, "eval_accuracy": 0.496, "eval_loss": 1.0858867168426514, "eval_runtime": 4.9172, "eval_samples_per_second": 50.841, "eval_steps_per_second": 1.627, "step": 1 }, { "epoch": 0.125, "grad_norm": 10.5625, "learning_rate": 5e-05, "loss": 0.9224, "step": 2 }, { "epoch": 0.125, "eval_accuracy": 0.488, "eval_loss": 1.0843095779418945, "eval_runtime": 4.9094, "eval_samples_per_second": 50.923, "eval_steps_per_second": 1.63, "step": 2 }, { "epoch": 0.1875, "grad_norm": 11.875, "learning_rate": 4.968354430379747e-05, "loss": 1.2584, "step": 3 }, { "epoch": 0.1875, "eval_accuracy": 0.488, "eval_loss": 1.0734808444976807, "eval_runtime": 4.9054, "eval_samples_per_second": 50.964, "eval_steps_per_second": 1.631, "step": 3 }, { "epoch": 0.25, "grad_norm": 13.3125, "learning_rate": 4.936708860759494e-05, "loss": 1.0442, "step": 4 }, { "epoch": 0.25, "eval_accuracy": 0.492, "eval_loss": 1.0604006052017212, "eval_runtime": 4.9069, "eval_samples_per_second": 50.948, "eval_steps_per_second": 1.63, "step": 4 }, { "epoch": 0.3125, "grad_norm": 22.625, "learning_rate": 4.905063291139241e-05, "loss": 1.3363, "step": 5 }, { "epoch": 0.3125, "eval_accuracy": 0.496, "eval_loss": 1.0435388088226318, "eval_runtime": 4.9047, "eval_samples_per_second": 50.972, "eval_steps_per_second": 1.631, "step": 5 }, { "epoch": 0.375, "grad_norm": 10.625, "learning_rate": 4.8734177215189874e-05, "loss": 1.0358, "step": 6 }, { "epoch": 0.375, "eval_accuracy": 0.504, "eval_loss": 1.0265413522720337, "eval_runtime": 4.9042, "eval_samples_per_second": 50.977, "eval_steps_per_second": 1.631, "step": 6 }, { "epoch": 0.4375, "grad_norm": 7.5625, "learning_rate": 4.8417721518987346e-05, "loss": 0.9276, "step": 7 }, { "epoch": 0.4375, "eval_accuracy": 0.508, "eval_loss": 1.0061290264129639, "eval_runtime": 4.9077, "eval_samples_per_second": 50.941, "eval_steps_per_second": 1.63, "step": 7 }, { "epoch": 0.5, "grad_norm": 16.75, "learning_rate": 4.810126582278481e-05, "loss": 0.9751, "step": 8 }, { "epoch": 0.5, "eval_accuracy": 0.512, "eval_loss": 0.9898163080215454, "eval_runtime": 4.9054, "eval_samples_per_second": 50.964, "eval_steps_per_second": 1.631, "step": 8 }, { "epoch": 0.5625, "grad_norm": 10.875, "learning_rate": 4.778481012658228e-05, "loss": 1.1136, "step": 9 }, { "epoch": 0.5625, "eval_accuracy": 0.524, "eval_loss": 0.9671441912651062, "eval_runtime": 4.8588, "eval_samples_per_second": 51.453, "eval_steps_per_second": 1.646, "step": 9 }, { "epoch": 0.625, "grad_norm": 11.0, "learning_rate": 4.7468354430379746e-05, "loss": 0.8243, "step": 10 }, { "epoch": 0.625, "eval_accuracy": 0.528, "eval_loss": 0.9475510120391846, "eval_runtime": 4.9061, "eval_samples_per_second": 50.957, "eval_steps_per_second": 1.631, "step": 10 }, { "epoch": 0.6875, "grad_norm": 10.875, "learning_rate": 4.715189873417722e-05, "loss": 0.9469, "step": 11 }, { "epoch": 0.6875, "eval_accuracy": 0.536, "eval_loss": 0.9229685068130493, "eval_runtime": 4.9037, "eval_samples_per_second": 50.982, "eval_steps_per_second": 1.631, "step": 11 }, { "epoch": 0.75, "grad_norm": 10.125, "learning_rate": 4.683544303797468e-05, "loss": 0.9348, "step": 12 }, { "epoch": 0.75, "eval_accuracy": 0.532, "eval_loss": 0.9007159471511841, "eval_runtime": 4.9072, "eval_samples_per_second": 50.945, "eval_steps_per_second": 1.63, "step": 12 }, { "epoch": 0.8125, "grad_norm": 9.6875, "learning_rate": 4.6518987341772154e-05, "loss": 0.939, "step": 13 }, { "epoch": 0.8125, "eval_accuracy": 0.548, "eval_loss": 0.8757832050323486, "eval_runtime": 4.9116, "eval_samples_per_second": 50.9, "eval_steps_per_second": 1.629, "step": 13 }, { "epoch": 0.875, "grad_norm": 16.375, "learning_rate": 4.6202531645569625e-05, "loss": 0.8007, "step": 14 }, { "epoch": 0.875, "eval_accuracy": 0.556, "eval_loss": 0.8450800776481628, "eval_runtime": 4.9085, "eval_samples_per_second": 50.932, "eval_steps_per_second": 1.63, "step": 14 }, { "epoch": 0.9375, "grad_norm": 13.625, "learning_rate": 4.588607594936709e-05, "loss": 0.8642, "step": 15 }, { "epoch": 0.9375, "eval_accuracy": 0.584, "eval_loss": 0.8132224082946777, "eval_runtime": 4.9071, "eval_samples_per_second": 50.946, "eval_steps_per_second": 1.63, "step": 15 }, { "epoch": 1.0, "grad_norm": 16.5, "learning_rate": 4.556962025316456e-05, "loss": 0.6672, "step": 16 }, { "epoch": 1.0, "eval_accuracy": 0.604, "eval_loss": 0.7814155220985413, "eval_runtime": 4.8576, "eval_samples_per_second": 51.465, "eval_steps_per_second": 1.647, "step": 16 }, { "epoch": 1.0625, "grad_norm": 9.1875, "learning_rate": 4.525316455696203e-05, "loss": 0.7072, "step": 17 }, { "epoch": 1.0625, "eval_accuracy": 0.612, "eval_loss": 0.7591574788093567, "eval_runtime": 4.9059, "eval_samples_per_second": 50.959, "eval_steps_per_second": 1.631, "step": 17 }, { "epoch": 1.125, "grad_norm": 8.3125, "learning_rate": 4.49367088607595e-05, "loss": 0.7815, "step": 18 }, { "epoch": 1.125, "eval_accuracy": 0.644, "eval_loss": 0.7339711785316467, "eval_runtime": 4.9112, "eval_samples_per_second": 50.904, "eval_steps_per_second": 1.629, "step": 18 }, { "epoch": 1.1875, "grad_norm": 7.0, "learning_rate": 4.462025316455696e-05, "loss": 0.6226, "step": 19 }, { "epoch": 1.1875, "eval_accuracy": 0.66, "eval_loss": 0.7131918668746948, "eval_runtime": 4.9082, "eval_samples_per_second": 50.935, "eval_steps_per_second": 1.63, "step": 19 }, { "epoch": 1.25, "grad_norm": 11.25, "learning_rate": 4.430379746835443e-05, "loss": 0.6456, "step": 20 }, { "epoch": 1.25, "eval_accuracy": 0.664, "eval_loss": 0.6907175183296204, "eval_runtime": 4.9075, "eval_samples_per_second": 50.943, "eval_steps_per_second": 1.63, "step": 20 }, { "epoch": 1.3125, "grad_norm": 14.0625, "learning_rate": 4.3987341772151904e-05, "loss": 0.8005, "step": 21 }, { "epoch": 1.3125, "eval_accuracy": 0.668, "eval_loss": 0.6769924163818359, "eval_runtime": 4.9073, "eval_samples_per_second": 50.945, "eval_steps_per_second": 1.63, "step": 21 }, { "epoch": 1.375, "grad_norm": 6.09375, "learning_rate": 4.367088607594937e-05, "loss": 0.4491, "step": 22 }, { "epoch": 1.375, "eval_accuracy": 0.688, "eval_loss": 0.6586074233055115, "eval_runtime": 4.906, "eval_samples_per_second": 50.958, "eval_steps_per_second": 1.631, "step": 22 }, { "epoch": 1.4375, "grad_norm": 6.0625, "learning_rate": 4.3354430379746834e-05, "loss": 0.474, "step": 23 }, { "epoch": 1.4375, "eval_accuracy": 0.692, "eval_loss": 0.6458801031112671, "eval_runtime": 4.8557, "eval_samples_per_second": 51.486, "eval_steps_per_second": 1.648, "step": 23 }, { "epoch": 1.5, "grad_norm": 7.3125, "learning_rate": 4.3037974683544305e-05, "loss": 0.5048, "step": 24 }, { "epoch": 1.5, "eval_accuracy": 0.704, "eval_loss": 0.6440668702125549, "eval_runtime": 4.9064, "eval_samples_per_second": 50.954, "eval_steps_per_second": 1.631, "step": 24 }, { "epoch": 1.5625, "grad_norm": 9.75, "learning_rate": 4.2721518987341776e-05, "loss": 0.443, "step": 25 }, { "epoch": 1.5625, "eval_accuracy": 0.712, "eval_loss": 0.639043927192688, "eval_runtime": 4.9058, "eval_samples_per_second": 50.96, "eval_steps_per_second": 1.631, "step": 25 }, { "epoch": 1.625, "grad_norm": 6.40625, "learning_rate": 4.240506329113924e-05, "loss": 0.4949, "step": 26 }, { "epoch": 1.625, "eval_accuracy": 0.716, "eval_loss": 0.6155992150306702, "eval_runtime": 4.9046, "eval_samples_per_second": 50.972, "eval_steps_per_second": 1.631, "step": 26 }, { "epoch": 1.6875, "grad_norm": 16.0, "learning_rate": 4.208860759493671e-05, "loss": 0.5333, "step": 27 }, { "epoch": 1.6875, "eval_accuracy": 0.708, "eval_loss": 0.6270785927772522, "eval_runtime": 4.9087, "eval_samples_per_second": 50.93, "eval_steps_per_second": 1.63, "step": 27 }, { "epoch": 1.75, "grad_norm": 8.25, "learning_rate": 4.177215189873418e-05, "loss": 0.5316, "step": 28 }, { "epoch": 1.75, "eval_accuracy": 0.696, "eval_loss": 0.6393130421638489, "eval_runtime": 4.9075, "eval_samples_per_second": 50.943, "eval_steps_per_second": 1.63, "step": 28 }, { "epoch": 1.8125, "grad_norm": 26.875, "learning_rate": 4.145569620253165e-05, "loss": 0.5872, "step": 29 }, { "epoch": 1.8125, "eval_accuracy": 0.708, "eval_loss": 0.6233423948287964, "eval_runtime": 4.9097, "eval_samples_per_second": 50.919, "eval_steps_per_second": 1.629, "step": 29 }, { "epoch": 1.875, "grad_norm": 14.3125, "learning_rate": 4.113924050632912e-05, "loss": 0.532, "step": 30 }, { "epoch": 1.875, "eval_accuracy": 0.728, "eval_loss": 0.5802625417709351, "eval_runtime": 4.9064, "eval_samples_per_second": 50.954, "eval_steps_per_second": 1.631, "step": 30 }, { "epoch": 1.9375, "grad_norm": 5.8125, "learning_rate": 4.0822784810126584e-05, "loss": 0.5531, "step": 31 }, { "epoch": 1.9375, "eval_accuracy": 0.748, "eval_loss": 0.5553041100502014, "eval_runtime": 4.9066, "eval_samples_per_second": 50.952, "eval_steps_per_second": 1.63, "step": 31 }, { "epoch": 2.0, "grad_norm": 19.375, "learning_rate": 4.050632911392405e-05, "loss": 0.4206, "step": 32 }, { "epoch": 2.0, "eval_accuracy": 0.74, "eval_loss": 0.5429959893226624, "eval_runtime": 4.9071, "eval_samples_per_second": 50.947, "eval_steps_per_second": 1.63, "step": 32 }, { "epoch": 2.0625, "grad_norm": 11.4375, "learning_rate": 4.018987341772152e-05, "loss": 0.4597, "step": 33 }, { "epoch": 2.0625, "eval_accuracy": 0.72, "eval_loss": 0.5543990731239319, "eval_runtime": 4.9053, "eval_samples_per_second": 50.965, "eval_steps_per_second": 1.631, "step": 33 }, { "epoch": 2.125, "grad_norm": 12.0, "learning_rate": 3.987341772151899e-05, "loss": 0.4009, "step": 34 }, { "epoch": 2.125, "eval_accuracy": 0.716, "eval_loss": 0.5574648976325989, "eval_runtime": 4.9099, "eval_samples_per_second": 50.917, "eval_steps_per_second": 1.629, "step": 34 }, { "epoch": 2.1875, "grad_norm": 13.0625, "learning_rate": 3.9556962025316456e-05, "loss": 0.5113, "step": 35 }, { "epoch": 2.1875, "eval_accuracy": 0.732, "eval_loss": 0.5373346209526062, "eval_runtime": 4.9081, "eval_samples_per_second": 50.936, "eval_steps_per_second": 1.63, "step": 35 }, { "epoch": 2.25, "grad_norm": 18.25, "learning_rate": 3.924050632911392e-05, "loss": 0.3224, "step": 36 }, { "epoch": 2.25, "eval_accuracy": 0.744, "eval_loss": 0.5083271265029907, "eval_runtime": 4.9092, "eval_samples_per_second": 50.925, "eval_steps_per_second": 1.63, "step": 36 }, { "epoch": 2.3125, "grad_norm": 13.8125, "learning_rate": 3.89240506329114e-05, "loss": 0.2552, "step": 37 }, { "epoch": 2.3125, "eval_accuracy": 0.756, "eval_loss": 0.48166918754577637, "eval_runtime": 4.9082, "eval_samples_per_second": 50.935, "eval_steps_per_second": 1.63, "step": 37 }, { "epoch": 2.375, "grad_norm": 13.0625, "learning_rate": 3.8607594936708864e-05, "loss": 0.3693, "step": 38 }, { "epoch": 2.375, "eval_accuracy": 0.76, "eval_loss": 0.4727539122104645, "eval_runtime": 4.8582, "eval_samples_per_second": 51.459, "eval_steps_per_second": 1.647, "step": 38 }, { "epoch": 2.4375, "grad_norm": 5.71875, "learning_rate": 3.829113924050633e-05, "loss": 0.3886, "step": 39 }, { "epoch": 2.4375, "eval_accuracy": 0.76, "eval_loss": 0.4700586497783661, "eval_runtime": 4.9039, "eval_samples_per_second": 50.979, "eval_steps_per_second": 1.631, "step": 39 }, { "epoch": 2.5, "grad_norm": 12.6875, "learning_rate": 3.79746835443038e-05, "loss": 0.3666, "step": 40 }, { "epoch": 2.5, "eval_accuracy": 0.772, "eval_loss": 0.46244245767593384, "eval_runtime": 4.9122, "eval_samples_per_second": 50.894, "eval_steps_per_second": 1.629, "step": 40 }, { "epoch": 2.5625, "grad_norm": 5.21875, "learning_rate": 3.765822784810127e-05, "loss": 0.3614, "step": 41 }, { "epoch": 2.5625, "eval_accuracy": 0.772, "eval_loss": 0.45654377341270447, "eval_runtime": 4.8573, "eval_samples_per_second": 51.469, "eval_steps_per_second": 1.647, "step": 41 }, { "epoch": 2.625, "grad_norm": 3.375, "learning_rate": 3.7341772151898736e-05, "loss": 0.2228, "step": 42 }, { "epoch": 2.625, "eval_accuracy": 0.768, "eval_loss": 0.45055416226387024, "eval_runtime": 4.9079, "eval_samples_per_second": 50.938, "eval_steps_per_second": 1.63, "step": 42 }, { "epoch": 2.6875, "grad_norm": 7.34375, "learning_rate": 3.70253164556962e-05, "loss": 0.2365, "step": 43 }, { "epoch": 2.6875, "eval_accuracy": 0.772, "eval_loss": 0.4469849467277527, "eval_runtime": 4.9083, "eval_samples_per_second": 50.935, "eval_steps_per_second": 1.63, "step": 43 }, { "epoch": 2.75, "grad_norm": 7.5625, "learning_rate": 3.670886075949367e-05, "loss": 0.3274, "step": 44 }, { "epoch": 2.75, "eval_accuracy": 0.784, "eval_loss": 0.44335371255874634, "eval_runtime": 4.9063, "eval_samples_per_second": 50.955, "eval_steps_per_second": 1.631, "step": 44 }, { "epoch": 2.8125, "grad_norm": 11.125, "learning_rate": 3.639240506329114e-05, "loss": 0.2592, "step": 45 }, { "epoch": 2.8125, "eval_accuracy": 0.788, "eval_loss": 0.44204553961753845, "eval_runtime": 4.9034, "eval_samples_per_second": 50.985, "eval_steps_per_second": 1.632, "step": 45 }, { "epoch": 2.875, "grad_norm": 3.75, "learning_rate": 3.607594936708861e-05, "loss": 0.2343, "step": 46 }, { "epoch": 2.875, "eval_accuracy": 0.78, "eval_loss": 0.4448190927505493, "eval_runtime": 4.9077, "eval_samples_per_second": 50.94, "eval_steps_per_second": 1.63, "step": 46 }, { "epoch": 2.9375, "grad_norm": 9.375, "learning_rate": 3.575949367088608e-05, "loss": 0.219, "step": 47 }, { "epoch": 2.9375, "eval_accuracy": 0.788, "eval_loss": 0.44333717226982117, "eval_runtime": 4.9081, "eval_samples_per_second": 50.936, "eval_steps_per_second": 1.63, "step": 47 }, { "epoch": 3.0, "grad_norm": 5.25, "learning_rate": 3.5443037974683544e-05, "loss": 0.2329, "step": 48 }, { "epoch": 3.0, "eval_accuracy": 0.796, "eval_loss": 0.44067704677581787, "eval_runtime": 4.9046, "eval_samples_per_second": 50.973, "eval_steps_per_second": 1.631, "step": 48 }, { "epoch": 3.0625, "grad_norm": 6.28125, "learning_rate": 3.5126582278481015e-05, "loss": 0.2697, "step": 49 }, { "epoch": 3.0625, "eval_accuracy": 0.796, "eval_loss": 0.4345742166042328, "eval_runtime": 4.905, "eval_samples_per_second": 50.969, "eval_steps_per_second": 1.631, "step": 49 }, { "epoch": 3.125, "grad_norm": 5.71875, "learning_rate": 3.4810126582278487e-05, "loss": 0.1796, "step": 50 }, { "epoch": 3.125, "eval_accuracy": 0.796, "eval_loss": 0.42894840240478516, "eval_runtime": 4.9069, "eval_samples_per_second": 50.949, "eval_steps_per_second": 1.63, "step": 50 }, { "epoch": 3.1875, "grad_norm": 9.125, "learning_rate": 3.449367088607595e-05, "loss": 0.1754, "step": 51 }, { "epoch": 3.1875, "eval_accuracy": 0.792, "eval_loss": 0.43848368525505066, "eval_runtime": 4.9038, "eval_samples_per_second": 50.981, "eval_steps_per_second": 1.631, "step": 51 }, { "epoch": 3.25, "grad_norm": 3.28125, "learning_rate": 3.4177215189873416e-05, "loss": 0.1621, "step": 52 }, { "epoch": 3.25, "eval_accuracy": 0.78, "eval_loss": 0.4773699939250946, "eval_runtime": 4.9079, "eval_samples_per_second": 50.938, "eval_steps_per_second": 1.63, "step": 52 }, { "epoch": 3.3125, "grad_norm": 12.0625, "learning_rate": 3.386075949367089e-05, "loss": 0.2976, "step": 53 }, { "epoch": 3.3125, "eval_accuracy": 0.776, "eval_loss": 0.5157197117805481, "eval_runtime": 4.9079, "eval_samples_per_second": 50.939, "eval_steps_per_second": 1.63, "step": 53 }, { "epoch": 3.375, "grad_norm": 9.0625, "learning_rate": 3.354430379746836e-05, "loss": 0.2612, "step": 54 }, { "epoch": 3.375, "eval_accuracy": 0.772, "eval_loss": 0.5219131112098694, "eval_runtime": 4.9158, "eval_samples_per_second": 50.856, "eval_steps_per_second": 1.627, "step": 54 }, { "epoch": 3.4375, "grad_norm": 15.6875, "learning_rate": 3.322784810126582e-05, "loss": 0.2856, "step": 55 }, { "epoch": 3.4375, "eval_accuracy": 0.772, "eval_loss": 0.5048539638519287, "eval_runtime": 4.9093, "eval_samples_per_second": 50.924, "eval_steps_per_second": 1.63, "step": 55 }, { "epoch": 3.5, "grad_norm": 18.75, "learning_rate": 3.291139240506329e-05, "loss": 0.2473, "step": 56 }, { "epoch": 3.5, "eval_accuracy": 0.784, "eval_loss": 0.4671543538570404, "eval_runtime": 5.0157, "eval_samples_per_second": 49.844, "eval_steps_per_second": 1.595, "step": 56 }, { "epoch": 3.5625, "grad_norm": 5.90625, "learning_rate": 3.2594936708860766e-05, "loss": 0.1565, "step": 57 }, { "epoch": 3.5625, "eval_accuracy": 0.808, "eval_loss": 0.4280901551246643, "eval_runtime": 5.2697, "eval_samples_per_second": 47.441, "eval_steps_per_second": 1.518, "step": 57 }, { "epoch": 3.625, "grad_norm": 4.9375, "learning_rate": 3.227848101265823e-05, "loss": 0.1862, "step": 58 }, { "epoch": 3.625, "eval_accuracy": 0.792, "eval_loss": 0.417385071516037, "eval_runtime": 4.936, "eval_samples_per_second": 50.648, "eval_steps_per_second": 1.621, "step": 58 }, { "epoch": 3.6875, "grad_norm": 4.71875, "learning_rate": 3.1962025316455695e-05, "loss": 0.1997, "step": 59 }, { "epoch": 3.6875, "eval_accuracy": 0.812, "eval_loss": 0.43094775080680847, "eval_runtime": 5.3564, "eval_samples_per_second": 46.673, "eval_steps_per_second": 1.494, "step": 59 }, { "epoch": 3.75, "grad_norm": 4.21875, "learning_rate": 3.1645569620253167e-05, "loss": 0.2196, "step": 60 }, { "epoch": 3.75, "eval_accuracy": 0.816, "eval_loss": 0.45009762048721313, "eval_runtime": 5.0887, "eval_samples_per_second": 49.129, "eval_steps_per_second": 1.572, "step": 60 }, { "epoch": 3.8125, "grad_norm": 7.4375, "learning_rate": 3.132911392405064e-05, "loss": 0.1863, "step": 61 }, { "epoch": 3.8125, "eval_accuracy": 0.816, "eval_loss": 0.4615320563316345, "eval_runtime": 4.9972, "eval_samples_per_second": 50.028, "eval_steps_per_second": 1.601, "step": 61 }, { "epoch": 3.875, "grad_norm": 5.84375, "learning_rate": 3.10126582278481e-05, "loss": 0.1621, "step": 62 }, { "epoch": 3.875, "eval_accuracy": 0.816, "eval_loss": 0.4588499367237091, "eval_runtime": 4.9486, "eval_samples_per_second": 50.519, "eval_steps_per_second": 1.617, "step": 62 }, { "epoch": 3.9375, "grad_norm": 14.9375, "learning_rate": 3.0696202531645574e-05, "loss": 0.2533, "step": 63 }, { "epoch": 3.9375, "eval_accuracy": 0.82, "eval_loss": 0.44347089529037476, "eval_runtime": 4.9235, "eval_samples_per_second": 50.777, "eval_steps_per_second": 1.625, "step": 63 }, { "epoch": 4.0, "grad_norm": 4.15625, "learning_rate": 3.0379746835443042e-05, "loss": 0.1085, "step": 64 }, { "epoch": 4.0, "eval_accuracy": 0.832, "eval_loss": 0.4236921966075897, "eval_runtime": 5.0108, "eval_samples_per_second": 49.892, "eval_steps_per_second": 1.597, "step": 64 }, { "epoch": 4.0625, "grad_norm": 10.0625, "learning_rate": 3.0063291139240506e-05, "loss": 0.1236, "step": 65 }, { "epoch": 4.0625, "eval_accuracy": 0.824, "eval_loss": 0.40546613931655884, "eval_runtime": 4.9996, "eval_samples_per_second": 50.004, "eval_steps_per_second": 1.6, "step": 65 }, { "epoch": 4.125, "grad_norm": 2.421875, "learning_rate": 2.9746835443037974e-05, "loss": 0.0979, "step": 66 }, { "epoch": 4.125, "eval_accuracy": 0.816, "eval_loss": 0.3985447883605957, "eval_runtime": 4.969, "eval_samples_per_second": 50.312, "eval_steps_per_second": 1.61, "step": 66 }, { "epoch": 4.1875, "grad_norm": 2.703125, "learning_rate": 2.9430379746835446e-05, "loss": 0.096, "step": 67 }, { "epoch": 4.1875, "eval_accuracy": 0.812, "eval_loss": 0.40320640802383423, "eval_runtime": 5.0458, "eval_samples_per_second": 49.546, "eval_steps_per_second": 1.585, "step": 67 }, { "epoch": 4.25, "grad_norm": 4.1875, "learning_rate": 2.9113924050632914e-05, "loss": 0.0776, "step": 68 }, { "epoch": 4.25, "eval_accuracy": 0.816, "eval_loss": 0.40163105726242065, "eval_runtime": 4.9683, "eval_samples_per_second": 50.319, "eval_steps_per_second": 1.61, "step": 68 }, { "epoch": 4.3125, "grad_norm": 3.734375, "learning_rate": 2.879746835443038e-05, "loss": 0.1406, "step": 69 }, { "epoch": 4.3125, "eval_accuracy": 0.812, "eval_loss": 0.40083351731300354, "eval_runtime": 5.0742, "eval_samples_per_second": 49.269, "eval_steps_per_second": 1.577, "step": 69 }, { "epoch": 4.375, "grad_norm": 3.0, "learning_rate": 2.848101265822785e-05, "loss": 0.1276, "step": 70 }, { "epoch": 4.375, "eval_accuracy": 0.816, "eval_loss": 0.4019404351711273, "eval_runtime": 5.0319, "eval_samples_per_second": 49.683, "eval_steps_per_second": 1.59, "step": 70 }, { "epoch": 4.4375, "grad_norm": 3.40625, "learning_rate": 2.8164556962025318e-05, "loss": 0.1169, "step": 71 }, { "epoch": 4.4375, "eval_accuracy": 0.82, "eval_loss": 0.40479913353919983, "eval_runtime": 5.0124, "eval_samples_per_second": 49.876, "eval_steps_per_second": 1.596, "step": 71 }, { "epoch": 4.5, "grad_norm": 10.875, "learning_rate": 2.7848101265822786e-05, "loss": 0.2681, "step": 72 }, { "epoch": 4.5, "eval_accuracy": 0.824, "eval_loss": 0.40435412526130676, "eval_runtime": 5.1364, "eval_samples_per_second": 48.673, "eval_steps_per_second": 1.558, "step": 72 }, { "epoch": 4.5625, "grad_norm": 5.59375, "learning_rate": 2.7531645569620257e-05, "loss": 0.1596, "step": 73 }, { "epoch": 4.5625, "eval_accuracy": 0.82, "eval_loss": 0.4038302004337311, "eval_runtime": 5.0213, "eval_samples_per_second": 49.788, "eval_steps_per_second": 1.593, "step": 73 }, { "epoch": 4.625, "grad_norm": 3.84375, "learning_rate": 2.7215189873417722e-05, "loss": 0.1036, "step": 74 }, { "epoch": 4.625, "eval_accuracy": 0.82, "eval_loss": 0.40612995624542236, "eval_runtime": 5.014, "eval_samples_per_second": 49.861, "eval_steps_per_second": 1.596, "step": 74 }, { "epoch": 4.6875, "grad_norm": 4.6875, "learning_rate": 2.689873417721519e-05, "loss": 0.1256, "step": 75 }, { "epoch": 4.6875, "eval_accuracy": 0.824, "eval_loss": 0.4083655774593353, "eval_runtime": 5.0762, "eval_samples_per_second": 49.249, "eval_steps_per_second": 1.576, "step": 75 }, { "epoch": 4.75, "grad_norm": 2.046875, "learning_rate": 2.6582278481012658e-05, "loss": 0.049, "step": 76 }, { "epoch": 4.75, "eval_accuracy": 0.828, "eval_loss": 0.4126991629600525, "eval_runtime": 5.1818, "eval_samples_per_second": 48.246, "eval_steps_per_second": 1.544, "step": 76 }, { "epoch": 4.8125, "grad_norm": 2.953125, "learning_rate": 2.626582278481013e-05, "loss": 0.1125, "step": 77 }, { "epoch": 4.8125, "eval_accuracy": 0.828, "eval_loss": 0.41637206077575684, "eval_runtime": 5.0132, "eval_samples_per_second": 49.869, "eval_steps_per_second": 1.596, "step": 77 }, { "epoch": 4.875, "grad_norm": 2.21875, "learning_rate": 2.5949367088607597e-05, "loss": 0.081, "step": 78 }, { "epoch": 4.875, "eval_accuracy": 0.832, "eval_loss": 0.41693058609962463, "eval_runtime": 5.1308, "eval_samples_per_second": 48.725, "eval_steps_per_second": 1.559, "step": 78 }, { "epoch": 4.9375, "grad_norm": 2.953125, "learning_rate": 2.5632911392405062e-05, "loss": 0.1308, "step": 79 }, { "epoch": 4.9375, "eval_accuracy": 0.82, "eval_loss": 0.4119318425655365, "eval_runtime": 5.3506, "eval_samples_per_second": 46.723, "eval_steps_per_second": 1.495, "step": 79 }, { "epoch": 5.0, "grad_norm": 3.078125, "learning_rate": 2.5316455696202533e-05, "loss": 0.0556, "step": 80 }, { "epoch": 5.0, "eval_accuracy": 0.82, "eval_loss": 0.4106213450431824, "eval_runtime": 5.0414, "eval_samples_per_second": 49.589, "eval_steps_per_second": 1.587, "step": 80 }, { "epoch": 5.0625, "grad_norm": 5.4375, "learning_rate": 2.5e-05, "loss": 0.1657, "step": 81 }, { "epoch": 5.0625, "eval_accuracy": 0.816, "eval_loss": 0.40592437982559204, "eval_runtime": 4.991, "eval_samples_per_second": 50.09, "eval_steps_per_second": 1.603, "step": 81 }, { "epoch": 5.125, "grad_norm": 3.125, "learning_rate": 2.468354430379747e-05, "loss": 0.1037, "step": 82 }, { "epoch": 5.125, "eval_accuracy": 0.816, "eval_loss": 0.40300390124320984, "eval_runtime": 4.9485, "eval_samples_per_second": 50.521, "eval_steps_per_second": 1.617, "step": 82 }, { "epoch": 5.1875, "grad_norm": 2.78125, "learning_rate": 2.4367088607594937e-05, "loss": 0.0676, "step": 83 }, { "epoch": 5.1875, "eval_accuracy": 0.824, "eval_loss": 0.39928579330444336, "eval_runtime": 5.1773, "eval_samples_per_second": 48.287, "eval_steps_per_second": 1.545, "step": 83 }, { "epoch": 5.25, "grad_norm": 3.625, "learning_rate": 2.4050632911392405e-05, "loss": 0.0513, "step": 84 }, { "epoch": 5.25, "eval_accuracy": 0.812, "eval_loss": 0.3965282738208771, "eval_runtime": 4.9096, "eval_samples_per_second": 50.921, "eval_steps_per_second": 1.629, "step": 84 }, { "epoch": 5.3125, "grad_norm": 2.84375, "learning_rate": 2.3734177215189873e-05, "loss": 0.0539, "step": 85 }, { "epoch": 5.3125, "eval_accuracy": 0.816, "eval_loss": 0.3946448862552643, "eval_runtime": 4.9127, "eval_samples_per_second": 50.889, "eval_steps_per_second": 1.628, "step": 85 }, { "epoch": 5.375, "grad_norm": 2.40625, "learning_rate": 2.341772151898734e-05, "loss": 0.0757, "step": 86 }, { "epoch": 5.375, "eval_accuracy": 0.816, "eval_loss": 0.3956676721572876, "eval_runtime": 4.9093, "eval_samples_per_second": 50.924, "eval_steps_per_second": 1.63, "step": 86 }, { "epoch": 5.4375, "grad_norm": 1.4296875, "learning_rate": 2.3101265822784813e-05, "loss": 0.0394, "step": 87 }, { "epoch": 5.4375, "eval_accuracy": 0.812, "eval_loss": 0.4012235105037689, "eval_runtime": 4.9087, "eval_samples_per_second": 50.93, "eval_steps_per_second": 1.63, "step": 87 }, { "epoch": 5.5, "grad_norm": 1.2578125, "learning_rate": 2.278481012658228e-05, "loss": 0.034, "step": 88 }, { "epoch": 5.5, "eval_accuracy": 0.804, "eval_loss": 0.40770140290260315, "eval_runtime": 4.9931, "eval_samples_per_second": 50.069, "eval_steps_per_second": 1.602, "step": 88 }, { "epoch": 5.5625, "grad_norm": 3.9375, "learning_rate": 2.246835443037975e-05, "loss": 0.0601, "step": 89 }, { "epoch": 5.5625, "eval_accuracy": 0.804, "eval_loss": 0.4079342782497406, "eval_runtime": 5.4428, "eval_samples_per_second": 45.932, "eval_steps_per_second": 1.47, "step": 89 }, { "epoch": 5.625, "grad_norm": 2.609375, "learning_rate": 2.2151898734177217e-05, "loss": 0.0402, "step": 90 }, { "epoch": 5.625, "eval_accuracy": 0.812, "eval_loss": 0.4079940915107727, "eval_runtime": 5.2545, "eval_samples_per_second": 47.578, "eval_steps_per_second": 1.523, "step": 90 }, { "epoch": 5.6875, "grad_norm": 2.171875, "learning_rate": 2.1835443037974685e-05, "loss": 0.0443, "step": 91 }, { "epoch": 5.6875, "eval_accuracy": 0.824, "eval_loss": 0.4047956168651581, "eval_runtime": 5.1747, "eval_samples_per_second": 48.312, "eval_steps_per_second": 1.546, "step": 91 }, { "epoch": 5.75, "grad_norm": 1.0, "learning_rate": 2.1518987341772153e-05, "loss": 0.0213, "step": 92 }, { "epoch": 5.75, "eval_accuracy": 0.824, "eval_loss": 0.4049394130706787, "eval_runtime": 5.3443, "eval_samples_per_second": 46.779, "eval_steps_per_second": 1.497, "step": 92 }, { "epoch": 5.8125, "grad_norm": 3.09375, "learning_rate": 2.120253164556962e-05, "loss": 0.05, "step": 93 }, { "epoch": 5.8125, "eval_accuracy": 0.824, "eval_loss": 0.40513497591018677, "eval_runtime": 4.9931, "eval_samples_per_second": 50.069, "eval_steps_per_second": 1.602, "step": 93 }, { "epoch": 5.875, "grad_norm": 1.171875, "learning_rate": 2.088607594936709e-05, "loss": 0.0321, "step": 94 }, { "epoch": 5.875, "eval_accuracy": 0.832, "eval_loss": 0.4072630703449249, "eval_runtime": 4.9859, "eval_samples_per_second": 50.141, "eval_steps_per_second": 1.605, "step": 94 }, { "epoch": 5.9375, "grad_norm": 1.453125, "learning_rate": 2.056962025316456e-05, "loss": 0.0315, "step": 95 }, { "epoch": 5.9375, "eval_accuracy": 0.832, "eval_loss": 0.40790075063705444, "eval_runtime": 4.9401, "eval_samples_per_second": 50.606, "eval_steps_per_second": 1.619, "step": 95 }, { "epoch": 6.0, "grad_norm": 1.265625, "learning_rate": 2.0253164556962025e-05, "loss": 0.0207, "step": 96 }, { "epoch": 6.0, "eval_accuracy": 0.832, "eval_loss": 0.4082264304161072, "eval_runtime": 4.9371, "eval_samples_per_second": 50.637, "eval_steps_per_second": 1.62, "step": 96 }, { "epoch": 6.0625, "grad_norm": 1.828125, "learning_rate": 1.9936708860759496e-05, "loss": 0.0235, "step": 97 }, { "epoch": 6.0625, "eval_accuracy": 0.828, "eval_loss": 0.4142768681049347, "eval_runtime": 4.9339, "eval_samples_per_second": 50.67, "eval_steps_per_second": 1.621, "step": 97 }, { "epoch": 6.125, "grad_norm": 2.0625, "learning_rate": 1.962025316455696e-05, "loss": 0.0247, "step": 98 }, { "epoch": 6.125, "eval_accuracy": 0.832, "eval_loss": 0.41069871187210083, "eval_runtime": 4.9302, "eval_samples_per_second": 50.708, "eval_steps_per_second": 1.623, "step": 98 }, { "epoch": 6.1875, "grad_norm": 1.2265625, "learning_rate": 1.9303797468354432e-05, "loss": 0.0148, "step": 99 }, { "epoch": 6.1875, "eval_accuracy": 0.836, "eval_loss": 0.4168325662612915, "eval_runtime": 4.9333, "eval_samples_per_second": 50.676, "eval_steps_per_second": 1.622, "step": 99 }, { "epoch": 6.25, "grad_norm": 3.078125, "learning_rate": 1.89873417721519e-05, "loss": 0.0365, "step": 100 }, { "epoch": 6.25, "eval_accuracy": 0.84, "eval_loss": 0.4236372709274292, "eval_runtime": 4.9348, "eval_samples_per_second": 50.661, "eval_steps_per_second": 1.621, "step": 100 }, { "epoch": 6.3125, "grad_norm": 1.46875, "learning_rate": 1.8670886075949368e-05, "loss": 0.0211, "step": 101 }, { "epoch": 6.3125, "eval_accuracy": 0.844, "eval_loss": 0.4290027320384979, "eval_runtime": 4.9336, "eval_samples_per_second": 50.673, "eval_steps_per_second": 1.622, "step": 101 }, { "epoch": 6.375, "grad_norm": 1.40625, "learning_rate": 1.8354430379746836e-05, "loss": 0.0183, "step": 102 }, { "epoch": 6.375, "eval_accuracy": 0.844, "eval_loss": 0.4270361661911011, "eval_runtime": 4.9305, "eval_samples_per_second": 50.704, "eval_steps_per_second": 1.623, "step": 102 }, { "epoch": 6.4375, "grad_norm": 1.4375, "learning_rate": 1.8037974683544304e-05, "loss": 0.0224, "step": 103 }, { "epoch": 6.4375, "eval_accuracy": 0.844, "eval_loss": 0.4209546446800232, "eval_runtime": 5.0135, "eval_samples_per_second": 49.865, "eval_steps_per_second": 1.596, "step": 103 }, { "epoch": 6.5, "grad_norm": 3.296875, "learning_rate": 1.7721518987341772e-05, "loss": 0.0437, "step": 104 }, { "epoch": 6.5, "eval_accuracy": 0.844, "eval_loss": 0.4190281331539154, "eval_runtime": 5.1552, "eval_samples_per_second": 48.495, "eval_steps_per_second": 1.552, "step": 104 }, { "epoch": 6.5625, "grad_norm": 1.0859375, "learning_rate": 1.7405063291139243e-05, "loss": 0.0113, "step": 105 }, { "epoch": 6.5625, "eval_accuracy": 0.844, "eval_loss": 0.41917330026626587, "eval_runtime": 5.2448, "eval_samples_per_second": 47.666, "eval_steps_per_second": 1.525, "step": 105 }, { "epoch": 6.625, "grad_norm": 2.96875, "learning_rate": 1.7088607594936708e-05, "loss": 0.0404, "step": 106 }, { "epoch": 6.625, "eval_accuracy": 0.84, "eval_loss": 0.4207764267921448, "eval_runtime": 5.3617, "eval_samples_per_second": 46.627, "eval_steps_per_second": 1.492, "step": 106 }, { "epoch": 6.6875, "grad_norm": 1.1484375, "learning_rate": 1.677215189873418e-05, "loss": 0.0127, "step": 107 }, { "epoch": 6.6875, "eval_accuracy": 0.848, "eval_loss": 0.42192724347114563, "eval_runtime": 5.0116, "eval_samples_per_second": 49.884, "eval_steps_per_second": 1.596, "step": 107 }, { "epoch": 6.75, "grad_norm": 0.62109375, "learning_rate": 1.6455696202531644e-05, "loss": 0.011, "step": 108 }, { "epoch": 6.75, "eval_accuracy": 0.848, "eval_loss": 0.4295770227909088, "eval_runtime": 5.1327, "eval_samples_per_second": 48.708, "eval_steps_per_second": 1.559, "step": 108 }, { "epoch": 6.8125, "grad_norm": 0.80078125, "learning_rate": 1.6139240506329115e-05, "loss": 0.0088, "step": 109 }, { "epoch": 6.8125, "eval_accuracy": 0.848, "eval_loss": 0.4343023896217346, "eval_runtime": 5.1895, "eval_samples_per_second": 48.174, "eval_steps_per_second": 1.542, "step": 109 }, { "epoch": 6.875, "grad_norm": 2.734375, "learning_rate": 1.5822784810126583e-05, "loss": 0.0285, "step": 110 }, { "epoch": 6.875, "eval_accuracy": 0.852, "eval_loss": 0.4316946268081665, "eval_runtime": 4.9676, "eval_samples_per_second": 50.326, "eval_steps_per_second": 1.61, "step": 110 }, { "epoch": 6.9375, "grad_norm": 1.984375, "learning_rate": 1.550632911392405e-05, "loss": 0.0179, "step": 111 }, { "epoch": 6.9375, "eval_accuracy": 0.852, "eval_loss": 0.43008503317832947, "eval_runtime": 4.9342, "eval_samples_per_second": 50.666, "eval_steps_per_second": 1.621, "step": 111 }, { "epoch": 7.0, "grad_norm": 2.15625, "learning_rate": 1.5189873417721521e-05, "loss": 0.0207, "step": 112 }, { "epoch": 7.0, "eval_accuracy": 0.852, "eval_loss": 0.4265301525592804, "eval_runtime": 4.9341, "eval_samples_per_second": 50.668, "eval_steps_per_second": 1.621, "step": 112 }, { "epoch": 7.0625, "grad_norm": 0.3671875, "learning_rate": 1.4873417721518987e-05, "loss": 0.005, "step": 113 }, { "epoch": 7.0625, "eval_accuracy": 0.84, "eval_loss": 0.42337584495544434, "eval_runtime": 4.9377, "eval_samples_per_second": 50.631, "eval_steps_per_second": 1.62, "step": 113 }, { "epoch": 7.125, "grad_norm": 0.408203125, "learning_rate": 1.4556962025316457e-05, "loss": 0.0072, "step": 114 }, { "epoch": 7.125, "eval_accuracy": 0.84, "eval_loss": 0.42673492431640625, "eval_runtime": 4.9384, "eval_samples_per_second": 50.624, "eval_steps_per_second": 1.62, "step": 114 }, { "epoch": 7.1875, "grad_norm": 0.6796875, "learning_rate": 1.4240506329113925e-05, "loss": 0.0105, "step": 115 }, { "epoch": 7.1875, "eval_accuracy": 0.84, "eval_loss": 0.4281560778617859, "eval_runtime": 4.9383, "eval_samples_per_second": 50.624, "eval_steps_per_second": 1.62, "step": 115 }, { "epoch": 7.25, "grad_norm": 0.4375, "learning_rate": 1.3924050632911393e-05, "loss": 0.0062, "step": 116 }, { "epoch": 7.25, "eval_accuracy": 0.836, "eval_loss": 0.430206298828125, "eval_runtime": 4.9311, "eval_samples_per_second": 50.698, "eval_steps_per_second": 1.622, "step": 116 }, { "epoch": 7.3125, "grad_norm": 1.4453125, "learning_rate": 1.3607594936708861e-05, "loss": 0.015, "step": 117 }, { "epoch": 7.3125, "eval_accuracy": 0.844, "eval_loss": 0.43175867199897766, "eval_runtime": 4.9439, "eval_samples_per_second": 50.568, "eval_steps_per_second": 1.618, "step": 117 }, { "epoch": 7.375, "grad_norm": 0.86328125, "learning_rate": 1.3291139240506329e-05, "loss": 0.0094, "step": 118 }, { "epoch": 7.375, "eval_accuracy": 0.84, "eval_loss": 0.4308719336986542, "eval_runtime": 4.9314, "eval_samples_per_second": 50.696, "eval_steps_per_second": 1.622, "step": 118 }, { "epoch": 7.4375, "grad_norm": 1.03125, "learning_rate": 1.2974683544303799e-05, "loss": 0.0086, "step": 119 }, { "epoch": 7.4375, "eval_accuracy": 0.84, "eval_loss": 0.4366334080696106, "eval_runtime": 4.9431, "eval_samples_per_second": 50.576, "eval_steps_per_second": 1.618, "step": 119 }, { "epoch": 7.5, "grad_norm": 0.2021484375, "learning_rate": 1.2658227848101267e-05, "loss": 0.0035, "step": 120 }, { "epoch": 7.5, "eval_accuracy": 0.848, "eval_loss": 0.44147494435310364, "eval_runtime": 4.9422, "eval_samples_per_second": 50.585, "eval_steps_per_second": 1.619, "step": 120 }, { "epoch": 7.5625, "grad_norm": 1.015625, "learning_rate": 1.2341772151898735e-05, "loss": 0.0088, "step": 121 }, { "epoch": 7.5625, "eval_accuracy": 0.848, "eval_loss": 0.4474000036716461, "eval_runtime": 4.943, "eval_samples_per_second": 50.577, "eval_steps_per_second": 1.618, "step": 121 }, { "epoch": 7.625, "grad_norm": 0.3359375, "learning_rate": 1.2025316455696203e-05, "loss": 0.0062, "step": 122 }, { "epoch": 7.625, "eval_accuracy": 0.848, "eval_loss": 0.44968822598457336, "eval_runtime": 4.9362, "eval_samples_per_second": 50.646, "eval_steps_per_second": 1.621, "step": 122 }, { "epoch": 7.6875, "grad_norm": 1.1953125, "learning_rate": 1.170886075949367e-05, "loss": 0.01, "step": 123 }, { "epoch": 7.6875, "eval_accuracy": 0.848, "eval_loss": 0.45175397396087646, "eval_runtime": 4.885, "eval_samples_per_second": 51.177, "eval_steps_per_second": 1.638, "step": 123 }, { "epoch": 7.75, "grad_norm": 0.6328125, "learning_rate": 1.139240506329114e-05, "loss": 0.008, "step": 124 }, { "epoch": 7.75, "eval_accuracy": 0.848, "eval_loss": 0.45391473174095154, "eval_runtime": 4.9403, "eval_samples_per_second": 50.604, "eval_steps_per_second": 1.619, "step": 124 }, { "epoch": 7.8125, "grad_norm": 0.431640625, "learning_rate": 1.1075949367088608e-05, "loss": 0.0071, "step": 125 }, { "epoch": 7.8125, "eval_accuracy": 0.852, "eval_loss": 0.4616769552230835, "eval_runtime": 4.9324, "eval_samples_per_second": 50.685, "eval_steps_per_second": 1.622, "step": 125 }, { "epoch": 7.875, "grad_norm": 0.72265625, "learning_rate": 1.0759493670886076e-05, "loss": 0.0065, "step": 126 }, { "epoch": 7.875, "eval_accuracy": 0.856, "eval_loss": 0.4597095251083374, "eval_runtime": 4.9365, "eval_samples_per_second": 50.643, "eval_steps_per_second": 1.621, "step": 126 }, { "epoch": 7.9375, "grad_norm": 0.361328125, "learning_rate": 1.0443037974683544e-05, "loss": 0.004, "step": 127 }, { "epoch": 7.9375, "eval_accuracy": 0.856, "eval_loss": 0.4604904055595398, "eval_runtime": 4.9367, "eval_samples_per_second": 50.641, "eval_steps_per_second": 1.621, "step": 127 }, { "epoch": 8.0, "grad_norm": 0.2099609375, "learning_rate": 1.0126582278481012e-05, "loss": 0.0027, "step": 128 }, { "epoch": 8.0, "eval_accuracy": 0.856, "eval_loss": 0.46479332447052, "eval_runtime": 4.9361, "eval_samples_per_second": 50.648, "eval_steps_per_second": 1.621, "step": 128 }, { "epoch": 8.0625, "grad_norm": 0.224609375, "learning_rate": 9.81012658227848e-06, "loss": 0.0029, "step": 129 }, { "epoch": 8.0625, "eval_accuracy": 0.852, "eval_loss": 0.47178688645362854, "eval_runtime": 4.9337, "eval_samples_per_second": 50.672, "eval_steps_per_second": 1.622, "step": 129 }, { "epoch": 8.125, "grad_norm": 0.404296875, "learning_rate": 9.49367088607595e-06, "loss": 0.0058, "step": 130 }, { "epoch": 8.125, "eval_accuracy": 0.852, "eval_loss": 0.4701623022556305, "eval_runtime": 4.9934, "eval_samples_per_second": 50.066, "eval_steps_per_second": 1.602, "step": 130 }, { "epoch": 8.1875, "grad_norm": 0.39453125, "learning_rate": 9.177215189873418e-06, "loss": 0.0036, "step": 131 }, { "epoch": 8.1875, "eval_accuracy": 0.852, "eval_loss": 0.4724094867706299, "eval_runtime": 5.0541, "eval_samples_per_second": 49.464, "eval_steps_per_second": 1.583, "step": 131 }, { "epoch": 8.25, "grad_norm": 0.734375, "learning_rate": 8.860759493670886e-06, "loss": 0.0061, "step": 132 }, { "epoch": 8.25, "eval_accuracy": 0.852, "eval_loss": 0.47215989232063293, "eval_runtime": 5.2055, "eval_samples_per_second": 48.026, "eval_steps_per_second": 1.537, "step": 132 }, { "epoch": 8.3125, "grad_norm": 0.345703125, "learning_rate": 8.544303797468354e-06, "loss": 0.0044, "step": 133 }, { "epoch": 8.3125, "eval_accuracy": 0.852, "eval_loss": 0.47440290451049805, "eval_runtime": 4.9325, "eval_samples_per_second": 50.684, "eval_steps_per_second": 1.622, "step": 133 }, { "epoch": 8.375, "grad_norm": 0.43359375, "learning_rate": 8.227848101265822e-06, "loss": 0.0052, "step": 134 }, { "epoch": 8.375, "eval_accuracy": 0.852, "eval_loss": 0.4775030016899109, "eval_runtime": 5.0459, "eval_samples_per_second": 49.545, "eval_steps_per_second": 1.585, "step": 134 }, { "epoch": 8.4375, "grad_norm": 0.291015625, "learning_rate": 7.911392405063292e-06, "loss": 0.0058, "step": 135 }, { "epoch": 8.4375, "eval_accuracy": 0.856, "eval_loss": 0.47711512446403503, "eval_runtime": 4.9135, "eval_samples_per_second": 50.88, "eval_steps_per_second": 1.628, "step": 135 }, { "epoch": 8.5, "grad_norm": 0.1650390625, "learning_rate": 7.5949367088607605e-06, "loss": 0.0023, "step": 136 }, { "epoch": 8.5, "eval_accuracy": 0.86, "eval_loss": 0.4801058769226074, "eval_runtime": 4.9192, "eval_samples_per_second": 50.821, "eval_steps_per_second": 1.626, "step": 136 }, { "epoch": 8.5625, "grad_norm": 0.5703125, "learning_rate": 7.2784810126582285e-06, "loss": 0.0065, "step": 137 }, { "epoch": 8.5625, "eval_accuracy": 0.864, "eval_loss": 0.47905686497688293, "eval_runtime": 5.0724, "eval_samples_per_second": 49.287, "eval_steps_per_second": 1.577, "step": 137 }, { "epoch": 8.625, "grad_norm": 0.291015625, "learning_rate": 6.9620253164556965e-06, "loss": 0.0026, "step": 138 }, { "epoch": 8.625, "eval_accuracy": 0.856, "eval_loss": 0.48366859555244446, "eval_runtime": 4.9162, "eval_samples_per_second": 50.852, "eval_steps_per_second": 1.627, "step": 138 }, { "epoch": 8.6875, "grad_norm": 0.4140625, "learning_rate": 6.6455696202531645e-06, "loss": 0.0044, "step": 139 }, { "epoch": 8.6875, "eval_accuracy": 0.864, "eval_loss": 0.48236754536628723, "eval_runtime": 4.9088, "eval_samples_per_second": 50.929, "eval_steps_per_second": 1.63, "step": 139 }, { "epoch": 8.75, "grad_norm": 0.154296875, "learning_rate": 6.329113924050633e-06, "loss": 0.0024, "step": 140 }, { "epoch": 8.75, "eval_accuracy": 0.852, "eval_loss": 0.4821201264858246, "eval_runtime": 4.9092, "eval_samples_per_second": 50.925, "eval_steps_per_second": 1.63, "step": 140 }, { "epoch": 8.8125, "grad_norm": 0.19140625, "learning_rate": 6.012658227848101e-06, "loss": 0.003, "step": 141 }, { "epoch": 8.8125, "eval_accuracy": 0.86, "eval_loss": 0.4807308316230774, "eval_runtime": 4.9123, "eval_samples_per_second": 50.892, "eval_steps_per_second": 1.629, "step": 141 }, { "epoch": 8.875, "grad_norm": 0.337890625, "learning_rate": 5.69620253164557e-06, "loss": 0.0036, "step": 142 }, { "epoch": 8.875, "eval_accuracy": 0.864, "eval_loss": 0.4836767613887787, "eval_runtime": 4.915, "eval_samples_per_second": 50.864, "eval_steps_per_second": 1.628, "step": 142 }, { "epoch": 8.9375, "grad_norm": 0.251953125, "learning_rate": 5.379746835443038e-06, "loss": 0.0025, "step": 143 }, { "epoch": 8.9375, "eval_accuracy": 0.86, "eval_loss": 0.4841257631778717, "eval_runtime": 4.9119, "eval_samples_per_second": 50.897, "eval_steps_per_second": 1.629, "step": 143 }, { "epoch": 9.0, "grad_norm": 0.53515625, "learning_rate": 5.063291139240506e-06, "loss": 0.0052, "step": 144 }, { "epoch": 9.0, "eval_accuracy": 0.86, "eval_loss": 0.48218366503715515, "eval_runtime": 4.9085, "eval_samples_per_second": 50.932, "eval_steps_per_second": 1.63, "step": 144 }, { "epoch": 9.0625, "grad_norm": 0.1767578125, "learning_rate": 4.746835443037975e-06, "loss": 0.0022, "step": 145 }, { "epoch": 9.0625, "eval_accuracy": 0.86, "eval_loss": 0.4810963273048401, "eval_runtime": 4.9111, "eval_samples_per_second": 50.905, "eval_steps_per_second": 1.629, "step": 145 }, { "epoch": 9.125, "grad_norm": 0.44140625, "learning_rate": 4.430379746835443e-06, "loss": 0.007, "step": 146 }, { "epoch": 9.125, "eval_accuracy": 0.864, "eval_loss": 0.482108473777771, "eval_runtime": 4.9117, "eval_samples_per_second": 50.899, "eval_steps_per_second": 1.629, "step": 146 }, { "epoch": 9.1875, "grad_norm": 0.17578125, "learning_rate": 4.113924050632911e-06, "loss": 0.0033, "step": 147 }, { "epoch": 9.1875, "eval_accuracy": 0.856, "eval_loss": 0.48591578006744385, "eval_runtime": 4.9102, "eval_samples_per_second": 50.914, "eval_steps_per_second": 1.629, "step": 147 }, { "epoch": 9.25, "grad_norm": 0.1552734375, "learning_rate": 3.7974683544303802e-06, "loss": 0.0025, "step": 148 }, { "epoch": 9.25, "eval_accuracy": 0.86, "eval_loss": 0.4883337914943695, "eval_runtime": 4.9105, "eval_samples_per_second": 50.912, "eval_steps_per_second": 1.629, "step": 148 }, { "epoch": 9.3125, "grad_norm": 0.302734375, "learning_rate": 3.4810126582278482e-06, "loss": 0.0049, "step": 149 }, { "epoch": 9.3125, "eval_accuracy": 0.864, "eval_loss": 0.4839572012424469, "eval_runtime": 4.9139, "eval_samples_per_second": 50.876, "eval_steps_per_second": 1.628, "step": 149 }, { "epoch": 9.375, "grad_norm": 0.1611328125, "learning_rate": 3.1645569620253167e-06, "loss": 0.0026, "step": 150 }, { "epoch": 9.375, "eval_accuracy": 0.856, "eval_loss": 0.4814904034137726, "eval_runtime": 4.9134, "eval_samples_per_second": 50.881, "eval_steps_per_second": 1.628, "step": 150 }, { "epoch": 9.4375, "grad_norm": 0.2177734375, "learning_rate": 2.848101265822785e-06, "loss": 0.0021, "step": 151 }, { "epoch": 9.4375, "eval_accuracy": 0.86, "eval_loss": 0.4863939881324768, "eval_runtime": 4.914, "eval_samples_per_second": 50.875, "eval_steps_per_second": 1.628, "step": 151 }, { "epoch": 9.5, "grad_norm": 0.1484375, "learning_rate": 2.531645569620253e-06, "loss": 0.0023, "step": 152 }, { "epoch": 9.5, "eval_accuracy": 0.864, "eval_loss": 0.4879688024520874, "eval_runtime": 4.9108, "eval_samples_per_second": 50.909, "eval_steps_per_second": 1.629, "step": 152 }, { "epoch": 9.5625, "grad_norm": 0.263671875, "learning_rate": 2.2151898734177215e-06, "loss": 0.0039, "step": 153 }, { "epoch": 9.5625, "eval_accuracy": 0.86, "eval_loss": 0.4850555956363678, "eval_runtime": 4.9088, "eval_samples_per_second": 50.929, "eval_steps_per_second": 1.63, "step": 153 }, { "epoch": 9.625, "grad_norm": 0.208984375, "learning_rate": 1.8987341772151901e-06, "loss": 0.0025, "step": 154 }, { "epoch": 9.625, "eval_accuracy": 0.856, "eval_loss": 0.48572131991386414, "eval_runtime": 4.9101, "eval_samples_per_second": 50.916, "eval_steps_per_second": 1.629, "step": 154 }, { "epoch": 9.6875, "grad_norm": 0.21484375, "learning_rate": 1.5822784810126583e-06, "loss": 0.004, "step": 155 }, { "epoch": 9.6875, "eval_accuracy": 0.864, "eval_loss": 0.4816167652606964, "eval_runtime": 4.9101, "eval_samples_per_second": 50.915, "eval_steps_per_second": 1.629, "step": 155 }, { "epoch": 9.75, "grad_norm": 0.80078125, "learning_rate": 1.2658227848101265e-06, "loss": 0.0088, "step": 156 }, { "epoch": 9.75, "eval_accuracy": 0.86, "eval_loss": 0.4846898913383484, "eval_runtime": 5.0723, "eval_samples_per_second": 49.288, "eval_steps_per_second": 1.577, "step": 156 }, { "epoch": 9.8125, "grad_norm": 0.2138671875, "learning_rate": 9.493670886075951e-07, "loss": 0.0035, "step": 157 }, { "epoch": 9.8125, "eval_accuracy": 0.86, "eval_loss": 0.48494383692741394, "eval_runtime": 4.9092, "eval_samples_per_second": 50.925, "eval_steps_per_second": 1.63, "step": 157 }, { "epoch": 9.875, "grad_norm": 0.1708984375, "learning_rate": 6.329113924050633e-07, "loss": 0.0027, "step": 158 }, { "epoch": 9.875, "eval_accuracy": 0.852, "eval_loss": 0.4813316762447357, "eval_runtime": 4.9095, "eval_samples_per_second": 50.922, "eval_steps_per_second": 1.629, "step": 158 }, { "epoch": 9.9375, "grad_norm": 0.2099609375, "learning_rate": 3.1645569620253163e-07, "loss": 0.0022, "step": 159 }, { "epoch": 9.9375, "eval_accuracy": 0.86, "eval_loss": 0.48336902260780334, "eval_runtime": 4.911, "eval_samples_per_second": 50.906, "eval_steps_per_second": 1.629, "step": 159 }, { "epoch": 10.0, "grad_norm": 0.154296875, "learning_rate": 0.0, "loss": 0.0022, "step": 160 }, { "epoch": 10.0, "eval_accuracy": 0.86, "eval_loss": 0.483336478471756, "eval_runtime": 4.9132, "eval_samples_per_second": 50.883, "eval_steps_per_second": 1.628, "step": 160 }, { "epoch": 10.0, "step": 160, "total_flos": 6.923398955820646e+16, "train_loss": 0.22855629230616614, "train_runtime": 1357.293, "train_samples_per_second": 7.368, "train_steps_per_second": 0.118 } ], "logging_steps": 1, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.923398955820646e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }