{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0625, "grad_norm": 17.75647735595703, "learning_rate": 2.5e-05, "loss": 0.8699, "step": 1 }, { "epoch": 0.0625, "eval_accuracy": 0.496, "eval_loss": 0.837658703327179, "eval_runtime": 4.5469, "eval_samples_per_second": 54.983, "eval_steps_per_second": 1.759, "step": 1 }, { "epoch": 0.125, "grad_norm": 10.630256652832031, "learning_rate": 5e-05, "loss": 0.8372, "step": 2 }, { "epoch": 0.125, "eval_accuracy": 0.496, "eval_loss": 0.8297348618507385, "eval_runtime": 4.4856, "eval_samples_per_second": 55.733, "eval_steps_per_second": 1.783, "step": 2 }, { "epoch": 0.1875, "grad_norm": 18.88325309753418, "learning_rate": 4.968354430379747e-05, "loss": 0.8813, "step": 3 }, { "epoch": 0.1875, "eval_accuracy": 0.496, "eval_loss": 0.8005664348602295, "eval_runtime": 4.5427, "eval_samples_per_second": 55.034, "eval_steps_per_second": 1.761, "step": 3 }, { "epoch": 0.25, "grad_norm": 14.382352828979492, "learning_rate": 4.936708860759494e-05, "loss": 0.8725, "step": 4 }, { "epoch": 0.25, "eval_accuracy": 0.5, "eval_loss": 0.7717475295066833, "eval_runtime": 4.495, "eval_samples_per_second": 55.618, "eval_steps_per_second": 1.78, "step": 4 }, { "epoch": 0.3125, "grad_norm": 18.139081954956055, "learning_rate": 4.905063291139241e-05, "loss": 0.8504, "step": 5 }, { "epoch": 0.3125, "eval_accuracy": 0.496, "eval_loss": 0.7444770336151123, "eval_runtime": 4.5407, "eval_samples_per_second": 55.057, "eval_steps_per_second": 1.762, "step": 5 }, { "epoch": 0.375, "grad_norm": 7.32247257232666, "learning_rate": 4.8734177215189874e-05, "loss": 0.8015, "step": 6 }, { "epoch": 0.375, "eval_accuracy": 0.5, "eval_loss": 0.7214609384536743, "eval_runtime": 4.5353, "eval_samples_per_second": 55.123, "eval_steps_per_second": 1.764, "step": 6 }, { "epoch": 0.4375, "grad_norm": 4.8535966873168945, "learning_rate": 4.8417721518987346e-05, "loss": 0.7165, "step": 7 }, { "epoch": 0.4375, "eval_accuracy": 0.512, "eval_loss": 0.7035966515541077, "eval_runtime": 4.5437, "eval_samples_per_second": 55.021, "eval_steps_per_second": 1.761, "step": 7 }, { "epoch": 0.5, "grad_norm": 7.805424690246582, "learning_rate": 4.810126582278481e-05, "loss": 0.766, "step": 8 }, { "epoch": 0.5, "eval_accuracy": 0.52, "eval_loss": 0.6936992406845093, "eval_runtime": 4.5422, "eval_samples_per_second": 55.04, "eval_steps_per_second": 1.761, "step": 8 }, { "epoch": 0.5625, "grad_norm": 2.8888654708862305, "learning_rate": 4.778481012658228e-05, "loss": 0.6915, "step": 9 }, { "epoch": 0.5625, "eval_accuracy": 0.524, "eval_loss": 0.6935371160507202, "eval_runtime": 4.4937, "eval_samples_per_second": 55.633, "eval_steps_per_second": 1.78, "step": 9 }, { "epoch": 0.625, "grad_norm": 7.086565971374512, "learning_rate": 4.7468354430379746e-05, "loss": 0.7093, "step": 10 }, { "epoch": 0.625, "eval_accuracy": 0.532, "eval_loss": 0.6983515620231628, "eval_runtime": 4.4948, "eval_samples_per_second": 55.619, "eval_steps_per_second": 1.78, "step": 10 }, { "epoch": 0.6875, "grad_norm": 2.0717809200286865, "learning_rate": 4.715189873417722e-05, "loss": 0.7107, "step": 11 }, { "epoch": 0.6875, "eval_accuracy": 0.524, "eval_loss": 0.7018242478370667, "eval_runtime": 4.5362, "eval_samples_per_second": 55.112, "eval_steps_per_second": 1.764, "step": 11 }, { "epoch": 0.75, "grad_norm": 4.748137474060059, "learning_rate": 4.683544303797468e-05, "loss": 0.729, "step": 12 }, { "epoch": 0.75, "eval_accuracy": 0.524, "eval_loss": 0.7031621336936951, "eval_runtime": 4.5441, "eval_samples_per_second": 55.017, "eval_steps_per_second": 1.761, "step": 12 }, { "epoch": 0.8125, "grad_norm": 5.206336975097656, "learning_rate": 4.6518987341772154e-05, "loss": 0.7819, "step": 13 }, { "epoch": 0.8125, "eval_accuracy": 0.524, "eval_loss": 0.7015683650970459, "eval_runtime": 4.5432, "eval_samples_per_second": 55.028, "eval_steps_per_second": 1.761, "step": 13 }, { "epoch": 0.875, "grad_norm": 5.746586322784424, "learning_rate": 4.6202531645569625e-05, "loss": 0.7343, "step": 14 }, { "epoch": 0.875, "eval_accuracy": 0.524, "eval_loss": 0.6985683441162109, "eval_runtime": 4.5466, "eval_samples_per_second": 54.986, "eval_steps_per_second": 1.76, "step": 14 }, { "epoch": 0.9375, "grad_norm": 2.441669464111328, "learning_rate": 4.588607594936709e-05, "loss": 0.7052, "step": 15 }, { "epoch": 0.9375, "eval_accuracy": 0.536, "eval_loss": 0.6962031126022339, "eval_runtime": 4.5479, "eval_samples_per_second": 54.97, "eval_steps_per_second": 1.759, "step": 15 }, { "epoch": 1.0, "grad_norm": 8.590319633483887, "learning_rate": 4.556962025316456e-05, "loss": 0.7406, "step": 16 }, { "epoch": 1.0, "eval_accuracy": 0.54, "eval_loss": 0.6932148337364197, "eval_runtime": 4.5474, "eval_samples_per_second": 54.977, "eval_steps_per_second": 1.759, "step": 16 }, { "epoch": 1.0625, "grad_norm": 3.343947172164917, "learning_rate": 4.525316455696203e-05, "loss": 0.6939, "step": 17 }, { "epoch": 1.0625, "eval_accuracy": 0.524, "eval_loss": 0.6910995841026306, "eval_runtime": 4.5426, "eval_samples_per_second": 55.035, "eval_steps_per_second": 1.761, "step": 17 }, { "epoch": 1.125, "grad_norm": 5.235752582550049, "learning_rate": 4.49367088607595e-05, "loss": 0.6857, "step": 18 }, { "epoch": 1.125, "eval_accuracy": 0.528, "eval_loss": 0.690261721611023, "eval_runtime": 4.5449, "eval_samples_per_second": 55.006, "eval_steps_per_second": 1.76, "step": 18 }, { "epoch": 1.1875, "grad_norm": 8.583759307861328, "learning_rate": 4.462025316455696e-05, "loss": 0.6932, "step": 19 }, { "epoch": 1.1875, "eval_accuracy": 0.52, "eval_loss": 0.6894707083702087, "eval_runtime": 4.5371, "eval_samples_per_second": 55.101, "eval_steps_per_second": 1.763, "step": 19 }, { "epoch": 1.25, "grad_norm": 9.518383979797363, "learning_rate": 4.430379746835443e-05, "loss": 0.6993, "step": 20 }, { "epoch": 1.25, "eval_accuracy": 0.516, "eval_loss": 0.6895566582679749, "eval_runtime": 4.5489, "eval_samples_per_second": 54.958, "eval_steps_per_second": 1.759, "step": 20 }, { "epoch": 1.3125, "grad_norm": 7.239161014556885, "learning_rate": 4.3987341772151904e-05, "loss": 0.763, "step": 21 }, { "epoch": 1.3125, "eval_accuracy": 0.52, "eval_loss": 0.690017580986023, "eval_runtime": 4.4884, "eval_samples_per_second": 55.699, "eval_steps_per_second": 1.782, "step": 21 }, { "epoch": 1.375, "grad_norm": 7.390464782714844, "learning_rate": 4.367088607594937e-05, "loss": 0.6954, "step": 22 }, { "epoch": 1.375, "eval_accuracy": 0.512, "eval_loss": 0.6905702948570251, "eval_runtime": 4.5375, "eval_samples_per_second": 55.097, "eval_steps_per_second": 1.763, "step": 22 }, { "epoch": 1.4375, "grad_norm": 3.728330135345459, "learning_rate": 4.3354430379746834e-05, "loss": 0.7282, "step": 23 }, { "epoch": 1.4375, "eval_accuracy": 0.524, "eval_loss": 0.690304696559906, "eval_runtime": 4.5427, "eval_samples_per_second": 55.033, "eval_steps_per_second": 1.761, "step": 23 }, { "epoch": 1.5, "grad_norm": 5.795580863952637, "learning_rate": 4.3037974683544305e-05, "loss": 0.7534, "step": 24 }, { "epoch": 1.5, "eval_accuracy": 0.52, "eval_loss": 0.6902949213981628, "eval_runtime": 4.5361, "eval_samples_per_second": 55.113, "eval_steps_per_second": 1.764, "step": 24 }, { "epoch": 1.5625, "grad_norm": 2.294114589691162, "learning_rate": 4.2721518987341776e-05, "loss": 0.7245, "step": 25 }, { "epoch": 1.5625, "eval_accuracy": 0.52, "eval_loss": 0.6901406049728394, "eval_runtime": 4.4936, "eval_samples_per_second": 55.634, "eval_steps_per_second": 1.78, "step": 25 }, { "epoch": 1.625, "grad_norm": 7.018092155456543, "learning_rate": 4.240506329113924e-05, "loss": 0.692, "step": 26 }, { "epoch": 1.625, "eval_accuracy": 0.508, "eval_loss": 0.6903828382492065, "eval_runtime": 4.5453, "eval_samples_per_second": 55.002, "eval_steps_per_second": 1.76, "step": 26 }, { "epoch": 1.6875, "grad_norm": 7.2787909507751465, "learning_rate": 4.208860759493671e-05, "loss": 0.754, "step": 27 }, { "epoch": 1.6875, "eval_accuracy": 0.512, "eval_loss": 0.6910136938095093, "eval_runtime": 4.5501, "eval_samples_per_second": 54.944, "eval_steps_per_second": 1.758, "step": 27 }, { "epoch": 1.75, "grad_norm": 6.70567512512207, "learning_rate": 4.177215189873418e-05, "loss": 0.7132, "step": 28 }, { "epoch": 1.75, "eval_accuracy": 0.508, "eval_loss": 0.691275417804718, "eval_runtime": 4.4942, "eval_samples_per_second": 55.627, "eval_steps_per_second": 1.78, "step": 28 }, { "epoch": 1.8125, "grad_norm": 7.861635208129883, "learning_rate": 4.145569620253165e-05, "loss": 0.7075, "step": 29 }, { "epoch": 1.8125, "eval_accuracy": 0.52, "eval_loss": 0.6910507678985596, "eval_runtime": 4.5518, "eval_samples_per_second": 54.924, "eval_steps_per_second": 1.758, "step": 29 }, { "epoch": 1.875, "grad_norm": 6.021241188049316, "learning_rate": 4.113924050632912e-05, "loss": 0.7013, "step": 30 }, { "epoch": 1.875, "eval_accuracy": 0.528, "eval_loss": 0.6917343735694885, "eval_runtime": 4.5466, "eval_samples_per_second": 54.986, "eval_steps_per_second": 1.76, "step": 30 }, { "epoch": 1.9375, "grad_norm": 4.954082012176514, "learning_rate": 4.0822784810126584e-05, "loss": 0.7059, "step": 31 }, { "epoch": 1.9375, "eval_accuracy": 0.508, "eval_loss": 0.692550778388977, "eval_runtime": 4.5373, "eval_samples_per_second": 55.099, "eval_steps_per_second": 1.763, "step": 31 }, { "epoch": 2.0, "grad_norm": 4.5776824951171875, "learning_rate": 4.050632911392405e-05, "loss": 0.6932, "step": 32 }, { "epoch": 2.0, "eval_accuracy": 0.52, "eval_loss": 0.692632794380188, "eval_runtime": 4.5448, "eval_samples_per_second": 55.008, "eval_steps_per_second": 1.76, "step": 32 }, { "epoch": 2.0625, "grad_norm": 8.209676742553711, "learning_rate": 4.018987341772152e-05, "loss": 0.7083, "step": 33 }, { "epoch": 2.0625, "eval_accuracy": 0.52, "eval_loss": 0.6923945546150208, "eval_runtime": 4.5461, "eval_samples_per_second": 54.993, "eval_steps_per_second": 1.76, "step": 33 }, { "epoch": 2.125, "grad_norm": 2.001976490020752, "learning_rate": 3.987341772151899e-05, "loss": 0.7423, "step": 34 }, { "epoch": 2.125, "eval_accuracy": 0.528, "eval_loss": 0.692144513130188, "eval_runtime": 4.5419, "eval_samples_per_second": 55.043, "eval_steps_per_second": 1.761, "step": 34 }, { "epoch": 2.1875, "grad_norm": 7.856252670288086, "learning_rate": 3.9556962025316456e-05, "loss": 0.6794, "step": 35 }, { "epoch": 2.1875, "eval_accuracy": 0.532, "eval_loss": 0.6922343969345093, "eval_runtime": 4.5401, "eval_samples_per_second": 55.065, "eval_steps_per_second": 1.762, "step": 35 }, { "epoch": 2.25, "grad_norm": 10.469124794006348, "learning_rate": 3.924050632911392e-05, "loss": 0.7089, "step": 36 }, { "epoch": 2.25, "eval_accuracy": 0.52, "eval_loss": 0.6925742030143738, "eval_runtime": 4.4947, "eval_samples_per_second": 55.621, "eval_steps_per_second": 1.78, "step": 36 }, { "epoch": 2.3125, "grad_norm": 12.528965950012207, "learning_rate": 3.89240506329114e-05, "loss": 0.738, "step": 37 }, { "epoch": 2.3125, "eval_accuracy": 0.512, "eval_loss": 0.6928867101669312, "eval_runtime": 4.4904, "eval_samples_per_second": 55.674, "eval_steps_per_second": 1.782, "step": 37 }, { "epoch": 2.375, "grad_norm": 10.900518417358398, "learning_rate": 3.8607594936708864e-05, "loss": 0.6796, "step": 38 }, { "epoch": 2.375, "eval_accuracy": 0.512, "eval_loss": 0.6924609541893005, "eval_runtime": 4.5411, "eval_samples_per_second": 55.052, "eval_steps_per_second": 1.762, "step": 38 }, { "epoch": 2.4375, "grad_norm": 1.5410585403442383, "learning_rate": 3.829113924050633e-05, "loss": 0.6729, "step": 39 }, { "epoch": 2.4375, "eval_accuracy": 0.512, "eval_loss": 0.6923437714576721, "eval_runtime": 4.5412, "eval_samples_per_second": 55.052, "eval_steps_per_second": 1.762, "step": 39 }, { "epoch": 2.5, "grad_norm": 5.861754894256592, "learning_rate": 3.79746835443038e-05, "loss": 0.6589, "step": 40 }, { "epoch": 2.5, "eval_accuracy": 0.512, "eval_loss": 0.6922851800918579, "eval_runtime": 4.5478, "eval_samples_per_second": 54.971, "eval_steps_per_second": 1.759, "step": 40 }, { "epoch": 2.5625, "grad_norm": 2.633316993713379, "learning_rate": 3.765822784810127e-05, "loss": 0.7336, "step": 41 }, { "epoch": 2.5625, "eval_accuracy": 0.512, "eval_loss": 0.6914882659912109, "eval_runtime": 4.4965, "eval_samples_per_second": 55.598, "eval_steps_per_second": 1.779, "step": 41 }, { "epoch": 2.625, "grad_norm": 4.3643693923950195, "learning_rate": 3.7341772151898736e-05, "loss": 0.7018, "step": 42 }, { "epoch": 2.625, "eval_accuracy": 0.524, "eval_loss": 0.690136730670929, "eval_runtime": 4.5418, "eval_samples_per_second": 55.045, "eval_steps_per_second": 1.761, "step": 42 }, { "epoch": 2.6875, "grad_norm": 4.561107158660889, "learning_rate": 3.70253164556962e-05, "loss": 0.7331, "step": 43 }, { "epoch": 2.6875, "eval_accuracy": 0.536, "eval_loss": 0.6878847479820251, "eval_runtime": 4.5431, "eval_samples_per_second": 55.028, "eval_steps_per_second": 1.761, "step": 43 }, { "epoch": 2.75, "grad_norm": 2.425762891769409, "learning_rate": 3.670886075949367e-05, "loss": 0.6961, "step": 44 }, { "epoch": 2.75, "eval_accuracy": 0.544, "eval_loss": 0.6869159936904907, "eval_runtime": 4.5385, "eval_samples_per_second": 55.084, "eval_steps_per_second": 1.763, "step": 44 }, { "epoch": 2.8125, "grad_norm": 7.950039863586426, "learning_rate": 3.639240506329114e-05, "loss": 0.7228, "step": 45 }, { "epoch": 2.8125, "eval_accuracy": 0.544, "eval_loss": 0.6861679553985596, "eval_runtime": 4.5403, "eval_samples_per_second": 55.063, "eval_steps_per_second": 1.762, "step": 45 }, { "epoch": 2.875, "grad_norm": 12.410717964172363, "learning_rate": 3.607594936708861e-05, "loss": 0.7031, "step": 46 }, { "epoch": 2.875, "eval_accuracy": 0.532, "eval_loss": 0.685476541519165, "eval_runtime": 4.5437, "eval_samples_per_second": 55.022, "eval_steps_per_second": 1.761, "step": 46 }, { "epoch": 2.9375, "grad_norm": 3.116471767425537, "learning_rate": 3.575949367088608e-05, "loss": 0.6885, "step": 47 }, { "epoch": 2.9375, "eval_accuracy": 0.544, "eval_loss": 0.6849316358566284, "eval_runtime": 4.5418, "eval_samples_per_second": 55.045, "eval_steps_per_second": 1.761, "step": 47 }, { "epoch": 3.0, "grad_norm": 6.724969387054443, "learning_rate": 3.5443037974683544e-05, "loss": 0.7062, "step": 48 }, { "epoch": 3.0, "eval_accuracy": 0.532, "eval_loss": 0.6846718788146973, "eval_runtime": 4.4458, "eval_samples_per_second": 56.233, "eval_steps_per_second": 1.799, "step": 48 }, { "epoch": 3.0625, "grad_norm": 2.1322343349456787, "learning_rate": 3.5126582278481015e-05, "loss": 0.6679, "step": 49 }, { "epoch": 3.0625, "eval_accuracy": 0.532, "eval_loss": 0.6838710904121399, "eval_runtime": 4.4926, "eval_samples_per_second": 55.648, "eval_steps_per_second": 1.781, "step": 49 }, { "epoch": 3.125, "grad_norm": 6.895395278930664, "learning_rate": 3.4810126582278487e-05, "loss": 0.6956, "step": 50 }, { "epoch": 3.125, "eval_accuracy": 0.532, "eval_loss": 0.6838496327400208, "eval_runtime": 4.5422, "eval_samples_per_second": 55.04, "eval_steps_per_second": 1.761, "step": 50 }, { "epoch": 3.1875, "grad_norm": 10.101134300231934, "learning_rate": 3.449367088607595e-05, "loss": 0.7449, "step": 51 }, { "epoch": 3.1875, "eval_accuracy": 0.528, "eval_loss": 0.6836503744125366, "eval_runtime": 4.5, "eval_samples_per_second": 55.556, "eval_steps_per_second": 1.778, "step": 51 }, { "epoch": 3.25, "grad_norm": 5.3039422035217285, "learning_rate": 3.4177215189873416e-05, "loss": 0.6853, "step": 52 }, { "epoch": 3.25, "eval_accuracy": 0.536, "eval_loss": 0.6831699013710022, "eval_runtime": 4.4953, "eval_samples_per_second": 55.613, "eval_steps_per_second": 1.78, "step": 52 }, { "epoch": 3.3125, "grad_norm": 2.962162733078003, "learning_rate": 3.386075949367089e-05, "loss": 0.7127, "step": 53 }, { "epoch": 3.3125, "eval_accuracy": 0.536, "eval_loss": 0.6828047037124634, "eval_runtime": 4.5467, "eval_samples_per_second": 54.985, "eval_steps_per_second": 1.76, "step": 53 }, { "epoch": 3.375, "grad_norm": 4.858814239501953, "learning_rate": 3.354430379746836e-05, "loss": 0.6544, "step": 54 }, { "epoch": 3.375, "eval_accuracy": 0.556, "eval_loss": 0.6824140548706055, "eval_runtime": 4.5474, "eval_samples_per_second": 54.976, "eval_steps_per_second": 1.759, "step": 54 }, { "epoch": 3.4375, "grad_norm": 5.237043380737305, "learning_rate": 3.322784810126582e-05, "loss": 0.6638, "step": 55 }, { "epoch": 3.4375, "eval_accuracy": 0.532, "eval_loss": 0.6816914081573486, "eval_runtime": 4.5453, "eval_samples_per_second": 55.002, "eval_steps_per_second": 1.76, "step": 55 }, { "epoch": 3.5, "grad_norm": 3.878478527069092, "learning_rate": 3.291139240506329e-05, "loss": 0.7148, "step": 56 }, { "epoch": 3.5, "eval_accuracy": 0.536, "eval_loss": 0.6814433336257935, "eval_runtime": 4.5412, "eval_samples_per_second": 55.052, "eval_steps_per_second": 1.762, "step": 56 }, { "epoch": 3.5625, "grad_norm": 4.188953399658203, "learning_rate": 3.2594936708860766e-05, "loss": 0.7003, "step": 57 }, { "epoch": 3.5625, "eval_accuracy": 0.528, "eval_loss": 0.6815546751022339, "eval_runtime": 4.5437, "eval_samples_per_second": 55.021, "eval_steps_per_second": 1.761, "step": 57 }, { "epoch": 3.625, "grad_norm": 12.408546447753906, "learning_rate": 3.227848101265823e-05, "loss": 0.771, "step": 58 }, { "epoch": 3.625, "eval_accuracy": 0.528, "eval_loss": 0.681021511554718, "eval_runtime": 4.5384, "eval_samples_per_second": 55.086, "eval_steps_per_second": 1.763, "step": 58 }, { "epoch": 3.6875, "grad_norm": 3.4157402515411377, "learning_rate": 3.1962025316455695e-05, "loss": 0.6973, "step": 59 }, { "epoch": 3.6875, "eval_accuracy": 0.512, "eval_loss": 0.6810234189033508, "eval_runtime": 4.5487, "eval_samples_per_second": 54.96, "eval_steps_per_second": 1.759, "step": 59 }, { "epoch": 3.75, "grad_norm": 7.873476028442383, "learning_rate": 3.1645569620253167e-05, "loss": 0.7426, "step": 60 }, { "epoch": 3.75, "eval_accuracy": 0.512, "eval_loss": 0.6811171770095825, "eval_runtime": 4.5472, "eval_samples_per_second": 54.978, "eval_steps_per_second": 1.759, "step": 60 }, { "epoch": 3.8125, "grad_norm": 5.3661322593688965, "learning_rate": 3.132911392405064e-05, "loss": 0.6969, "step": 61 }, { "epoch": 3.8125, "eval_accuracy": 0.528, "eval_loss": 0.6811054944992065, "eval_runtime": 4.5489, "eval_samples_per_second": 54.959, "eval_steps_per_second": 1.759, "step": 61 }, { "epoch": 3.875, "grad_norm": 2.467409372329712, "learning_rate": 3.10126582278481e-05, "loss": 0.7369, "step": 62 }, { "epoch": 3.875, "eval_accuracy": 0.54, "eval_loss": 0.6803652048110962, "eval_runtime": 4.5424, "eval_samples_per_second": 55.037, "eval_steps_per_second": 1.761, "step": 62 }, { "epoch": 3.9375, "grad_norm": 2.4884164333343506, "learning_rate": 3.0696202531645574e-05, "loss": 0.6572, "step": 63 }, { "epoch": 3.9375, "eval_accuracy": 0.56, "eval_loss": 0.6803945302963257, "eval_runtime": 4.5015, "eval_samples_per_second": 55.537, "eval_steps_per_second": 1.777, "step": 63 }, { "epoch": 4.0, "grad_norm": 1.9957572221755981, "learning_rate": 3.0379746835443042e-05, "loss": 0.758, "step": 64 }, { "epoch": 4.0, "eval_accuracy": 0.544, "eval_loss": 0.6798281073570251, "eval_runtime": 4.5427, "eval_samples_per_second": 55.033, "eval_steps_per_second": 1.761, "step": 64 }, { "epoch": 4.0625, "grad_norm": 11.552275657653809, "learning_rate": 3.0063291139240506e-05, "loss": 0.7428, "step": 65 }, { "epoch": 4.0625, "eval_accuracy": 0.524, "eval_loss": 0.679925799369812, "eval_runtime": 4.539, "eval_samples_per_second": 55.078, "eval_steps_per_second": 1.763, "step": 65 }, { "epoch": 4.125, "grad_norm": 2.6973438262939453, "learning_rate": 2.9746835443037974e-05, "loss": 0.6784, "step": 66 }, { "epoch": 4.125, "eval_accuracy": 0.536, "eval_loss": 0.6790605187416077, "eval_runtime": 4.5462, "eval_samples_per_second": 54.991, "eval_steps_per_second": 1.76, "step": 66 }, { "epoch": 4.1875, "grad_norm": 3.727440595626831, "learning_rate": 2.9430379746835446e-05, "loss": 0.7045, "step": 67 }, { "epoch": 4.1875, "eval_accuracy": 0.528, "eval_loss": 0.6793281435966492, "eval_runtime": 4.5469, "eval_samples_per_second": 54.983, "eval_steps_per_second": 1.759, "step": 67 }, { "epoch": 4.25, "grad_norm": 1.7801040410995483, "learning_rate": 2.9113924050632914e-05, "loss": 0.643, "step": 68 }, { "epoch": 4.25, "eval_accuracy": 0.512, "eval_loss": 0.6788183450698853, "eval_runtime": 4.5354, "eval_samples_per_second": 55.122, "eval_steps_per_second": 1.764, "step": 68 }, { "epoch": 4.3125, "grad_norm": 3.4789085388183594, "learning_rate": 2.879746835443038e-05, "loss": 0.675, "step": 69 }, { "epoch": 4.3125, "eval_accuracy": 0.52, "eval_loss": 0.6782128810882568, "eval_runtime": 4.5402, "eval_samples_per_second": 55.063, "eval_steps_per_second": 1.762, "step": 69 }, { "epoch": 4.375, "grad_norm": 4.243752956390381, "learning_rate": 2.848101265822785e-05, "loss": 0.6469, "step": 70 }, { "epoch": 4.375, "eval_accuracy": 0.508, "eval_loss": 0.6780292987823486, "eval_runtime": 4.543, "eval_samples_per_second": 55.029, "eval_steps_per_second": 1.761, "step": 70 }, { "epoch": 4.4375, "grad_norm": 6.593841552734375, "learning_rate": 2.8164556962025318e-05, "loss": 0.7455, "step": 71 }, { "epoch": 4.4375, "eval_accuracy": 0.516, "eval_loss": 0.6775800585746765, "eval_runtime": 4.545, "eval_samples_per_second": 55.005, "eval_steps_per_second": 1.76, "step": 71 }, { "epoch": 4.5, "grad_norm": 12.047831535339355, "learning_rate": 2.7848101265822786e-05, "loss": 0.6985, "step": 72 }, { "epoch": 4.5, "eval_accuracy": 0.516, "eval_loss": 0.6778261661529541, "eval_runtime": 4.5527, "eval_samples_per_second": 54.912, "eval_steps_per_second": 1.757, "step": 72 }, { "epoch": 4.5625, "grad_norm": 3.4566452503204346, "learning_rate": 2.7531645569620257e-05, "loss": 0.7616, "step": 73 }, { "epoch": 4.5625, "eval_accuracy": 0.52, "eval_loss": 0.6769921779632568, "eval_runtime": 4.545, "eval_samples_per_second": 55.005, "eval_steps_per_second": 1.76, "step": 73 }, { "epoch": 4.625, "grad_norm": 2.8978374004364014, "learning_rate": 2.7215189873417722e-05, "loss": 0.7135, "step": 74 }, { "epoch": 4.625, "eval_accuracy": 0.516, "eval_loss": 0.6770429611206055, "eval_runtime": 4.541, "eval_samples_per_second": 55.054, "eval_steps_per_second": 1.762, "step": 74 }, { "epoch": 4.6875, "grad_norm": 3.3244338035583496, "learning_rate": 2.689873417721519e-05, "loss": 0.7157, "step": 75 }, { "epoch": 4.6875, "eval_accuracy": 0.528, "eval_loss": 0.6766347885131836, "eval_runtime": 4.5403, "eval_samples_per_second": 55.062, "eval_steps_per_second": 1.762, "step": 75 }, { "epoch": 4.75, "grad_norm": 5.23004150390625, "learning_rate": 2.6582278481012658e-05, "loss": 0.7058, "step": 76 }, { "epoch": 4.75, "eval_accuracy": 0.528, "eval_loss": 0.6764668226242065, "eval_runtime": 4.54, "eval_samples_per_second": 55.066, "eval_steps_per_second": 1.762, "step": 76 }, { "epoch": 4.8125, "grad_norm": 8.803872108459473, "learning_rate": 2.626582278481013e-05, "loss": 0.7127, "step": 77 }, { "epoch": 4.8125, "eval_accuracy": 0.524, "eval_loss": 0.6759433746337891, "eval_runtime": 4.5454, "eval_samples_per_second": 55.0, "eval_steps_per_second": 1.76, "step": 77 }, { "epoch": 4.875, "grad_norm": 3.5992655754089355, "learning_rate": 2.5949367088607597e-05, "loss": 0.7004, "step": 78 }, { "epoch": 4.875, "eval_accuracy": 0.536, "eval_loss": 0.6762988567352295, "eval_runtime": 4.4993, "eval_samples_per_second": 55.565, "eval_steps_per_second": 1.778, "step": 78 }, { "epoch": 4.9375, "grad_norm": 3.1371684074401855, "learning_rate": 2.5632911392405062e-05, "loss": 0.6827, "step": 79 }, { "epoch": 4.9375, "eval_accuracy": 0.552, "eval_loss": 0.6757890582084656, "eval_runtime": 4.5476, "eval_samples_per_second": 54.974, "eval_steps_per_second": 1.759, "step": 79 }, { "epoch": 5.0, "grad_norm": 3.854306697845459, "learning_rate": 2.5316455696202533e-05, "loss": 0.7649, "step": 80 }, { "epoch": 5.0, "eval_accuracy": 0.54, "eval_loss": 0.6760488152503967, "eval_runtime": 4.539, "eval_samples_per_second": 55.079, "eval_steps_per_second": 1.763, "step": 80 }, { "epoch": 5.0625, "grad_norm": 4.356711387634277, "learning_rate": 2.5e-05, "loss": 0.7461, "step": 81 }, { "epoch": 5.0625, "eval_accuracy": 0.54, "eval_loss": 0.6765702962875366, "eval_runtime": 4.4883, "eval_samples_per_second": 55.701, "eval_steps_per_second": 1.782, "step": 81 }, { "epoch": 5.125, "grad_norm": 4.030115127563477, "learning_rate": 2.468354430379747e-05, "loss": 0.6346, "step": 82 }, { "epoch": 5.125, "eval_accuracy": 0.536, "eval_loss": 0.6769394278526306, "eval_runtime": 4.5387, "eval_samples_per_second": 55.082, "eval_steps_per_second": 1.763, "step": 82 }, { "epoch": 5.1875, "grad_norm": 3.892704486846924, "learning_rate": 2.4367088607594937e-05, "loss": 0.6245, "step": 83 }, { "epoch": 5.1875, "eval_accuracy": 0.548, "eval_loss": 0.6764355301856995, "eval_runtime": 4.548, "eval_samples_per_second": 54.969, "eval_steps_per_second": 1.759, "step": 83 }, { "epoch": 5.25, "grad_norm": 2.755213975906372, "learning_rate": 2.4050632911392405e-05, "loss": 0.6595, "step": 84 }, { "epoch": 5.25, "eval_accuracy": 0.548, "eval_loss": 0.6767304539680481, "eval_runtime": 4.5445, "eval_samples_per_second": 55.012, "eval_steps_per_second": 1.76, "step": 84 }, { "epoch": 5.3125, "grad_norm": 9.109251976013184, "learning_rate": 2.3734177215189873e-05, "loss": 0.6507, "step": 85 }, { "epoch": 5.3125, "eval_accuracy": 0.552, "eval_loss": 0.6769980192184448, "eval_runtime": 4.4987, "eval_samples_per_second": 55.572, "eval_steps_per_second": 1.778, "step": 85 }, { "epoch": 5.375, "grad_norm": 4.487890720367432, "learning_rate": 2.341772151898734e-05, "loss": 0.6528, "step": 86 }, { "epoch": 5.375, "eval_accuracy": 0.552, "eval_loss": 0.6765019297599792, "eval_runtime": 4.544, "eval_samples_per_second": 55.017, "eval_steps_per_second": 1.761, "step": 86 }, { "epoch": 5.4375, "grad_norm": 2.2593257427215576, "learning_rate": 2.3101265822784813e-05, "loss": 0.687, "step": 87 }, { "epoch": 5.4375, "eval_accuracy": 0.564, "eval_loss": 0.6773359179496765, "eval_runtime": 4.5397, "eval_samples_per_second": 55.07, "eval_steps_per_second": 1.762, "step": 87 }, { "epoch": 5.5, "grad_norm": 9.76685905456543, "learning_rate": 2.278481012658228e-05, "loss": 0.6913, "step": 88 }, { "epoch": 5.5, "eval_accuracy": 0.56, "eval_loss": 0.6779413819313049, "eval_runtime": 4.5446, "eval_samples_per_second": 55.01, "eval_steps_per_second": 1.76, "step": 88 }, { "epoch": 5.5625, "grad_norm": 1.9855612516403198, "learning_rate": 2.246835443037975e-05, "loss": 0.6799, "step": 89 }, { "epoch": 5.5625, "eval_accuracy": 0.56, "eval_loss": 0.6777753829956055, "eval_runtime": 4.5453, "eval_samples_per_second": 55.002, "eval_steps_per_second": 1.76, "step": 89 }, { "epoch": 5.625, "grad_norm": 6.978314399719238, "learning_rate": 2.2151898734177217e-05, "loss": 0.6616, "step": 90 }, { "epoch": 5.625, "eval_accuracy": 0.568, "eval_loss": 0.6782050728797913, "eval_runtime": 4.5005, "eval_samples_per_second": 55.549, "eval_steps_per_second": 1.778, "step": 90 }, { "epoch": 5.6875, "grad_norm": 2.3891565799713135, "learning_rate": 2.1835443037974685e-05, "loss": 0.6577, "step": 91 }, { "epoch": 5.6875, "eval_accuracy": 0.552, "eval_loss": 0.6784765720367432, "eval_runtime": 4.5406, "eval_samples_per_second": 55.059, "eval_steps_per_second": 1.762, "step": 91 }, { "epoch": 5.75, "grad_norm": 4.9778313636779785, "learning_rate": 2.1518987341772153e-05, "loss": 0.6248, "step": 92 }, { "epoch": 5.75, "eval_accuracy": 0.556, "eval_loss": 0.678955078125, "eval_runtime": 4.5404, "eval_samples_per_second": 55.062, "eval_steps_per_second": 1.762, "step": 92 }, { "epoch": 5.8125, "grad_norm": 1.9475889205932617, "learning_rate": 2.120253164556962e-05, "loss": 0.7026, "step": 93 }, { "epoch": 5.8125, "eval_accuracy": 0.552, "eval_loss": 0.6784570217132568, "eval_runtime": 4.5434, "eval_samples_per_second": 55.025, "eval_steps_per_second": 1.761, "step": 93 }, { "epoch": 5.875, "grad_norm": 6.539444923400879, "learning_rate": 2.088607594936709e-05, "loss": 0.6816, "step": 94 }, { "epoch": 5.875, "eval_accuracy": 0.536, "eval_loss": 0.6789179444313049, "eval_runtime": 4.5418, "eval_samples_per_second": 55.044, "eval_steps_per_second": 1.761, "step": 94 }, { "epoch": 5.9375, "grad_norm": 1.8745115995407104, "learning_rate": 2.056962025316456e-05, "loss": 0.6476, "step": 95 }, { "epoch": 5.9375, "eval_accuracy": 0.532, "eval_loss": 0.6787148714065552, "eval_runtime": 4.5397, "eval_samples_per_second": 55.069, "eval_steps_per_second": 1.762, "step": 95 }, { "epoch": 6.0, "grad_norm": 7.960897922515869, "learning_rate": 2.0253164556962025e-05, "loss": 0.6797, "step": 96 }, { "epoch": 6.0, "eval_accuracy": 0.54, "eval_loss": 0.6785527467727661, "eval_runtime": 4.5395, "eval_samples_per_second": 55.072, "eval_steps_per_second": 1.762, "step": 96 }, { "epoch": 6.0625, "grad_norm": 6.119703769683838, "learning_rate": 1.9936708860759496e-05, "loss": 0.6603, "step": 97 }, { "epoch": 6.0625, "eval_accuracy": 0.532, "eval_loss": 0.6781836152076721, "eval_runtime": 4.5394, "eval_samples_per_second": 55.073, "eval_steps_per_second": 1.762, "step": 97 }, { "epoch": 6.125, "grad_norm": 2.6292548179626465, "learning_rate": 1.962025316455696e-05, "loss": 0.6892, "step": 98 }, { "epoch": 6.125, "eval_accuracy": 0.54, "eval_loss": 0.6773242354393005, "eval_runtime": 4.5436, "eval_samples_per_second": 55.022, "eval_steps_per_second": 1.761, "step": 98 }, { "epoch": 6.1875, "grad_norm": 5.301840305328369, "learning_rate": 1.9303797468354432e-05, "loss": 0.677, "step": 99 }, { "epoch": 6.1875, "eval_accuracy": 0.548, "eval_loss": 0.6762461066246033, "eval_runtime": 4.5432, "eval_samples_per_second": 55.027, "eval_steps_per_second": 1.761, "step": 99 }, { "epoch": 6.25, "grad_norm": 3.4270968437194824, "learning_rate": 1.89873417721519e-05, "loss": 0.6696, "step": 100 }, { "epoch": 6.25, "eval_accuracy": 0.544, "eval_loss": 0.6752324104309082, "eval_runtime": 4.5411, "eval_samples_per_second": 55.052, "eval_steps_per_second": 1.762, "step": 100 }, { "epoch": 6.3125, "grad_norm": 2.9809482097625732, "learning_rate": 1.8670886075949368e-05, "loss": 0.666, "step": 101 }, { "epoch": 6.3125, "eval_accuracy": 0.56, "eval_loss": 0.6741093993186951, "eval_runtime": 4.5435, "eval_samples_per_second": 55.024, "eval_steps_per_second": 1.761, "step": 101 }, { "epoch": 6.375, "grad_norm": 3.612354278564453, "learning_rate": 1.8354430379746836e-05, "loss": 0.6552, "step": 102 }, { "epoch": 6.375, "eval_accuracy": 0.564, "eval_loss": 0.6736387014389038, "eval_runtime": 4.5443, "eval_samples_per_second": 55.014, "eval_steps_per_second": 1.76, "step": 102 }, { "epoch": 6.4375, "grad_norm": 13.848094940185547, "learning_rate": 1.8037974683544304e-05, "loss": 0.6958, "step": 103 }, { "epoch": 6.4375, "eval_accuracy": 0.564, "eval_loss": 0.6730585694313049, "eval_runtime": 4.537, "eval_samples_per_second": 55.102, "eval_steps_per_second": 1.763, "step": 103 }, { "epoch": 6.5, "grad_norm": 2.657895565032959, "learning_rate": 1.7721518987341772e-05, "loss": 0.6779, "step": 104 }, { "epoch": 6.5, "eval_accuracy": 0.576, "eval_loss": 0.6721835732460022, "eval_runtime": 4.5416, "eval_samples_per_second": 55.047, "eval_steps_per_second": 1.762, "step": 104 }, { "epoch": 6.5625, "grad_norm": 3.6230475902557373, "learning_rate": 1.7405063291139243e-05, "loss": 0.662, "step": 105 }, { "epoch": 6.5625, "eval_accuracy": 0.576, "eval_loss": 0.6725234389305115, "eval_runtime": 4.4966, "eval_samples_per_second": 55.598, "eval_steps_per_second": 1.779, "step": 105 }, { "epoch": 6.625, "grad_norm": 2.817807674407959, "learning_rate": 1.7088607594936708e-05, "loss": 0.639, "step": 106 }, { "epoch": 6.625, "eval_accuracy": 0.58, "eval_loss": 0.6714980602264404, "eval_runtime": 4.4976, "eval_samples_per_second": 55.585, "eval_steps_per_second": 1.779, "step": 106 }, { "epoch": 6.6875, "grad_norm": 2.2491910457611084, "learning_rate": 1.677215189873418e-05, "loss": 0.6469, "step": 107 }, { "epoch": 6.6875, "eval_accuracy": 0.564, "eval_loss": 0.6703847646713257, "eval_runtime": 4.5015, "eval_samples_per_second": 55.537, "eval_steps_per_second": 1.777, "step": 107 }, { "epoch": 6.75, "grad_norm": 6.607123851776123, "learning_rate": 1.6455696202531644e-05, "loss": 0.6494, "step": 108 }, { "epoch": 6.75, "eval_accuracy": 0.544, "eval_loss": 0.6705585718154907, "eval_runtime": 4.5512, "eval_samples_per_second": 54.931, "eval_steps_per_second": 1.758, "step": 108 }, { "epoch": 6.8125, "grad_norm": 3.7436728477478027, "learning_rate": 1.6139240506329115e-05, "loss": 0.6428, "step": 109 }, { "epoch": 6.8125, "eval_accuracy": 0.556, "eval_loss": 0.6696659922599792, "eval_runtime": 4.5466, "eval_samples_per_second": 54.986, "eval_steps_per_second": 1.76, "step": 109 }, { "epoch": 6.875, "grad_norm": 10.663908004760742, "learning_rate": 1.5822784810126583e-05, "loss": 0.6949, "step": 110 }, { "epoch": 6.875, "eval_accuracy": 0.552, "eval_loss": 0.6699844002723694, "eval_runtime": 4.5417, "eval_samples_per_second": 55.046, "eval_steps_per_second": 1.761, "step": 110 }, { "epoch": 6.9375, "grad_norm": 2.8781378269195557, "learning_rate": 1.550632911392405e-05, "loss": 0.6557, "step": 111 }, { "epoch": 6.9375, "eval_accuracy": 0.556, "eval_loss": 0.6697744131088257, "eval_runtime": 4.5421, "eval_samples_per_second": 55.041, "eval_steps_per_second": 1.761, "step": 111 }, { "epoch": 7.0, "grad_norm": 9.62548828125, "learning_rate": 1.5189873417721521e-05, "loss": 0.625, "step": 112 }, { "epoch": 7.0, "eval_accuracy": 0.56, "eval_loss": 0.6696327924728394, "eval_runtime": 4.5434, "eval_samples_per_second": 55.025, "eval_steps_per_second": 1.761, "step": 112 }, { "epoch": 7.0625, "grad_norm": 3.4376492500305176, "learning_rate": 1.4873417721518987e-05, "loss": 0.6648, "step": 113 }, { "epoch": 7.0625, "eval_accuracy": 0.556, "eval_loss": 0.6688730716705322, "eval_runtime": 4.5489, "eval_samples_per_second": 54.958, "eval_steps_per_second": 1.759, "step": 113 }, { "epoch": 7.125, "grad_norm": 11.591545104980469, "learning_rate": 1.4556962025316457e-05, "loss": 0.6909, "step": 114 }, { "epoch": 7.125, "eval_accuracy": 0.54, "eval_loss": 0.6680371165275574, "eval_runtime": 4.5477, "eval_samples_per_second": 54.973, "eval_steps_per_second": 1.759, "step": 114 }, { "epoch": 7.1875, "grad_norm": 3.0911552906036377, "learning_rate": 1.4240506329113925e-05, "loss": 0.6548, "step": 115 }, { "epoch": 7.1875, "eval_accuracy": 0.552, "eval_loss": 0.667611300945282, "eval_runtime": 4.5404, "eval_samples_per_second": 55.062, "eval_steps_per_second": 1.762, "step": 115 }, { "epoch": 7.25, "grad_norm": 5.890276908874512, "learning_rate": 1.3924050632911393e-05, "loss": 0.6278, "step": 116 }, { "epoch": 7.25, "eval_accuracy": 0.58, "eval_loss": 0.6670957207679749, "eval_runtime": 4.5443, "eval_samples_per_second": 55.014, "eval_steps_per_second": 1.76, "step": 116 }, { "epoch": 7.3125, "grad_norm": 2.038860321044922, "learning_rate": 1.3607594936708861e-05, "loss": 0.6899, "step": 117 }, { "epoch": 7.3125, "eval_accuracy": 0.596, "eval_loss": 0.6669042706489563, "eval_runtime": 4.5442, "eval_samples_per_second": 55.015, "eval_steps_per_second": 1.76, "step": 117 }, { "epoch": 7.375, "grad_norm": 7.413594722747803, "learning_rate": 1.3291139240506329e-05, "loss": 0.6197, "step": 118 }, { "epoch": 7.375, "eval_accuracy": 0.588, "eval_loss": 0.6667382717132568, "eval_runtime": 4.5391, "eval_samples_per_second": 55.077, "eval_steps_per_second": 1.762, "step": 118 }, { "epoch": 7.4375, "grad_norm": 3.1535215377807617, "learning_rate": 1.2974683544303799e-05, "loss": 0.653, "step": 119 }, { "epoch": 7.4375, "eval_accuracy": 0.588, "eval_loss": 0.666509747505188, "eval_runtime": 4.5431, "eval_samples_per_second": 55.028, "eval_steps_per_second": 1.761, "step": 119 }, { "epoch": 7.5, "grad_norm": 5.736833095550537, "learning_rate": 1.2658227848101267e-05, "loss": 0.6531, "step": 120 }, { "epoch": 7.5, "eval_accuracy": 0.592, "eval_loss": 0.6669736504554749, "eval_runtime": 4.5365, "eval_samples_per_second": 55.108, "eval_steps_per_second": 1.763, "step": 120 }, { "epoch": 7.5625, "grad_norm": 3.403089761734009, "learning_rate": 1.2341772151898735e-05, "loss": 0.6494, "step": 121 }, { "epoch": 7.5625, "eval_accuracy": 0.584, "eval_loss": 0.6666631102561951, "eval_runtime": 4.5001, "eval_samples_per_second": 55.555, "eval_steps_per_second": 1.778, "step": 121 }, { "epoch": 7.625, "grad_norm": 2.2943952083587646, "learning_rate": 1.2025316455696203e-05, "loss": 0.6914, "step": 122 }, { "epoch": 7.625, "eval_accuracy": 0.588, "eval_loss": 0.6671044826507568, "eval_runtime": 4.5495, "eval_samples_per_second": 54.952, "eval_steps_per_second": 1.758, "step": 122 }, { "epoch": 7.6875, "grad_norm": 1.8052605390548706, "learning_rate": 1.170886075949367e-05, "loss": 0.6506, "step": 123 }, { "epoch": 7.6875, "eval_accuracy": 0.592, "eval_loss": 0.6672109365463257, "eval_runtime": 4.5468, "eval_samples_per_second": 54.984, "eval_steps_per_second": 1.759, "step": 123 }, { "epoch": 7.75, "grad_norm": 2.0512139797210693, "learning_rate": 1.139240506329114e-05, "loss": 0.6647, "step": 124 }, { "epoch": 7.75, "eval_accuracy": 0.592, "eval_loss": 0.6669785380363464, "eval_runtime": 4.5423, "eval_samples_per_second": 55.039, "eval_steps_per_second": 1.761, "step": 124 }, { "epoch": 7.8125, "grad_norm": 9.648463249206543, "learning_rate": 1.1075949367088608e-05, "loss": 0.6476, "step": 125 }, { "epoch": 7.8125, "eval_accuracy": 0.592, "eval_loss": 0.6669345498085022, "eval_runtime": 4.5447, "eval_samples_per_second": 55.009, "eval_steps_per_second": 1.76, "step": 125 }, { "epoch": 7.875, "grad_norm": 3.750437021255493, "learning_rate": 1.0759493670886076e-05, "loss": 0.6609, "step": 126 }, { "epoch": 7.875, "eval_accuracy": 0.592, "eval_loss": 0.6669287085533142, "eval_runtime": 4.5377, "eval_samples_per_second": 55.094, "eval_steps_per_second": 1.763, "step": 126 }, { "epoch": 7.9375, "grad_norm": 2.9882094860076904, "learning_rate": 1.0443037974683544e-05, "loss": 0.6497, "step": 127 }, { "epoch": 7.9375, "eval_accuracy": 0.596, "eval_loss": 0.6663134694099426, "eval_runtime": 4.5412, "eval_samples_per_second": 55.052, "eval_steps_per_second": 1.762, "step": 127 }, { "epoch": 8.0, "grad_norm": 5.13292932510376, "learning_rate": 1.0126582278481012e-05, "loss": 0.6773, "step": 128 }, { "epoch": 8.0, "eval_accuracy": 0.588, "eval_loss": 0.6660781502723694, "eval_runtime": 4.4907, "eval_samples_per_second": 55.671, "eval_steps_per_second": 1.781, "step": 128 }, { "epoch": 8.0625, "grad_norm": 4.037117958068848, "learning_rate": 9.81012658227848e-06, "loss": 0.6841, "step": 129 }, { "epoch": 8.0625, "eval_accuracy": 0.596, "eval_loss": 0.6660195589065552, "eval_runtime": 4.4945, "eval_samples_per_second": 55.623, "eval_steps_per_second": 1.78, "step": 129 }, { "epoch": 8.125, "grad_norm": 9.9661865234375, "learning_rate": 9.49367088607595e-06, "loss": 0.657, "step": 130 }, { "epoch": 8.125, "eval_accuracy": 0.592, "eval_loss": 0.6656200885772705, "eval_runtime": 4.5383, "eval_samples_per_second": 55.086, "eval_steps_per_second": 1.763, "step": 130 }, { "epoch": 8.1875, "grad_norm": 8.460039138793945, "learning_rate": 9.177215189873418e-06, "loss": 0.6622, "step": 131 }, { "epoch": 8.1875, "eval_accuracy": 0.6, "eval_loss": 0.6657363176345825, "eval_runtime": 4.549, "eval_samples_per_second": 54.957, "eval_steps_per_second": 1.759, "step": 131 }, { "epoch": 8.25, "grad_norm": 2.761270046234131, "learning_rate": 8.860759493670886e-06, "loss": 0.667, "step": 132 }, { "epoch": 8.25, "eval_accuracy": 0.592, "eval_loss": 0.665112316608429, "eval_runtime": 4.5498, "eval_samples_per_second": 54.947, "eval_steps_per_second": 1.758, "step": 132 }, { "epoch": 8.3125, "grad_norm": 4.367539405822754, "learning_rate": 8.544303797468354e-06, "loss": 0.6662, "step": 133 }, { "epoch": 8.3125, "eval_accuracy": 0.596, "eval_loss": 0.6654492020606995, "eval_runtime": 4.5418, "eval_samples_per_second": 55.044, "eval_steps_per_second": 1.761, "step": 133 }, { "epoch": 8.375, "grad_norm": 3.8258039951324463, "learning_rate": 8.227848101265822e-06, "loss": 0.615, "step": 134 }, { "epoch": 8.375, "eval_accuracy": 0.588, "eval_loss": 0.6657968759536743, "eval_runtime": 4.5412, "eval_samples_per_second": 55.051, "eval_steps_per_second": 1.762, "step": 134 }, { "epoch": 8.4375, "grad_norm": 2.691741466522217, "learning_rate": 7.911392405063292e-06, "loss": 0.6961, "step": 135 }, { "epoch": 8.4375, "eval_accuracy": 0.596, "eval_loss": 0.6653828024864197, "eval_runtime": 4.5434, "eval_samples_per_second": 55.025, "eval_steps_per_second": 1.761, "step": 135 }, { "epoch": 8.5, "grad_norm": 5.671183109283447, "learning_rate": 7.5949367088607605e-06, "loss": 0.6134, "step": 136 }, { "epoch": 8.5, "eval_accuracy": 0.6, "eval_loss": 0.6660419702529907, "eval_runtime": 4.5413, "eval_samples_per_second": 55.05, "eval_steps_per_second": 1.762, "step": 136 }, { "epoch": 8.5625, "grad_norm": 7.398742198944092, "learning_rate": 7.2784810126582285e-06, "loss": 0.6839, "step": 137 }, { "epoch": 8.5625, "eval_accuracy": 0.592, "eval_loss": 0.6657724380493164, "eval_runtime": 4.5404, "eval_samples_per_second": 55.061, "eval_steps_per_second": 1.762, "step": 137 }, { "epoch": 8.625, "grad_norm": 4.798144340515137, "learning_rate": 6.9620253164556965e-06, "loss": 0.6482, "step": 138 }, { "epoch": 8.625, "eval_accuracy": 0.596, "eval_loss": 0.666140615940094, "eval_runtime": 4.5422, "eval_samples_per_second": 55.039, "eval_steps_per_second": 1.761, "step": 138 }, { "epoch": 8.6875, "grad_norm": 8.25437068939209, "learning_rate": 6.6455696202531645e-06, "loss": 0.6635, "step": 139 }, { "epoch": 8.6875, "eval_accuracy": 0.604, "eval_loss": 0.6665273308753967, "eval_runtime": 4.5468, "eval_samples_per_second": 54.984, "eval_steps_per_second": 1.759, "step": 139 }, { "epoch": 8.75, "grad_norm": 2.5796449184417725, "learning_rate": 6.329113924050633e-06, "loss": 0.6229, "step": 140 }, { "epoch": 8.75, "eval_accuracy": 0.608, "eval_loss": 0.6665956974029541, "eval_runtime": 4.5394, "eval_samples_per_second": 55.073, "eval_steps_per_second": 1.762, "step": 140 }, { "epoch": 8.8125, "grad_norm": 2.3988282680511475, "learning_rate": 6.012658227848101e-06, "loss": 0.6205, "step": 141 }, { "epoch": 8.8125, "eval_accuracy": 0.604, "eval_loss": 0.6665576100349426, "eval_runtime": 4.5397, "eval_samples_per_second": 55.07, "eval_steps_per_second": 1.762, "step": 141 }, { "epoch": 8.875, "grad_norm": 3.2234578132629395, "learning_rate": 5.69620253164557e-06, "loss": 0.6347, "step": 142 }, { "epoch": 8.875, "eval_accuracy": 0.596, "eval_loss": 0.6664531230926514, "eval_runtime": 4.4932, "eval_samples_per_second": 55.64, "eval_steps_per_second": 1.78, "step": 142 }, { "epoch": 8.9375, "grad_norm": 3.1038153171539307, "learning_rate": 5.379746835443038e-06, "loss": 0.6868, "step": 143 }, { "epoch": 8.9375, "eval_accuracy": 0.608, "eval_loss": 0.6668280959129333, "eval_runtime": 4.5461, "eval_samples_per_second": 54.993, "eval_steps_per_second": 1.76, "step": 143 }, { "epoch": 9.0, "grad_norm": 5.682613849639893, "learning_rate": 5.063291139240506e-06, "loss": 0.6447, "step": 144 }, { "epoch": 9.0, "eval_accuracy": 0.604, "eval_loss": 0.6665273308753967, "eval_runtime": 4.5397, "eval_samples_per_second": 55.069, "eval_steps_per_second": 1.762, "step": 144 }, { "epoch": 9.0625, "grad_norm": 8.149535179138184, "learning_rate": 4.746835443037975e-06, "loss": 0.6755, "step": 145 }, { "epoch": 9.0625, "eval_accuracy": 0.596, "eval_loss": 0.6669501662254333, "eval_runtime": 4.497, "eval_samples_per_second": 55.592, "eval_steps_per_second": 1.779, "step": 145 }, { "epoch": 9.125, "grad_norm": 3.2166755199432373, "learning_rate": 4.430379746835443e-06, "loss": 0.6749, "step": 146 }, { "epoch": 9.125, "eval_accuracy": 0.604, "eval_loss": 0.667477548122406, "eval_runtime": 4.549, "eval_samples_per_second": 54.957, "eval_steps_per_second": 1.759, "step": 146 }, { "epoch": 9.1875, "grad_norm": 2.9138267040252686, "learning_rate": 4.113924050632911e-06, "loss": 0.6681, "step": 147 }, { "epoch": 9.1875, "eval_accuracy": 0.596, "eval_loss": 0.6679531335830688, "eval_runtime": 4.5432, "eval_samples_per_second": 55.027, "eval_steps_per_second": 1.761, "step": 147 }, { "epoch": 9.25, "grad_norm": 8.955977439880371, "learning_rate": 3.7974683544303802e-06, "loss": 0.6768, "step": 148 }, { "epoch": 9.25, "eval_accuracy": 0.6, "eval_loss": 0.667892575263977, "eval_runtime": 4.5414, "eval_samples_per_second": 55.049, "eval_steps_per_second": 1.762, "step": 148 }, { "epoch": 9.3125, "grad_norm": 4.039650917053223, "learning_rate": 3.4810126582278482e-06, "loss": 0.6291, "step": 149 }, { "epoch": 9.3125, "eval_accuracy": 0.596, "eval_loss": 0.6680244207382202, "eval_runtime": 4.5436, "eval_samples_per_second": 55.022, "eval_steps_per_second": 1.761, "step": 149 }, { "epoch": 9.375, "grad_norm": 3.648364543914795, "learning_rate": 3.1645569620253167e-06, "loss": 0.6857, "step": 150 }, { "epoch": 9.375, "eval_accuracy": 0.596, "eval_loss": 0.6680644750595093, "eval_runtime": 4.5475, "eval_samples_per_second": 54.975, "eval_steps_per_second": 1.759, "step": 150 }, { "epoch": 9.4375, "grad_norm": 2.3928475379943848, "learning_rate": 2.848101265822785e-06, "loss": 0.6454, "step": 151 }, { "epoch": 9.4375, "eval_accuracy": 0.604, "eval_loss": 0.6678046584129333, "eval_runtime": 4.5472, "eval_samples_per_second": 54.979, "eval_steps_per_second": 1.759, "step": 151 }, { "epoch": 9.5, "grad_norm": 1.8685684204101562, "learning_rate": 2.531645569620253e-06, "loss": 0.648, "step": 152 }, { "epoch": 9.5, "eval_accuracy": 0.592, "eval_loss": 0.6684179902076721, "eval_runtime": 4.5422, "eval_samples_per_second": 55.04, "eval_steps_per_second": 1.761, "step": 152 }, { "epoch": 9.5625, "grad_norm": 6.94075345993042, "learning_rate": 2.2151898734177215e-06, "loss": 0.5989, "step": 153 }, { "epoch": 9.5625, "eval_accuracy": 0.604, "eval_loss": 0.6686621308326721, "eval_runtime": 4.549, "eval_samples_per_second": 54.957, "eval_steps_per_second": 1.759, "step": 153 }, { "epoch": 9.625, "grad_norm": 2.2033395767211914, "learning_rate": 1.8987341772151901e-06, "loss": 0.6334, "step": 154 }, { "epoch": 9.625, "eval_accuracy": 0.588, "eval_loss": 0.6692119240760803, "eval_runtime": 4.5415, "eval_samples_per_second": 55.048, "eval_steps_per_second": 1.762, "step": 154 }, { "epoch": 9.6875, "grad_norm": 4.624488353729248, "learning_rate": 1.5822784810126583e-06, "loss": 0.6086, "step": 155 }, { "epoch": 9.6875, "eval_accuracy": 0.592, "eval_loss": 0.6682060360908508, "eval_runtime": 4.5376, "eval_samples_per_second": 55.095, "eval_steps_per_second": 1.763, "step": 155 }, { "epoch": 9.75, "grad_norm": 8.24832820892334, "learning_rate": 1.2658227848101265e-06, "loss": 0.6355, "step": 156 }, { "epoch": 9.75, "eval_accuracy": 0.6, "eval_loss": 0.6689130663871765, "eval_runtime": 4.5365, "eval_samples_per_second": 55.109, "eval_steps_per_second": 1.763, "step": 156 }, { "epoch": 9.8125, "grad_norm": 1.9968777894973755, "learning_rate": 9.493670886075951e-07, "loss": 0.618, "step": 157 }, { "epoch": 9.8125, "eval_accuracy": 0.596, "eval_loss": 0.6689111590385437, "eval_runtime": 4.547, "eval_samples_per_second": 54.981, "eval_steps_per_second": 1.759, "step": 157 }, { "epoch": 9.875, "grad_norm": 2.4490880966186523, "learning_rate": 6.329113924050633e-07, "loss": 0.6603, "step": 158 }, { "epoch": 9.875, "eval_accuracy": 0.596, "eval_loss": 0.6684619188308716, "eval_runtime": 4.5445, "eval_samples_per_second": 55.012, "eval_steps_per_second": 1.76, "step": 158 }, { "epoch": 9.9375, "grad_norm": 5.009583950042725, "learning_rate": 3.1645569620253163e-07, "loss": 0.6585, "step": 159 }, { "epoch": 9.9375, "eval_accuracy": 0.6, "eval_loss": 0.6681816577911377, "eval_runtime": 4.5393, "eval_samples_per_second": 55.075, "eval_steps_per_second": 1.762, "step": 159 }, { "epoch": 10.0, "grad_norm": 3.7535176277160645, "learning_rate": 0.0, "loss": 0.6705, "step": 160 }, { "epoch": 10.0, "eval_accuracy": 0.6, "eval_loss": 0.6681679487228394, "eval_runtime": 4.4878, "eval_samples_per_second": 55.706, "eval_steps_per_second": 1.783, "step": 160 }, { "epoch": 10.0, "step": 160, "total_flos": 174253428178944.0, "train_loss": 0.6897118806838989, "train_runtime": 1349.6743, "train_samples_per_second": 7.409, "train_steps_per_second": 0.119 } ], "logging_steps": 1, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 174253428178944.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }