{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.025573961057832027, "eval_steps": 1, "global_step": 352, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.265329845975008e-05, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8438, "step": 1 }, { "epoch": 7.265329845975008e-05, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 267.5371, "eval_samples_per_second": 126.214, "eval_steps_per_second": 2.631, "step": 1 }, { "epoch": 0.00014530659691950015, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8359, "step": 2 }, { "epoch": 0.00014530659691950015, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 267.9301, "eval_samples_per_second": 126.029, "eval_steps_per_second": 2.628, "step": 2 }, { "epoch": 0.00021795989537925023, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8438, "step": 3 }, { "epoch": 0.00021795989537925023, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 267.74, "eval_samples_per_second": 126.119, "eval_steps_per_second": 2.629, "step": 3 }, { "epoch": 0.0002906131938390003, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8359, "step": 4 }, { "epoch": 0.0002906131938390003, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 267.3271, "eval_samples_per_second": 126.313, "eval_steps_per_second": 2.633, "step": 4 }, { "epoch": 0.0003632664922987504, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8438, "step": 5 }, { "epoch": 0.0003632664922987504, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 267.207, "eval_samples_per_second": 126.37, "eval_steps_per_second": 2.635, "step": 5 }, { "epoch": 0.00043591979075850045, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8438, "step": 6 }, { "epoch": 0.00043591979075850045, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 268.9045, "eval_samples_per_second": 125.572, "eval_steps_per_second": 2.618, "step": 6 }, { "epoch": 0.0005085730892182505, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8359, "step": 7 }, { "epoch": 0.0005085730892182505, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 269.4441, "eval_samples_per_second": 125.321, "eval_steps_per_second": 2.613, "step": 7 }, { "epoch": 0.0005812263876780006, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8359, "step": 8 }, { "epoch": 0.0005812263876780006, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 265.7225, "eval_samples_per_second": 127.076, "eval_steps_per_second": 2.649, "step": 8 }, { "epoch": 0.0006538796861377507, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8438, "step": 9 }, { "epoch": 0.0006538796861377507, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 265.6793, "eval_samples_per_second": 127.097, "eval_steps_per_second": 2.65, "step": 9 }, { "epoch": 0.0007265329845975008, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8359, "step": 10 }, { "epoch": 0.0007265329845975008, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 266.2481, "eval_samples_per_second": 126.825, "eval_steps_per_second": 2.644, "step": 10 }, { "epoch": 0.0007991862830572508, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8359, "step": 11 }, { "epoch": 0.0007991862830572508, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 265.3262, "eval_samples_per_second": 127.266, "eval_steps_per_second": 2.653, "step": 11 }, { "epoch": 0.0008718395815170009, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8438, "step": 12 }, { "epoch": 0.0008718395815170009, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 265.9213, "eval_samples_per_second": 126.981, "eval_steps_per_second": 2.647, "step": 12 }, { "epoch": 0.000944492879976751, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8438, "step": 13 }, { "epoch": 0.000944492879976751, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 266.6935, "eval_samples_per_second": 126.614, "eval_steps_per_second": 2.64, "step": 13 }, { "epoch": 0.001017146178436501, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8359, "step": 14 }, { "epoch": 0.001017146178436501, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 265.5702, "eval_samples_per_second": 127.149, "eval_steps_per_second": 2.651, "step": 14 }, { "epoch": 0.001089799476896251, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8359, "step": 15 }, { "epoch": 0.001089799476896251, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 266.1801, "eval_samples_per_second": 126.858, "eval_steps_per_second": 2.645, "step": 15 }, { "epoch": 0.0011624527753560012, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8438, "step": 16 }, { "epoch": 0.0011624527753560012, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 265.2069, "eval_samples_per_second": 127.323, "eval_steps_per_second": 2.655, "step": 16 }, { "epoch": 0.0012351060738157512, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8359, "step": 17 }, { "epoch": 0.0012351060738157512, "eval_accuracy": 0.010348185357762373, "eval_loss": 10.84375, "eval_runtime": 265.4707, "eval_samples_per_second": 127.197, "eval_steps_per_second": 2.652, "step": 17 }, { "epoch": 0.0013077593722755014, "grad_norm": 2.8743269443511963, "learning_rate": 9.999992734670155e-06, "loss": 10.8438, "step": 18 }, { "epoch": 0.0013077593722755014, "eval_accuracy": 0.011272349663430095, "eval_loss": 10.828125, "eval_runtime": 265.1122, "eval_samples_per_second": 127.369, "eval_steps_per_second": 2.655, "step": 18 }, { "epoch": 0.0013804126707352513, "grad_norm": 2.8402953147888184, "learning_rate": 9.999985469340309e-06, "loss": 10.8203, "step": 19 }, { "epoch": 0.0013804126707352513, "eval_accuracy": 0.011634905538764718, "eval_loss": 10.8125, "eval_runtime": 265.8546, "eval_samples_per_second": 127.013, "eval_steps_per_second": 2.648, "step": 19 }, { "epoch": 0.0014530659691950015, "grad_norm": 2.8661510944366455, "learning_rate": 9.999978204010463e-06, "loss": 10.8203, "step": 20 }, { "epoch": 0.0014530659691950015, "eval_accuracy": 0.01173857150727105, "eval_loss": 10.8046875, "eval_runtime": 265.3217, "eval_samples_per_second": 127.268, "eval_steps_per_second": 2.653, "step": 20 }, { "epoch": 0.0015257192676547515, "grad_norm": 2.8541078567504883, "learning_rate": 9.999970938680617e-06, "loss": 10.8047, "step": 21 }, { "epoch": 0.0015257192676547515, "eval_accuracy": 0.011759009422313067, "eval_loss": 10.7890625, "eval_runtime": 265.864, "eval_samples_per_second": 127.009, "eval_steps_per_second": 2.648, "step": 21 }, { "epoch": 0.0015983725661145017, "grad_norm": 2.8900887966156006, "learning_rate": 9.99996367335077e-06, "loss": 10.7969, "step": 22 }, { "epoch": 0.0015983725661145017, "eval_accuracy": 0.011769199430945915, "eval_loss": 10.7734375, "eval_runtime": 266.281, "eval_samples_per_second": 126.81, "eval_steps_per_second": 2.644, "step": 22 }, { "epoch": 0.0016710258645742516, "grad_norm": 2.884963035583496, "learning_rate": 9.999956408020926e-06, "loss": 10.7812, "step": 23 }, { "epoch": 0.0016710258645742516, "eval_accuracy": 0.01179959576351549, "eval_loss": 10.765625, "eval_runtime": 266.3138, "eval_samples_per_second": 126.794, "eval_steps_per_second": 2.643, "step": 23 }, { "epoch": 0.0017436791630340018, "grad_norm": 2.8954319953918457, "learning_rate": 9.999949142691078e-06, "loss": 10.7656, "step": 24 }, { "epoch": 0.0017436791630340018, "eval_accuracy": 0.011878857819301676, "eval_loss": 10.75, "eval_runtime": 265.8429, "eval_samples_per_second": 127.019, "eval_steps_per_second": 2.648, "step": 24 }, { "epoch": 0.0018163324614937518, "grad_norm": 2.713453769683838, "learning_rate": 9.999941877361234e-06, "loss": 10.7578, "step": 25 }, { "epoch": 0.0018163324614937518, "eval_accuracy": 0.012068501985647663, "eval_loss": 10.734375, "eval_runtime": 266.534, "eval_samples_per_second": 126.689, "eval_steps_per_second": 2.641, "step": 25 }, { "epoch": 0.001888985759953502, "grad_norm": 2.663592576980591, "learning_rate": 9.999934612031386e-06, "loss": 10.75, "step": 26 }, { "epoch": 0.001888985759953502, "eval_accuracy": 0.012414296454736778, "eval_loss": 10.7265625, "eval_runtime": 265.503, "eval_samples_per_second": 127.181, "eval_steps_per_second": 2.652, "step": 26 }, { "epoch": 0.001961639058413252, "grad_norm": 2.4643020629882812, "learning_rate": 9.999927346701542e-06, "loss": 10.7344, "step": 27 }, { "epoch": 0.001961639058413252, "eval_accuracy": 0.013084492164563661, "eval_loss": 10.71875, "eval_runtime": 264.7117, "eval_samples_per_second": 127.561, "eval_steps_per_second": 2.659, "step": 27 }, { "epoch": 0.002034292356873002, "grad_norm": 2.2399826049804688, "learning_rate": 9.999920081371694e-06, "loss": 10.7266, "step": 28 }, { "epoch": 0.002034292356873002, "eval_accuracy": 0.014443063485982847, "eval_loss": 10.703125, "eval_runtime": 264.8704, "eval_samples_per_second": 127.485, "eval_steps_per_second": 2.658, "step": 28 }, { "epoch": 0.0021069456553327523, "grad_norm": 2.138185977935791, "learning_rate": 9.99991281604185e-06, "loss": 10.7109, "step": 29 }, { "epoch": 0.0021069456553327523, "eval_accuracy": 0.016544752766507735, "eval_loss": 10.6953125, "eval_runtime": 264.0295, "eval_samples_per_second": 127.891, "eval_steps_per_second": 2.666, "step": 29 }, { "epoch": 0.002179598953792502, "grad_norm": 1.8671512603759766, "learning_rate": 9.999905550712004e-06, "loss": 10.7031, "step": 30 }, { "epoch": 0.002179598953792502, "eval_accuracy": 0.01964894204406536, "eval_loss": 10.6875, "eval_runtime": 264.1126, "eval_samples_per_second": 127.851, "eval_steps_per_second": 2.666, "step": 30 }, { "epoch": 0.0022522522522522522, "grad_norm": 1.72816002368927, "learning_rate": 9.999898285382156e-06, "loss": 10.7031, "step": 31 }, { "epoch": 0.0022522522522522522, "eval_accuracy": 0.023590159473924593, "eval_loss": 10.6796875, "eval_runtime": 265.2939, "eval_samples_per_second": 127.281, "eval_steps_per_second": 2.654, "step": 31 }, { "epoch": 0.0023249055507120024, "grad_norm": 1.6541900634765625, "learning_rate": 9.999891020052312e-06, "loss": 10.6875, "step": 32 }, { "epoch": 0.0023249055507120024, "eval_accuracy": 0.028234313806121365, "eval_loss": 10.671875, "eval_runtime": 264.9943, "eval_samples_per_second": 127.425, "eval_steps_per_second": 2.657, "step": 32 }, { "epoch": 0.0023975588491717526, "grad_norm": 1.4378719329833984, "learning_rate": 9.999883754722464e-06, "loss": 10.6797, "step": 33 }, { "epoch": 0.0023975588491717526, "eval_accuracy": 0.03299875076862917, "eval_loss": 10.6640625, "eval_runtime": 264.4617, "eval_samples_per_second": 127.682, "eval_steps_per_second": 2.662, "step": 33 }, { "epoch": 0.0024702121476315024, "grad_norm": 1.3948858976364136, "learning_rate": 9.99987648939262e-06, "loss": 10.6719, "step": 34 }, { "epoch": 0.0024702121476315024, "eval_accuracy": 0.03746090344095459, "eval_loss": 10.6640625, "eval_runtime": 263.8541, "eval_samples_per_second": 127.976, "eval_steps_per_second": 2.668, "step": 34 }, { "epoch": 0.0025428654460912525, "grad_norm": 1.2194068431854248, "learning_rate": 9.999869224062774e-06, "loss": 10.6719, "step": 35 }, { "epoch": 0.0025428654460912525, "eval_accuracy": 0.04094886812886922, "eval_loss": 10.65625, "eval_runtime": 263.4632, "eval_samples_per_second": 128.166, "eval_steps_per_second": 2.672, "step": 35 }, { "epoch": 0.0026155187445510027, "grad_norm": 1.2569856643676758, "learning_rate": 9.999861958732927e-06, "loss": 10.6719, "step": 36 }, { "epoch": 0.0026155187445510027, "eval_accuracy": 0.04373679080326246, "eval_loss": 10.6484375, "eval_runtime": 265.8278, "eval_samples_per_second": 127.026, "eval_steps_per_second": 2.648, "step": 36 }, { "epoch": 0.002688172043010753, "grad_norm": 1.16013503074646, "learning_rate": 9.999854693403081e-06, "loss": 10.6484, "step": 37 }, { "epoch": 0.002688172043010753, "eval_accuracy": 0.046063847178124624, "eval_loss": 10.6484375, "eval_runtime": 266.8235, "eval_samples_per_second": 126.552, "eval_steps_per_second": 2.638, "step": 37 }, { "epoch": 0.0027608253414705027, "grad_norm": 1.1432477235794067, "learning_rate": 9.999847428073235e-06, "loss": 10.6562, "step": 38 }, { "epoch": 0.0027608253414705027, "eval_accuracy": 0.04811105465112957, "eval_loss": 10.640625, "eval_runtime": 266.3014, "eval_samples_per_second": 126.8, "eval_steps_per_second": 2.644, "step": 38 }, { "epoch": 0.002833478639930253, "grad_norm": 1.071315050125122, "learning_rate": 9.99984016274339e-06, "loss": 10.6484, "step": 39 }, { "epoch": 0.002833478639930253, "eval_accuracy": 0.04980491199523524, "eval_loss": 10.640625, "eval_runtime": 266.126, "eval_samples_per_second": 126.883, "eval_steps_per_second": 2.645, "step": 39 }, { "epoch": 0.002906131938390003, "grad_norm": 1.0130771398544312, "learning_rate": 9.999832897413543e-06, "loss": 10.6484, "step": 40 }, { "epoch": 0.002906131938390003, "eval_accuracy": 0.05107903940988734, "eval_loss": 10.6328125, "eval_runtime": 265.8854, "eval_samples_per_second": 126.998, "eval_steps_per_second": 2.648, "step": 40 }, { "epoch": 0.0029787852368497528, "grad_norm": 1.014347791671753, "learning_rate": 9.999825632083697e-06, "loss": 10.6406, "step": 41 }, { "epoch": 0.0029787852368497528, "eval_accuracy": 0.052067614991714396, "eval_loss": 10.6328125, "eval_runtime": 266.8406, "eval_samples_per_second": 126.544, "eval_steps_per_second": 2.638, "step": 41 }, { "epoch": 0.003051438535309503, "grad_norm": 1.0095568895339966, "learning_rate": 9.999818366753851e-06, "loss": 10.6406, "step": 42 }, { "epoch": 0.003051438535309503, "eval_accuracy": 0.05287193090039351, "eval_loss": 10.625, "eval_runtime": 266.7685, "eval_samples_per_second": 126.578, "eval_steps_per_second": 2.639, "step": 42 }, { "epoch": 0.003124091833769253, "grad_norm": 0.9412463307380676, "learning_rate": 9.999811101424005e-06, "loss": 10.6406, "step": 43 }, { "epoch": 0.003124091833769253, "eval_accuracy": 0.053470536009796996, "eval_loss": 10.625, "eval_runtime": 267.0069, "eval_samples_per_second": 126.465, "eval_steps_per_second": 2.637, "step": 43 }, { "epoch": 0.0031967451322290033, "grad_norm": 0.952081561088562, "learning_rate": 9.999803836094159e-06, "loss": 10.625, "step": 44 }, { "epoch": 0.0031967451322290033, "eval_accuracy": 0.05392726261832098, "eval_loss": 10.6171875, "eval_runtime": 269.2581, "eval_samples_per_second": 125.408, "eval_steps_per_second": 2.615, "step": 44 }, { "epoch": 0.003269398430688753, "grad_norm": 0.9194355607032776, "learning_rate": 9.999796570764313e-06, "loss": 10.625, "step": 45 }, { "epoch": 0.003269398430688753, "eval_accuracy": 0.05422120962871285, "eval_loss": 10.6171875, "eval_runtime": 269.3844, "eval_samples_per_second": 125.349, "eval_steps_per_second": 2.613, "step": 45 }, { "epoch": 0.0033420517291485033, "grad_norm": 0.9257526993751526, "learning_rate": 9.999789305434467e-06, "loss": 10.625, "step": 46 }, { "epoch": 0.0033420517291485033, "eval_accuracy": 0.054314019764158616, "eval_loss": 10.6171875, "eval_runtime": 269.7126, "eval_samples_per_second": 125.196, "eval_steps_per_second": 2.61, "step": 46 }, { "epoch": 0.0034147050276082534, "grad_norm": 0.9701704382896423, "learning_rate": 9.999782040104623e-06, "loss": 10.6172, "step": 47 }, { "epoch": 0.0034147050276082534, "eval_accuracy": 0.05444489768753676, "eval_loss": 10.609375, "eval_runtime": 269.4275, "eval_samples_per_second": 125.329, "eval_steps_per_second": 2.613, "step": 47 }, { "epoch": 0.0034873583260680036, "grad_norm": 0.8972945809364319, "learning_rate": 9.999774774774775e-06, "loss": 10.625, "step": 48 }, { "epoch": 0.0034873583260680036, "eval_accuracy": 0.05449599247514181, "eval_loss": 10.609375, "eval_runtime": 268.4057, "eval_samples_per_second": 125.806, "eval_steps_per_second": 2.623, "step": 48 }, { "epoch": 0.0035600116245277534, "grad_norm": 0.9347382187843323, "learning_rate": 9.99976750944493e-06, "loss": 10.6172, "step": 49 }, { "epoch": 0.0035600116245277534, "eval_accuracy": 0.05453342338753463, "eval_loss": 10.6015625, "eval_runtime": 269.5777, "eval_samples_per_second": 125.259, "eval_steps_per_second": 2.611, "step": 49 }, { "epoch": 0.0036326649229875036, "grad_norm": 0.9273884892463684, "learning_rate": 9.999760244115083e-06, "loss": 10.6016, "step": 50 }, { "epoch": 0.0036326649229875036, "eval_accuracy": 0.05452213332115164, "eval_loss": 10.6015625, "eval_runtime": 268.6914, "eval_samples_per_second": 125.672, "eval_steps_per_second": 2.62, "step": 50 }, { "epoch": 0.0037053182214472537, "grad_norm": 0.9508588910102844, "learning_rate": 9.999752978785238e-06, "loss": 10.6016, "step": 51 }, { "epoch": 0.0037053182214472537, "eval_accuracy": 0.054549547918240585, "eval_loss": 10.6015625, "eval_runtime": 269.6867, "eval_samples_per_second": 125.208, "eval_steps_per_second": 2.61, "step": 51 }, { "epoch": 0.003777971519907004, "grad_norm": 0.97487872838974, "learning_rate": 9.999745713455392e-06, "loss": 10.6016, "step": 52 }, { "epoch": 0.003777971519907004, "eval_accuracy": 0.05455154539152372, "eval_loss": 10.59375, "eval_runtime": 269.8783, "eval_samples_per_second": 125.119, "eval_steps_per_second": 2.609, "step": 52 }, { "epoch": 0.0038506248183667537, "grad_norm": 1.050345540046692, "learning_rate": 9.999738448125546e-06, "loss": 10.6016, "step": 53 }, { "epoch": 0.0038506248183667537, "eval_accuracy": 0.054549403173799776, "eval_loss": 10.59375, "eval_runtime": 269.476, "eval_samples_per_second": 125.306, "eval_steps_per_second": 2.612, "step": 53 }, { "epoch": 0.003923278116826504, "grad_norm": 0.9317484498023987, "learning_rate": 9.9997311827957e-06, "loss": 10.5938, "step": 54 }, { "epoch": 0.003923278116826504, "eval_accuracy": 0.05454671092720075, "eval_loss": 10.59375, "eval_runtime": 268.2865, "eval_samples_per_second": 125.862, "eval_steps_per_second": 2.624, "step": 54 }, { "epoch": 0.003995931415286254, "grad_norm": 0.9053019285202026, "learning_rate": 9.999723917465854e-06, "loss": 10.6016, "step": 55 }, { "epoch": 0.003995931415286254, "eval_accuracy": 0.054510206379229105, "eval_loss": 10.5859375, "eval_runtime": 268.6755, "eval_samples_per_second": 125.68, "eval_steps_per_second": 2.62, "step": 55 }, { "epoch": 0.004068584713746004, "grad_norm": 1.051640272140503, "learning_rate": 9.999716652136008e-06, "loss": 10.5859, "step": 56 }, { "epoch": 0.004068584713746004, "eval_accuracy": 0.05450499557936003, "eval_loss": 10.5859375, "eval_runtime": 268.9754, "eval_samples_per_second": 125.539, "eval_steps_per_second": 2.617, "step": 56 }, { "epoch": 0.004141238012205754, "grad_norm": 0.8980646729469299, "learning_rate": 9.999709386806162e-06, "loss": 10.6016, "step": 57 }, { "epoch": 0.004141238012205754, "eval_accuracy": 0.05452688093881013, "eval_loss": 10.5859375, "eval_runtime": 269.557, "eval_samples_per_second": 125.269, "eval_steps_per_second": 2.612, "step": 57 }, { "epoch": 0.004213891310665505, "grad_norm": 0.9363867044448853, "learning_rate": 9.999702121476316e-06, "loss": 10.5859, "step": 58 }, { "epoch": 0.004213891310665505, "eval_accuracy": 0.054599542648095495, "eval_loss": 10.5859375, "eval_runtime": 268.413, "eval_samples_per_second": 125.802, "eval_steps_per_second": 2.623, "step": 58 }, { "epoch": 0.004286544609125254, "grad_norm": 0.9355424642562866, "learning_rate": 9.99969485614647e-06, "loss": 10.5859, "step": 59 }, { "epoch": 0.004286544609125254, "eval_accuracy": 0.05472584664714412, "eval_loss": 10.578125, "eval_runtime": 267.1942, "eval_samples_per_second": 126.376, "eval_steps_per_second": 2.635, "step": 59 }, { "epoch": 0.004359197907585004, "grad_norm": 0.9955667853355408, "learning_rate": 9.999687590816624e-06, "loss": 10.5781, "step": 60 }, { "epoch": 0.004359197907585004, "eval_accuracy": 0.05484039739759917, "eval_loss": 10.578125, "eval_runtime": 268.4725, "eval_samples_per_second": 125.774, "eval_steps_per_second": 2.622, "step": 60 }, { "epoch": 0.004431851206044755, "grad_norm": 0.9198755025863647, "learning_rate": 9.999680325486778e-06, "loss": 10.5781, "step": 61 }, { "epoch": 0.004431851206044755, "eval_accuracy": 0.054993913351519604, "eval_loss": 10.578125, "eval_runtime": 268.5476, "eval_samples_per_second": 125.739, "eval_steps_per_second": 2.622, "step": 61 }, { "epoch": 0.0045045045045045045, "grad_norm": 0.9875515699386597, "learning_rate": 9.999673060156932e-06, "loss": 10.5781, "step": 62 }, { "epoch": 0.0045045045045045045, "eval_accuracy": 0.055311424756874936, "eval_loss": 10.5703125, "eval_runtime": 267.2035, "eval_samples_per_second": 126.372, "eval_steps_per_second": 2.635, "step": 62 }, { "epoch": 0.004577157802964254, "grad_norm": 0.9037775993347168, "learning_rate": 9.999665794827086e-06, "loss": 10.5781, "step": 63 }, { "epoch": 0.004577157802964254, "eval_accuracy": 0.05571670919113593, "eval_loss": 10.5703125, "eval_runtime": 267.7809, "eval_samples_per_second": 126.099, "eval_steps_per_second": 2.629, "step": 63 }, { "epoch": 0.004649811101424005, "grad_norm": 0.9087035655975342, "learning_rate": 9.99965852949724e-06, "loss": 10.5703, "step": 64 }, { "epoch": 0.004649811101424005, "eval_accuracy": 0.056142518387103435, "eval_loss": 10.5703125, "eval_runtime": 267.3757, "eval_samples_per_second": 126.29, "eval_steps_per_second": 2.633, "step": 64 }, { "epoch": 0.004722464399883755, "grad_norm": 0.8892097473144531, "learning_rate": 9.999651264167394e-06, "loss": 10.5781, "step": 65 }, { "epoch": 0.004722464399883755, "eval_accuracy": 0.05656430368761649, "eval_loss": 10.5625, "eval_runtime": 266.879, "eval_samples_per_second": 126.525, "eval_steps_per_second": 2.638, "step": 65 }, { "epoch": 0.004795117698343505, "grad_norm": 0.9172134399414062, "learning_rate": 9.999643998837548e-06, "loss": 10.5625, "step": 66 }, { "epoch": 0.004795117698343505, "eval_accuracy": 0.0569930367212883, "eval_loss": 10.5625, "eval_runtime": 266.0917, "eval_samples_per_second": 126.9, "eval_steps_per_second": 2.646, "step": 66 }, { "epoch": 0.004867770996803255, "grad_norm": 0.9037718176841736, "learning_rate": 9.999636733507701e-06, "loss": 10.5781, "step": 67 }, { "epoch": 0.004867770996803255, "eval_accuracy": 0.05732160660192132, "eval_loss": 10.5625, "eval_runtime": 267.3756, "eval_samples_per_second": 126.291, "eval_steps_per_second": 2.633, "step": 67 }, { "epoch": 0.004940424295263005, "grad_norm": 0.8923665881156921, "learning_rate": 9.999629468177855e-06, "loss": 10.5703, "step": 68 }, { "epoch": 0.004940424295263005, "eval_accuracy": 0.05754022860531697, "eval_loss": 10.5546875, "eval_runtime": 267.5062, "eval_samples_per_second": 126.229, "eval_steps_per_second": 2.632, "step": 68 }, { "epoch": 0.005013077593722755, "grad_norm": 0.9167753458023071, "learning_rate": 9.999622202848011e-06, "loss": 10.5625, "step": 69 }, { "epoch": 0.005013077593722755, "eval_accuracy": 0.0576851756883416, "eval_loss": 10.5546875, "eval_runtime": 269.0923, "eval_samples_per_second": 125.485, "eval_steps_per_second": 2.616, "step": 69 }, { "epoch": 0.005085730892182505, "grad_norm": 0.9031029343605042, "learning_rate": 9.999614937518163e-06, "loss": 10.5625, "step": 70 }, { "epoch": 0.005085730892182505, "eval_accuracy": 0.05778192287257733, "eval_loss": 10.5546875, "eval_runtime": 269.1242, "eval_samples_per_second": 125.47, "eval_steps_per_second": 2.616, "step": 70 }, { "epoch": 0.005158384190642255, "grad_norm": 0.8912838101387024, "learning_rate": 9.999607672188319e-06, "loss": 10.5625, "step": 71 }, { "epoch": 0.005158384190642255, "eval_accuracy": 0.05789719734523642, "eval_loss": 10.5546875, "eval_runtime": 269.7373, "eval_samples_per_second": 125.185, "eval_steps_per_second": 2.61, "step": 71 }, { "epoch": 0.0052310374891020054, "grad_norm": 0.8998405933380127, "learning_rate": 9.999600406858471e-06, "loss": 10.5547, "step": 72 }, { "epoch": 0.0052310374891020054, "eval_accuracy": 0.057985607249681645, "eval_loss": 10.546875, "eval_runtime": 269.5868, "eval_samples_per_second": 125.255, "eval_steps_per_second": 2.611, "step": 72 }, { "epoch": 0.005303690787561755, "grad_norm": 0.9078417420387268, "learning_rate": 9.999593141528627e-06, "loss": 10.5469, "step": 73 }, { "epoch": 0.005303690787561755, "eval_accuracy": 0.05800511880030249, "eval_loss": 10.546875, "eval_runtime": 271.0227, "eval_samples_per_second": 124.591, "eval_steps_per_second": 2.598, "step": 73 }, { "epoch": 0.005376344086021506, "grad_norm": 0.8995553851127625, "learning_rate": 9.99958587619878e-06, "loss": 10.5469, "step": 74 }, { "epoch": 0.005376344086021506, "eval_accuracy": 0.05802399347538379, "eval_loss": 10.546875, "eval_runtime": 270.6941, "eval_samples_per_second": 124.742, "eval_steps_per_second": 2.601, "step": 74 }, { "epoch": 0.0054489973844812556, "grad_norm": 0.8786413073539734, "learning_rate": 9.999578610868935e-06, "loss": 10.5547, "step": 75 }, { "epoch": 0.0054489973844812556, "eval_accuracy": 0.05797920954539795, "eval_loss": 10.5390625, "eval_runtime": 268.937, "eval_samples_per_second": 125.557, "eval_steps_per_second": 2.618, "step": 75 }, { "epoch": 0.005521650682941005, "grad_norm": 0.9166957139968872, "learning_rate": 9.999571345539089e-06, "loss": 10.5547, "step": 76 }, { "epoch": 0.005521650682941005, "eval_accuracy": 0.05800387399811155, "eval_loss": 10.5390625, "eval_runtime": 270.6425, "eval_samples_per_second": 124.766, "eval_steps_per_second": 2.601, "step": 76 }, { "epoch": 0.005594303981400756, "grad_norm": 0.9106067419052124, "learning_rate": 9.999564080209243e-06, "loss": 10.5469, "step": 77 }, { "epoch": 0.005594303981400756, "eval_accuracy": 0.058152873925478785, "eval_loss": 10.5390625, "eval_runtime": 268.7053, "eval_samples_per_second": 125.666, "eval_steps_per_second": 2.62, "step": 77 }, { "epoch": 0.005666957279860506, "grad_norm": 0.9021939039230347, "learning_rate": 9.999556814879397e-06, "loss": 10.5469, "step": 78 }, { "epoch": 0.005666957279860506, "eval_accuracy": 0.05821331920396, "eval_loss": 10.5390625, "eval_runtime": 268.8573, "eval_samples_per_second": 125.594, "eval_steps_per_second": 2.618, "step": 78 }, { "epoch": 0.005739610578320255, "grad_norm": 0.935400664806366, "learning_rate": 9.99954954954955e-06, "loss": 10.5312, "step": 79 }, { "epoch": 0.005739610578320255, "eval_accuracy": 0.05838906790398846, "eval_loss": 10.53125, "eval_runtime": 270.5471, "eval_samples_per_second": 124.81, "eval_steps_per_second": 2.602, "step": 79 }, { "epoch": 0.005812263876780006, "grad_norm": 0.9361926317214966, "learning_rate": 9.999542284219704e-06, "loss": 10.5312, "step": 80 }, { "epoch": 0.005812263876780006, "eval_accuracy": 0.05863825993328266, "eval_loss": 10.53125, "eval_runtime": 270.2854, "eval_samples_per_second": 124.931, "eval_steps_per_second": 2.605, "step": 80 }, { "epoch": 0.005884917175239756, "grad_norm": 0.9991462826728821, "learning_rate": 9.999535018889858e-06, "loss": 10.5312, "step": 81 }, { "epoch": 0.005884917175239756, "eval_accuracy": 0.058984343891253385, "eval_loss": 10.53125, "eval_runtime": 269.5128, "eval_samples_per_second": 125.289, "eval_steps_per_second": 2.612, "step": 81 }, { "epoch": 0.0059575704736995055, "grad_norm": 0.8942323327064514, "learning_rate": 9.999527753560012e-06, "loss": 10.5312, "step": 82 }, { "epoch": 0.0059575704736995055, "eval_accuracy": 0.05927577234837521, "eval_loss": 10.53125, "eval_runtime": 267.9968, "eval_samples_per_second": 125.998, "eval_steps_per_second": 2.627, "step": 82 }, { "epoch": 0.006030223772159256, "grad_norm": 0.9410443902015686, "learning_rate": 9.999520488230166e-06, "loss": 10.5312, "step": 83 }, { "epoch": 0.006030223772159256, "eval_accuracy": 0.05966313742086423, "eval_loss": 10.5234375, "eval_runtime": 269.3505, "eval_samples_per_second": 125.365, "eval_steps_per_second": 2.614, "step": 83 }, { "epoch": 0.006102877070619006, "grad_norm": 0.9418770670890808, "learning_rate": 9.99951322290032e-06, "loss": 10.5234, "step": 84 }, { "epoch": 0.006102877070619006, "eval_accuracy": 0.06000398163007773, "eval_loss": 10.5234375, "eval_runtime": 268.3417, "eval_samples_per_second": 125.836, "eval_steps_per_second": 2.624, "step": 84 }, { "epoch": 0.0061755303690787565, "grad_norm": 0.8822703957557678, "learning_rate": 9.999505957570474e-06, "loss": 10.5312, "step": 85 }, { "epoch": 0.0061755303690787565, "eval_accuracy": 0.060200255091812704, "eval_loss": 10.5234375, "eval_runtime": 269.365, "eval_samples_per_second": 125.358, "eval_steps_per_second": 2.614, "step": 85 }, { "epoch": 0.006248183667538506, "grad_norm": 0.8689332604408264, "learning_rate": 9.999498692240628e-06, "loss": 10.5312, "step": 86 }, { "epoch": 0.006248183667538506, "eval_accuracy": 0.06028594380077074, "eval_loss": 10.5234375, "eval_runtime": 267.6304, "eval_samples_per_second": 126.17, "eval_steps_per_second": 2.63, "step": 86 }, { "epoch": 0.006320836965998256, "grad_norm": 0.8931795954704285, "learning_rate": 9.999491426910782e-06, "loss": 10.5234, "step": 87 }, { "epoch": 0.006320836965998256, "eval_accuracy": 0.060404489497792084, "eval_loss": 10.515625, "eval_runtime": 267.5381, "eval_samples_per_second": 126.214, "eval_steps_per_second": 2.631, "step": 87 }, { "epoch": 0.006393490264458007, "grad_norm": 0.8975218534469604, "learning_rate": 9.999484161580936e-06, "loss": 10.5156, "step": 88 }, { "epoch": 0.006393490264458007, "eval_accuracy": 0.06048001714700543, "eval_loss": 10.515625, "eval_runtime": 266.4763, "eval_samples_per_second": 126.717, "eval_steps_per_second": 2.642, "step": 88 }, { "epoch": 0.006466143562917756, "grad_norm": 0.8878839015960693, "learning_rate": 9.99947689625109e-06, "loss": 10.5234, "step": 89 }, { "epoch": 0.006466143562917756, "eval_accuracy": 0.060569556058088954, "eval_loss": 10.515625, "eval_runtime": 265.4792, "eval_samples_per_second": 127.193, "eval_steps_per_second": 2.652, "step": 89 }, { "epoch": 0.006538796861377506, "grad_norm": 0.8937884569168091, "learning_rate": 9.999469630921244e-06, "loss": 10.5156, "step": 90 }, { "epoch": 0.006538796861377506, "eval_accuracy": 0.06063920708300553, "eval_loss": 10.515625, "eval_runtime": 264.9514, "eval_samples_per_second": 127.446, "eval_steps_per_second": 2.657, "step": 90 }, { "epoch": 0.006611450159837257, "grad_norm": 0.9820625185966492, "learning_rate": 9.9994623655914e-06, "loss": 10.5156, "step": 91 }, { "epoch": 0.006611450159837257, "eval_accuracy": 0.06055398155625807, "eval_loss": 10.5078125, "eval_runtime": 265.7628, "eval_samples_per_second": 127.057, "eval_steps_per_second": 2.649, "step": 91 }, { "epoch": 0.0066841034582970065, "grad_norm": 0.8808642029762268, "learning_rate": 9.999455100261552e-06, "loss": 10.5156, "step": 92 }, { "epoch": 0.0066841034582970065, "eval_accuracy": 0.060472056202761026, "eval_loss": 10.5078125, "eval_runtime": 266.9758, "eval_samples_per_second": 126.48, "eval_steps_per_second": 2.637, "step": 92 }, { "epoch": 0.006756756756756757, "grad_norm": 0.8804787993431091, "learning_rate": 9.999447834931707e-06, "loss": 10.5156, "step": 93 }, { "epoch": 0.006756756756756757, "eval_accuracy": 0.06028976505400806, "eval_loss": 10.5078125, "eval_runtime": 267.6988, "eval_samples_per_second": 126.138, "eval_steps_per_second": 2.63, "step": 93 }, { "epoch": 0.006829410055216507, "grad_norm": 0.8913342356681824, "learning_rate": 9.99944056960186e-06, "loss": 10.5156, "step": 94 }, { "epoch": 0.006829410055216507, "eval_accuracy": 0.06017234836362501, "eval_loss": 10.5078125, "eval_runtime": 268.8716, "eval_samples_per_second": 125.588, "eval_steps_per_second": 2.618, "step": 94 }, { "epoch": 0.006902063353676257, "grad_norm": 0.8503950834274292, "learning_rate": 9.999433304272015e-06, "loss": 10.5234, "step": 95 }, { "epoch": 0.006902063353676257, "eval_accuracy": 0.0601403308933184, "eval_loss": 10.5, "eval_runtime": 267.2705, "eval_samples_per_second": 126.34, "eval_steps_per_second": 2.634, "step": 95 }, { "epoch": 0.006974716652136007, "grad_norm": 0.912339985370636, "learning_rate": 9.99942603894217e-06, "loss": 10.5156, "step": 96 }, { "epoch": 0.006974716652136007, "eval_accuracy": 0.06016896134371012, "eval_loss": 10.5, "eval_runtime": 268.9999, "eval_samples_per_second": 125.528, "eval_steps_per_second": 2.617, "step": 96 }, { "epoch": 0.007047369950595757, "grad_norm": 0.8949794769287109, "learning_rate": 9.999418773612323e-06, "loss": 10.5078, "step": 97 }, { "epoch": 0.007047369950595757, "eval_accuracy": 0.06026235045691912, "eval_loss": 10.5, "eval_runtime": 267.9858, "eval_samples_per_second": 126.003, "eval_steps_per_second": 2.627, "step": 97 }, { "epoch": 0.007120023249055507, "grad_norm": 0.8988801836967468, "learning_rate": 9.999411508282477e-06, "loss": 10.5, "step": 98 }, { "epoch": 0.007120023249055507, "eval_accuracy": 0.06031005822460927, "eval_loss": 10.5, "eval_runtime": 268.4028, "eval_samples_per_second": 125.807, "eval_steps_per_second": 2.623, "step": 98 }, { "epoch": 0.007192676547515257, "grad_norm": 0.8954498767852783, "learning_rate": 9.999404242952631e-06, "loss": 10.5078, "step": 99 }, { "epoch": 0.007192676547515257, "eval_accuracy": 0.06044918658111344, "eval_loss": 10.5, "eval_runtime": 266.438, "eval_samples_per_second": 126.735, "eval_steps_per_second": 2.642, "step": 99 }, { "epoch": 0.007265329845975007, "grad_norm": 0.8816587328910828, "learning_rate": 9.999396977622785e-06, "loss": 10.5078, "step": 100 }, { "epoch": 0.007265329845975007, "eval_accuracy": 0.060579746066721805, "eval_loss": 10.4921875, "eval_runtime": 268.404, "eval_samples_per_second": 125.807, "eval_steps_per_second": 2.623, "step": 100 }, { "epoch": 0.007337983144434758, "grad_norm": 0.9182707071304321, "learning_rate": 9.999389712292939e-06, "loss": 10.5, "step": 101 }, { "epoch": 0.007337983144434758, "eval_accuracy": 0.0607209587431736, "eval_loss": 10.4921875, "eval_runtime": 269.5331, "eval_samples_per_second": 125.28, "eval_steps_per_second": 2.612, "step": 101 }, { "epoch": 0.0074106364428945075, "grad_norm": 0.9346348643302917, "learning_rate": 9.999382446963093e-06, "loss": 10.4922, "step": 102 }, { "epoch": 0.0074106364428945075, "eval_accuracy": 0.06087233247937008, "eval_loss": 10.4921875, "eval_runtime": 268.6739, "eval_samples_per_second": 125.68, "eval_steps_per_second": 2.62, "step": 102 }, { "epoch": 0.007483289741354257, "grad_norm": 1.0120168924331665, "learning_rate": 9.999375181633247e-06, "loss": 10.4922, "step": 103 }, { "epoch": 0.007483289741354257, "eval_accuracy": 0.06115695794777395, "eval_loss": 10.4921875, "eval_runtime": 269.4069, "eval_samples_per_second": 125.338, "eval_steps_per_second": 2.613, "step": 103 }, { "epoch": 0.007555943039814008, "grad_norm": 0.9036211967468262, "learning_rate": 9.999367916303401e-06, "loss": 10.4844, "step": 104 }, { "epoch": 0.007555943039814008, "eval_accuracy": 0.06140849483700922, "eval_loss": 10.484375, "eval_runtime": 267.3403, "eval_samples_per_second": 126.307, "eval_steps_per_second": 2.633, "step": 104 }, { "epoch": 0.007628596338273758, "grad_norm": 0.895473837852478, "learning_rate": 9.999360650973555e-06, "loss": 10.4922, "step": 105 }, { "epoch": 0.007628596338273758, "eval_accuracy": 0.0616540103575069, "eval_loss": 10.484375, "eval_runtime": 268.6172, "eval_samples_per_second": 125.707, "eval_steps_per_second": 2.621, "step": 105 }, { "epoch": 0.007701249636733507, "grad_norm": 0.908990204334259, "learning_rate": 9.999353385643709e-06, "loss": 10.4922, "step": 106 }, { "epoch": 0.007701249636733507, "eval_accuracy": 0.06192879320393586, "eval_loss": 10.484375, "eval_runtime": 269.3754, "eval_samples_per_second": 125.353, "eval_steps_per_second": 2.613, "step": 106 }, { "epoch": 0.007773902935193258, "grad_norm": 0.929440975189209, "learning_rate": 9.999346120313863e-06, "loss": 10.4844, "step": 107 }, { "epoch": 0.007773902935193258, "eval_accuracy": 0.062228703685288995, "eval_loss": 10.484375, "eval_runtime": 268.7245, "eval_samples_per_second": 125.657, "eval_steps_per_second": 2.62, "step": 107 }, { "epoch": 0.007846556233653008, "grad_norm": 0.9397541880607605, "learning_rate": 9.999338854984018e-06, "loss": 10.4922, "step": 108 }, { "epoch": 0.007846556233653008, "eval_accuracy": 0.06252207171791763, "eval_loss": 10.4765625, "eval_runtime": 267.6862, "eval_samples_per_second": 126.144, "eval_steps_per_second": 2.63, "step": 108 }, { "epoch": 0.007919209532112758, "grad_norm": 0.9697725772857666, "learning_rate": 9.99933158965417e-06, "loss": 10.4844, "step": 109 }, { "epoch": 0.007919209532112758, "eval_accuracy": 0.06282742459024514, "eval_loss": 10.4765625, "eval_runtime": 266.1334, "eval_samples_per_second": 126.88, "eval_steps_per_second": 2.645, "step": 109 }, { "epoch": 0.007991862830572507, "grad_norm": 0.8942638039588928, "learning_rate": 9.999324324324326e-06, "loss": 10.4766, "step": 110 }, { "epoch": 0.007991862830572507, "eval_accuracy": 0.06300488127467513, "eval_loss": 10.4765625, "eval_runtime": 267.3455, "eval_samples_per_second": 126.305, "eval_steps_per_second": 2.633, "step": 110 }, { "epoch": 0.008064516129032258, "grad_norm": 0.8714835047721863, "learning_rate": 9.999317058994478e-06, "loss": 10.4844, "step": 111 }, { "epoch": 0.008064516129032258, "eval_accuracy": 0.06321102630727317, "eval_loss": 10.4765625, "eval_runtime": 266.4815, "eval_samples_per_second": 126.714, "eval_steps_per_second": 2.642, "step": 111 }, { "epoch": 0.008137169427492008, "grad_norm": 0.8855974078178406, "learning_rate": 9.999309793664634e-06, "loss": 10.4766, "step": 112 }, { "epoch": 0.008137169427492008, "eval_accuracy": 0.06342374273748387, "eval_loss": 10.4765625, "eval_runtime": 265.4409, "eval_samples_per_second": 127.211, "eval_steps_per_second": 2.652, "step": 112 }, { "epoch": 0.008209822725951759, "grad_norm": 0.8999938368797302, "learning_rate": 9.999302528334788e-06, "loss": 10.4844, "step": 113 }, { "epoch": 0.008209822725951759, "eval_accuracy": 0.06359850717531484, "eval_loss": 10.46875, "eval_runtime": 264.9726, "eval_samples_per_second": 127.436, "eval_steps_per_second": 2.657, "step": 113 }, { "epoch": 0.008282476024411508, "grad_norm": 0.8735718727111816, "learning_rate": 9.99929526300494e-06, "loss": 10.4766, "step": 114 }, { "epoch": 0.008282476024411508, "eval_accuracy": 0.0637932173970891, "eval_loss": 10.46875, "eval_runtime": 264.7038, "eval_samples_per_second": 127.565, "eval_steps_per_second": 2.66, "step": 114 }, { "epoch": 0.008355129322871259, "grad_norm": 0.9054996371269226, "learning_rate": 9.999287997675096e-06, "loss": 10.4766, "step": 115 }, { "epoch": 0.008355129322871259, "eval_accuracy": 0.06404524641742311, "eval_loss": 10.46875, "eval_runtime": 265.6768, "eval_samples_per_second": 127.098, "eval_steps_per_second": 2.65, "step": 115 }, { "epoch": 0.00842778262133101, "grad_norm": 0.9062832593917847, "learning_rate": 9.999280732345248e-06, "loss": 10.4844, "step": 116 }, { "epoch": 0.00842778262133101, "eval_accuracy": 0.06427657698272166, "eval_loss": 10.46875, "eval_runtime": 264.6617, "eval_samples_per_second": 127.586, "eval_steps_per_second": 2.66, "step": 116 }, { "epoch": 0.008500435919790758, "grad_norm": 0.9465892910957336, "learning_rate": 9.999273467015404e-06, "loss": 10.4531, "step": 117 }, { "epoch": 0.008500435919790758, "eval_accuracy": 0.06444393050518328, "eval_loss": 10.4609375, "eval_runtime": 265.0601, "eval_samples_per_second": 127.394, "eval_steps_per_second": 2.656, "step": 117 }, { "epoch": 0.008573089218250509, "grad_norm": 0.9980528354644775, "learning_rate": 9.999266201685556e-06, "loss": 10.4609, "step": 118 }, { "epoch": 0.008573089218250509, "eval_accuracy": 0.06469538054775407, "eval_loss": 10.4609375, "eval_runtime": 265.9103, "eval_samples_per_second": 126.986, "eval_steps_per_second": 2.648, "step": 118 }, { "epoch": 0.00864574251671026, "grad_norm": 0.9110475778579712, "learning_rate": 9.999258936355712e-06, "loss": 10.4609, "step": 119 }, { "epoch": 0.00864574251671026, "eval_accuracy": 0.06482475312894781, "eval_loss": 10.4609375, "eval_runtime": 268.5579, "eval_samples_per_second": 125.735, "eval_steps_per_second": 2.621, "step": 119 }, { "epoch": 0.008718395815170008, "grad_norm": 0.8688368797302246, "learning_rate": 9.999251671025866e-06, "loss": 10.4688, "step": 120 }, { "epoch": 0.008718395815170008, "eval_accuracy": 0.06493464310840887, "eval_loss": 10.4609375, "eval_runtime": 268.4968, "eval_samples_per_second": 125.763, "eval_steps_per_second": 2.622, "step": 120 }, { "epoch": 0.008791049113629759, "grad_norm": 0.8964656591415405, "learning_rate": 9.99924440569602e-06, "loss": 10.4609, "step": 121 }, { "epoch": 0.008791049113629759, "eval_accuracy": 0.06505721270088466, "eval_loss": 10.4609375, "eval_runtime": 268.2266, "eval_samples_per_second": 125.89, "eval_steps_per_second": 2.625, "step": 121 }, { "epoch": 0.00886370241208951, "grad_norm": 0.917405903339386, "learning_rate": 9.999237140366174e-06, "loss": 10.4609, "step": 122 }, { "epoch": 0.00886370241208951, "eval_accuracy": 0.06532562679191808, "eval_loss": 10.453125, "eval_runtime": 267.3486, "eval_samples_per_second": 126.303, "eval_steps_per_second": 2.633, "step": 122 }, { "epoch": 0.008936355710549258, "grad_norm": 0.9321388602256775, "learning_rate": 9.999229875036328e-06, "loss": 10.4531, "step": 123 }, { "epoch": 0.008936355710549258, "eval_accuracy": 0.06563454037748945, "eval_loss": 10.453125, "eval_runtime": 268.6664, "eval_samples_per_second": 125.684, "eval_steps_per_second": 2.62, "step": 123 }, { "epoch": 0.009009009009009009, "grad_norm": 1.0306340456008911, "learning_rate": 9.999222609706481e-06, "loss": 10.4531, "step": 124 }, { "epoch": 0.009009009009009009, "eval_accuracy": 0.06589062224216607, "eval_loss": 10.453125, "eval_runtime": 269.4306, "eval_samples_per_second": 125.327, "eval_steps_per_second": 2.613, "step": 124 }, { "epoch": 0.00908166230746876, "grad_norm": 0.90235435962677, "learning_rate": 9.999215344376635e-06, "loss": 10.4531, "step": 125 }, { "epoch": 0.00908166230746876, "eval_accuracy": 0.06600340711044328, "eval_loss": 10.453125, "eval_runtime": 268.5113, "eval_samples_per_second": 125.756, "eval_steps_per_second": 2.622, "step": 125 }, { "epoch": 0.009154315605928508, "grad_norm": 0.8829610347747803, "learning_rate": 9.99920807904679e-06, "loss": 10.4531, "step": 126 }, { "epoch": 0.009154315605928508, "eval_accuracy": 0.06620975478525845, "eval_loss": 10.4453125, "eval_runtime": 268.5597, "eval_samples_per_second": 125.734, "eval_steps_per_second": 2.621, "step": 126 }, { "epoch": 0.009226968904388259, "grad_norm": 0.9231570959091187, "learning_rate": 9.999200813716943e-06, "loss": 10.4531, "step": 127 }, { "epoch": 0.009226968904388259, "eval_accuracy": 0.06642637931537096, "eval_loss": 10.4453125, "eval_runtime": 266.401, "eval_samples_per_second": 126.753, "eval_steps_per_second": 2.643, "step": 127 }, { "epoch": 0.00929962220284801, "grad_norm": 0.9046792984008789, "learning_rate": 9.999193548387097e-06, "loss": 10.4453, "step": 128 }, { "epoch": 0.00929962220284801, "eval_accuracy": 0.06673063212994831, "eval_loss": 10.4453125, "eval_runtime": 268.2718, "eval_samples_per_second": 125.869, "eval_steps_per_second": 2.624, "step": 128 }, { "epoch": 0.009372275501307759, "grad_norm": 0.9026487469673157, "learning_rate": 9.999186283057251e-06, "loss": 10.4531, "step": 129 }, { "epoch": 0.009372275501307759, "eval_accuracy": 0.06701204427176626, "eval_loss": 10.4453125, "eval_runtime": 266.0718, "eval_samples_per_second": 126.909, "eval_steps_per_second": 2.646, "step": 129 }, { "epoch": 0.00944492879976751, "grad_norm": 0.918516218662262, "learning_rate": 9.999179017727407e-06, "loss": 10.4375, "step": 130 }, { "epoch": 0.00944492879976751, "eval_accuracy": 0.0673045727866382, "eval_loss": 10.4453125, "eval_runtime": 267.2937, "eval_samples_per_second": 126.329, "eval_steps_per_second": 2.634, "step": 130 }, { "epoch": 0.00951758209822726, "grad_norm": 0.9067806601524353, "learning_rate": 9.999171752397559e-06, "loss": 10.4453, "step": 131 }, { "epoch": 0.00951758209822726, "eval_accuracy": 0.06758094782191605, "eval_loss": 10.4375, "eval_runtime": 266.3825, "eval_samples_per_second": 126.761, "eval_steps_per_second": 2.643, "step": 131 }, { "epoch": 0.00959023539668701, "grad_norm": 0.9461184740066528, "learning_rate": 9.999164487067715e-06, "loss": 10.4375, "step": 132 }, { "epoch": 0.00959023539668701, "eval_accuracy": 0.06776827607720912, "eval_loss": 10.4375, "eval_runtime": 266.9747, "eval_samples_per_second": 126.48, "eval_steps_per_second": 2.637, "step": 132 }, { "epoch": 0.00966288869514676, "grad_norm": 0.9130184054374695, "learning_rate": 9.999157221737867e-06, "loss": 10.4375, "step": 133 }, { "epoch": 0.00966288869514676, "eval_accuracy": 0.06790248312272583, "eval_loss": 10.4375, "eval_runtime": 267.3131, "eval_samples_per_second": 126.32, "eval_steps_per_second": 2.634, "step": 133 }, { "epoch": 0.00973554199360651, "grad_norm": 0.9464238882064819, "learning_rate": 9.999149956408023e-06, "loss": 10.4297, "step": 134 }, { "epoch": 0.00973554199360651, "eval_accuracy": 0.06788108989437448, "eval_loss": 10.4375, "eval_runtime": 267.1185, "eval_samples_per_second": 126.412, "eval_steps_per_second": 2.636, "step": 134 }, { "epoch": 0.00980819529206626, "grad_norm": 0.8799238204956055, "learning_rate": 9.999142691078175e-06, "loss": 10.4453, "step": 135 }, { "epoch": 0.00980819529206626, "eval_accuracy": 0.06779870135866685, "eval_loss": 10.4296875, "eval_runtime": 266.9212, "eval_samples_per_second": 126.505, "eval_steps_per_second": 2.637, "step": 135 }, { "epoch": 0.00988084859052601, "grad_norm": 0.8914628624916077, "learning_rate": 9.99913542574833e-06, "loss": 10.4375, "step": 136 }, { "epoch": 0.00988084859052601, "eval_accuracy": 0.06771660231184085, "eval_loss": 10.4296875, "eval_runtime": 265.5018, "eval_samples_per_second": 127.182, "eval_steps_per_second": 2.652, "step": 136 }, { "epoch": 0.00995350188898576, "grad_norm": 0.9166758060455322, "learning_rate": 9.999128160418484e-06, "loss": 10.4375, "step": 137 }, { "epoch": 0.00995350188898576, "eval_accuracy": 0.0677075992076226, "eval_loss": 10.4296875, "eval_runtime": 266.035, "eval_samples_per_second": 126.927, "eval_steps_per_second": 2.646, "step": 137 }, { "epoch": 0.01002615518744551, "grad_norm": 0.9371738433837891, "learning_rate": 9.999120895088638e-06, "loss": 10.4219, "step": 138 }, { "epoch": 0.01002615518744551, "eval_accuracy": 0.0677291950781911, "eval_loss": 10.4296875, "eval_runtime": 266.3525, "eval_samples_per_second": 126.776, "eval_steps_per_second": 2.643, "step": 138 }, { "epoch": 0.01009880848590526, "grad_norm": 0.8814043998718262, "learning_rate": 9.999113629758792e-06, "loss": 10.4375, "step": 139 }, { "epoch": 0.01009880848590526, "eval_accuracy": 0.06779661703871923, "eval_loss": 10.421875, "eval_runtime": 266.1042, "eval_samples_per_second": 126.894, "eval_steps_per_second": 2.646, "step": 139 }, { "epoch": 0.01017146178436501, "grad_norm": 0.9055945873260498, "learning_rate": 9.999106364428946e-06, "loss": 10.4297, "step": 140 }, { "epoch": 0.01017146178436501, "eval_accuracy": 0.06796996298103028, "eval_loss": 10.421875, "eval_runtime": 267.0593, "eval_samples_per_second": 126.44, "eval_steps_per_second": 2.636, "step": 140 }, { "epoch": 0.01024411508282476, "grad_norm": 0.8938325643539429, "learning_rate": 9.9990990990991e-06, "loss": 10.4297, "step": 141 }, { "epoch": 0.01024411508282476, "eval_accuracy": 0.06815485952971778, "eval_loss": 10.421875, "eval_runtime": 267.7346, "eval_samples_per_second": 126.121, "eval_steps_per_second": 2.629, "step": 141 }, { "epoch": 0.01031676838128451, "grad_norm": 0.9098795056343079, "learning_rate": 9.999091833769254e-06, "loss": 10.4219, "step": 142 }, { "epoch": 0.01031676838128451, "eval_accuracy": 0.06841157826993396, "eval_loss": 10.421875, "eval_runtime": 268.1424, "eval_samples_per_second": 125.929, "eval_steps_per_second": 2.625, "step": 142 }, { "epoch": 0.01038942167974426, "grad_norm": 0.907673716545105, "learning_rate": 9.999084568439408e-06, "loss": 10.4219, "step": 143 }, { "epoch": 0.01038942167974426, "eval_accuracy": 0.06867860281433565, "eval_loss": 10.421875, "eval_runtime": 268.3436, "eval_samples_per_second": 125.835, "eval_steps_per_second": 2.624, "step": 143 }, { "epoch": 0.010462074978204011, "grad_norm": 0.9119425415992737, "learning_rate": 9.999077303109562e-06, "loss": 10.4219, "step": 144 }, { "epoch": 0.010462074978204011, "eval_accuracy": 0.06893219507463037, "eval_loss": 10.4140625, "eval_runtime": 267.8804, "eval_samples_per_second": 126.053, "eval_steps_per_second": 2.628, "step": 144 }, { "epoch": 0.01053472827666376, "grad_norm": 0.8991184830665588, "learning_rate": 9.999070037779716e-06, "loss": 10.4219, "step": 145 }, { "epoch": 0.01053472827666376, "eval_accuracy": 0.0691721234597129, "eval_loss": 10.4140625, "eval_runtime": 268.0954, "eval_samples_per_second": 125.951, "eval_steps_per_second": 2.626, "step": 145 }, { "epoch": 0.01060738157512351, "grad_norm": 1.0204856395721436, "learning_rate": 9.99906277244987e-06, "loss": 10.4141, "step": 146 }, { "epoch": 0.01060738157512351, "eval_accuracy": 0.06934083757991812, "eval_loss": 10.4140625, "eval_runtime": 267.5128, "eval_samples_per_second": 126.226, "eval_steps_per_second": 2.632, "step": 146 }, { "epoch": 0.010680034873583261, "grad_norm": 0.9581719040870667, "learning_rate": 9.999055507120024e-06, "loss": 10.4062, "step": 147 }, { "epoch": 0.010680034873583261, "eval_accuracy": 0.06947478408544137, "eval_loss": 10.4140625, "eval_runtime": 267.7655, "eval_samples_per_second": 126.107, "eval_steps_per_second": 2.629, "step": 147 }, { "epoch": 0.010752688172043012, "grad_norm": 0.9252108931541443, "learning_rate": 9.999048241790178e-06, "loss": 10.4141, "step": 148 }, { "epoch": 0.010752688172043012, "eval_accuracy": 0.06958163443164546, "eval_loss": 10.40625, "eval_runtime": 266.9258, "eval_samples_per_second": 126.503, "eval_steps_per_second": 2.637, "step": 148 }, { "epoch": 0.01082534147050276, "grad_norm": 0.8792810440063477, "learning_rate": 9.999040976460332e-06, "loss": 10.4141, "step": 149 }, { "epoch": 0.01082534147050276, "eval_accuracy": 0.0696740971804333, "eval_loss": 10.40625, "eval_runtime": 267.4905, "eval_samples_per_second": 126.236, "eval_steps_per_second": 2.632, "step": 149 }, { "epoch": 0.010897994768962511, "grad_norm": 0.8908605575561523, "learning_rate": 9.999033711130486e-06, "loss": 10.4219, "step": 150 }, { "epoch": 0.010897994768962511, "eval_accuracy": 0.06974991431852827, "eval_loss": 10.40625, "eval_runtime": 268.0407, "eval_samples_per_second": 125.977, "eval_steps_per_second": 2.626, "step": 150 }, { "epoch": 0.010970648067422262, "grad_norm": 0.9047368764877319, "learning_rate": 9.99902644580064e-06, "loss": 10.4062, "step": 151 }, { "epoch": 0.010970648067422262, "eval_accuracy": 0.06984810894717207, "eval_loss": 10.40625, "eval_runtime": 268.9337, "eval_samples_per_second": 125.559, "eval_steps_per_second": 2.618, "step": 151 }, { "epoch": 0.01104330136588201, "grad_norm": 0.9523606300354004, "learning_rate": 9.999019180470794e-06, "loss": 10.4141, "step": 152 }, { "epoch": 0.01104330136588201, "eval_accuracy": 0.06996532299533799, "eval_loss": 10.40625, "eval_runtime": 268.7819, "eval_samples_per_second": 125.63, "eval_steps_per_second": 2.619, "step": 152 }, { "epoch": 0.011115954664341761, "grad_norm": 0.9536779522895813, "learning_rate": 9.999011915140948e-06, "loss": 10.4141, "step": 153 }, { "epoch": 0.011115954664341761, "eval_accuracy": 0.0701369030554712, "eval_loss": 10.3984375, "eval_runtime": 267.3646, "eval_samples_per_second": 126.296, "eval_steps_per_second": 2.633, "step": 153 }, { "epoch": 0.011188607962801512, "grad_norm": 0.8978484272956848, "learning_rate": 9.999004649811103e-06, "loss": 10.4219, "step": 154 }, { "epoch": 0.011188607962801512, "eval_accuracy": 0.07024372445278713, "eval_loss": 10.3984375, "eval_runtime": 267.0372, "eval_samples_per_second": 126.451, "eval_steps_per_second": 2.636, "step": 154 }, { "epoch": 0.01126126126126126, "grad_norm": 0.9067463874816895, "learning_rate": 9.998997384481255e-06, "loss": 10.4141, "step": 155 }, { "epoch": 0.01126126126126126, "eval_accuracy": 0.07035488818332729, "eval_loss": 10.3984375, "eval_runtime": 266.8902, "eval_samples_per_second": 126.52, "eval_steps_per_second": 2.638, "step": 155 }, { "epoch": 0.011333914559721011, "grad_norm": 0.8969941735267639, "learning_rate": 9.998990119151411e-06, "loss": 10.4062, "step": 156 }, { "epoch": 0.011333914559721011, "eval_accuracy": 0.07053625296765909, "eval_loss": 10.3984375, "eval_runtime": 266.2466, "eval_samples_per_second": 126.826, "eval_steps_per_second": 2.644, "step": 156 }, { "epoch": 0.011406567858180762, "grad_norm": 0.9160457253456116, "learning_rate": 9.998982853821563e-06, "loss": 10.4062, "step": 157 }, { "epoch": 0.011406567858180762, "eval_accuracy": 0.07071909414528711, "eval_loss": 10.390625, "eval_runtime": 266.1843, "eval_samples_per_second": 126.856, "eval_steps_per_second": 2.645, "step": 157 }, { "epoch": 0.01147922115664051, "grad_norm": 0.9947687387466431, "learning_rate": 9.998975588491719e-06, "loss": 10.3906, "step": 158 }, { "epoch": 0.01147922115664051, "eval_accuracy": 0.07083940572448631, "eval_loss": 10.390625, "eval_runtime": 266.0309, "eval_samples_per_second": 126.929, "eval_steps_per_second": 2.646, "step": 158 }, { "epoch": 0.011551874455100261, "grad_norm": 0.9923911690711975, "learning_rate": 9.998968323161873e-06, "loss": 10.3906, "step": 159 }, { "epoch": 0.011551874455100261, "eval_accuracy": 0.07099616395388084, "eval_loss": 10.390625, "eval_runtime": 265.8514, "eval_samples_per_second": 127.015, "eval_steps_per_second": 2.648, "step": 159 }, { "epoch": 0.011624527753560012, "grad_norm": 0.8818120360374451, "learning_rate": 9.998961057832027e-06, "loss": 10.3984, "step": 160 }, { "epoch": 0.011624527753560012, "eval_accuracy": 0.07109409804253118, "eval_loss": 10.390625, "eval_runtime": 265.1784, "eval_samples_per_second": 127.337, "eval_steps_per_second": 2.655, "step": 160 }, { "epoch": 0.011697181052019761, "grad_norm": 0.8733471035957336, "learning_rate": 9.99895379250218e-06, "loss": 10.3984, "step": 161 }, { "epoch": 0.011697181052019761, "eval_accuracy": 0.07113795560809585, "eval_loss": 10.390625, "eval_runtime": 266.4251, "eval_samples_per_second": 126.741, "eval_steps_per_second": 2.642, "step": 161 }, { "epoch": 0.011769834350479512, "grad_norm": 0.8769287467002869, "learning_rate": 9.998946527172335e-06, "loss": 10.3906, "step": 162 }, { "epoch": 0.011769834350479512, "eval_accuracy": 0.07117098628948813, "eval_loss": 10.3828125, "eval_runtime": 266.1962, "eval_samples_per_second": 126.85, "eval_steps_per_second": 2.645, "step": 162 }, { "epoch": 0.011842487648939262, "grad_norm": 0.8957408666610718, "learning_rate": 9.998939261842489e-06, "loss": 10.3906, "step": 163 }, { "epoch": 0.011842487648939262, "eval_accuracy": 0.07124170842326667, "eval_loss": 10.3828125, "eval_runtime": 266.0683, "eval_samples_per_second": 126.911, "eval_steps_per_second": 2.646, "step": 163 }, { "epoch": 0.011915140947399011, "grad_norm": 0.9207865595817566, "learning_rate": 9.998931996512643e-06, "loss": 10.3906, "step": 164 }, { "epoch": 0.011915140947399011, "eval_accuracy": 0.07135113522051714, "eval_loss": 10.3828125, "eval_runtime": 266.3681, "eval_samples_per_second": 126.768, "eval_steps_per_second": 2.643, "step": 164 }, { "epoch": 0.011987794245858762, "grad_norm": 1.0381028652191162, "learning_rate": 9.998924731182797e-06, "loss": 10.3828, "step": 165 }, { "epoch": 0.011987794245858762, "eval_accuracy": 0.0714838079749613, "eval_loss": 10.3828125, "eval_runtime": 266.8907, "eval_samples_per_second": 126.52, "eval_steps_per_second": 2.638, "step": 165 }, { "epoch": 0.012060447544318512, "grad_norm": 1.0229851007461548, "learning_rate": 9.99891746585295e-06, "loss": 10.375, "step": 166 }, { "epoch": 0.012060447544318512, "eval_accuracy": 0.07162267579147201, "eval_loss": 10.375, "eval_runtime": 268.0765, "eval_samples_per_second": 125.96, "eval_steps_per_second": 2.626, "step": 166 }, { "epoch": 0.012133100842778263, "grad_norm": 0.8751774430274963, "learning_rate": 9.998910200523105e-06, "loss": 10.3828, "step": 167 }, { "epoch": 0.012133100842778263, "eval_accuracy": 0.07171681757577321, "eval_loss": 10.375, "eval_runtime": 266.5398, "eval_samples_per_second": 126.687, "eval_steps_per_second": 2.641, "step": 167 }, { "epoch": 0.012205754141238012, "grad_norm": 0.9067946672439575, "learning_rate": 9.998902935193258e-06, "loss": 10.3828, "step": 168 }, { "epoch": 0.012205754141238012, "eval_accuracy": 0.07183573960834065, "eval_loss": 10.375, "eval_runtime": 269.0409, "eval_samples_per_second": 125.509, "eval_steps_per_second": 2.617, "step": 168 }, { "epoch": 0.012278407439697762, "grad_norm": 0.8819664120674133, "learning_rate": 9.998895669863412e-06, "loss": 10.3828, "step": 169 }, { "epoch": 0.012278407439697762, "eval_accuracy": 0.07194832183440072, "eval_loss": 10.375, "eval_runtime": 269.4359, "eval_samples_per_second": 125.325, "eval_steps_per_second": 2.613, "step": 169 }, { "epoch": 0.012351060738157513, "grad_norm": 0.8808407783508301, "learning_rate": 9.998888404533566e-06, "loss": 10.3828, "step": 170 }, { "epoch": 0.012351060738157513, "eval_accuracy": 0.0720716440979687, "eval_loss": 10.375, "eval_runtime": 269.1418, "eval_samples_per_second": 125.462, "eval_steps_per_second": 2.616, "step": 170 }, { "epoch": 0.012423714036617262, "grad_norm": 0.9482495188713074, "learning_rate": 9.99888113920372e-06, "loss": 10.3672, "step": 171 }, { "epoch": 0.012423714036617262, "eval_accuracy": 0.07210840918593382, "eval_loss": 10.3671875, "eval_runtime": 268.1791, "eval_samples_per_second": 125.912, "eval_steps_per_second": 2.625, "step": 171 }, { "epoch": 0.012496367335077013, "grad_norm": 0.8792570233345032, "learning_rate": 9.998873873873874e-06, "loss": 10.375, "step": 172 }, { "epoch": 0.012496367335077013, "eval_accuracy": 0.07209656909067576, "eval_loss": 10.3671875, "eval_runtime": 267.4096, "eval_samples_per_second": 126.274, "eval_steps_per_second": 2.633, "step": 172 }, { "epoch": 0.012569020633536763, "grad_norm": 1.0035219192504883, "learning_rate": 9.998866608544028e-06, "loss": 10.3594, "step": 173 }, { "epoch": 0.012569020633536763, "eval_accuracy": 0.07205164041624912, "eval_loss": 10.3671875, "eval_runtime": 267.2327, "eval_samples_per_second": 126.358, "eval_steps_per_second": 2.634, "step": 173 }, { "epoch": 0.012641673931996512, "grad_norm": 0.9102580547332764, "learning_rate": 9.998859343214182e-06, "loss": 10.375, "step": 174 }, { "epoch": 0.012641673931996512, "eval_accuracy": 0.07202917607903579, "eval_loss": 10.3671875, "eval_runtime": 267.9463, "eval_samples_per_second": 126.022, "eval_steps_per_second": 2.627, "step": 174 }, { "epoch": 0.012714327230456263, "grad_norm": 0.9545760154724121, "learning_rate": 9.998852077884336e-06, "loss": 10.3594, "step": 175 }, { "epoch": 0.012714327230456263, "eval_accuracy": 0.0720582697116381, "eval_loss": 10.359375, "eval_runtime": 266.8569, "eval_samples_per_second": 126.536, "eval_steps_per_second": 2.638, "step": 175 }, { "epoch": 0.012786980528916013, "grad_norm": 0.9329653978347778, "learning_rate": 9.998844812554492e-06, "loss": 10.3672, "step": 176 }, { "epoch": 0.012786980528916013, "eval_accuracy": 0.07217727859087003, "eval_loss": 10.359375, "eval_runtime": 265.4766, "eval_samples_per_second": 127.194, "eval_steps_per_second": 2.652, "step": 176 }, { "epoch": 0.012859633827375762, "grad_norm": 0.903916597366333, "learning_rate": 9.998837547224644e-06, "loss": 10.375, "step": 177 }, { "epoch": 0.012859633827375762, "eval_accuracy": 0.07231886760286792, "eval_loss": 10.359375, "eval_runtime": 266.8553, "eval_samples_per_second": 126.537, "eval_steps_per_second": 2.638, "step": 177 }, { "epoch": 0.012932287125835513, "grad_norm": 0.9190238118171692, "learning_rate": 9.9988302818948e-06, "loss": 10.3672, "step": 178 }, { "epoch": 0.012932287125835513, "eval_accuracy": 0.07257023079877423, "eval_loss": 10.359375, "eval_runtime": 266.7383, "eval_samples_per_second": 126.592, "eval_steps_per_second": 2.639, "step": 178 }, { "epoch": 0.013004940424295263, "grad_norm": 0.8731828927993774, "learning_rate": 9.998823016564952e-06, "loss": 10.3672, "step": 179 }, { "epoch": 0.013004940424295263, "eval_accuracy": 0.0727295075814388, "eval_loss": 10.359375, "eval_runtime": 266.7924, "eval_samples_per_second": 126.567, "eval_steps_per_second": 2.639, "step": 179 }, { "epoch": 0.013077593722755012, "grad_norm": 0.8964665532112122, "learning_rate": 9.998815751235107e-06, "loss": 10.3594, "step": 180 }, { "epoch": 0.013077593722755012, "eval_accuracy": 0.0728463452940586, "eval_loss": 10.3515625, "eval_runtime": 264.1125, "eval_samples_per_second": 127.851, "eval_steps_per_second": 2.666, "step": 180 }, { "epoch": 0.013150247021214763, "grad_norm": 0.8609874844551086, "learning_rate": 9.998808485905261e-06, "loss": 10.3672, "step": 181 }, { "epoch": 0.013150247021214763, "eval_accuracy": 0.07293759218954365, "eval_loss": 10.3515625, "eval_runtime": 264.4015, "eval_samples_per_second": 127.711, "eval_steps_per_second": 2.663, "step": 181 }, { "epoch": 0.013222900319674514, "grad_norm": 0.9078623056411743, "learning_rate": 9.998801220575415e-06, "loss": 10.3594, "step": 182 }, { "epoch": 0.013222900319674514, "eval_accuracy": 0.0730243230584755, "eval_loss": 10.3515625, "eval_runtime": 265.1597, "eval_samples_per_second": 127.346, "eval_steps_per_second": 2.655, "step": 182 }, { "epoch": 0.013295553618134264, "grad_norm": 0.9211888909339905, "learning_rate": 9.99879395524557e-06, "loss": 10.3516, "step": 183 }, { "epoch": 0.013295553618134264, "eval_accuracy": 0.07311939120719788, "eval_loss": 10.3515625, "eval_runtime": 265.0761, "eval_samples_per_second": 127.386, "eval_steps_per_second": 2.656, "step": 183 }, { "epoch": 0.013368206916594013, "grad_norm": 0.9223811030387878, "learning_rate": 9.998786689915723e-06, "loss": 10.3594, "step": 184 }, { "epoch": 0.013368206916594013, "eval_accuracy": 0.07324948751059565, "eval_loss": 10.3515625, "eval_runtime": 267.2168, "eval_samples_per_second": 126.366, "eval_steps_per_second": 2.635, "step": 184 }, { "epoch": 0.013440860215053764, "grad_norm": 0.8940264582633972, "learning_rate": 9.998779424585877e-06, "loss": 10.3516, "step": 185 }, { "epoch": 0.013440860215053764, "eval_accuracy": 0.07333268661517181, "eval_loss": 10.34375, "eval_runtime": 266.2508, "eval_samples_per_second": 126.824, "eval_steps_per_second": 2.644, "step": 185 }, { "epoch": 0.013513513513513514, "grad_norm": 0.8663957118988037, "learning_rate": 9.998772159256031e-06, "loss": 10.3516, "step": 186 }, { "epoch": 0.013513513513513514, "eval_accuracy": 0.07334869535032512, "eval_loss": 10.34375, "eval_runtime": 268.3928, "eval_samples_per_second": 125.812, "eval_steps_per_second": 2.623, "step": 186 }, { "epoch": 0.013586166811973263, "grad_norm": 0.899961531162262, "learning_rate": 9.998764893926185e-06, "loss": 10.3438, "step": 187 }, { "epoch": 0.013586166811973263, "eval_accuracy": 0.07337804952292087, "eval_loss": 10.34375, "eval_runtime": 268.2225, "eval_samples_per_second": 125.892, "eval_steps_per_second": 2.625, "step": 187 }, { "epoch": 0.013658820110433014, "grad_norm": 0.8721891045570374, "learning_rate": 9.998757628596339e-06, "loss": 10.3516, "step": 188 }, { "epoch": 0.013658820110433014, "eval_accuracy": 0.07342506251729515, "eval_loss": 10.34375, "eval_runtime": 268.3043, "eval_samples_per_second": 125.853, "eval_steps_per_second": 2.624, "step": 188 }, { "epoch": 0.013731473408892764, "grad_norm": 0.9019783735275269, "learning_rate": 9.998750363266493e-06, "loss": 10.3516, "step": 189 }, { "epoch": 0.013731473408892764, "eval_accuracy": 0.07348243921363125, "eval_loss": 10.3359375, "eval_runtime": 267.616, "eval_samples_per_second": 126.177, "eval_steps_per_second": 2.631, "step": 189 }, { "epoch": 0.013804126707352513, "grad_norm": 0.9109626412391663, "learning_rate": 9.998743097936647e-06, "loss": 10.3438, "step": 190 }, { "epoch": 0.013804126707352513, "eval_accuracy": 0.07349436615555378, "eval_loss": 10.3359375, "eval_runtime": 268.802, "eval_samples_per_second": 125.62, "eval_steps_per_second": 2.619, "step": 190 }, { "epoch": 0.013876780005812264, "grad_norm": 0.8707013130187988, "learning_rate": 9.998735832606801e-06, "loss": 10.3516, "step": 191 }, { "epoch": 0.013876780005812264, "eval_accuracy": 0.07354867426974475, "eval_loss": 10.3359375, "eval_runtime": 269.0333, "eval_samples_per_second": 125.512, "eval_steps_per_second": 2.617, "step": 191 }, { "epoch": 0.013949433304272014, "grad_norm": 0.9611899256706238, "learning_rate": 9.998728567276955e-06, "loss": 10.3359, "step": 192 }, { "epoch": 0.013949433304272014, "eval_accuracy": 0.07366001169361389, "eval_loss": 10.3359375, "eval_runtime": 266.4167, "eval_samples_per_second": 126.745, "eval_steps_per_second": 2.642, "step": 192 }, { "epoch": 0.014022086602731763, "grad_norm": 0.8997408151626587, "learning_rate": 9.99872130194711e-06, "loss": 10.3359, "step": 193 }, { "epoch": 0.014022086602731763, "eval_accuracy": 0.07367448613769463, "eval_loss": 10.3359375, "eval_runtime": 267.3071, "eval_samples_per_second": 126.323, "eval_steps_per_second": 2.634, "step": 193 }, { "epoch": 0.014094739901191514, "grad_norm": 0.8796170949935913, "learning_rate": 9.998714036617263e-06, "loss": 10.3359, "step": 194 }, { "epoch": 0.014094739901191514, "eval_accuracy": 0.07364093437631546, "eval_loss": 10.328125, "eval_runtime": 267.1483, "eval_samples_per_second": 126.398, "eval_steps_per_second": 2.635, "step": 194 }, { "epoch": 0.014167393199651265, "grad_norm": 0.9038819670677185, "learning_rate": 9.998706771287417e-06, "loss": 10.3359, "step": 195 }, { "epoch": 0.014167393199651265, "eval_accuracy": 0.0735885658376313, "eval_loss": 10.328125, "eval_runtime": 267.3865, "eval_samples_per_second": 126.285, "eval_steps_per_second": 2.633, "step": 195 }, { "epoch": 0.014240046498111014, "grad_norm": 0.9091231822967529, "learning_rate": 9.99869950595757e-06, "loss": 10.3359, "step": 196 }, { "epoch": 0.014240046498111014, "eval_accuracy": 0.0736078168482587, "eval_loss": 10.328125, "eval_runtime": 267.9665, "eval_samples_per_second": 126.012, "eval_steps_per_second": 2.627, "step": 196 }, { "epoch": 0.014312699796570764, "grad_norm": 0.9065552949905396, "learning_rate": 9.998692240627725e-06, "loss": 10.3281, "step": 197 }, { "epoch": 0.014312699796570764, "eval_accuracy": 0.07369920848818455, "eval_loss": 10.328125, "eval_runtime": 267.2862, "eval_samples_per_second": 126.333, "eval_steps_per_second": 2.634, "step": 197 }, { "epoch": 0.014385353095030515, "grad_norm": 0.8987371921539307, "learning_rate": 9.99868497529788e-06, "loss": 10.3359, "step": 198 }, { "epoch": 0.014385353095030515, "eval_accuracy": 0.07384696361336085, "eval_loss": 10.328125, "eval_runtime": 267.5117, "eval_samples_per_second": 126.226, "eval_steps_per_second": 2.632, "step": 198 }, { "epoch": 0.014458006393490264, "grad_norm": 0.891822338104248, "learning_rate": 9.998677709968032e-06, "loss": 10.3203, "step": 199 }, { "epoch": 0.014458006393490264, "eval_accuracy": 0.07396643567480336, "eval_loss": 10.3203125, "eval_runtime": 265.9448, "eval_samples_per_second": 126.97, "eval_steps_per_second": 2.647, "step": 199 }, { "epoch": 0.014530659691950014, "grad_norm": 0.8724116683006287, "learning_rate": 9.998670444638188e-06, "loss": 10.3359, "step": 200 }, { "epoch": 0.014530659691950014, "eval_accuracy": 0.07408226017633752, "eval_loss": 10.3203125, "eval_runtime": 266.1838, "eval_samples_per_second": 126.856, "eval_steps_per_second": 2.645, "step": 200 }, { "epoch": 0.014603312990409765, "grad_norm": 0.8940464854240417, "learning_rate": 9.99866317930834e-06, "loss": 10.3359, "step": 201 }, { "epoch": 0.014603312990409765, "eval_accuracy": 0.07420095061779967, "eval_loss": 10.3203125, "eval_runtime": 264.9987, "eval_samples_per_second": 127.423, "eval_steps_per_second": 2.657, "step": 201 }, { "epoch": 0.014675966288869515, "grad_norm": 0.9207845330238342, "learning_rate": 9.998655913978496e-06, "loss": 10.3281, "step": 202 }, { "epoch": 0.014675966288869515, "eval_accuracy": 0.0742804732135793, "eval_loss": 10.3203125, "eval_runtime": 264.7467, "eval_samples_per_second": 127.545, "eval_steps_per_second": 2.659, "step": 202 }, { "epoch": 0.014748619587329264, "grad_norm": 0.8840688467025757, "learning_rate": 9.998648648648648e-06, "loss": 10.3203, "step": 203 }, { "epoch": 0.014748619587329264, "eval_accuracy": 0.07432360705693994, "eval_loss": 10.3125, "eval_runtime": 264.0804, "eval_samples_per_second": 127.866, "eval_steps_per_second": 2.666, "step": 203 }, { "epoch": 0.014821272885789015, "grad_norm": 0.8885826468467712, "learning_rate": 9.998641383318804e-06, "loss": 10.3203, "step": 204 }, { "epoch": 0.014821272885789015, "eval_accuracy": 0.07433359442335566, "eval_loss": 10.3125, "eval_runtime": 266.5522, "eval_samples_per_second": 126.681, "eval_steps_per_second": 2.641, "step": 204 }, { "epoch": 0.014893926184248766, "grad_norm": 0.897081732749939, "learning_rate": 9.998634117988958e-06, "loss": 10.3281, "step": 205 }, { "epoch": 0.014893926184248766, "eval_accuracy": 0.07427410445818378, "eval_loss": 10.3125, "eval_runtime": 267.4051, "eval_samples_per_second": 126.277, "eval_steps_per_second": 2.633, "step": 205 }, { "epoch": 0.014966579482708514, "grad_norm": 0.9606081247329712, "learning_rate": 9.998626852659112e-06, "loss": 10.3125, "step": 206 }, { "epoch": 0.014966579482708514, "eval_accuracy": 0.07412779677741556, "eval_loss": 10.3125, "eval_runtime": 265.1816, "eval_samples_per_second": 127.335, "eval_steps_per_second": 2.655, "step": 206 }, { "epoch": 0.015039232781168265, "grad_norm": 0.9314731955528259, "learning_rate": 9.998619587329266e-06, "loss": 10.3125, "step": 207 }, { "epoch": 0.015039232781168265, "eval_accuracy": 0.07401284074252625, "eval_loss": 10.3125, "eval_runtime": 266.8876, "eval_samples_per_second": 126.521, "eval_steps_per_second": 2.638, "step": 207 }, { "epoch": 0.015111886079628016, "grad_norm": 0.9583424925804138, "learning_rate": 9.99861232199942e-06, "loss": 10.3047, "step": 208 }, { "epoch": 0.015111886079628016, "eval_accuracy": 0.07404071852182577, "eval_loss": 10.3046875, "eval_runtime": 266.703, "eval_samples_per_second": 126.609, "eval_steps_per_second": 2.64, "step": 208 }, { "epoch": 0.015184539378087765, "grad_norm": 0.9071934223175049, "learning_rate": 9.998605056669574e-06, "loss": 10.3125, "step": 209 }, { "epoch": 0.015184539378087765, "eval_accuracy": 0.07412215174422407, "eval_loss": 10.3046875, "eval_runtime": 266.5993, "eval_samples_per_second": 126.658, "eval_steps_per_second": 2.641, "step": 209 }, { "epoch": 0.015257192676547515, "grad_norm": 0.8879753351211548, "learning_rate": 9.998597791339728e-06, "loss": 10.3125, "step": 210 }, { "epoch": 0.015257192676547515, "eval_accuracy": 0.07420225331776693, "eval_loss": 10.3046875, "eval_runtime": 267.0745, "eval_samples_per_second": 126.433, "eval_steps_per_second": 2.636, "step": 210 }, { "epoch": 0.015329845975007266, "grad_norm": 0.9064663052558899, "learning_rate": 9.998590526009882e-06, "loss": 10.3203, "step": 211 }, { "epoch": 0.015329845975007266, "eval_accuracy": 0.07434856099853515, "eval_loss": 10.3046875, "eval_runtime": 267.2364, "eval_samples_per_second": 126.356, "eval_steps_per_second": 2.634, "step": 211 }, { "epoch": 0.015402499273467015, "grad_norm": 0.888227105140686, "learning_rate": 9.998583260680035e-06, "loss": 10.3047, "step": 212 }, { "epoch": 0.015402499273467015, "eval_accuracy": 0.07441022213031916, "eval_loss": 10.3046875, "eval_runtime": 267.6856, "eval_samples_per_second": 126.144, "eval_steps_per_second": 2.63, "step": 212 }, { "epoch": 0.015475152571926765, "grad_norm": 0.873029887676239, "learning_rate": 9.99857599535019e-06, "loss": 10.3203, "step": 213 }, { "epoch": 0.015475152571926765, "eval_accuracy": 0.07452069108754343, "eval_loss": 10.296875, "eval_runtime": 267.3911, "eval_samples_per_second": 126.283, "eval_steps_per_second": 2.633, "step": 213 }, { "epoch": 0.015547805870386516, "grad_norm": 0.9147621989250183, "learning_rate": 9.998568730020343e-06, "loss": 10.3125, "step": 214 }, { "epoch": 0.015547805870386516, "eval_accuracy": 0.07470987207167884, "eval_loss": 10.296875, "eval_runtime": 267.6606, "eval_samples_per_second": 126.156, "eval_steps_per_second": 2.63, "step": 214 }, { "epoch": 0.015620459168846265, "grad_norm": 0.9260271787643433, "learning_rate": 9.998561464690499e-06, "loss": 10.3047, "step": 215 }, { "epoch": 0.015620459168846265, "eval_accuracy": 0.07488478125395062, "eval_loss": 10.296875, "eval_runtime": 266.1126, "eval_samples_per_second": 126.89, "eval_steps_per_second": 2.645, "step": 215 }, { "epoch": 0.015693112467306015, "grad_norm": 0.9096031785011292, "learning_rate": 9.998554199360651e-06, "loss": 10.2969, "step": 216 }, { "epoch": 0.015693112467306015, "eval_accuracy": 0.07498534969142368, "eval_loss": 10.296875, "eval_runtime": 266.7055, "eval_samples_per_second": 126.608, "eval_steps_per_second": 2.64, "step": 216 }, { "epoch": 0.015765765765765764, "grad_norm": 0.9063442945480347, "learning_rate": 9.998546934030807e-06, "loss": 10.3047, "step": 217 }, { "epoch": 0.015765765765765764, "eval_accuracy": 0.07498459702033147, "eval_loss": 10.296875, "eval_runtime": 267.4598, "eval_samples_per_second": 126.251, "eval_steps_per_second": 2.632, "step": 217 }, { "epoch": 0.015838419064225517, "grad_norm": 0.9258260130882263, "learning_rate": 9.998539668700959e-06, "loss": 10.2969, "step": 218 }, { "epoch": 0.015838419064225517, "eval_accuracy": 0.0748771966452523, "eval_loss": 10.2890625, "eval_runtime": 267.3776, "eval_samples_per_second": 126.29, "eval_steps_per_second": 2.633, "step": 218 }, { "epoch": 0.015911072362685266, "grad_norm": 0.9510604739189148, "learning_rate": 9.998532403371115e-06, "loss": 10.2891, "step": 219 }, { "epoch": 0.015911072362685266, "eval_accuracy": 0.07466393018616653, "eval_loss": 10.2890625, "eval_runtime": 266.9849, "eval_samples_per_second": 126.475, "eval_steps_per_second": 2.637, "step": 219 }, { "epoch": 0.015983725661145014, "grad_norm": 0.888974130153656, "learning_rate": 9.998525138041267e-06, "loss": 10.2969, "step": 220 }, { "epoch": 0.015983725661145014, "eval_accuracy": 0.07440356388604201, "eval_loss": 10.2890625, "eval_runtime": 265.0185, "eval_samples_per_second": 127.414, "eval_steps_per_second": 2.656, "step": 220 }, { "epoch": 0.016056378959604767, "grad_norm": 0.9004181623458862, "learning_rate": 9.998517872711423e-06, "loss": 10.2969, "step": 221 }, { "epoch": 0.016056378959604767, "eval_accuracy": 0.07420781150429394, "eval_loss": 10.2890625, "eval_runtime": 265.4243, "eval_samples_per_second": 127.219, "eval_steps_per_second": 2.652, "step": 221 }, { "epoch": 0.016129032258064516, "grad_norm": 0.8965704441070557, "learning_rate": 9.998510607381577e-06, "loss": 10.2891, "step": 222 }, { "epoch": 0.016129032258064516, "eval_accuracy": 0.07409245018497036, "eval_loss": 10.2890625, "eval_runtime": 265.3546, "eval_samples_per_second": 127.252, "eval_steps_per_second": 2.653, "step": 222 }, { "epoch": 0.016201685556524265, "grad_norm": 0.9500789046287537, "learning_rate": 9.998503342051729e-06, "loss": 10.2891, "step": 223 }, { "epoch": 0.016201685556524265, "eval_accuracy": 0.07416858576083511, "eval_loss": 10.28125, "eval_runtime": 265.6881, "eval_samples_per_second": 127.093, "eval_steps_per_second": 2.65, "step": 223 }, { "epoch": 0.016274338854984017, "grad_norm": 0.9081275463104248, "learning_rate": 9.998496076721884e-06, "loss": 10.2891, "step": 224 }, { "epoch": 0.016274338854984017, "eval_accuracy": 0.07432068321923563, "eval_loss": 10.28125, "eval_runtime": 265.8739, "eval_samples_per_second": 127.004, "eval_steps_per_second": 2.648, "step": 224 }, { "epoch": 0.016346992153443766, "grad_norm": 0.9124018549919128, "learning_rate": 9.998488811392037e-06, "loss": 10.2891, "step": 225 }, { "epoch": 0.016346992153443766, "eval_accuracy": 0.07457499920173441, "eval_loss": 10.28125, "eval_runtime": 265.6262, "eval_samples_per_second": 127.122, "eval_steps_per_second": 2.65, "step": 225 }, { "epoch": 0.016419645451903518, "grad_norm": 0.8811033368110657, "learning_rate": 9.998481546062192e-06, "loss": 10.2969, "step": 226 }, { "epoch": 0.016419645451903518, "eval_accuracy": 0.07480795090477, "eval_loss": 10.28125, "eval_runtime": 265.1697, "eval_samples_per_second": 127.341, "eval_steps_per_second": 2.655, "step": 226 }, { "epoch": 0.016492298750363267, "grad_norm": 0.8931852579116821, "learning_rate": 9.998474280732346e-06, "loss": 10.2812, "step": 227 }, { "epoch": 0.016492298750363267, "eval_accuracy": 0.07488660503390479, "eval_loss": 10.2734375, "eval_runtime": 265.5292, "eval_samples_per_second": 127.169, "eval_steps_per_second": 2.651, "step": 227 }, { "epoch": 0.016564952048823016, "grad_norm": 0.9159207344055176, "learning_rate": 9.9984670154025e-06, "loss": 10.2891, "step": 228 }, { "epoch": 0.016564952048823016, "eval_accuracy": 0.07503638658125239, "eval_loss": 10.2734375, "eval_runtime": 264.9834, "eval_samples_per_second": 127.431, "eval_steps_per_second": 2.657, "step": 228 }, { "epoch": 0.01663760534728277, "grad_norm": 0.9291688203811646, "learning_rate": 9.998459750072654e-06, "loss": 10.2734, "step": 229 }, { "epoch": 0.01663760534728277, "eval_accuracy": 0.0751111905082617, "eval_loss": 10.2734375, "eval_runtime": 265.3402, "eval_samples_per_second": 127.259, "eval_steps_per_second": 2.653, "step": 229 }, { "epoch": 0.016710258645742517, "grad_norm": 0.8605514168739319, "learning_rate": 9.998452484742808e-06, "loss": 10.2969, "step": 230 }, { "epoch": 0.016710258645742517, "eval_accuracy": 0.075049760967583, "eval_loss": 10.2734375, "eval_runtime": 266.7587, "eval_samples_per_second": 126.583, "eval_steps_per_second": 2.639, "step": 230 }, { "epoch": 0.016782911944202266, "grad_norm": 0.9553351998329163, "learning_rate": 9.998445219412962e-06, "loss": 10.2656, "step": 231 }, { "epoch": 0.016782911944202266, "eval_accuracy": 0.07486694873884313, "eval_loss": 10.2734375, "eval_runtime": 266.6123, "eval_samples_per_second": 126.652, "eval_steps_per_second": 2.641, "step": 231 }, { "epoch": 0.01685556524266202, "grad_norm": 0.9175562262535095, "learning_rate": 9.998437954083116e-06, "loss": 10.2734, "step": 232 }, { "epoch": 0.01685556524266202, "eval_accuracy": 0.07473213376667504, "eval_loss": 10.265625, "eval_runtime": 267.4978, "eval_samples_per_second": 126.233, "eval_steps_per_second": 2.632, "step": 232 }, { "epoch": 0.016928218541121767, "grad_norm": 0.9021575450897217, "learning_rate": 9.99843068875327e-06, "loss": 10.2734, "step": 233 }, { "epoch": 0.016928218541121767, "eval_accuracy": 0.0746572719418894, "eval_loss": 10.265625, "eval_runtime": 267.5513, "eval_samples_per_second": 126.208, "eval_steps_per_second": 2.631, "step": 233 }, { "epoch": 0.017000871839581516, "grad_norm": 0.8851971626281738, "learning_rate": 9.998423423423424e-06, "loss": 10.2734, "step": 234 }, { "epoch": 0.017000871839581516, "eval_accuracy": 0.07463700772017634, "eval_loss": 10.265625, "eval_runtime": 267.13, "eval_samples_per_second": 126.407, "eval_steps_per_second": 2.635, "step": 234 }, { "epoch": 0.01707352513804127, "grad_norm": 0.9394397139549255, "learning_rate": 9.998416158093578e-06, "loss": 10.2656, "step": 235 }, { "epoch": 0.01707352513804127, "eval_accuracy": 0.07467038578822655, "eval_loss": 10.265625, "eval_runtime": 265.7106, "eval_samples_per_second": 127.082, "eval_steps_per_second": 2.649, "step": 235 }, { "epoch": 0.017146178436501017, "grad_norm": 0.9121464490890503, "learning_rate": 9.998408892763732e-06, "loss": 10.2656, "step": 236 }, { "epoch": 0.017146178436501017, "eval_accuracy": 0.07478282326984581, "eval_loss": 10.265625, "eval_runtime": 267.6727, "eval_samples_per_second": 126.15, "eval_steps_per_second": 2.63, "step": 236 }, { "epoch": 0.017218831734960766, "grad_norm": 0.8910766839981079, "learning_rate": 9.998401627433887e-06, "loss": 10.2734, "step": 237 }, { "epoch": 0.017218831734960766, "eval_accuracy": 0.07491986730640236, "eval_loss": 10.2578125, "eval_runtime": 268.4345, "eval_samples_per_second": 125.792, "eval_steps_per_second": 2.623, "step": 237 }, { "epoch": 0.01729148503342052, "grad_norm": 0.9403276443481445, "learning_rate": 9.99839436210404e-06, "loss": 10.2656, "step": 238 }, { "epoch": 0.01729148503342052, "eval_accuracy": 0.07518078363540195, "eval_loss": 10.2578125, "eval_runtime": 267.0058, "eval_samples_per_second": 126.465, "eval_steps_per_second": 2.637, "step": 238 }, { "epoch": 0.017364138331880268, "grad_norm": 0.8892084956169128, "learning_rate": 9.998387096774195e-06, "loss": 10.2734, "step": 239 }, { "epoch": 0.017364138331880268, "eval_accuracy": 0.07545047147751449, "eval_loss": 10.2578125, "eval_runtime": 265.2789, "eval_samples_per_second": 127.289, "eval_steps_per_second": 2.654, "step": 239 }, { "epoch": 0.017436791630340016, "grad_norm": 0.9249821305274963, "learning_rate": 9.998379831444348e-06, "loss": 10.2578, "step": 240 }, { "epoch": 0.017436791630340016, "eval_accuracy": 0.07559730023826962, "eval_loss": 10.2578125, "eval_runtime": 265.5188, "eval_samples_per_second": 127.174, "eval_steps_per_second": 2.651, "step": 240 }, { "epoch": 0.01750944492879977, "grad_norm": 0.8785547614097595, "learning_rate": 9.998372566114503e-06, "loss": 10.2734, "step": 241 }, { "epoch": 0.01750944492879977, "eval_accuracy": 0.07564480536374263, "eval_loss": 10.2578125, "eval_runtime": 265.4778, "eval_samples_per_second": 127.193, "eval_steps_per_second": 2.652, "step": 241 }, { "epoch": 0.017582098227259518, "grad_norm": 0.9142479300498962, "learning_rate": 9.998365300784656e-06, "loss": 10.2656, "step": 242 }, { "epoch": 0.017582098227259518, "eval_accuracy": 0.07562578594422054, "eval_loss": 10.25, "eval_runtime": 265.3917, "eval_samples_per_second": 127.235, "eval_steps_per_second": 2.653, "step": 242 }, { "epoch": 0.017654751525719267, "grad_norm": 0.924387514591217, "learning_rate": 9.998358035454811e-06, "loss": 10.2578, "step": 243 }, { "epoch": 0.017654751525719267, "eval_accuracy": 0.07555361636603392, "eval_loss": 10.25, "eval_runtime": 263.7272, "eval_samples_per_second": 128.038, "eval_steps_per_second": 2.669, "step": 243 }, { "epoch": 0.01772740482417902, "grad_norm": 0.9198188185691833, "learning_rate": 9.998350770124965e-06, "loss": 10.2578, "step": 244 }, { "epoch": 0.01772740482417902, "eval_accuracy": 0.07558369426083371, "eval_loss": 10.25, "eval_runtime": 264.6264, "eval_samples_per_second": 127.603, "eval_steps_per_second": 2.66, "step": 244 }, { "epoch": 0.017800058122638768, "grad_norm": 0.9178450703620911, "learning_rate": 9.998343504795119e-06, "loss": 10.2578, "step": 245 }, { "epoch": 0.017800058122638768, "eval_accuracy": 0.07555222681940216, "eval_loss": 10.25, "eval_runtime": 264.3962, "eval_samples_per_second": 127.714, "eval_steps_per_second": 2.663, "step": 245 }, { "epoch": 0.017872711421098517, "grad_norm": 0.8939234614372253, "learning_rate": 9.998336239465273e-06, "loss": 10.2578, "step": 246 }, { "epoch": 0.017872711421098517, "eval_accuracy": 0.07556374847689043, "eval_loss": 10.25, "eval_runtime": 264.4976, "eval_samples_per_second": 127.665, "eval_steps_per_second": 2.662, "step": 246 }, { "epoch": 0.01794536471955827, "grad_norm": 0.9162428379058838, "learning_rate": 9.998328974135427e-06, "loss": 10.2578, "step": 247 }, { "epoch": 0.01794536471955827, "eval_accuracy": 0.07567158308529202, "eval_loss": 10.2421875, "eval_runtime": 264.7261, "eval_samples_per_second": 127.554, "eval_steps_per_second": 2.659, "step": 247 }, { "epoch": 0.018018018018018018, "grad_norm": 0.8900968432426453, "learning_rate": 9.998321708805581e-06, "loss": 10.2578, "step": 248 }, { "epoch": 0.018018018018018018, "eval_accuracy": 0.07578150201364124, "eval_loss": 10.2421875, "eval_runtime": 264.8491, "eval_samples_per_second": 127.495, "eval_steps_per_second": 2.658, "step": 248 }, { "epoch": 0.018090671316477767, "grad_norm": 0.9296072721481323, "learning_rate": 9.998314443475735e-06, "loss": 10.2422, "step": 249 }, { "epoch": 0.018090671316477767, "eval_accuracy": 0.07588282312220648, "eval_loss": 10.2421875, "eval_runtime": 265.9596, "eval_samples_per_second": 126.963, "eval_steps_per_second": 2.647, "step": 249 }, { "epoch": 0.01816332461493752, "grad_norm": 0.9414094686508179, "learning_rate": 9.998307178145889e-06, "loss": 10.2422, "step": 250 }, { "epoch": 0.01816332461493752, "eval_accuracy": 0.07589431583080661, "eval_loss": 10.2421875, "eval_runtime": 265.5852, "eval_samples_per_second": 127.142, "eval_steps_per_second": 2.651, "step": 250 }, { "epoch": 0.018235977913397268, "grad_norm": 0.9078280329704285, "learning_rate": 9.998299912816043e-06, "loss": 10.2422, "step": 251 }, { "epoch": 0.018235977913397268, "eval_accuracy": 0.07588244678666038, "eval_loss": 10.2421875, "eval_runtime": 266.4837, "eval_samples_per_second": 126.713, "eval_steps_per_second": 2.642, "step": 251 }, { "epoch": 0.018308631211857017, "grad_norm": 0.9042601585388184, "learning_rate": 9.998292647486197e-06, "loss": 10.2422, "step": 252 }, { "epoch": 0.018308631211857017, "eval_accuracy": 0.07590384001501174, "eval_loss": 10.234375, "eval_runtime": 265.9021, "eval_samples_per_second": 126.99, "eval_steps_per_second": 2.648, "step": 252 }, { "epoch": 0.01838128451031677, "grad_norm": 0.9029207825660706, "learning_rate": 9.99828538215635e-06, "loss": 10.2422, "step": 253 }, { "epoch": 0.01838128451031677, "eval_accuracy": 0.0758776991690019, "eval_loss": 10.234375, "eval_runtime": 264.8706, "eval_samples_per_second": 127.485, "eval_steps_per_second": 2.658, "step": 253 }, { "epoch": 0.018453937808776518, "grad_norm": 0.901042640209198, "learning_rate": 9.998278116826505e-06, "loss": 10.2422, "step": 254 }, { "epoch": 0.018453937808776518, "eval_accuracy": 0.075944397407326, "eval_loss": 10.234375, "eval_runtime": 266.8196, "eval_samples_per_second": 126.554, "eval_steps_per_second": 2.638, "step": 254 }, { "epoch": 0.018526591107236267, "grad_norm": 0.921441376209259, "learning_rate": 9.998270851496658e-06, "loss": 10.2422, "step": 255 }, { "epoch": 0.018526591107236267, "eval_accuracy": 0.07608468371935663, "eval_loss": 10.234375, "eval_runtime": 266.5157, "eval_samples_per_second": 126.698, "eval_steps_per_second": 2.641, "step": 255 }, { "epoch": 0.01859924440569602, "grad_norm": 0.9235514998435974, "learning_rate": 9.998263586166812e-06, "loss": 10.2422, "step": 256 }, { "epoch": 0.01859924440569602, "eval_accuracy": 0.07609151565696273, "eval_loss": 10.234375, "eval_runtime": 265.5975, "eval_samples_per_second": 127.136, "eval_steps_per_second": 2.651, "step": 256 }, { "epoch": 0.018671897704155768, "grad_norm": 0.8886791467666626, "learning_rate": 9.998256320836966e-06, "loss": 10.2422, "step": 257 }, { "epoch": 0.018671897704155768, "eval_accuracy": 0.07604354734927914, "eval_loss": 10.2265625, "eval_runtime": 266.183, "eval_samples_per_second": 126.856, "eval_steps_per_second": 2.645, "step": 257 }, { "epoch": 0.018744551002615517, "grad_norm": 0.8807479739189148, "learning_rate": 9.99824905550712e-06, "loss": 10.2422, "step": 258 }, { "epoch": 0.018744551002615517, "eval_accuracy": 0.07599326313054261, "eval_loss": 10.2265625, "eval_runtime": 266.5367, "eval_samples_per_second": 126.688, "eval_steps_per_second": 2.641, "step": 258 }, { "epoch": 0.01881720430107527, "grad_norm": 0.902275025844574, "learning_rate": 9.998241790177274e-06, "loss": 10.2344, "step": 259 }, { "epoch": 0.01881720430107527, "eval_accuracy": 0.07592763600108049, "eval_loss": 10.2265625, "eval_runtime": 265.7243, "eval_samples_per_second": 127.075, "eval_steps_per_second": 2.649, "step": 259 }, { "epoch": 0.01888985759953502, "grad_norm": 0.8911043405532837, "learning_rate": 9.998234524847428e-06, "loss": 10.2344, "step": 260 }, { "epoch": 0.01888985759953502, "eval_accuracy": 0.07592697017665277, "eval_loss": 10.2265625, "eval_runtime": 265.6968, "eval_samples_per_second": 127.088, "eval_steps_per_second": 2.65, "step": 260 }, { "epoch": 0.01896251089799477, "grad_norm": 0.9092383980751038, "learning_rate": 9.998227259517584e-06, "loss": 10.2266, "step": 261 }, { "epoch": 0.01896251089799477, "eval_accuracy": 0.0759951448082731, "eval_loss": 10.2265625, "eval_runtime": 265.7199, "eval_samples_per_second": 127.077, "eval_steps_per_second": 2.649, "step": 261 }, { "epoch": 0.01903516419645452, "grad_norm": 0.928420901298523, "learning_rate": 9.998219994187736e-06, "loss": 10.2188, "step": 262 }, { "epoch": 0.01903516419645452, "eval_accuracy": 0.07604878709803636, "eval_loss": 10.21875, "eval_runtime": 265.3942, "eval_samples_per_second": 127.233, "eval_steps_per_second": 2.653, "step": 262 }, { "epoch": 0.01910781749491427, "grad_norm": 0.9022119641304016, "learning_rate": 9.998212728857892e-06, "loss": 10.2266, "step": 263 }, { "epoch": 0.01910781749491427, "eval_accuracy": 0.07615966133969491, "eval_loss": 10.21875, "eval_runtime": 265.8823, "eval_samples_per_second": 127.0, "eval_steps_per_second": 2.648, "step": 263 }, { "epoch": 0.01918047079337402, "grad_norm": 0.8958231210708618, "learning_rate": 9.998205463528044e-06, "loss": 10.2266, "step": 264 }, { "epoch": 0.01918047079337402, "eval_accuracy": 0.07618090982360545, "eval_loss": 10.21875, "eval_runtime": 264.9067, "eval_samples_per_second": 127.468, "eval_steps_per_second": 2.658, "step": 264 }, { "epoch": 0.01925312409183377, "grad_norm": 0.9452428817749023, "learning_rate": 9.9981981981982e-06, "loss": 10.2188, "step": 265 }, { "epoch": 0.01925312409183377, "eval_accuracy": 0.07618519425905335, "eval_loss": 10.21875, "eval_runtime": 265.6026, "eval_samples_per_second": 127.134, "eval_steps_per_second": 2.651, "step": 265 }, { "epoch": 0.01932577739029352, "grad_norm": 0.8848786354064941, "learning_rate": 9.998190932868354e-06, "loss": 10.2266, "step": 266 }, { "epoch": 0.01932577739029352, "eval_accuracy": 0.07619544216546252, "eval_loss": 10.21875, "eval_runtime": 264.5225, "eval_samples_per_second": 127.653, "eval_steps_per_second": 2.661, "step": 266 }, { "epoch": 0.01939843068875327, "grad_norm": 0.89435875415802, "learning_rate": 9.998183667538508e-06, "loss": 10.2188, "step": 267 }, { "epoch": 0.01939843068875327, "eval_accuracy": 0.07622743068688098, "eval_loss": 10.2109375, "eval_runtime": 263.8523, "eval_samples_per_second": 127.977, "eval_steps_per_second": 2.668, "step": 267 }, { "epoch": 0.01947108398721302, "grad_norm": 0.9147275686264038, "learning_rate": 9.998176402208661e-06, "loss": 10.2109, "step": 268 }, { "epoch": 0.01947108398721302, "eval_accuracy": 0.076288020709803, "eval_loss": 10.2109375, "eval_runtime": 264.7583, "eval_samples_per_second": 127.539, "eval_steps_per_second": 2.659, "step": 268 }, { "epoch": 0.01954373728567277, "grad_norm": 0.9651392102241516, "learning_rate": 9.998169136878815e-06, "loss": 10.2109, "step": 269 }, { "epoch": 0.01954373728567277, "eval_accuracy": 0.07618924710339596, "eval_loss": 10.2109375, "eval_runtime": 263.97, "eval_samples_per_second": 127.92, "eval_steps_per_second": 2.667, "step": 269 }, { "epoch": 0.01961639058413252, "grad_norm": 0.9311045408248901, "learning_rate": 9.99816187154897e-06, "loss": 10.2109, "step": 270 }, { "epoch": 0.01961639058413252, "eval_accuracy": 0.07608592852154757, "eval_loss": 10.2109375, "eval_runtime": 263.7922, "eval_samples_per_second": 128.006, "eval_steps_per_second": 2.669, "step": 270 }, { "epoch": 0.01968904388259227, "grad_norm": 0.9008721113204956, "learning_rate": 9.998154606219123e-06, "loss": 10.2188, "step": 271 }, { "epoch": 0.01968904388259227, "eval_accuracy": 0.07606844339309803, "eval_loss": 10.2109375, "eval_runtime": 263.8126, "eval_samples_per_second": 127.996, "eval_steps_per_second": 2.669, "step": 271 }, { "epoch": 0.01976169718105202, "grad_norm": 0.9026838541030884, "learning_rate": 9.998147340889277e-06, "loss": 10.2109, "step": 272 }, { "epoch": 0.01976169718105202, "eval_accuracy": 0.07600967715013018, "eval_loss": 10.203125, "eval_runtime": 265.1164, "eval_samples_per_second": 127.367, "eval_steps_per_second": 2.655, "step": 272 }, { "epoch": 0.01983435047951177, "grad_norm": 0.9332795143127441, "learning_rate": 9.998140075559431e-06, "loss": 10.2188, "step": 273 }, { "epoch": 0.01983435047951177, "eval_accuracy": 0.07610422421886563, "eval_loss": 10.203125, "eval_runtime": 266.0904, "eval_samples_per_second": 126.9, "eval_steps_per_second": 2.646, "step": 273 }, { "epoch": 0.01990700377797152, "grad_norm": 0.8622159361839294, "learning_rate": 9.998132810229585e-06, "loss": 10.2266, "step": 274 }, { "epoch": 0.01990700377797152, "eval_accuracy": 0.07618247306356617, "eval_loss": 10.203125, "eval_runtime": 265.3346, "eval_samples_per_second": 127.262, "eval_steps_per_second": 2.653, "step": 274 }, { "epoch": 0.01997965707643127, "grad_norm": 0.8802500367164612, "learning_rate": 9.998125544899739e-06, "loss": 10.2188, "step": 275 }, { "epoch": 0.01997965707643127, "eval_accuracy": 0.07621292729391207, "eval_loss": 10.203125, "eval_runtime": 263.5634, "eval_samples_per_second": 128.117, "eval_steps_per_second": 2.671, "step": 275 }, { "epoch": 0.02005231037489102, "grad_norm": 0.8940539956092834, "learning_rate": 9.998118279569893e-06, "loss": 10.2109, "step": 276 }, { "epoch": 0.02005231037489102, "eval_accuracy": 0.07612616747609205, "eval_loss": 10.1953125, "eval_runtime": 264.8033, "eval_samples_per_second": 127.517, "eval_steps_per_second": 2.659, "step": 276 }, { "epoch": 0.02012496367335077, "grad_norm": 0.9146431684494019, "learning_rate": 9.998111014240047e-06, "loss": 10.2109, "step": 277 }, { "epoch": 0.02012496367335077, "eval_accuracy": 0.07617022768387385, "eval_loss": 10.1953125, "eval_runtime": 264.0677, "eval_samples_per_second": 127.873, "eval_steps_per_second": 2.666, "step": 277 }, { "epoch": 0.02019761697181052, "grad_norm": 0.9410712122917175, "learning_rate": 9.998103748910201e-06, "loss": 10.1953, "step": 278 }, { "epoch": 0.02019761697181052, "eval_accuracy": 0.07624436578645546, "eval_loss": 10.1953125, "eval_runtime": 263.8716, "eval_samples_per_second": 127.968, "eval_steps_per_second": 2.668, "step": 278 }, { "epoch": 0.02027027027027027, "grad_norm": 0.8908507227897644, "learning_rate": 9.998096483580355e-06, "loss": 10.2031, "step": 279 }, { "epoch": 0.02027027027027027, "eval_accuracy": 0.07630177143167971, "eval_loss": 10.1953125, "eval_runtime": 263.3209, "eval_samples_per_second": 128.235, "eval_steps_per_second": 2.674, "step": 279 }, { "epoch": 0.02034292356873002, "grad_norm": 0.9145093560218811, "learning_rate": 9.998089218250509e-06, "loss": 10.2188, "step": 280 }, { "epoch": 0.02034292356873002, "eval_accuracy": 0.07648229669825482, "eval_loss": 10.1953125, "eval_runtime": 262.7765, "eval_samples_per_second": 128.501, "eval_steps_per_second": 2.679, "step": 280 }, { "epoch": 0.02041557686718977, "grad_norm": 0.9509057402610779, "learning_rate": 9.998081952920663e-06, "loss": 10.1953, "step": 281 }, { "epoch": 0.02041557686718977, "eval_accuracy": 0.07658419678458331, "eval_loss": 10.1875, "eval_runtime": 264.7797, "eval_samples_per_second": 127.529, "eval_steps_per_second": 2.659, "step": 281 }, { "epoch": 0.02048823016564952, "grad_norm": 0.9156680107116699, "learning_rate": 9.998074687590817e-06, "loss": 10.1953, "step": 282 }, { "epoch": 0.02048823016564952, "eval_accuracy": 0.07667509629341042, "eval_loss": 10.1875, "eval_runtime": 264.7774, "eval_samples_per_second": 127.53, "eval_steps_per_second": 2.659, "step": 282 }, { "epoch": 0.02056088346410927, "grad_norm": 0.9039434194564819, "learning_rate": 9.998067422260972e-06, "loss": 10.2031, "step": 283 }, { "epoch": 0.02056088346410927, "eval_accuracy": 0.07671203507470449, "eval_loss": 10.1875, "eval_runtime": 264.8864, "eval_samples_per_second": 127.477, "eval_steps_per_second": 2.658, "step": 283 }, { "epoch": 0.02063353676256902, "grad_norm": 0.9945496320724487, "learning_rate": 9.998060156931125e-06, "loss": 10.1797, "step": 284 }, { "epoch": 0.02063353676256902, "eval_accuracy": 0.0766314992678392, "eval_loss": 10.1875, "eval_runtime": 264.5241, "eval_samples_per_second": 127.652, "eval_steps_per_second": 2.661, "step": 284 }, { "epoch": 0.02070619006102877, "grad_norm": 1.0437395572662354, "learning_rate": 9.99805289160128e-06, "loss": 10.1953, "step": 285 }, { "epoch": 0.02070619006102877, "eval_accuracy": 0.07648548107595259, "eval_loss": 10.1875, "eval_runtime": 265.3496, "eval_samples_per_second": 127.255, "eval_steps_per_second": 2.653, "step": 285 }, { "epoch": 0.02077884335948852, "grad_norm": 0.9568849802017212, "learning_rate": 9.998045626271433e-06, "loss": 10.1953, "step": 286 }, { "epoch": 0.02077884335948852, "eval_accuracy": 0.07641924601983908, "eval_loss": 10.1796875, "eval_runtime": 264.6514, "eval_samples_per_second": 127.59, "eval_steps_per_second": 2.66, "step": 286 }, { "epoch": 0.02085149665794827, "grad_norm": 0.9541803002357483, "learning_rate": 9.998038360941588e-06, "loss": 10.1875, "step": 287 }, { "epoch": 0.02085149665794827, "eval_accuracy": 0.07642651219076761, "eval_loss": 10.1796875, "eval_runtime": 264.1269, "eval_samples_per_second": 127.844, "eval_steps_per_second": 2.665, "step": 287 }, { "epoch": 0.020924149956408022, "grad_norm": 0.8972413539886475, "learning_rate": 9.998031095611742e-06, "loss": 10.1953, "step": 288 }, { "epoch": 0.020924149956408022, "eval_accuracy": 0.0764929209402101, "eval_loss": 10.1796875, "eval_runtime": 262.3613, "eval_samples_per_second": 128.704, "eval_steps_per_second": 2.683, "step": 288 }, { "epoch": 0.02099680325486777, "grad_norm": 0.9032208323478699, "learning_rate": 9.998023830281896e-06, "loss": 10.1875, "step": 289 }, { "epoch": 0.02099680325486777, "eval_accuracy": 0.07653272566143215, "eval_loss": 10.1796875, "eval_runtime": 264.0713, "eval_samples_per_second": 127.871, "eval_steps_per_second": 2.666, "step": 289 }, { "epoch": 0.02106945655332752, "grad_norm": 0.9714264869689941, "learning_rate": 9.99801656495205e-06, "loss": 10.1875, "step": 290 }, { "epoch": 0.02106945655332752, "eval_accuracy": 0.07676037971793419, "eval_loss": 10.1796875, "eval_runtime": 265.0307, "eval_samples_per_second": 127.408, "eval_steps_per_second": 2.656, "step": 290 }, { "epoch": 0.021142109851787272, "grad_norm": 0.9713578820228577, "learning_rate": 9.998009299622204e-06, "loss": 10.1797, "step": 291 }, { "epoch": 0.021142109851787272, "eval_accuracy": 0.07695943227293267, "eval_loss": 10.171875, "eval_runtime": 264.6088, "eval_samples_per_second": 127.611, "eval_steps_per_second": 2.661, "step": 291 }, { "epoch": 0.02121476315024702, "grad_norm": 0.947812557220459, "learning_rate": 9.998002034292358e-06, "loss": 10.1719, "step": 292 }, { "epoch": 0.02121476315024702, "eval_accuracy": 0.07713083863973691, "eval_loss": 10.171875, "eval_runtime": 263.4033, "eval_samples_per_second": 128.195, "eval_steps_per_second": 2.673, "step": 292 }, { "epoch": 0.02128741644870677, "grad_norm": 0.980165421962738, "learning_rate": 9.997994768962512e-06, "loss": 10.1719, "step": 293 }, { "epoch": 0.02128741644870677, "eval_accuracy": 0.07723297031717068, "eval_loss": 10.171875, "eval_runtime": 264.0562, "eval_samples_per_second": 127.878, "eval_steps_per_second": 2.666, "step": 293 }, { "epoch": 0.021360069747166522, "grad_norm": 0.9119016528129578, "learning_rate": 9.997987503632666e-06, "loss": 10.1797, "step": 294 }, { "epoch": 0.021360069747166522, "eval_accuracy": 0.07731897746389849, "eval_loss": 10.171875, "eval_runtime": 265.096, "eval_samples_per_second": 127.377, "eval_steps_per_second": 2.656, "step": 294 }, { "epoch": 0.02143272304562627, "grad_norm": 0.9215472936630249, "learning_rate": 9.99798023830282e-06, "loss": 10.1797, "step": 295 }, { "epoch": 0.02143272304562627, "eval_accuracy": 0.07734911325647462, "eval_loss": 10.171875, "eval_runtime": 264.1241, "eval_samples_per_second": 127.845, "eval_steps_per_second": 2.665, "step": 295 }, { "epoch": 0.021505376344086023, "grad_norm": 0.915708601474762, "learning_rate": 9.997972972972974e-06, "loss": 10.1641, "step": 296 }, { "epoch": 0.021505376344086023, "eval_accuracy": 0.07730580571978501, "eval_loss": 10.1640625, "eval_runtime": 262.6233, "eval_samples_per_second": 128.576, "eval_steps_per_second": 2.681, "step": 296 }, { "epoch": 0.021578029642545772, "grad_norm": 0.9310121536254883, "learning_rate": 9.997965707643128e-06, "loss": 10.1719, "step": 297 }, { "epoch": 0.021578029642545772, "eval_accuracy": 0.07730317137096232, "eval_loss": 10.1640625, "eval_runtime": 263.411, "eval_samples_per_second": 128.191, "eval_steps_per_second": 2.673, "step": 297 }, { "epoch": 0.02165068294100552, "grad_norm": 0.9275549650192261, "learning_rate": 9.997958442313282e-06, "loss": 10.1719, "step": 298 }, { "epoch": 0.02165068294100552, "eval_accuracy": 0.07726713000520125, "eval_loss": 10.1640625, "eval_runtime": 263.956, "eval_samples_per_second": 127.927, "eval_steps_per_second": 2.667, "step": 298 }, { "epoch": 0.021723336239465273, "grad_norm": 0.9178668260574341, "learning_rate": 9.997951176983435e-06, "loss": 10.1719, "step": 299 }, { "epoch": 0.021723336239465273, "eval_accuracy": 0.07729367613564535, "eval_loss": 10.1640625, "eval_runtime": 263.6087, "eval_samples_per_second": 128.095, "eval_steps_per_second": 2.671, "step": 299 }, { "epoch": 0.021795989537925022, "grad_norm": 0.9181063175201416, "learning_rate": 9.997943911653591e-06, "loss": 10.1719, "step": 300 }, { "epoch": 0.021795989537925022, "eval_accuracy": 0.0773372442123284, "eval_loss": 10.1640625, "eval_runtime": 264.6871, "eval_samples_per_second": 127.573, "eval_steps_per_second": 2.66, "step": 300 }, { "epoch": 0.02186864283638477, "grad_norm": 0.9063278436660767, "learning_rate": 9.997936646323743e-06, "loss": 10.1641, "step": 301 }, { "epoch": 0.02186864283638477, "eval_accuracy": 0.07729164971347403, "eval_loss": 10.1640625, "eval_runtime": 263.4469, "eval_samples_per_second": 128.174, "eval_steps_per_second": 2.672, "step": 301 }, { "epoch": 0.021941296134844523, "grad_norm": 0.9040680527687073, "learning_rate": 9.997929380993899e-06, "loss": 10.1562, "step": 302 }, { "epoch": 0.021941296134844523, "eval_accuracy": 0.07716485358332667, "eval_loss": 10.15625, "eval_runtime": 264.4717, "eval_samples_per_second": 127.677, "eval_steps_per_second": 2.662, "step": 302 }, { "epoch": 0.022013949433304272, "grad_norm": 0.9027392864227295, "learning_rate": 9.997922115664051e-06, "loss": 10.1719, "step": 303 }, { "epoch": 0.022013949433304272, "eval_accuracy": 0.0771164799912088, "eval_loss": 10.15625, "eval_runtime": 263.5132, "eval_samples_per_second": 128.142, "eval_steps_per_second": 2.672, "step": 303 }, { "epoch": 0.02208660273176402, "grad_norm": 0.9688916802406311, "learning_rate": 9.997914850334205e-06, "loss": 10.1562, "step": 304 }, { "epoch": 0.02208660273176402, "eval_accuracy": 0.07716798006324811, "eval_loss": 10.15625, "eval_runtime": 265.6792, "eval_samples_per_second": 127.097, "eval_steps_per_second": 2.65, "step": 304 }, { "epoch": 0.022159256030223774, "grad_norm": 0.9013357162475586, "learning_rate": 9.997907585004361e-06, "loss": 10.1641, "step": 305 }, { "epoch": 0.022159256030223774, "eval_accuracy": 0.07729283661788866, "eval_loss": 10.15625, "eval_runtime": 263.3689, "eval_samples_per_second": 128.212, "eval_steps_per_second": 2.673, "step": 305 }, { "epoch": 0.022231909328683522, "grad_norm": 0.9209669828414917, "learning_rate": 9.997900319674513e-06, "loss": 10.1562, "step": 306 }, { "epoch": 0.022231909328683522, "eval_accuracy": 0.07727607521164315, "eval_loss": 10.1484375, "eval_runtime": 262.7117, "eval_samples_per_second": 128.533, "eval_steps_per_second": 2.68, "step": 306 }, { "epoch": 0.02230456262714327, "grad_norm": 0.9404518604278564, "learning_rate": 9.997893054344669e-06, "loss": 10.1641, "step": 307 }, { "epoch": 0.02230456262714327, "eval_accuracy": 0.07732798056811672, "eval_loss": 10.1484375, "eval_runtime": 264.5099, "eval_samples_per_second": 127.659, "eval_steps_per_second": 2.662, "step": 307 }, { "epoch": 0.022377215925603024, "grad_norm": 0.8949778079986572, "learning_rate": 9.997885789014821e-06, "loss": 10.1719, "step": 308 }, { "epoch": 0.022377215925603024, "eval_accuracy": 0.07748395717753088, "eval_loss": 10.1484375, "eval_runtime": 265.2866, "eval_samples_per_second": 127.285, "eval_steps_per_second": 2.654, "step": 308 }, { "epoch": 0.022449869224062773, "grad_norm": 0.9001926183700562, "learning_rate": 9.997878523684977e-06, "loss": 10.1562, "step": 309 }, { "epoch": 0.022449869224062773, "eval_accuracy": 0.07751218234348835, "eval_loss": 10.1484375, "eval_runtime": 265.0408, "eval_samples_per_second": 127.403, "eval_steps_per_second": 2.656, "step": 309 }, { "epoch": 0.02252252252252252, "grad_norm": 0.9069272875785828, "learning_rate": 9.997871258355129e-06, "loss": 10.1719, "step": 310 }, { "epoch": 0.02252252252252252, "eval_accuracy": 0.07750650836140868, "eval_loss": 10.1484375, "eval_runtime": 265.3373, "eval_samples_per_second": 127.261, "eval_steps_per_second": 2.653, "step": 310 }, { "epoch": 0.022595175820982274, "grad_norm": 0.92779940366745, "learning_rate": 9.997863993025285e-06, "loss": 10.1562, "step": 311 }, { "epoch": 0.022595175820982274, "eval_accuracy": 0.07742909903446483, "eval_loss": 10.140625, "eval_runtime": 264.244, "eval_samples_per_second": 127.787, "eval_steps_per_second": 2.664, "step": 311 }, { "epoch": 0.022667829119442023, "grad_norm": 0.9007747769355774, "learning_rate": 9.997856727695438e-06, "loss": 10.1562, "step": 312 }, { "epoch": 0.022667829119442023, "eval_accuracy": 0.07736051911841024, "eval_loss": 10.140625, "eval_runtime": 265.1787, "eval_samples_per_second": 127.337, "eval_steps_per_second": 2.655, "step": 312 }, { "epoch": 0.02274048241790177, "grad_norm": 0.9027653336524963, "learning_rate": 9.997849462365592e-06, "loss": 10.1562, "step": 313 }, { "epoch": 0.02274048241790177, "eval_accuracy": 0.07730719526641676, "eval_loss": 10.140625, "eval_runtime": 262.9549, "eval_samples_per_second": 128.414, "eval_steps_per_second": 2.677, "step": 313 }, { "epoch": 0.022813135716361524, "grad_norm": 0.9862774610519409, "learning_rate": 9.997842197035746e-06, "loss": 10.1406, "step": 314 }, { "epoch": 0.022813135716361524, "eval_accuracy": 0.07735834795179813, "eval_loss": 10.140625, "eval_runtime": 263.4521, "eval_samples_per_second": 128.171, "eval_steps_per_second": 2.672, "step": 314 }, { "epoch": 0.022885789014821273, "grad_norm": 0.9319806694984436, "learning_rate": 9.9978349317059e-06, "loss": 10.1406, "step": 315 }, { "epoch": 0.022885789014821273, "eval_accuracy": 0.07738877323325587, "eval_loss": 10.140625, "eval_runtime": 262.7183, "eval_samples_per_second": 128.529, "eval_steps_per_second": 2.68, "step": 315 }, { "epoch": 0.02295844231328102, "grad_norm": 0.9190651774406433, "learning_rate": 9.997827666376054e-06, "loss": 10.1406, "step": 316 }, { "epoch": 0.02295844231328102, "eval_accuracy": 0.07740159759071141, "eval_loss": 10.140625, "eval_runtime": 264.3693, "eval_samples_per_second": 127.727, "eval_steps_per_second": 2.663, "step": 316 }, { "epoch": 0.023031095611740774, "grad_norm": 0.9385405778884888, "learning_rate": 9.997820401046208e-06, "loss": 10.1328, "step": 317 }, { "epoch": 0.023031095611740774, "eval_accuracy": 0.07745408192494821, "eval_loss": 10.1328125, "eval_runtime": 263.8288, "eval_samples_per_second": 127.988, "eval_steps_per_second": 2.668, "step": 317 }, { "epoch": 0.023103748910200523, "grad_norm": 0.9594412446022034, "learning_rate": 9.997813135716362e-06, "loss": 10.1484, "step": 318 }, { "epoch": 0.023103748910200523, "eval_accuracy": 0.07749652099499298, "eval_loss": 10.1328125, "eval_runtime": 262.8505, "eval_samples_per_second": 128.465, "eval_steps_per_second": 2.678, "step": 318 }, { "epoch": 0.023176402208660272, "grad_norm": 0.9393614530563354, "learning_rate": 9.997805870386516e-06, "loss": 10.1328, "step": 319 }, { "epoch": 0.023176402208660272, "eval_accuracy": 0.07747440404443759, "eval_loss": 10.1328125, "eval_runtime": 263.8311, "eval_samples_per_second": 127.987, "eval_steps_per_second": 2.668, "step": 319 }, { "epoch": 0.023249055507120024, "grad_norm": 0.9211113452911377, "learning_rate": 9.99779860505667e-06, "loss": 10.1328, "step": 320 }, { "epoch": 0.023249055507120024, "eval_accuracy": 0.07746320082471908, "eval_loss": 10.1328125, "eval_runtime": 264.0976, "eval_samples_per_second": 127.858, "eval_steps_per_second": 2.666, "step": 320 }, { "epoch": 0.023321708805579773, "grad_norm": 0.9568068385124207, "learning_rate": 9.997791339726824e-06, "loss": 10.125, "step": 321 }, { "epoch": 0.023321708805579773, "eval_accuracy": 0.07749640519944033, "eval_loss": 10.1328125, "eval_runtime": 264.3338, "eval_samples_per_second": 127.744, "eval_steps_per_second": 2.663, "step": 321 }, { "epoch": 0.023394362104039522, "grad_norm": 0.9372284412384033, "learning_rate": 9.99778407439698e-06, "loss": 10.1406, "step": 322 }, { "epoch": 0.023394362104039522, "eval_accuracy": 0.07760533986559205, "eval_loss": 10.125, "eval_runtime": 264.2267, "eval_samples_per_second": 127.796, "eval_steps_per_second": 2.664, "step": 322 }, { "epoch": 0.023467015402499274, "grad_norm": 0.9022813439369202, "learning_rate": 9.997776809067132e-06, "loss": 10.1328, "step": 323 }, { "epoch": 0.023467015402499274, "eval_accuracy": 0.07770527142752555, "eval_loss": 10.125, "eval_runtime": 263.5487, "eval_samples_per_second": 128.124, "eval_steps_per_second": 2.671, "step": 323 }, { "epoch": 0.023539668700959023, "grad_norm": 0.9569028615951538, "learning_rate": 9.997769543737288e-06, "loss": 10.125, "step": 324 }, { "epoch": 0.023539668700959023, "eval_accuracy": 0.07776036116169688, "eval_loss": 10.125, "eval_runtime": 264.1074, "eval_samples_per_second": 127.853, "eval_steps_per_second": 2.666, "step": 324 }, { "epoch": 0.023612321999418772, "grad_norm": 0.926621675491333, "learning_rate": 9.99776227840744e-06, "loss": 10.125, "step": 325 }, { "epoch": 0.023612321999418772, "eval_accuracy": 0.07774163123105639, "eval_loss": 10.125, "eval_runtime": 264.9077, "eval_samples_per_second": 127.467, "eval_steps_per_second": 2.658, "step": 325 }, { "epoch": 0.023684975297878524, "grad_norm": 0.8989631533622742, "learning_rate": 9.997755013077595e-06, "loss": 10.125, "step": 326 }, { "epoch": 0.023684975297878524, "eval_accuracy": 0.0777147087650662, "eval_loss": 10.125, "eval_runtime": 262.9955, "eval_samples_per_second": 128.394, "eval_steps_per_second": 2.677, "step": 326 }, { "epoch": 0.023757628596338273, "grad_norm": 0.918336033821106, "learning_rate": 9.997747747747748e-06, "loss": 10.1328, "step": 327 }, { "epoch": 0.023757628596338273, "eval_accuracy": 0.07771042432961829, "eval_loss": 10.1171875, "eval_runtime": 264.3981, "eval_samples_per_second": 127.713, "eval_steps_per_second": 2.663, "step": 327 }, { "epoch": 0.023830281894798022, "grad_norm": 0.9403995275497437, "learning_rate": 9.997740482417903e-06, "loss": 10.1172, "step": 328 }, { "epoch": 0.023830281894798022, "eval_accuracy": 0.07768630990577977, "eval_loss": 10.1171875, "eval_runtime": 262.3713, "eval_samples_per_second": 128.699, "eval_steps_per_second": 2.683, "step": 328 }, { "epoch": 0.023902935193257775, "grad_norm": 0.9186561703681946, "learning_rate": 9.997733217088057e-06, "loss": 10.1172, "step": 329 }, { "epoch": 0.023902935193257775, "eval_accuracy": 0.07772889372026533, "eval_loss": 10.1171875, "eval_runtime": 264.6641, "eval_samples_per_second": 127.584, "eval_steps_per_second": 2.66, "step": 329 }, { "epoch": 0.023975588491717523, "grad_norm": 0.9268199801445007, "learning_rate": 9.997725951758211e-06, "loss": 10.125, "step": 330 }, { "epoch": 0.023975588491717523, "eval_accuracy": 0.07778919425430574, "eval_loss": 10.1171875, "eval_runtime": 264.2577, "eval_samples_per_second": 127.781, "eval_steps_per_second": 2.664, "step": 330 }, { "epoch": 0.024048241790177276, "grad_norm": 0.9123356342315674, "learning_rate": 9.997718686428365e-06, "loss": 10.1094, "step": 331 }, { "epoch": 0.024048241790177276, "eval_accuracy": 0.07782868053775802, "eval_loss": 10.1171875, "eval_runtime": 264.6854, "eval_samples_per_second": 127.574, "eval_steps_per_second": 2.66, "step": 331 }, { "epoch": 0.024120895088637025, "grad_norm": 0.9475653767585754, "learning_rate": 9.997711421098517e-06, "loss": 10.1094, "step": 332 }, { "epoch": 0.024120895088637025, "eval_accuracy": 0.07771977482049446, "eval_loss": 10.109375, "eval_runtime": 264.176, "eval_samples_per_second": 127.82, "eval_steps_per_second": 2.665, "step": 332 }, { "epoch": 0.024193548387096774, "grad_norm": 0.9262251853942871, "learning_rate": 9.997704155768673e-06, "loss": 10.1094, "step": 333 }, { "epoch": 0.024193548387096774, "eval_accuracy": 0.07764413137572845, "eval_loss": 10.109375, "eval_runtime": 266.5418, "eval_samples_per_second": 126.686, "eval_steps_per_second": 2.641, "step": 333 }, { "epoch": 0.024266201685556526, "grad_norm": 0.9046162962913513, "learning_rate": 9.997696890438827e-06, "loss": 10.1172, "step": 334 }, { "epoch": 0.024266201685556526, "eval_accuracy": 0.07753328608295808, "eval_loss": 10.109375, "eval_runtime": 264.8485, "eval_samples_per_second": 127.496, "eval_steps_per_second": 2.658, "step": 334 }, { "epoch": 0.024338854984016275, "grad_norm": 0.8864550590515137, "learning_rate": 9.997689625108981e-06, "loss": 10.125, "step": 335 }, { "epoch": 0.024338854984016275, "eval_accuracy": 0.07736963801818111, "eval_loss": 10.109375, "eval_runtime": 262.4932, "eval_samples_per_second": 128.64, "eval_steps_per_second": 2.682, "step": 335 }, { "epoch": 0.024411508282476024, "grad_norm": 0.8957669138908386, "learning_rate": 9.997682359779135e-06, "loss": 10.1172, "step": 336 }, { "epoch": 0.024411508282476024, "eval_accuracy": 0.07715223186808826, "eval_loss": 10.109375, "eval_runtime": 264.8989, "eval_samples_per_second": 127.471, "eval_steps_per_second": 2.658, "step": 336 }, { "epoch": 0.024484161580935776, "grad_norm": 0.9608045816421509, "learning_rate": 9.997675094449289e-06, "loss": 10.1016, "step": 337 }, { "epoch": 0.024484161580935776, "eval_accuracy": 0.07713784427067198, "eval_loss": 10.1015625, "eval_runtime": 264.4843, "eval_samples_per_second": 127.671, "eval_steps_per_second": 2.662, "step": 337 }, { "epoch": 0.024556814879395525, "grad_norm": 0.9367948770523071, "learning_rate": 9.997667829119443e-06, "loss": 10.1094, "step": 338 }, { "epoch": 0.024556814879395525, "eval_accuracy": 0.0773351309434926, "eval_loss": 10.1015625, "eval_runtime": 264.3155, "eval_samples_per_second": 127.753, "eval_steps_per_second": 2.663, "step": 338 }, { "epoch": 0.024629468177855274, "grad_norm": 0.9086586833000183, "learning_rate": 9.997660563789597e-06, "loss": 10.1172, "step": 339 }, { "epoch": 0.024629468177855274, "eval_accuracy": 0.07753942324724832, "eval_loss": 10.1015625, "eval_runtime": 263.7541, "eval_samples_per_second": 128.025, "eval_steps_per_second": 2.669, "step": 339 }, { "epoch": 0.024702121476315026, "grad_norm": 0.936314046382904, "learning_rate": 9.99765329845975e-06, "loss": 10.1094, "step": 340 }, { "epoch": 0.024702121476315026, "eval_accuracy": 0.07769429979891233, "eval_loss": 10.1015625, "eval_runtime": 264.8284, "eval_samples_per_second": 127.505, "eval_steps_per_second": 2.658, "step": 340 }, { "epoch": 0.024774774774774775, "grad_norm": 0.8729653358459473, "learning_rate": 9.997646033129905e-06, "loss": 10.1172, "step": 341 }, { "epoch": 0.024774774774774775, "eval_accuracy": 0.07776137437278254, "eval_loss": 10.1015625, "eval_runtime": 264.1666, "eval_samples_per_second": 127.825, "eval_steps_per_second": 2.665, "step": 341 }, { "epoch": 0.024847428073234524, "grad_norm": 0.9122793078422546, "learning_rate": 9.997638767800059e-06, "loss": 10.0938, "step": 342 }, { "epoch": 0.024847428073234524, "eval_accuracy": 0.07787676464099427, "eval_loss": 10.09375, "eval_runtime": 264.3725, "eval_samples_per_second": 127.725, "eval_steps_per_second": 2.663, "step": 342 }, { "epoch": 0.024920081371694276, "grad_norm": 0.9096229076385498, "learning_rate": 9.997631502470212e-06, "loss": 10.1016, "step": 343 }, { "epoch": 0.024920081371694276, "eval_accuracy": 0.077991257493673, "eval_loss": 10.09375, "eval_runtime": 265.889, "eval_samples_per_second": 126.997, "eval_steps_per_second": 2.648, "step": 343 }, { "epoch": 0.024992734670154025, "grad_norm": 0.9116566181182861, "learning_rate": 9.997624237140366e-06, "loss": 10.0938, "step": 344 }, { "epoch": 0.024992734670154025, "eval_accuracy": 0.07803271230152027, "eval_loss": 10.09375, "eval_runtime": 263.9618, "eval_samples_per_second": 127.924, "eval_steps_per_second": 2.667, "step": 344 }, { "epoch": 0.025065387968613774, "grad_norm": 0.9252493381500244, "learning_rate": 9.99761697181052e-06, "loss": 10.0938, "step": 345 }, { "epoch": 0.025065387968613774, "eval_accuracy": 0.0780303674415792, "eval_loss": 10.09375, "eval_runtime": 264.8551, "eval_samples_per_second": 127.492, "eval_steps_per_second": 2.658, "step": 345 }, { "epoch": 0.025138041267073526, "grad_norm": 0.8922543525695801, "learning_rate": 9.997609706480676e-06, "loss": 10.1016, "step": 346 }, { "epoch": 0.025138041267073526, "eval_accuracy": 0.07808099904697366, "eval_loss": 10.09375, "eval_runtime": 264.1901, "eval_samples_per_second": 127.813, "eval_steps_per_second": 2.665, "step": 346 }, { "epoch": 0.025210694565533275, "grad_norm": 0.8663190603256226, "learning_rate": 9.997602441150828e-06, "loss": 10.1094, "step": 347 }, { "epoch": 0.025210694565533275, "eval_accuracy": 0.07801609563971557, "eval_loss": 10.0859375, "eval_runtime": 262.4011, "eval_samples_per_second": 128.685, "eval_steps_per_second": 2.683, "step": 347 }, { "epoch": 0.025283347863993024, "grad_norm": 0.9128501415252686, "learning_rate": 9.997595175820984e-06, "loss": 10.0938, "step": 348 }, { "epoch": 0.025283347863993024, "eval_accuracy": 0.07798708885377775, "eval_loss": 10.0859375, "eval_runtime": 263.7953, "eval_samples_per_second": 128.005, "eval_steps_per_second": 2.669, "step": 348 }, { "epoch": 0.025356001162452776, "grad_norm": 0.9011194705963135, "learning_rate": 9.997587910491136e-06, "loss": 10.0938, "step": 349 }, { "epoch": 0.025356001162452776, "eval_accuracy": 0.07796170067886012, "eval_loss": 10.0859375, "eval_runtime": 263.0115, "eval_samples_per_second": 128.386, "eval_steps_per_second": 2.677, "step": 349 }, { "epoch": 0.025428654460912525, "grad_norm": 0.9395301342010498, "learning_rate": 9.997580645161292e-06, "loss": 10.0859, "step": 350 }, { "epoch": 0.025428654460912525, "eval_accuracy": 0.0779401337571798, "eval_loss": 10.0859375, "eval_runtime": 265.4304, "eval_samples_per_second": 127.216, "eval_steps_per_second": 2.652, "step": 350 }, { "epoch": 0.025501307759372274, "grad_norm": 0.9046230316162109, "learning_rate": 9.997573379831446e-06, "loss": 10.0859, "step": 351 }, { "epoch": 0.025501307759372274, "eval_accuracy": 0.07796905369645313, "eval_loss": 10.0859375, "eval_runtime": 264.1028, "eval_samples_per_second": 127.856, "eval_steps_per_second": 2.666, "step": 351 }, { "epoch": 0.025573961057832027, "grad_norm": 0.9076169729232788, "learning_rate": 9.9975661145016e-06, "loss": 10.0938, "step": 352 }, { "epoch": 0.025573961057832027, "eval_accuracy": 0.07807518032045319, "eval_loss": 10.078125, "eval_runtime": 263.5124, "eval_samples_per_second": 128.142, "eval_steps_per_second": 2.672, "step": 352 }, { "epoch": 0.025573961057832027, "step": 352, "total_flos": 247015648788480.0, "train_loss": 10.390092329545455, "train_runtime": 94034.2968, "train_samples_per_second": 702.555, "train_steps_per_second": 14.637 } ], "logging_steps": 1, "max_steps": 1376400, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 247015648788480.0, "train_batch_size": 48, "trial_name": null, "trial_params": null }